Spaces:

nateraw
/

deepafx-st

Build error

App Files Files Community

yourusername commited on Jul 19, 2022

Commit

66a6dc0

•

1 Parent(s): 9f5a755

:beers: cheers

Browse files

Files changed (43) hide show

app.py +91 -0
deepafx_st/__init__.py +4 -0
deepafx_st/callbacks/audio.py +184 -0
deepafx_st/callbacks/ckpt.py +33 -0
deepafx_st/callbacks/params.py +87 -0
deepafx_st/callbacks/plotting.py +126 -0
deepafx_st/data/audio.py +177 -0
deepafx_st/data/augmentations.py +235 -0
deepafx_st/data/dataset.py +344 -0
deepafx_st/data/proxy.py +181 -0
deepafx_st/data/style.py +62 -0
deepafx_st/metrics.py +157 -0
deepafx_st/models/baselines.py +280 -0
deepafx_st/models/controller.py +75 -0
deepafx_st/models/efficient_net/LICENSE +202 -0
deepafx_st/models/efficient_net/__init__.py +9 -0
deepafx_st/models/efficient_net/model.py +419 -0
deepafx_st/models/efficient_net/utils.py +616 -0
deepafx_st/models/encoder.py +113 -0
deepafx_st/models/mobilenetv2.py +226 -0
deepafx_st/probes/cdpam_encoder.py +68 -0
deepafx_st/probes/probe_system.py +307 -0
deepafx_st/probes/random_mel.py +93 -0
deepafx_st/processors/autodiff/__init__.py +0 -0
deepafx_st/processors/autodiff/channel.py +28 -0
deepafx_st/processors/autodiff/compressor.py +169 -0
deepafx_st/processors/autodiff/fir.py +68 -0
deepafx_st/processors/autodiff/peq.py +274 -0
deepafx_st/processors/autodiff/signal.py +194 -0
deepafx_st/processors/dsp/compressor.py +177 -0
deepafx_st/processors/dsp/peq.py +323 -0
deepafx_st/processors/processor.py +87 -0
deepafx_st/processors/proxy/channel.py +130 -0
deepafx_st/processors/proxy/proxy_system.py +289 -0
deepafx_st/processors/proxy/tcn.py +199 -0
deepafx_st/processors/spsa/channel.py +179 -0
deepafx_st/processors/spsa/eps_scheduler.py +32 -0
deepafx_st/processors/spsa/spsa_func.py +131 -0
deepafx_st/system.py +563 -0
deepafx_st/utils.py +277 -0
deepafx_st/version.py +6 -0
packages.txt +3 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+import numpy as np
+import resampy
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from deepafx_st.system import System
+from deepafx_st.utils import DSPMode
+system = System.load_from_checkpoint(
+    hf_hub_download("nateraw/deepafx-st-libritts-autodiff", "lit_model.ckpt"), batch_size=1
+).eval()
+gpu = torch.cuda.is_available()
+if gpu:
+    system.to("cuda")
+def process(input_path, reference_path):
+    # load audio data
+    x, x_sr = torchaudio.load(input_path)
+    r, r_sr = torchaudio.load(reference_path)
+    # resample if needed
+    if x_sr != 24000:
+        print("Resampling to 24000 Hz...")
+        x_24000 = torch.tensor(resampy.resample(x.view(-1).numpy(), x_sr, 24000))
+        x_24000 = x_24000.view(1, -1)
+    else:
+        x_24000 = x
+    if r_sr != 24000:
+        print("Resampling to 24000 Hz...")
+        r_24000 = torch.tensor(resampy.resample(r.view(-1).numpy(), r_sr, 24000))
+        r_24000 = r_24000.view(1, -1)
+    else:
+        r_24000 = r
+    # peak normalize to -12 dBFS
+    x_24000 = x_24000[0:1, : 24000 * 5]
+    x_24000 /= x_24000.abs().max()
+    x_24000 *= 10 ** (-12 / 20.0)
+    x_24000 = x_24000.view(1, 1, -1)
+    # peak normalize to -12 dBFS
+    r_24000 = r_24000[0:1, : 24000 * 5]
+    r_24000 /= r_24000.abs().max()
+    r_24000 *= 10 ** (-12 / 20.0)
+    r_24000 = r_24000.view(1, 1, -1)
+    if gpu:
+        x_24000 = x_24000.to("cuda")
+        r_24000 = r_24000.to("cuda")
+    with torch.no_grad():
+        y_hat, p, e = system(x_24000, r_24000)
+    y_hat = y_hat.view(1, -1)
+    y_hat /= y_hat.abs().max()
+    x_24000 /= x_24000.abs().max()
+    # Sqeeze to (T,), convert to numpy, and convert to int16
+    out_audio = (32767 * y_hat).squeeze(0).detach().cpu().numpy().astype(np.int16)
+    return 24000, out_audio
+gr.Interface(
+    fn=process,
+    inputs=[gr.Audio(type="filepath"), gr.Audio(type="filepath")],
+    outputs="audio",
+    examples=[
+        [
+            hf_hub_download("nateraw/examples", "voice_raw.wav", repo_type="dataset", cache_dir="./data"),
+            hf_hub_download("nateraw/examples", "voice_produced.wav", repo_type="dataset", cache_dir="./data"),
+        ],
+    ],
+    title="DeepAFx-ST",
+    description=(
+        "Gradio demo for DeepAFx-ST for style transfer of audio effects with differentiable signal processing. To use it, simply"
+        " upload your audio files or choose from one of the examples. Read more at the links below."
+    ),
+    article=(
+        "<div style='text-align: center;'><a href='https://github.com/adobe-research/DeepAFx-ST' target='_blank'>Github Repo</a>"
+        " <center><img src='https://visitor-badge.glitch.me/badge?page_id=nateraw_deepafx-st' alt='visitor"
+        " badge'></center></div>"
+    ),
+    allow_flagging="never",
+).launch()

deepafx_st/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/usr/bin/env python
+"""Top-level module for deepafx_st"""
+from .version import version as __version__

deepafx_st/callbacks/audio.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import auraloss
+import numpy as np
+import pytorch_lightning as pl
+from deepafx_st.callbacks.plotting import plot_multi_spectrum
+from deepafx_st.metrics import (
+    LoudnessError,
+    SpectralCentroidError,
+    CrestFactorError,
+    PESQ,
+    MelSpectralDistance,
+)
+class LogAudioCallback(pl.callbacks.Callback):
+    def __init__(self, num_examples=4, peak_normalize=True, sample_rate=22050):
+        super().__init__()
+        self.num_examples = 4
+        self.peak_normalize = peak_normalize
+        self.metrics = {
+            "PESQ": PESQ(sample_rate),
+            "MRSTFT": auraloss.freq.MultiResolutionSTFTLoss(
+                fft_sizes=[32, 128, 512, 2048, 8192, 32768],
+                hop_sizes=[16, 64, 256, 1024, 4096, 16384],
+                win_lengths=[32, 128, 512, 2048, 8192, 32768],
+                w_sc=0.0,
+                w_phs=0.0,
+                w_lin_mag=1.0,
+                w_log_mag=1.0,
+            ),
+            "MSD": MelSpectralDistance(sample_rate),
+            "SCE": SpectralCentroidError(sample_rate),
+            "CFE": CrestFactorError(),
+            "LUFS": LoudnessError(sample_rate),
+        }
+        self.outputs = []
+    def on_validation_batch_end(
+        self,
+        trainer,
+        pl_module,
+        outputs,
+        batch,
+        batch_idx,
+        dataloader_idx,
+    ):
+        """Called when the validation batch ends."""
+        if outputs is not None:
+            examples = np.min([self.num_examples, outputs["x"].shape[0]])
+            self.outputs.append(outputs)
+            if batch_idx == 0:
+                for n in range(examples):
+                    if batch_idx == 0:
+                        self.log_audio(
+                            outputs,
+                            n,
+                            pl_module.hparams.sample_rate,
+                            pl_module.hparams.val_length,
+                            trainer.global_step,
+                            trainer.logger,
+                        )
+    def on_validation_end(self, trainer, pl_module):
+        metrics = {
+            "PESQ": [],
+            "MRSTFT": [],
+            "MSD": [],
+            "SCE": [],
+            "CFE": [],
+            "LUFS": [],
+        }
+        for output in self.outputs:
+            for metric_name, metric in self.metrics.items():
+                try:
+                    val = metric(output["y_hat"], output["y"])
+                    metrics[metric_name].append(val)
+                except:
+                    pass
+        # log final mean metrics
+        for metric_name, metric in metrics.items():
+            val = np.mean(metric)
+            trainer.logger.experiment.add_scalar(
+                f"metrics/{metric_name}", val, trainer.global_step
+            )
+        # clear outputs
+        self.outputs = []
+    def compute_metrics(self, metrics_dict, outputs, batch_idx, global_step):
+        # extract audio
+        y = outputs["y"][batch_idx, ...].float()
+        y_hat = outputs["y_hat"][batch_idx, ...].float()
+        # compute all metrics
+        for metric_name, metric in self.metrics.items():
+            try:
+                val = metric(y_hat.view(1, 1, -1), y.view(1, 1, -1))
+                metrics_dict[metric_name].append(val)
+            except:
+                pass
+    def log_audio(self, outputs, batch_idx, sample_rate, n_fft, global_step, logger):
+        x = outputs["x"][batch_idx, ...].float()
+        y = outputs["y"][batch_idx, ...].float()
+        y_hat = outputs["y_hat"][batch_idx, ...].float()
+        if self.peak_normalize:
+            x /= x.abs().max()
+            y /= y.abs().max()
+            y_hat /= y_hat.abs().max()
+        logger.experiment.add_audio(
+            f"x/{batch_idx+1}",
+            x[0:1, :],
+            global_step,
+            sample_rate=sample_rate,
+        )
+        logger.experiment.add_audio(
+            f"y/{batch_idx+1}",
+            y[0:1, :],
+            global_step,
+            sample_rate=sample_rate,
+        )
+        logger.experiment.add_audio(
+            f"y_hat/{batch_idx+1}",
+            y_hat[0:1, :],
+            global_step,
+            sample_rate=sample_rate,
+        )
+        if "y_ref" in outputs:
+            y_ref = outputs["y_ref"][batch_idx, ...].float()
+            if self.peak_normalize:
+                y_ref /= y_ref.abs().max()
+            logger.experiment.add_audio(
+                f"y_ref/{batch_idx+1}",
+                y_ref[0:1, :],
+                global_step,
+                sample_rate=sample_rate,
+            )
+        logger.experiment.add_image(
+            f"spec/{batch_idx+1}",
+            compare_spectra(
+                y_hat[0:1, :],
+                y[0:1, :],
+                x[0:1, :],
+                sample_rate=sample_rate,
+                n_fft=n_fft,
+            ),
+            global_step,
+        )
+def compare_spectra(
+    deepafx_y_hat, y, x, baseline_y_hat=None, sample_rate=44100, n_fft=16384
+):
+    legend = ["Corrupted"]
+    signals = [x]
+    if baseline_y_hat is not None:
+        legend.append("Baseline")
+        signals.append(baseline_y_hat)
+    legend.append("DeepAFx")
+    signals.append(deepafx_y_hat)
+    legend.append("Target")
+    signals.append(y)
+    image = plot_multi_spectrum(
+        ys=signals,
+        legend=legend,
+        sample_rate=sample_rate,
+        n_fft=n_fft,
+    )
+    return image

deepafx_st/callbacks/ckpt.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import sys
+import shutil
+import pytorch_lightning as pl
+class CopyPretrainedCheckpoints(pl.callbacks.Callback):
+    def __init__(self):
+        super().__init__()
+    def on_fit_start(self, trainer, pl_module):
+        """Before training, move the pre-trained checkpoints
+        to the current checkpoint directory.
+        """
+        # copy any pre-trained checkpoints to new directory
+        if pl_module.hparams.processor_model == "proxy":
+            pretrained_ckpt_dir = os.path.join(
+                pl_module.logger.experiment.log_dir, "pretrained_checkpoints"
+            )
+            if not os.path.isdir(pretrained_ckpt_dir):
+                os.makedirs(pretrained_ckpt_dir)
+            cp_proxy_ckpts = []
+            for proxy_ckpt in pl_module.hparams.proxy_ckpts:
+                new_ckpt = shutil.copy(
+                    proxy_ckpt,
+                    pretrained_ckpt_dir,
+                )
+                cp_proxy_ckpts.append(new_ckpt)
+                print(f"Moved checkpoint to {new_ckpt}.")
+            # overwrite to the paths in current experiment logs
+            pl_module.hparams.proxy_ckpts = cp_proxy_ckpts
+            print(pl_module.hparams.proxy_ckpts)

deepafx_st/callbacks/params.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import numpy as np
+import pytorch_lightning as pl
+import matplotlib.pyplot as plt
+import deepafx_st.utils as utils
+class LogParametersCallback(pl.callbacks.Callback):
+    def __init__(self, num_examples=4):
+        super().__init__()
+        self.num_examples = 4
+    def on_validation_epoch_start(self, trainer, pl_module):
+        """At the start of validation init storage for parameters."""
+        self.params = []
+    def on_validation_batch_end(
+        self,
+        trainer,
+        pl_module,
+        outputs,
+        batch,
+        batch_idx,
+        dataloader_idx,
+    ):
+        """Called when the validation batch ends.
+        Here we log the parameters only from the first batch.
+        """
+        if outputs is not None and batch_idx == 0:
+            examples = np.min([self.num_examples, outputs["x"].shape[0]])
+            for n in range(examples):
+                self.log_parameters(
+                    outputs,
+                    n,
+                    pl_module.processor.ports,
+                    trainer.global_step,
+                    trainer.logger,
+                    True if batch_idx == 0 else False,
+                )
+    def on_validation_epoch_end(self, trainer, pl_module):
+        pass
+    def log_parameters(self, outputs, batch_idx, ports, global_step, logger, log=True):
+        p = outputs["p"][batch_idx, ...]
+        table = ""
+        # table += f"""## {plugin["name"]}\n"""
+        table += "| Index| Name | Value | Units | Min | Max | Default | Raw Value | \n"
+        table += "|------|------|------:|:------|----:|----:|--------:| ---------:| \n"
+        start_idx = 0
+        # set plugin parameters based on provided normalized parameters
+        for port_list in ports:
+            for pidx, port in enumerate(port_list):
+                param_max = port["max"]
+                param_min = port["min"]
+                param_name = port["name"]
+                param_default = port["default"]
+                param_units = port["units"]
+                param_val = p[start_idx]
+                denorm_val = utils.denormalize(param_val, param_max, param_min)
+                # add values to table in row
+                table += f"| {start_idx + 1} | {param_name} "
+                if np.abs(denorm_val) > 10:
+                    table += f"| {denorm_val:0.1f} "
+                    table += f"| {param_units} "
+                    table += f"| {param_min:0.1f} | {param_max:0.1f} "
+                    table += f"| {param_default:0.1f} "
+                else:
+                    table += f"| {denorm_val:0.3f} "
+                    table += f"| {param_units} "
+                    table += f"| {param_min:0.3f} | {param_max:0.3f} "
+                    table += f"| {param_default:0.3f} "
+                table += f"| {np.squeeze(param_val):0.2f} | \n"
+                start_idx += 1
+        table += "\n\n"
+        if log:
+            logger.experiment.add_text(f"params/{batch_idx+1}", table, global_step)

deepafx_st/callbacks/plotting.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import io
+import torch
+import PIL.Image
+import numpy as np
+import scipy.signal
+import librosa.display
+import matplotlib.pyplot as plt
+from torch.functional import Tensor
+from torchvision.transforms import ToTensor
+def compute_comparison_spectrogram(
+    x: np.ndarray,
+    y: np.ndarray,
+    sample_rate: float = 44100,
+    n_fft: int = 2048,
+    hop_length: int = 1024,
+) -> Tensor:
+    X = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
+    X_db = librosa.amplitude_to_db(np.abs(X), ref=np.max)
+    Y = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
+    Y_db = librosa.amplitude_to_db(np.abs(Y), ref=np.max)
+    fig, axs = plt.subplots(figsize=(9, 6), nrows=2)
+    img = librosa.display.specshow(
+        X_db,
+        ax=axs[0],
+        hop_length=hop_length,
+        x_axis="time",
+        y_axis="log",
+        sr=sample_rate,
+    )
+    # fig.colorbar(img, ax=axs[0])
+    img = librosa.display.specshow(
+        Y_db,
+        ax=axs[1],
+        hop_length=hop_length,
+        x_axis="time",
+        y_axis="log",
+        sr=sample_rate,
+    )
+    # fig.colorbar(img, ax=axs[1])
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format="jpeg")
+    buf.seek(0)
+    image = PIL.Image.open(buf)
+    image = ToTensor()(image)
+    plt.close("all")
+    return image
+def plot_multi_spectrum(
+    ys=None,
+    Hs=None,
+    legend=[],
+    title="Spectrum",
+    filename=None,
+    sample_rate=44100,
+    n_fft=1024,
+    zero_mean=False,
+):
+    if Hs is None:
+        Hs = []
+        for y in ys:
+            X = get_average_spectrum(y, n_fft)
+            X_sm = smooth_spectrum(X)
+            Hs.append(X_sm)
+    bin_width = (sample_rate / 2) / (n_fft // 2)
+    freqs = np.arange(0, (sample_rate / 2) + bin_width, step=bin_width)
+    fig, ax1 = plt.subplots()
+    for idx, H in enumerate(Hs):
+        H = np.nan_to_num(H)
+        H = np.clip(H, 0, np.max(H))
+        H_dB = 20 * np.log10(H + 1e-8)
+        if zero_mean:
+            H_dB -= np.mean(H_dB)
+        if "Target" in legend[idx]:
+            ax1.plot(freqs, H_dB, linestyle="--", color="k")
+        else:
+            ax1.plot(freqs, H_dB)
+    plt.legend(legend)
+    ax1.set_xscale("log")
+    ax1.set_ylim([-80, 0])
+    ax1.set_xlim([100, 11000])
+    plt.title(title)
+    plt.ylabel("Magnitude (dB)")
+    plt.xlabel("Frequency (Hz)")
+    plt.grid(c="lightgray", which="both")
+    if filename is not None:
+        plt.savefig(f"{filename}.png", dpi=300)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format="jpeg")
+    buf.seek(0)
+    image = PIL.Image.open(buf)
+    image = ToTensor()(image)
+    plt.close("all")
+    return image
+def smooth_spectrum(H):
+    # apply Savgol filter for smoothed target curve
+    return scipy.signal.savgol_filter(H, 1025, 2)
+def get_average_spectrum(x, n_fft):
+    X = torch.stft(x, n_fft, return_complex=True, normalized=True)
+    X = X.abs()  # convert to magnitude
+    X = X.mean(dim=-1).view(-1)  # average across frames
+    return X

deepafx_st/data/audio.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import glob
+import torch
+import warnings
+import torchaudio
+import pyloudnorm as pyln
+class AudioFile(object):
+    def __init__(self, filepath, preload=False, half=False, target_loudness=None):
+        """Base class for audio files to handle metadata and loading.
+        Args:
+            filepath (str): Path to audio file to load from disk.
+            preload (bool, optional): If set, load audio data into RAM. Default: False
+            half (bool, optional): If set, store audio data as float16 to save space. Default: False
+            target_loudness (float, optional): Loudness normalize to dB LUFS value. Default:
+        """
+        super().__init__()
+        self.filepath = filepath
+        self.half = half
+        self.target_loudness = target_loudness
+        self.loaded = False
+        if preload:
+            self.load()
+            num_frames = self.audio.shape[-1]
+            num_channels = self.audio.shape[0]
+        else:
+            metadata = torchaudio.info(filepath)
+            audio = None
+            self.sample_rate = metadata.sample_rate
+            num_frames = metadata.num_frames
+            num_channels = metadata.num_channels
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+    def load(self):
+        audio, sr = torchaudio.load(self.filepath, normalize=True)
+        self.audio = audio
+        self.sample_rate = sr
+        if self.target_loudness is not None:
+            self.loudness_normalize()
+        if self.half:
+            self.audio = audio.half()
+        self.loaded = True
+    def loudness_normalize(self):
+        meter = pyln.Meter(self.sample_rate)
+        # conver mono to stereo
+        if self.audio.shape[0] == 1:
+            tmp_audio = self.audio.repeat(2, 1)
+        else:
+            tmp_audio = self.audio
+        # measure integrated loudness
+        input_loudness = meter.integrated_loudness(tmp_audio.numpy().T)
+        # compute and apply gain
+        gain_dB = self.target_loudness - input_loudness
+        gain_ln = 10 ** (gain_dB / 20.0)
+        self.audio *= gain_ln
+        # check for potentially clipped samples
+        if self.audio.abs().max() >= 1.0:
+            warnings.warn("Possible clipped samples in output.")
+class AudioFileDataset(torch.utils.data.Dataset):
+    """Base class for audio file datasets loaded from disk.
+    Datasets can be either paired or unpaired. A paired dataset requires passing the `target_dir` path.
+    Args:
+        input_dir (List[str]): List of paths to the directories containing input audio files.
+        target_dir (List[str], optional): List of paths to the directories containing correponding audio files. Default: []
+        subset (str, optional): Dataset subset. One of ["train", "val", "test"]. Default: "train"
+        length (int, optional): Number of samples to load for each example. Default: 65536
+        normalize (bool, optional): Normalize audio amplitiude to -1 to 1. Default: True
+        train_frac (float, optional): Fraction of the files to use for training subset. Default: 0.8
+        val_frac (float, optional): Fraction of the files to use for validation subset. Default: 0.1
+        preload (bool, optional): Read audio files into RAM at the start of training. Default: False
+        num_examples_per_epoch (int, optional): Define an epoch as certain number of audio examples. Default: 10000
+        ext (str, optional): Expected audio file extension. Default: "wav"
+    """
+    def __init__(
+        self,
+        input_dirs,
+        target_dirs=[],
+        subset="train",
+        length=65536,
+        normalize=True,
+        train_per=0.8,
+        val_per=0.1,
+        preload=False,
+        num_examples_per_epoch=10000,
+        ext="wav",
+    ):
+        super().__init__()
+        self.input_dirs = input_dirs
+        self.target_dirs = target_dirs
+        self.subset = subset
+        self.length = length
+        self.normalize = normalize
+        self.train_per = train_per
+        self.val_per = val_per
+        self.preload = preload
+        self.num_examples_per_epoch = num_examples_per_epoch
+        self.ext = ext
+        self.input_filepaths = []
+        for input_dir in input_dirs:
+            search_path = os.path.join(input_dir, f"*.{ext}")
+            self.input_filepaths += glob.glob(search_path)
+        self.input_filepaths = sorted(self.input_filepaths)
+        self.target_filepaths = []
+        for target_dir in target_dirs:
+            search_path = os.path.join(target_dir, f"*.{ext}")
+            self.target_filepaths += glob.glob(search_path)
+        self.target_filepaths = sorted(self.target_filepaths)
+        # both sets must have same number of files in paired dataset
+        assert len(self.target_filepaths) == len(self.input_filepaths)
+        # get details about audio files
+        self.input_files = []
+        for input_filepath in self.input_filepaths:
+            self.input_files.append(
+                AudioFile(input_filepath, preload=preload, normalize=normalize)
+            )
+        self.target_files = []
+        if target_dir is not None:
+            for target_filepath in self.target_filepaths:
+                self.target_files.append(
+                    AudioFile(target_filepath, preload=preload, normalize=normalize)
+                )
+    def __len__(self):
+        return self.num_examples_per_epoch
+    def __getitem__(self, idx):
+        """ """
+        # index the current audio file
+        input_file = self.input_files[idx]
+        # load the audio data if needed
+        if not input_file.loaded:
+            input_file.load()
+        # get a random patch of size `self.length`
+        start_idx = int(torch.rand() * (input_file.num_frames - self.length))
+        stop_idx = start_idx + self.length
+        input_audio = input_file.audio[:, start_idx:stop_idx]
+        # if there is a target file, get it (and load)
+        if len(self.target_files) > 0:
+            target_file = self.target_files[idx]
+            if not target_file.loaded:
+                target_file.load()
+            # use the same cropping indices
+            target_audio = target_file.audio[:, start_idx:stop_idx]
+            return input_audio, target_audio
+        else:
+            return input_audio

deepafx_st/data/augmentations.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import torch
+import torchaudio
+import numpy as np
+def gain(xs, min_dB=-12, max_dB=12):
+    gain_dB = (torch.rand(1) * (max_dB - min_dB)) + min_dB
+    gain_ln = 10 ** (gain_dB / 20)
+    for idx, x in enumerate(xs):
+        xs[idx] = x * gain_ln
+    return xs
+def peaking_filter(xs, sr=44100, frequency=1000, width_q=0.707, gain_db=12):
+    # gain_db = ((torch.rand(1) * 6) + 6).numpy().squeeze()
+    # width_q = (torch.rand(1) * 4).numpy().squeeze()
+    # frequency = ((torch.rand(1) * 9960) + 40).numpy().squeeze()
+    # if torch.rand(1) > 0.5:
+    #    gain_db = -gain_db
+    effects = [["equalizer", f"{frequency}", f"{width_q}", f"{gain_db}"]]
+    for idx, x in enumerate(xs):
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x, sr, effects, channels_first=True
+        )
+        xs[idx] = y
+    return xs
+def pitch_shift(xs, min_shift=-200, max_shift=200, sr=44100):
+    shift = min_shift + (torch.rand(1)).numpy().squeeze() * (max_shift - min_shift)
+    effects = [["pitch", f"{shift}"]]
+    for idx, x in enumerate(xs):
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x, sr, effects, channels_first=True
+        )
+        xs[idx] = y
+    return xs
+def time_stretch(xs, min_stretch=0.8, max_stretch=1.2, sr=44100):
+    stretch = min_stretch + (torch.rand(1)).numpy().squeeze() * (
+        max_stretch - min_stretch
+    )
+    effects = [["tempo", f"{stretch}"]]
+    for idx, x in enumerate(xs):
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x, sr, effects, channels_first=True
+        )
+        xs[idx] = y
+    return xs
+def frequency_corruption(xs, sr=44100):
+    effects = []
+    # apply a random number of peaking bands from 0 to 4s
+    bands = [[200, 2000], [800, 4000], [2000, 8000], [4000, int((sr // 2) * 0.9)]]
+    total_gain_db = 0.0
+    for band in bands:
+        if torch.rand(1).sum() > 0.2:
+            frequency = (torch.randint(band[0], band[1], [1])).numpy().squeeze()
+            width_q = ((torch.rand(1) * 10) + 0.1).numpy().squeeze()
+            gain_db = ((torch.rand(1) * 48)).numpy().squeeze()
+            if torch.rand(1).sum() > 0.5:
+                gain_db = -gain_db
+            total_gain_db += gain_db
+            if np.abs(total_gain_db) >= 24:
+                continue
+            cmd = ["equalizer", f"{frequency}", f"{width_q}", f"{gain_db}"]
+            effects.append(cmd)
+    # low shelf (bass)
+    if torch.rand(1).sum() > 0.2:
+        gain_db = ((torch.rand(1) * 24)).numpy().squeeze()
+        frequency = (torch.randint(20, 200, [1])).numpy().squeeze()
+        if torch.rand(1).sum() > 0.5:
+            gain_db = -gain_db
+        effects.append(["bass", f"{gain_db}", f"{frequency}"])
+    # high shelf (treble)
+    if torch.rand(1).sum() > 0.2:
+        gain_db = ((torch.rand(1) * 24)).numpy().squeeze()
+        frequency = (torch.randint(4000, int((sr // 2) * 0.9), [1])).numpy().squeeze()
+        if torch.rand(1).sum() > 0.5:
+            gain_db = -gain_db
+        effects.append(["treble", f"{gain_db}", f"{frequency}"])
+    for idx, x in enumerate(xs):
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x.view(1, -1) * 10 ** (-48 / 20), sr, effects, channels_first=True
+        )
+        # apply gain back
+        y *= 10 ** (48 / 20)
+        xs[idx] = y
+    return xs
+def dynamic_range_corruption(xs, sr=44100):
+    """Apply an expander."""
+    attack = (torch.rand([1]).numpy()[0] * 0.05) + 0.001
+    release = (torch.rand([1]).numpy()[0] * 0.2) + attack
+    knee = (torch.rand([1]).numpy()[0] * 12) + 0.0
+    # design the compressor transfer function
+    start = -100.0
+    threshold = -(
+        (torch.rand([1]).numpy()[0] * 20) + 10
+    )  # threshold from -30 to -10 dB
+    ratio = (torch.rand([1]).numpy()[0] * 4.0) + 1  # ratio from 1:1 to 5:1
+    # compute the transfer curve
+    point = -((-threshold / -ratio) + (-start / ratio) + -threshold)
+    # apply some makeup gain
+    makeup = torch.rand([1]).numpy()[0] * 6
+    effects = [
+        [
+            "compand",
+            f"{attack},{release}",
+            f"{knee}:{point},{start},{threshold},{threshold}",
+            f"{makeup}",
+            f"{start}",
+        ]
+    ]
+    for idx, x in enumerate(xs):
+        # if the input is clipping normalize it
+        if x.abs().max() >= 1.0:
+            x /= x.abs().max()
+            gain_db = -((torch.rand(1) * 24)).numpy().squeeze()
+            x *= 10 ** (gain_db / 20.0)
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x.view(1, -1), sr, effects, channels_first=True
+        )
+        xs[idx] = y
+    return xs
+def dynamic_range_compression(xs, sr=44100):
+    """Apply a compressor."""
+    attack = (torch.rand([1]).numpy()[0] * 0.05) + 0.0005
+    release = (torch.rand([1]).numpy()[0] * 0.2) + attack
+    knee = (torch.rand([1]).numpy()[0] * 12) + 0.0
+    # design the compressor transfer function
+    start = -100.0
+    threshold = -((torch.rand([1]).numpy()[0] * 52) + 12)
+    # threshold from -64 to -12 dB
+    ratio = (torch.rand([1]).numpy()[0] * 10.0) + 1  # ratio from 1:1 to 10:1
+    # compute the transfer curve
+    point = threshold * (1 - (1 / ratio))
+    # apply some makeup gain
+    makeup = torch.rand([1]).numpy()[0] * 6
+    effects = [
+        [
+            "compand",
+            f"{attack},{release}",
+            f"{knee}:{start},{threshold},{threshold},0,{point}",
+            f"{makeup}",
+            f"{start}",
+            f"{attack}",
+        ]
+    ]
+    for idx, x in enumerate(xs):
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x.view(1, -1), sr, effects, channels_first=True
+        )
+        xs[idx] = y
+    return xs
+def lowpass_filter(xs, sr=44100, frequency=4000):
+    effects = [["lowpass", f"{frequency}"]]
+    for idx, x in enumerate(xs):
+        y, sr = torchaudio.sox_effects.apply_effects_tensor(
+            x, sr, effects, channels_first=True
+        )
+        xs[idx] = y
+    return xs
+def apply(xs, sr, augmentations):
+    # iterate over augmentation dict
+    for aug, params in augmentations.items():
+        if aug == "gain":
+            xs = gain(xs, **params)
+        elif aug == "peak":
+            xs = peaking_filter(xs, **params)
+        elif aug == "lowpass":
+            xs = lowpass_filter(xs, **params)
+        elif aug == "pitch":
+            xs = pitch_shift(xs, **params)
+        elif aug == "tempo":
+            xs = time_stretch(xs, **params)
+        elif aug == "freq_corrupt":
+            xs = frequency_corruption(xs, **params)
+        else:
+            raise RuntimeError("Invalid augmentation: {aug}")
+    return xs

deepafx_st/data/dataset.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import os
+import sys
+import csv
+import glob
+import torch
+import random
+from tqdm import tqdm
+from typing import List, Any
+from deepafx_st.data.audio import AudioFile
+import deepafx_st.utils as utils
+import deepafx_st.data.augmentations as augmentations
+class AudioDataset(torch.utils.data.Dataset):
+    """Audio dataset which returns an input and target file.
+    Args:
+        audio_dir (str): Path to the top level of the audio dataset.
+        input_dir (List[str], optional): List of paths to the directories containing input audio files. Default: ["clean"]
+        subset (str, optional): Dataset subset. One of ["train", "val", "test"]. Default: "train"
+        length (int, optional): Number of samples to load for each example. Default: 65536
+        train_frac (float, optional): Fraction of the files to use for training subset. Default: 0.8
+        val_frac (float, optional): Fraction of the files to use for validation subset. Default: 0.1
+        buffer_size_gb (float, optional): Size of audio to read into RAM in GB at any given time. Default: 10.0
+            Note: This is the buffer size PER DataLoader worker. So total RAM = buffer_size_gb * num_workers
+        buffer_reload_rate (int, optional): Number of items to generate before loading next chunk of dataset. Default: 10000
+        half (bool, optional): Sotre audio samples as float 16. Default: False
+        num_examples_per_epoch (int, optional): Define an epoch as certain number of audio examples. Default: 10000
+        random_scale_input (bool, optional): Apply random gain scaling to input utterances. Default: False
+        random_scale_target (bool, optional): Apply same random gain scaling to target utterances. Default: False
+        augmentations (dict, optional): List of augmentation types to apply to inputs. Default: []
+        freq_corrupt (bool, optional): Apply bad EQ filters. Default: False
+        drc_corrupt (bool, optional): Apply an expander to corrupt dynamic range. Default: False
+        ext (str, optional): Expected audio file extension. Default: "wav"
+    """
+    def __init__(
+        self,
+        audio_dir,
+        input_dirs: List[str] = ["cleanraw"],
+        subset: str = "train",
+        length: int = 65536,
+        train_frac: float = 0.8,
+        val_per: float = 0.1,
+        buffer_size_gb: float = 1.0,
+        buffer_reload_rate: float = 1000,
+        half: bool = False,
+        num_examples_per_epoch: int = 10000,
+        random_scale_input: bool = False,
+        random_scale_target: bool = False,
+        augmentations: dict = {},
+        freq_corrupt: bool = False,
+        drc_corrupt: bool = False,
+        ext: str = "wav",
+    ):
+        super().__init__()
+        self.audio_dir = audio_dir
+        self.dataset_name = os.path.basename(audio_dir)
+        self.input_dirs = input_dirs
+        self.subset = subset
+        self.length = length
+        self.train_frac = train_frac
+        self.val_per = val_per
+        self.buffer_size_gb = buffer_size_gb
+        self.buffer_reload_rate = buffer_reload_rate
+        self.half = half
+        self.num_examples_per_epoch = num_examples_per_epoch
+        self.random_scale_input = random_scale_input
+        self.random_scale_target = random_scale_target
+        self.augmentations = augmentations
+        self.freq_corrupt = freq_corrupt
+        self.drc_corrupt = drc_corrupt
+        self.ext = ext
+        self.input_filepaths = []
+        for input_dir in input_dirs:
+            search_path = os.path.join(audio_dir, input_dir, f"*.{ext}")
+            self.input_filepaths += glob.glob(search_path)
+        self.input_filepaths = sorted(self.input_filepaths)
+        # create dataset split based on subset
+        self.input_filepaths = utils.split_dataset(
+            self.input_filepaths,
+            subset,
+            train_frac,
+        )
+        # get details about input audio files
+        input_files = {}
+        input_dur_frames = 0
+        for input_filepath in tqdm(self.input_filepaths, ncols=80):
+            file_id = os.path.basename(input_filepath)
+            audio_file = AudioFile(
+                input_filepath,
+                preload=False,
+                half=half,
+            )
+            if audio_file.num_frames < (self.length * 2):
+                continue
+            input_files[file_id] = audio_file
+            input_dur_frames += input_files[file_id].num_frames
+        if len(list(input_files.items())) < 1:
+            raise RuntimeError(f"No files found in {search_path}.")
+        input_dur_hr = (input_dur_frames / input_files[file_id].sample_rate) / 3600
+        print(
+            f"\nLoaded {len(input_files)} files for {subset} = {input_dur_hr:0.2f} hours."
+        )
+        self.sample_rate = input_files[file_id].sample_rate
+        # save a csv file with details about the train and test split
+        splits_dir = os.path.join("configs", "splits")
+        if not os.path.isdir(splits_dir):
+            os.makedirs(splits_dir)
+        csv_filepath = os.path.join(splits_dir, f"{self.dataset_name}_{self.subset}_set.csv")
+        with open(csv_filepath, "w") as fp:
+            dw = csv.DictWriter(fp, ["file_id", "filepath", "type", "subset"])
+            dw.writeheader()
+            for input_filepath in self.input_filepaths:
+                dw.writerow(
+                    {
+                        "file_id": self.get_file_id(input_filepath),
+                        "filepath": input_filepath,
+                        "type": "input",
+                        "subset": self.subset,
+                    }
+                )
+        # some setup for iteratble loading of the dataset into RAM
+        self.items_since_load = self.buffer_reload_rate
+    def __len__(self):
+        return self.num_examples_per_epoch
+    def load_audio_buffer(self):
+        self.input_files_loaded = {}  # clear audio buffer
+        self.items_since_load = 0  # reset iteration counter
+        nbytes_loaded = 0  # counter for data in RAM
+        # different subset in each
+        random.shuffle(self.input_filepaths)
+        # load files into RAM
+        for input_filepath in self.input_filepaths:
+            file_id = os.path.basename(input_filepath)
+            audio_file = AudioFile(
+                input_filepath,
+                preload=True,
+                half=self.half,
+            )
+            if audio_file.num_frames < (self.length * 2):
+                continue
+            self.input_files_loaded[file_id] = audio_file
+            nbytes = audio_file.audio.element_size() * audio_file.audio.nelement()
+            nbytes_loaded += nbytes
+            # check the size of loaded data
+            if nbytes_loaded > self.buffer_size_gb * 1e9:
+                break
+    def generate_pair(self):
+        # ------------------------ Input audio ----------------------
+        rand_input_file_id = None
+        input_file = None
+        start_idx = None
+        stop_idx = None
+        while True:
+            rand_input_file_id = self.get_random_file_id(self.input_files_loaded.keys())
+            # use this random key to retrieve an input file
+            input_file = self.input_files_loaded[rand_input_file_id]
+            # load the audio data if needed
+            if not input_file.loaded:
+                raise RuntimeError("Audio not loaded.")
+            # get a random patch of size `self.length` x 2
+            start_idx, stop_idx = self.get_random_patch(
+                input_file, int(self.length * 2)
+            )
+            if start_idx >= 0:
+                break
+        input_audio = input_file.audio[:, start_idx:stop_idx].clone().detach()
+        input_audio = input_audio.view(1, -1)
+        if self.half:
+            input_audio = input_audio.float()
+        # peak normalize to -12 dBFS
+        input_audio /= input_audio.abs().max()
+        input_audio *= 10 ** (-12.0 / 20)  # with min 3 dBFS headroom
+        if len(list(self.augmentations.items())) > 0:
+            if torch.rand(1).sum() < 0.5:
+                input_audio_aug = augmentations.apply(
+                    [input_audio],
+                    self.sample_rate,
+                    self.augmentations,
+                )[0]
+            else:
+                input_audio_aug = input_audio.clone()
+        else:
+            input_audio_aug = input_audio.clone()
+        input_audio_corrupt = input_audio_aug.clone()
+        # apply frequency and dynamic range corrpution (expander)
+        if self.freq_corrupt and torch.rand(1).sum() < 0.75:
+            input_audio_corrupt = augmentations.frequency_corruption(
+                [input_audio_corrupt], self.sample_rate
+            )[0]
+        # peak normalize again before passing through dynamic range expander
+        input_audio_corrupt /= input_audio_corrupt.abs().max()
+        input_audio_corrupt *= 10 ** (-12.0 / 20)  # with min 3 dBFS headroom
+        if self.drc_corrupt and torch.rand(1).sum() < 0.10:
+            input_audio_corrupt = augmentations.dynamic_range_corruption(
+                [input_audio_corrupt], self.sample_rate
+            )[0]
+        # ------------------------ Target audio ----------------------
+        # use the same augmented audio clip, add different random EQ and compressor
+        target_audio_corrupt = input_audio_aug.clone()
+        # apply frequency and dynamic range corrpution (expander)
+        if self.freq_corrupt and torch.rand(1).sum() < 0.75:
+            target_audio_corrupt = augmentations.frequency_corruption(
+                [target_audio_corrupt], self.sample_rate
+            )[0]
+        # peak normalize again before passing through dynamic range compressor
+        input_audio_corrupt /= input_audio_corrupt.abs().max()
+        input_audio_corrupt *= 10 ** (-12.0 / 20)  # with min 3 dBFS headroom
+        if self.drc_corrupt and torch.rand(1).sum() < 0.75:
+            target_audio_corrupt = augmentations.dynamic_range_compression(
+                [target_audio_corrupt], self.sample_rate
+            )[0]
+        return input_audio_corrupt, target_audio_corrupt
+    def __getitem__(self, _):
+        """ """
+        # increment counter
+        self.items_since_load += 1
+        # load next chunk into buffer if needed
+        if self.items_since_load > self.buffer_reload_rate:
+            self.load_audio_buffer()
+        # generate pairs for style training
+        input_audio, target_audio = self.generate_pair()
+        # ------------------------ Conform length of files -------------------
+        input_audio = utils.conform_length(input_audio, int(self.length * 2))
+        target_audio = utils.conform_length(target_audio, int(self.length * 2))
+        # ------------------------ Apply fade in and fade out -------------------
+        input_audio = utils.linear_fade(input_audio, sample_rate=self.sample_rate)
+        target_audio = utils.linear_fade(target_audio, sample_rate=self.sample_rate)
+        # ------------------------ Final normalizeation ----------------------
+        # always peak normalize final input to -12 dBFS
+        input_audio /= input_audio.abs().max()
+        input_audio *= 10 ** (-12.0 / 20.0)
+        # always peak normalize the target to -12 dBFS
+        target_audio /= target_audio.abs().max()
+        target_audio *= 10 ** (-12.0 / 20.0)
+        return input_audio, target_audio
+    @staticmethod
+    def get_random_file_id(keys):
+        # generate a random index into the keys of the input files
+        rand_input_idx = torch.randint(0, len(keys) - 1, [1])[0]
+        # find the key (file_id) correponding to the random index
+        rand_input_file_id = list(keys)[rand_input_idx]
+        return rand_input_file_id
+    @staticmethod
+    def get_random_patch(audio_file, length, check_silence=True):
+        silent = True
+        count = 0
+        while silent:
+            count += 1
+            start_idx = torch.randint(0, audio_file.num_frames - length - 1, [1])[0]
+            # int(torch.rand(1) * (audio_file.num_frames - length))
+            stop_idx = start_idx + length
+            patch = audio_file.audio[:, start_idx:stop_idx].clone().detach()
+            length = patch.shape[-1]
+            first_patch = patch[..., : length // 2]
+            second_patch = patch[..., length // 2 :]
+            if (
+                (first_patch**2).mean() > 1e-5 and (second_patch**2).mean() > 1e-5
+            ) or not check_silence:
+                silent = False
+            if count > 100:
+                print("get_random_patch count", count)
+                return -1, -1
+                # break
+        return start_idx, stop_idx
+    def get_file_id(self, filepath):
+        """Given a filepath extract the DAPS file id.
+        Args:
+            filepath (str): Path to an audio files in the DAPS dataset.
+        Returns:
+            file_id (str): DAPS file id of the form <participant_id>_<script_id>
+            file_set (str): The DAPS set to which the file belongs.
+        """
+        file_id = os.path.basename(filepath).split("_")[:2]
+        file_id = "_".join(file_id)
+        return file_id
+    def get_file_set(self, filepath):
+        """Given a filepath extract the DAPS file set name.
+        Args:
+            filepath (str): Path to an audio files in the DAPS dataset.
+        Returns:
+            file_set (str): The DAPS set to which the file belongs.
+        """
+        file_set = os.path.basename(filepath).split("_")[2:]
+        file_set = "_".join(file_set)
+        file_set = file_set.replace(f".{self.ext}", "")
+        return file_set

deepafx_st/data/proxy.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import json
+import glob
+import torch
+import random
+from tqdm import tqdm
+# from deepafx_st.plugins.channel import Channel
+from deepafx_st.processors.processor import Processor
+from deepafx_st.data.audio import AudioFile
+import deepafx_st.utils as utils
+class DSPProxyDataset(torch.utils.data.Dataset):
+    """Class for generating input-output audio from Python DSP effects.
+    Args:
+        input_dir (List[str]): List of paths to the directories containing input audio files.
+        processor (Processor): Processor object to create proxy of.
+        processor_type (str): Processor name.
+        subset (str, optional): Dataset subset. One of ["train", "val", "test"]. Default: "train"
+        buffer_size_gb (float, optional): Size of audio to read into RAM in GB at any given time. Default: 10.0
+            Note: This is the buffer size PER DataLoader worker. So total RAM = buffer_size_gb * num_workers
+        buffer_reload_rate (int, optional): Number of items to generate before loading next chunk of dataset. Default: 10000
+        length (int, optional): Number of samples to load for each example. Default: 65536
+        num_examples_per_epoch (int, optional): Define an epoch as certain number of audio examples. Default: 10000
+        ext (str, optional): Expected audio file extension. Default: "wav"
+        hard_clip (bool, optional): Hard clip outputs between -1 and 1. Default: True
+    """
+    def __init__(
+        self,
+        input_dir: str,
+        processor: Processor,
+        processor_type: str,
+        subset="train",
+        length=65536,
+        buffer_size_gb=1.0,
+        buffer_reload_rate=1000,
+        half=False,
+        num_examples_per_epoch=10000,
+        ext="wav",
+        soft_clip=True,
+    ):
+        super().__init__()
+        self.input_dir = input_dir
+        self.processor = processor
+        self.processor_type = processor_type
+        self.subset = subset
+        self.length = length
+        self.buffer_size_gb = buffer_size_gb
+        self.buffer_reload_rate = buffer_reload_rate
+        self.half = half
+        self.num_examples_per_epoch = num_examples_per_epoch
+        self.ext = ext
+        self.soft_clip = soft_clip
+        search_path = os.path.join(input_dir, f"*.{ext}")
+        self.input_filepaths = glob.glob(search_path)
+        self.input_filepaths = sorted(self.input_filepaths)
+        if len(self.input_filepaths) < 1:
+            raise RuntimeError(f"No files found in {input_dir}.")
+        # get training split
+        self.input_filepaths = utils.split_dataset(
+            self.input_filepaths, self.subset, 0.9
+        )
+        # get details about audio files
+        cnt = 0
+        self.input_files = {}
+        for input_filepath in tqdm(self.input_filepaths, ncols=80):
+            file_id = os.path.basename(input_filepath)
+            audio_file = AudioFile(
+                input_filepath,
+                preload=False,
+                half=half,
+            )
+            if audio_file.num_frames < self.length:
+                continue
+            self.input_files[file_id] = audio_file
+            self.sample_rate = self.input_files[file_id].sample_rate
+            cnt += 1
+            if cnt > 1000:
+                break
+        # some setup for iteratble loading of the dataset into RAM
+        self.items_since_load = self.buffer_reload_rate
+    def __len__(self):
+        return self.num_examples_per_epoch
+    def load_audio_buffer(self):
+        self.input_files_loaded = {}  # clear audio buffer
+        self.items_since_load = 0  # reset iteration counter
+        nbytes_loaded = 0  # counter for data in RAM
+        # different subset in each
+        random.shuffle(self.input_filepaths)
+        # load files into RAM
+        for input_filepath in self.input_filepaths:
+            file_id = os.path.basename(input_filepath)
+            audio_file = AudioFile(
+                input_filepath,
+                preload=True,
+                half=self.half,
+            )
+            if audio_file.num_frames < self.length:
+                continue
+            self.input_files_loaded[file_id] = audio_file
+            nbytes = audio_file.audio.element_size() * audio_file.audio.nelement()
+            nbytes_loaded += nbytes
+            if nbytes_loaded > self.buffer_size_gb * 1e9:
+                break
+    def __getitem__(self, _):
+        """ """
+        # increment counter
+        self.items_since_load += 1
+        # load next chunk into buffer if needed
+        if self.items_since_load > self.buffer_reload_rate:
+            self.load_audio_buffer()
+        rand_input_file_id = utils.get_random_file_id(self.input_files_loaded.keys())
+        # use this random key to retrieve an input file
+        input_file = self.input_files_loaded[rand_input_file_id]
+        # load the audio data if needed
+        if not input_file.loaded:
+            input_file.load()
+        # get a random patch of size `self.length`
+        # start_idx, stop_idx = utils.get_random_patch(input_file, self.sample_rate, self.length)
+        start_idx, stop_idx = utils.get_random_patch(input_file, self.length)
+        input_audio = input_file.audio[:, start_idx:stop_idx].clone().detach()
+        # random scaling
+        input_audio /= input_audio.abs().max()
+        scale_dB = (torch.rand(1).squeeze().numpy() * 12) + 12
+        input_audio *= 10 ** (-scale_dB / 20.0)
+        # generate random parameters (uniform) over 0 to 1
+        params = torch.rand(self.processor.num_control_params)
+        # expects batch dim
+        # apply plugins with random parameters
+        if self.processor_type == "channel":
+            params[-1] = 0.5  # set makeup gain to 0dB
+            target_audio = self.processor(
+                input_audio.view(1, 1, -1),
+                params.view(1, -1),
+            )
+            target_audio = target_audio.view(1, -1)
+        elif self.processor_type == "peq":
+            target_audio = self.processor(
+                input_audio.view(1, 1, -1).numpy(),
+                params.view(1, -1).numpy(),
+            )
+            target_audio = torch.tensor(target_audio).view(1, -1)
+        elif self.processor_type == "comp":
+            params[-1] = 0.5  # set makeup gain to 0dB
+            target_audio = self.processor(
+                input_audio.view(1, 1, -1).numpy(),
+                params.view(1, -1).numpy(),
+            )
+            target_audio = torch.tensor(target_audio).view(1, -1)
+        # clip
+        if self.soft_clip:
+            # target_audio = target_audio.clamp(-2.0, 2.0)
+            target_audio = torch.tanh(target_audio / 2.0) * 2.0
+        return input_audio, target_audio, params

deepafx_st/data/style.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import glob
+import torch
+import torchaudio
+from tqdm import tqdm
+class StyleDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        audio_dir: str,
+        subset: str = "train",
+        sample_rate: int = 24000,
+        length: int = 131072,
+    ) -> None:
+        super().__init__()
+        self.audio_dir = audio_dir
+        self.subset = subset
+        self.sample_rate = sample_rate
+        self.length = length
+        self.style_dirs = glob.glob(os.path.join(audio_dir, subset, "*"))
+        self.style_dirs = [sd for sd in self.style_dirs if os.path.isdir(sd)]
+        self.num_classes = len(self.style_dirs)
+        self.class_labels = {"broadcast" : 0, "telephone": 1, "neutral": 2, "bright": 3, "warm": 4}
+        self.examples = []
+        for n, style_dir in enumerate(self.style_dirs):
+            # get all files in style dir
+            style_filepaths = glob.glob(os.path.join(style_dir, "*.wav"))
+            style_name = os.path.basename(style_dir)
+            for style_filepath in tqdm(style_filepaths, ncols=120):
+                # load audio file
+                x, sr = torchaudio.load(style_filepath)
+                # sum to mono if needed
+                if x.shape[0] > 1:
+                    x = x.mean(dim=0, keepdim=True)
+                # resample
+                if sr != self.sample_rate:
+                    x = torchaudio.transforms.Resample(sr, self.sample_rate)(x)
+                # crop length after resample
+                if x.shape[-1] >= self.length:
+                    x = x[...,:self.length]
+                # store example
+                example = (x, self.class_labels[style_name])
+                self.examples.append(example)
+        print(f"Loaded {len(self.examples)} examples for {subset} subset.")
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, idx):
+        example = self.examples[idx]
+        x = example[0]
+        y = example[1]
+        return x, y

deepafx_st/metrics.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import torch
+import auraloss
+import resampy
+import torchaudio
+from pesq import pesq
+import pyloudnorm as pyln
+def crest_factor(x):
+    """Compute the crest factor of waveform."""
+    peak, _ = x.abs().max(dim=-1)
+    rms = torch.sqrt((x ** 2).mean(dim=-1))
+    return 20 * torch.log(peak / rms.clamp(1e-8))
+def rms_energy(x):
+    rms = torch.sqrt((x ** 2).mean(dim=-1))
+    return 20 * torch.log(rms.clamp(1e-8))
+def spectral_centroid(x):
+    """Compute the crest factor of waveform.
+    See: https://gist.github.com/endolith/359724
+    """
+    spectrum = torch.fft.rfft(x).abs()
+    normalized_spectrum = spectrum / spectrum.sum()
+    normalized_frequencies = torch.linspace(0, 1, spectrum.shape[-1])
+    spectral_centroid = torch.sum(normalized_frequencies * normalized_spectrum)
+    return spectral_centroid
+def loudness(x, sample_rate):
+    """Compute the loudness in dB LUFS of waveform."""
+    meter = pyln.Meter(sample_rate)
+    # add stereo dim if needed
+    if x.shape[0] < 2:
+        x = x.repeat(2, 1)
+    return torch.tensor(meter.integrated_loudness(x.permute(1, 0).numpy()))
+class MelSpectralDistance(torch.nn.Module):
+    def __init__(self, sample_rate, length=65536):
+        super().__init__()
+        self.error = auraloss.freq.MelSTFTLoss(
+            sample_rate,
+            fft_size=length,
+            hop_size=length,
+            win_length=length,
+            w_sc=0,
+            w_log_mag=1,
+            w_lin_mag=1,
+            n_mels=128,
+            scale_invariance=False,
+        )
+        # I think scale invariance may not work well,
+        # since aspects of the phase may be considered?
+    def forward(self, input, target):
+        return self.error(input, target)
+class PESQ(torch.nn.Module):
+    def __init__(self, sample_rate):
+        super().__init__()
+        self.sample_rate = sample_rate
+    def forward(self, input, target):
+        if self.sample_rate != 16000:
+            target = resampy.resample(
+                target.view(-1).numpy(),
+                self.sample_rate,
+                16000,
+            )
+            input = resampy.resample(
+                input.view(-1).numpy(),
+                self.sample_rate,
+                16000,
+            )
+        return pesq(
+            16000,
+            target,
+            input,
+            "wb",
+        )
+class CrestFactorError(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, input, target):
+        return torch.nn.functional.l1_loss(
+            crest_factor(input),
+            crest_factor(target),
+        ).item()
+class RMSEnergyError(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, input, target):
+        return torch.nn.functional.l1_loss(
+            rms_energy(input),
+            rms_energy(target),
+        ).item()
+class SpectralCentroidError(torch.nn.Module):
+    def __init__(self, sample_rate, n_fft=2048, hop_length=512):
+        super().__init__()
+        self.spectral_centroid = torchaudio.transforms.SpectralCentroid(
+            sample_rate,
+            n_fft=n_fft,
+            hop_length=hop_length,
+        )
+    def forward(self, input, target):
+        return torch.nn.functional.l1_loss(
+            self.spectral_centroid(input + 1e-16).mean(),
+            self.spectral_centroid(target + 1e-16).mean(),
+        ).item()
+class LoudnessError(torch.nn.Module):
+    def __init__(self, sample_rate: int, peak_normalize: bool = False):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.peak_normalize = peak_normalize
+    def forward(self, input, target):
+        if self.peak_normalize:
+            # peak normalize
+            x = input / input.abs().max()
+            y = target / target.abs().max()
+        else:
+            x = input
+            y = target
+        return torch.nn.functional.l1_loss(
+            loudness(x.view(1, -1), self.sample_rate),
+            loudness(y.view(1, -1), self.sample_rate),
+        ).item()

deepafx_st/models/baselines.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+import torchaudio
+import scipy.signal
+import numpy as np
+import pyloudnorm as pyln
+import matplotlib.pyplot as plt
+from deepafx_st.processors.dsp.compressor import compressor
+from tqdm import tqdm
+class BaselineEQ(torch.nn.Module):
+    def __init__(
+        self,
+        ntaps: int = 63,
+        n_fft: int = 65536,
+        sample_rate: float = 44100,
+    ):
+        super().__init__()
+        self.ntaps = ntaps
+        self.n_fft = n_fft
+        self.sample_rate = sample_rate
+        # compute the target spectrum
+        # print("Computing target spectrum...")
+        # self.target_spec, self.sm_target_spec = self.analyze_speech_dataset(filepaths)
+        # self.plot_spectrum(self.target_spec, filename="targetEQ")
+        # self.plot_spectrum(self.sm_target_spec, filename="targetEQsm")
+    def forward(self, x, y):
+        bs, ch, s = x.size()
+        x = x.view(bs * ch, -1)
+        y = y.view(bs * ch, -1)
+        in_spec = self.get_average_spectrum(x)
+        ref_spec = self.get_average_spectrum(y)
+        sm_in_spec = self.smooth_spectrum(in_spec)
+        sm_ref_spec = self.smooth_spectrum(ref_spec)
+        # self.plot_spectrum(in_spec, filename="inSpec")
+        # self.plot_spectrum(sm_in_spec, filename="inSpecsm")
+        # design inverse FIR filter to match target EQ
+        freqs = np.linspace(0, 1.0, num=(self.n_fft // 2) + 1)
+        response = sm_ref_spec / sm_in_spec
+        response[-1] = 0.0  # zero gain at nyquist
+        b = scipy.signal.firwin2(
+            self.ntaps,
+            freqs * (self.sample_rate / 2),
+            response,
+            fs=self.sample_rate,
+        )
+        # scale the coefficients for less intense filter
+        # clearb *= 0.5
+        # apply the filter
+        x_filt = scipy.signal.lfilter(b, [1.0], x.numpy())
+        x_filt = torch.tensor(x_filt.astype("float32"))
+        if False:
+            # plot the filter response
+            w, h = scipy.signal.freqz(b, fs=self.sample_rate, worN=response.shape[-1])
+            fig, ax1 = plt.subplots()
+            ax1.set_title("Digital filter frequency response")
+            ax1.plot(w, 20 * np.log10(abs(h + 1e-8)))
+            ax1.plot(w, 20 * np.log10(abs(response + 1e-8)))
+            ax1.set_xscale("log")
+            ax1.set_ylim([-12, 12])
+            plt.grid(c="lightgray")
+            plt.savefig(f"inverse.png")
+            x_filt_avg_spec = self.get_average_spectrum(x_filt)
+            sm_x_filt_avg_spec = self.smooth_spectrum(x_filt_avg_spec)
+            y_avg_spec = self.get_average_spectrum(y)
+            sm_y_avg_spec = self.smooth_spectrum(y_avg_spec)
+            compare = torch.stack(
+                [
+                    torch.tensor(sm_in_spec),
+                    torch.tensor(sm_x_filt_avg_spec),
+                    torch.tensor(sm_ref_spec),
+                    torch.tensor(sm_y_avg_spec),
+                ]
+            )
+            self.plot_multi_spectrum(
+                compare,
+                legend=["in", "out", "target curve", "actual target"],
+                filename="outSpec",
+            )
+        return x_filt
+    def analyze_speech_dataset(self, filepaths, peak=-3.0):
+        avg_spec = []
+        for filepath in tqdm(filepaths, ncols=80):
+            x, sr = torchaudio.load(filepath)
+            x /= x.abs().max()
+            x *= 10 ** (peak / 20.0)
+            avg_spec.append(self.get_average_spectrum(x))
+        avg_specs = torch.stack(avg_spec)
+        avg_spec = avg_specs.mean(dim=0).numpy()
+        avg_spec_std = avg_specs.std(dim=0).numpy()
+        # self.plot_multi_spectrum(avg_specs, filename="allTargetEQs")
+        # self.plot_spectrum_stats(avg_spec, avg_spec_std, filename="targetEQstats")
+        sm_avg_spec = self.smooth_spectrum(avg_spec)
+        return avg_spec, sm_avg_spec
+    def smooth_spectrum(self, H):
+        # apply Savgol filter for smoothed target curve
+        return scipy.signal.savgol_filter(H, 1025, 2)
+    def get_average_spectrum(self, x):
+        # x = x[:, : self.n_fft]
+        X = torch.stft(x, self.n_fft, return_complex=True, normalized=True)
+        # fft_size = self.next_power_of_2(x.shape[-1])
+        # X = torch.fft.rfft(x, n=fft_size)
+        X = X.abs()  # convert to magnitude
+        X = X.mean(dim=-1).view(-1)  # average across frames
+        return X
+    @staticmethod
+    def next_power_of_2(x):
+        return 1 if x == 0 else int(2 ** np.ceil(np.log2(x)))
+    def plot_multi_spectrum(self, Hs, legend=[], filename=None):
+        bin_width = (self.sample_rate / 2) / (self.n_fft // 2)
+        freqs = np.arange(0, (self.sample_rate / 2) + bin_width, step=bin_width)
+        fig, ax1 = plt.subplots()
+        for H in Hs:
+            ax1.plot(
+                freqs,
+                20 * np.log10(abs(H) + 1e-8),
+            )
+        plt.legend(legend)
+        # avg_spec = Hs.mean(dim=0).numpy()
+        # ax1.plot(freqs, 20 * np.log10(avg_spec), color="k", linewidth=2)
+        ax1.set_xscale("log")
+        ax1.set_ylim([-80, 0])
+        plt.grid(c="lightgray")
+        if filename is not None:
+            plt.savefig(f"{filename}.png")
+    def plot_spectrum_stats(self, H_mean, H_std, filename=None):
+        bin_width = (self.sample_rate / 2) / (self.n_fft // 2)
+        freqs = np.arange(0, (self.sample_rate / 2) + bin_width, step=bin_width)
+        fig, ax1 = plt.subplots()
+        ax1.plot(freqs, 20 * np.log10(H_mean))
+        ax1.plot(
+            freqs,
+            (20 * np.log10(H_mean)) + (20 * np.log10(H_std)),
+            linestyle="--",
+            color="k",
+        )
+        ax1.plot(
+            freqs,
+            (20 * np.log10(H_mean)) - (20 * np.log10(H_std)),
+            linestyle="--",
+            color="k",
+        )
+        ax1.set_xscale("log")
+        ax1.set_ylim([-80, 0])
+        plt.grid(c="lightgray")
+        if filename is not None:
+            plt.savefig(f"{filename}.png")
+    def plot_spectrum(self, H, legend=[], filename=None):
+        bin_width = (self.sample_rate / 2) / (self.n_fft // 2)
+        freqs = np.arange(0, (self.sample_rate / 2) + bin_width, step=bin_width)
+        fig, ax1 = plt.subplots()
+        ax1.plot(freqs, 20 * np.log10(H))
+        ax1.set_xscale("log")
+        ax1.set_ylim([-80, 0])
+        plt.grid(c="lightgray")
+        plt.legend(legend)
+        if filename is not None:
+            plt.savefig(f"{filename}.png")
+class BaslineComp(torch.nn.Module):
+    def __init__(
+        self,
+        sample_rate: float = 44100,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.meter = pyln.Meter(sample_rate)
+    def forward(self, x, y):
+        x_lufs = self.meter.integrated_loudness(x.view(-1).numpy())
+        y_lufs = self.meter.integrated_loudness(y.view(-1).numpy())
+        delta_lufs = y_lufs - x_lufs
+        threshold = 0.0
+        x_comp = x
+        x_comp_new = x
+        while delta_lufs > 0.5 and threshold > -80.0:
+            x_comp = x_comp_new  # use the last setting
+            x_comp_new = compressor(
+                x.view(-1).numpy(),
+                self.sample_rate,
+                threshold=threshold,
+                ratio=3,
+                attack_time=0.001,
+                release_time=0.05,
+                knee_dB=6.0,
+                makeup_gain_dB=0.0,
+            )
+            x_comp_new = torch.tensor(x_comp_new)
+            x_comp_new /= x_comp_new.abs().max()
+            x_comp_new *= 10 ** (-12.0 / 20)
+            x_lufs = self.meter.integrated_loudness(x_comp_new.view(-1).numpy())
+            delta_lufs = y_lufs - x_lufs
+            threshold -= 0.5
+        return x_comp.view(1, 1, -1)
+class BaselineEQAndComp(torch.nn.Module):
+    def __init__(
+        self,
+        ntaps=63,
+        n_fft=65536,
+        sample_rate=44100,
+        block_size=1024,
+        plugin_config=None,
+    ):
+        super().__init__()
+        self.eq = BaselineEQ(ntaps, n_fft, sample_rate)
+        self.comp = BaslineComp(sample_rate)
+    def forward(self, x, y):
+        with torch.inference_mode():
+            x /= x.abs().max()
+            y /= y.abs().max()
+            x *= 10 ** (-12.0 / 20)
+            y *= 10 ** (-12.0 / 20)
+            x = self.eq(x, y)
+            x /= x.abs().max()
+            y /= y.abs().max()
+            x *= 10 ** (-12.0 / 20)
+            y *= 10 ** (-12.0 / 20)
+            x = self.comp(x, y)
+            x /= x.abs().max()
+            x *= 10 ** (-12.0 / 20)
+        return x

deepafx_st/models/controller.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+class StyleTransferController(torch.nn.Module):
+    def __init__(
+        self,
+        num_control_params,
+        edim,
+        hidden_dim=256,
+        agg_method="mlp",
+    ):
+        """Plugin parameter controller module to map from input to target style.
+        Args:
+            num_control_params (int): Number of plugin parameters to predicted.
+            edim (int): Size of the encoder representations.
+            hidden_dim (int, optional): Hidden size of the 3-layer parameter predictor MLP. Default: 256
+            agg_method (str, optional): Input/reference embed aggregation method ["conv" or "linear", "mlp"]. Default: "mlp"
+        """
+        super().__init__()
+        self.num_control_params = num_control_params
+        self.edim = edim
+        self.hidden_dim = hidden_dim
+        self.agg_method = agg_method
+        if agg_method == "conv":
+            self.agg = torch.nn.Conv1d(
+                2,
+                1,
+                kernel_size=129,
+                stride=1,
+                padding="same",
+                bias=False,
+            )
+            mlp_in_dim = edim
+        elif agg_method == "linear":
+            self.agg = torch.nn.Linear(edim * 2, edim)
+        elif agg_method == "mlp":
+            self.agg = None
+            mlp_in_dim = edim * 2
+        else:
+            raise ValueError(f"Invalid agg_method = {self.agg_method}.")
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(mlp_in_dim, hidden_dim),
+            torch.nn.LeakyReLU(0.01),
+            torch.nn.Linear(hidden_dim, hidden_dim),
+            torch.nn.LeakyReLU(0.01),
+            torch.nn.Linear(hidden_dim, num_control_params),
+            torch.nn.Sigmoid(),  # normalize between 0 and 1
+        )
+    def forward(self, e_x, e_y, z=None):
+        """Forward pass to generate plugin parameters.
+        Args:
+            e_x (tensor): Input signal embedding of shape (batch, edim)
+            e_y (tensor): Target signal embedding of shape (batch, edim)
+        Returns:
+            p (tensor): Estimated control parameters of shape (batch, num_control_params)
+        """
+        # use learnable projection
+        if self.agg_method == "conv":
+            e_xy = torch.stack((e_x, e_y), dim=1)  # concat on channel dim
+            e_xy = self.agg(e_xy)
+        elif self.agg_method == "linear":
+            e_xy = torch.cat((e_x, e_y), dim=-1)  # concat on embed dim
+            e_xy = self.agg(e_xy)
+        else:
+            e_xy = torch.cat((e_x, e_y), dim=-1)  # concat on embed dim
+        # pass through MLP to project to control parametesr
+        p = self.mlp(e_xy.squeeze(1))
+        return p

deepafx_st/models/efficient_net/LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

deepafx_st/models/efficient_net/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+__version__ = "0.7.1"
+from .model import EfficientNet, VALID_MODELS
+from .utils import (
+    GlobalParams,
+    BlockArgs,
+    BlockDecoder,
+    efficientnet,
+    get_model_params,
+)

deepafx_st/models/efficient_net/model.py ADDED Viewed

	@@ -0,0 +1,419 @@

+"""model.py - Model and module class for EfficientNet.
+   They are built to mirror those in the official TensorFlow implementation.
+"""
+# Author: lukemelas (github username)
+# Github repo: https://github.com/lukemelas/EfficientNet-PyTorch
+# With adjustments and added comments by workingcoder (github username).
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .utils import (
+    round_filters,
+    round_repeats,
+    drop_connect,
+    get_same_padding_conv2d,
+    get_model_params,
+    efficientnet_params,
+    load_pretrained_weights,
+    Swish,
+    MemoryEfficientSwish,
+    calculate_output_image_size
+)
+VALID_MODELS = (
+    'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3',
+    'efficientnet-b4', 'efficientnet-b5', 'efficientnet-b6', 'efficientnet-b7',
+    'efficientnet-b8',
+    # Support the construction of 'efficientnet-l2' without pretrained weights
+    'efficientnet-l2'
+)
+class MBConvBlock(nn.Module):
+    """Mobile Inverted Residual Bottleneck Block.
+    Args:
+        block_args (namedtuple): BlockArgs, defined in utils.py.
+        global_params (namedtuple): GlobalParam, defined in utils.py.
+        image_size (tuple or list): [image_height, image_width].
+    References:
+        [1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
+        [2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
+        [3] https://arxiv.org/abs/1905.02244 (MobileNet v3)
+    """
+    def __init__(self, block_args, global_params, image_size=None):
+        super().__init__()
+        self._block_args = block_args
+        self._bn_mom = 1 - global_params.batch_norm_momentum  # pytorch's difference from tensorflow
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip  # whether to use skip connection and drop connect
+        # Expansion phase (Inverted Bottleneck)
+        inp = self._block_args.input_filters  # number of input channels
+        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
+        if self._block_args.expand_ratio != 1:
+            Conv2d = get_same_padding_conv2d(image_size=image_size)
+            self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
+            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+            # image_size = calculate_output_image_size(image_size, 1) <-- this wouldn't modify image_size
+        # Depthwise convolution phase
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._depthwise_conv = Conv2d(
+            in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
+            kernel_size=k, stride=s, bias=False)
+        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+        image_size = calculate_output_image_size(image_size, s)
+        # Squeeze and Excitation layer, if desired
+        if self.has_se:
+            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
+            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
+            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
+        # Pointwise convolution phase
+        final_oup = self._block_args.output_filters
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+        self._swish = MemoryEfficientSwish()
+    def forward(self, inputs, drop_connect_rate=None):
+        """MBConvBlock's forward function.
+        Args:
+            inputs (tensor): Input tensor.
+            drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
+        Returns:
+            Output of this block after processing.
+        """
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = self._expand_conv(inputs)
+            x = self._bn0(x)
+            x = self._swish(x)
+        x = self._depthwise_conv(x)
+        x = self._bn1(x)
+        x = self._swish(x)
+        # Squeeze and Excitation
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_reduce(x_squeezed)
+            x_squeezed = self._swish(x_squeezed)
+            x_squeezed = self._se_expand(x_squeezed)
+            x = torch.sigmoid(x_squeezed) * x
+        # Pointwise Convolution
+        x = self._project_conv(x)
+        x = self._bn2(x)
+        # Skip connection and drop connect
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            # The combination of skip connection and drop connect brings about stochastic depth.
+            if drop_connect_rate:
+                x = drop_connect(x, p=drop_connect_rate, training=self.training)
+            x = x + inputs  # skip connection
+        return x
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+class EfficientNet(nn.Module):
+    """EfficientNet model.
+       Most easily loaded with the .from_name or .from_pretrained methods.
+    Args:
+        blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
+        global_params (namedtuple): A set of GlobalParams shared between blocks.
+    References:
+        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
+    Example:
+        >>> import torch
+        >>> from efficientnet.model import EfficientNet
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+        >>> model.eval()
+        >>> outputs = model(inputs)
+    """
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__()
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+        # Batch norm parameters
+        bn_mom = 1 - self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+        # Get stem static or dynamic convolution depending on image size
+        image_size = global_params.image_size
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        # Stem
+        in_channels = 3  # rgb
+        out_channels = round_filters(32, self._global_params)  # number of output channels
+        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        image_size = calculate_output_image_size(image_size, 2)
+        # Build blocks
+        self._blocks = nn.ModuleList([])
+        for block_args in self._blocks_args:
+            # Update block input and output filters based on depth multiplier.
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters, self._global_params),
+                output_filters=round_filters(block_args.output_filters, self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
+            )
+            # The first block needs to take care of stride and filter size increase.
+            self._blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size))
+            image_size = calculate_output_image_size(image_size, block_args.stride)
+            if block_args.num_repeat > 1:  # modify block_args to keep same output size
+                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size))
+                # image_size = calculate_output_image_size(image_size, block_args.stride)  # stride = 1
+        # Head
+        in_channels = block_args.output_filters  # output of final block
+        out_channels = round_filters(1280, self._global_params)
+        Conv2d = get_same_padding_conv2d(image_size=image_size)
+        self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+        # Final linear layer
+        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
+        if self._global_params.include_top:
+            self._dropout = nn.Dropout(self._global_params.dropout_rate)
+            self._fc = nn.Linear(out_channels, self._global_params.num_classes)
+        # set activation to memory efficient swish by default
+        self._swish = MemoryEfficientSwish()
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export).
+        Args:
+            memory_efficient (bool): Whether to use memory-efficient version of swish.
+        """
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+        for block in self._blocks:
+            block.set_swish(memory_efficient)
+    def extract_endpoints(self, inputs):
+        """Use convolution layer to extract features
+        from reduction levels i in [1, 2, 3, 4, 5].
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Dictionary of last intermediate features
+            with reduction levels i in [1, 2, 3, 4, 5].
+            Example:
+                >>> import torch
+                >>> from efficientnet.model import EfficientNet
+                >>> inputs = torch.rand(1, 3, 224, 224)
+                >>> model = EfficientNet.from_pretrained('efficientnet-b0')
+                >>> endpoints = model.extract_endpoints(inputs)
+                >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
+                >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
+                >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
+                >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
+                >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 320, 7, 7])
+                >>> print(endpoints['reduction_6'].shape)  # torch.Size([1, 1280, 7, 7])
+        """
+        endpoints = dict()
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+        prev_x = x
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if prev_x.size(2) > x.size(2):
+                endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x
+            elif idx == len(self._blocks) - 1:
+                endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
+            prev_x = x
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+        endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
+        return endpoints
+    def extract_features(self, inputs):
+        """use convolution layer to extract feature .
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of the final convolution
+            layer in the efficientnet model.
+        """
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)  # scale drop connect_rate
+            x = block(x, drop_connect_rate=drop_connect_rate)
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+        return x
+    def forward(self, inputs):
+        """EfficientNet's forward function.
+           Calls extract_features to extract features, applies final linear layer, and returns logits.
+        Args:
+            inputs (tensor): Input tensor.
+        Returns:
+            Output of this model after processing.
+        """
+        # Convolution layers
+        x = self.extract_features(inputs)
+        # Pooling and final linear layer
+        x = self._avg_pooling(x)
+        if self._global_params.include_top:
+            x = x.flatten(start_dim=1)
+            x = self._dropout(x)
+            x = self._fc(x)
+        return x
+    @classmethod
+    def from_name(cls, model_name, in_channels=3, **override_params):
+        """Create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            in_channels (int): Input data's channel number.
+            override_params (other key word params):
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'num_classes', 'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            An efficientnet model.
+        """
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(model_name, override_params)
+        model = cls(blocks_args, global_params)
+        model._change_in_channels(in_channels)
+        return model
+    @classmethod
+    def from_pretrained(cls, model_name, weights_path=None, advprop=False,
+                        in_channels=3, num_classes=1000, **override_params):
+        """Create an efficientnet model according to name.
+        Args:
+            model_name (str): Name for efficientnet.
+            weights_path (None or str):
+                str: path to pretrained weights file on the local disk.
+                None: use pretrained weights downloaded from the Internet.
+            advprop (bool):
+                Whether to load pretrained weights
+                trained with advprop (valid when weights_path is None).
+            in_channels (int): Input data's channel number.
+            num_classes (int):
+                Number of categories for classification.
+                It controls the output size for final linear layer.
+            override_params (other key word params):
+                Params to override model's global_params.
+                Optional key:
+                    'width_coefficient', 'depth_coefficient',
+                    'image_size', 'dropout_rate',
+                    'batch_norm_momentum',
+                    'batch_norm_epsilon', 'drop_connect_rate',
+                    'depth_divisor', 'min_depth'
+        Returns:
+            A pretrained efficientnet model.
+        """
+        model = cls.from_name(model_name, num_classes=num_classes, **override_params)
+        load_pretrained_weights(model, model_name, weights_path=weights_path,
+                                load_fc=(num_classes == 1000), advprop=advprop)
+        model._change_in_channels(in_channels)
+        return model
+    @classmethod
+    def get_image_size(cls, model_name):
+        """Get the input image size for a given efficientnet model.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            Input image size (resolution).
+        """
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name):
+        """Validates model name.
+        Args:
+            model_name (str): Name for efficientnet.
+        Returns:
+            bool: Is a valid name or not.
+        """
+        if model_name not in VALID_MODELS:
+            raise ValueError('model_name should be one of: ' + ', '.join(VALID_MODELS))
+    def _change_in_channels(self, in_channels):
+        """Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
+        Args:
+            in_channels (int): Input data's channel number.
+        """
+        if in_channels != 3:
+            Conv2d = get_same_padding_conv2d(image_size=self._global_params.image_size)
+            out_channels = round_filters(32, self._global_params)
+            self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)

deepafx_st/models/efficient_net/utils.py ADDED Viewed

	@@ -0,0 +1,616 @@

+"""utils.py - Helper functions for building the model and for loading model parameters.
+   These helper functions are built to mirror those in the official TensorFlow implementation.
+"""
+# Author: lukemelas (github username)
+# Github repo: https://github.com/lukemelas/EfficientNet-PyTorch
+# With adjustments and added comments by workingcoder (github username).
+import re
+import math
+import collections
+from functools import partial
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils import model_zoo
+################################################################################
+# Help functions for model architecture
+################################################################################
+# GlobalParams and BlockArgs: Two namedtuples
+# Swish and MemoryEfficientSwish: Two implementations of the method
+# round_filters and round_repeats:
+#     Functions to calculate params for scaling model width and depth ! ! !
+# get_width_and_height_from_size and calculate_output_image_size
+# drop_connect: A structural design
+# get_same_padding_conv2d:
+#     Conv2dDynamicSamePadding
+#     Conv2dStaticSamePadding
+# get_same_padding_maxPool2d:
+#     MaxPool2dDynamicSamePadding
+#     MaxPool2dStaticSamePadding
+#     It's an additional function, not used in EfficientNet,
+#     but can be used in other model (such as EfficientDet).
+# Parameters for the entire model (stem, all blocks, and head)
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
+    'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
+    'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top'])
+# Parameters for an individual model block
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'num_repeat', 'kernel_size', 'stride', 'expand_ratio',
+    'input_filters', 'output_filters', 'se_ratio', 'id_skip'])
+# Set GlobalParams and BlockArgs's defaults
+GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
+# Swish activation function
+if hasattr(nn, 'SiLU'):
+    Swish = nn.SiLU
+else:
+    # For compatibility with old PyTorch versions
+    class Swish(nn.Module):
+        def forward(self, x):
+            return x * torch.sigmoid(x)
+# A memory-efficient implementation of Swish function
+class SwishImplementation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_tensors[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+class MemoryEfficientSwish(nn.Module):
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+def round_filters(filters, global_params):
+    """Calculate and round number of filters based on width multiplier.
+       Use width_coefficient, depth_divisor and min_depth of global_params.
+    Args:
+        filters (int): Filters number to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new_filters: New filters number after calculating.
+    """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    # TODO: modify the params names.
+    #       maybe the names (width_divisor,min_width)
+    #       are more suitable than (depth_divisor,min_depth).
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor  # pay attention to this line when using min_depth
+    # follow the formula transferred from official TensorFlow implementation
+    new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+def round_repeats(repeats, global_params):
+    """Calculate module's repeat number of a block based on depth multiplier.
+       Use depth_coefficient of global_params.
+    Args:
+        repeats (int): num_repeat to be calculated.
+        global_params (namedtuple): Global params of the model.
+    Returns:
+        new repeat: New repeat number after calculating.
+    """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    # follow the formula transferred from official TensorFlow implementation
+    return int(math.ceil(multiplier * repeats))
+def drop_connect(inputs, p, training):
+    """Drop connect.
+    Args:
+        input (tensor: BCWH): Input of this structure.
+        p (float: 0.0~1.0): Probability of drop connection.
+        training (bool): The running mode.
+    Returns:
+        output: Output after drop connection.
+    """
+    assert 0 <= p <= 1, 'p must be in range of [0,1]'
+    if not training:
+        return inputs
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+    # generate binary_tensor mask according to probability (p for 0, 1-p for 1)
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+    output = inputs / keep_prob * binary_tensor
+    return output
+def get_width_and_height_from_size(x):
+    """Obtain height and width from x.
+    Args:
+        x (int, tuple or list): Data size.
+    Returns:
+        size: A tuple or list (H,W).
+    """
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, list) or isinstance(x, tuple):
+        return x
+    else:
+        raise TypeError()
+def calculate_output_image_size(input_image_size, stride):
+    """Calculates the output image size when using Conv2dSamePadding with a stride.
+       Necessary for static padding. Thanks to mannatsingh for pointing this out.
+    Args:
+        input_image_size (int, tuple or list): Size of input image.
+        stride (int, tuple or list): Conv2d operation's stride.
+    Returns:
+        output_image_size: A list [H,W].
+    """
+    if input_image_size is None:
+        return None
+    image_height, image_width = get_width_and_height_from_size(input_image_size)
+    stride = stride if isinstance(stride, int) else stride[0]
+    image_height = int(math.ceil(image_height / stride))
+    image_width = int(math.ceil(image_width / stride))
+    return [image_height, image_width]
+# Note:
+# The following 'SamePadding' functions make output size equal ceil(input size/stride).
+# Only when stride equals 1, can the output size be the same as input size.
+# Don't be confused by their function names ! ! !
+def get_same_padding_conv2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
+    """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+class Conv2dDynamicSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow, for a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+    # Tips for 'SAME' mode padding.
+    #     Given the following:
+    #         i: width or height
+    #         s: stride
+    #         k: kernel size
+    #         d: dilation
+    #         p: padding
+    #     Output after Conv2d:
+    #         o = floor((i+p-((k-1)*d+1))/s+1)
+    # If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
+    # => p = (i-1)*s+((k-1)*d+1)-i
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)  # change the output size according to stride ! ! !
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+class Conv2dStaticSamePadding(nn.Conv2d):
+    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+    # With the same calculation as Conv2dDynamicSamePadding
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2,
+                                                pad_h // 2, pad_h - pad_h // 2))
+        else:
+            self.static_padding = nn.Identity()
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+        return x
+def get_same_padding_maxPool2d(image_size=None):
+    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+       Static padding is necessary for ONNX exporting of models.
+    Args:
+        image_size (int or tuple): Size of the image.
+    Returns:
+        MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
+    """
+    if image_size is None:
+        return MaxPool2dDynamicSamePadding
+    else:
+        return partial(MaxPool2dStaticSamePadding, image_size=image_size)
+class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
+       The padding is operated in forward function by calculating dynamically.
+    """
+    def __init__(self, kernel_size, stride, padding=0, dilation=1, return_indices=False, ceil_mode=False):
+        super().__init__(kernel_size, stride, padding, dilation, return_indices, ceil_mode)
+        self.stride = [self.stride] * 2 if isinstance(self.stride, int) else self.stride
+        self.kernel_size = [self.kernel_size] * 2 if isinstance(self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * 2 if isinstance(self.dilation, int) else self.dilation
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                            self.dilation, self.ceil_mode, self.return_indices)
+class MaxPool2dStaticSamePadding(nn.MaxPool2d):
+    """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
+       The padding mudule is calculated in construction function, then used in forward.
+    """
+    def __init__(self, kernel_size, stride, image_size=None, **kwargs):
+        super().__init__(kernel_size, stride, **kwargs)
+        self.stride = [self.stride] * 2 if isinstance(self.stride, int) else self.stride
+        self.kernel_size = [self.kernel_size] * 2 if isinstance(self.kernel_size, int) else self.kernel_size
+        self.dilation = [self.dilation] * 2 if isinstance(self.dilation, int) else self.dilation
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.kernel_size
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+        else:
+            self.static_padding = nn.Identity()
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
+                         self.dilation, self.ceil_mode, self.return_indices)
+        return x
+################################################################################
+# Helper functions for loading model params
+################################################################################
+# BlockDecoder: A Class for encoding and decoding BlockArgs
+# efficientnet_params: A function to query compound coefficient
+# get_model_params and efficientnet:
+#     Functions to get BlockArgs and GlobalParams for efficientnet
+# url_map and url_map_advprop: Dicts of url_map for pretrained weights
+# load_pretrained_weights: A function to load pretrained weights
+class BlockDecoder(object):
+    """Block Decoder for readability,
+       straight from the official TensorFlow repository.
+    """
+    @staticmethod
+    def _decode_block_string(block_string):
+        """Get a block through a string notation of arguments.
+        Args:
+            block_string (str): A string notation of arguments.
+                                Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
+        Returns:
+            BlockArgs: The namedtuple defined at the top of this file.
+        """
+        assert isinstance(block_string, str)
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1) or
+                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+        return BlockArgs(
+            num_repeat=int(options['r']),
+            kernel_size=int(options['k']),
+            stride=[int(options['s'][0])],
+            expand_ratio=int(options['e']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            id_skip=('noskip' not in block_string))
+    @staticmethod
+    def _encode_block_string(block):
+        """Encode a block to a string.
+        Args:
+            block (namedtuple): A BlockArgs type argument.
+        Returns:
+            block_string: A String form of BlockArgs.
+        """
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+    @staticmethod
+    def decode(string_list):
+        """Decode a list of string notations to specify blocks inside the network.
+        Args:
+            string_list (list[str]): A list of strings, each string is a notation of block.
+        Returns:
+            blocks_args: A list of BlockArgs namedtuples of block args.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+    @staticmethod
+    def encode(blocks_args):
+        """Encode a list of BlockArgs to a list of strings.
+        Args:
+            blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
+        Returns:
+            block_strings: A list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+def efficientnet_params(model_name):
+    """Map EfficientNet model name to parameter coefficients.
+    Args:
+        model_name (str): Model name to be queried.
+    Returns:
+        params_dict[model_name]: A (width,depth,res,dropout) tuple.
+    """
+    params_dict = {
+        # Coefficients:   width,depth,res,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+    }
+    return params_dict[model_name]
+def efficientnet(width_coefficient=None, depth_coefficient=None, image_size=None,
+                 dropout_rate=0.2, drop_connect_rate=0.2, num_classes=1000, include_top=True):
+    """Create BlockArgs and GlobalParams for efficientnet model.
+    Args:
+        width_coefficient (float)
+        depth_coefficient (float)
+        image_size (int)
+        dropout_rate (float)
+        drop_connect_rate (float)
+        num_classes (int)
+        Meaning as the name suggests.
+    Returns:
+        blocks_args, global_params.
+    """
+    # Blocks args for the whole model(efficientnet-b0 by default)
+    # It will be modified in the construction of EfficientNet Class according to model
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+    global_params = GlobalParams(
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        image_size=image_size,
+        dropout_rate=dropout_rate,
+        num_classes=num_classes,
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        drop_connect_rate=drop_connect_rate,
+        depth_divisor=8,
+        min_depth=None,
+        include_top=include_top,
+    )
+    return blocks_args, global_params
+def get_model_params(model_name, override_params):
+    """Get the block args and global params for a given model name.
+    Args:
+        model_name (str): Model's name.
+        override_params (dict): A dict to modify global_params.
+    Returns:
+        blocks_args, global_params
+    """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        # note: all models have drop connect rate = 0.2
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
+    else:
+        raise NotImplementedError('model name is not pre-defined: {}'.format(model_name))
+    if override_params:
+        # ValueError will be raised here if override_params has fields not included in global_params.
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+# train with Standard methods
+# check more details in paper(EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks)
+url_map = {
+    'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth',
+    'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b1-f1951068.pth',
+    'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth',
+    'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b3-5fb5a3c3.pth',
+    'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth',
+    'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b5-b6417697.pth',
+    'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b6-c76e70fd.pth',
+    'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth',
+}
+# train with Adversarial Examples(AdvProp)
+# check more details in paper(Adversarial Examples Improve Image Recognition)
+url_map_advprop = {
+    'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b0-b64d5a18.pth',
+    'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b1-0f3ce85a.pth',
+    'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b2-6e9d97e5.pth',
+    'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b3-cdd7c0f4.pth',
+    'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b4-44fb3a87.pth',
+    'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b5-86493f6b.pth',
+    'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b6-ac80338e.pth',
+    'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b7-4652b6dd.pth',
+    'efficientnet-b8': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b8-22a8fe65.pth',
+}
+# TODO: add the petrained weights url map of 'efficientnet-l2'
+def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True, advprop=False, verbose=True):
+    """Loads pretrained weights from weights path or download using url.
+    Args:
+        model (Module): The whole model of efficientnet.
+        model_name (str): Model name of efficientnet.
+        weights_path (None or str):
+            str: path to pretrained weights file on the local disk.
+            None: use pretrained weights downloaded from the Internet.
+        load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
+        advprop (bool): Whether to load pretrained weights
+                        trained with advprop (valid when weights_path is None).
+    """
+    if isinstance(weights_path, str):
+        state_dict = torch.load(weights_path)
+    else:
+        # AutoAugment or Advprop (different preprocessing)
+        url_map_ = url_map_advprop if advprop else url_map
+        state_dict = model_zoo.load_url(url_map_[model_name])
+    if load_fc:
+        ret = model.load_state_dict(state_dict, strict=False)
+        assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(ret.missing_keys)
+    else:
+        state_dict.pop('_fc.weight')
+        state_dict.pop('_fc.bias')
+        ret = model.load_state_dict(state_dict, strict=False)
+        assert set(ret.missing_keys) == set(
+            ['_fc.weight', '_fc.bias']), 'Missing keys when loading pretrained weights: {}'.format(ret.missing_keys)
+    assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(ret.unexpected_keys)
+    if verbose:
+        print('Loaded pretrained weights for {}'.format(model_name))

deepafx_st/models/encoder.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+from deepafx_st.models.mobilenetv2 import MobileNetV2
+from deepafx_st.models.efficient_net import EfficientNet
+class SpectralEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        num_params,
+        sample_rate,
+        encoder_model="mobilenet_v2",
+        embed_dim=1028,
+        width_mult=1,
+        min_level_db=-80,
+    ):
+        """Encoder operating on spectrograms.
+        Args:
+            num_params (int): Number of processor parameters to generate.
+            sample_rate (float): Audio sample rate for computing melspectrogram.
+            encoder_model (str, optional): Encoder model architecture. Default: "mobilenet_v2"
+            embed_dim (int, optional): Dimentionality of the encoder representations.
+            width_mult (int, optional): Encoder size. Default: 1
+            min_level_db (float, optional): Minimal dB value for the spectrogram. Default: -80
+        """
+        super().__init__()
+        self.num_params = num_params
+        self.sample_rate = sample_rate
+        self.encoder_model = encoder_model
+        self.embed_dim = embed_dim
+        self.width_mult = width_mult
+        self.min_level_db = min_level_db
+        # load model from torch.hub
+        if encoder_model == "mobilenet_v2":
+            self.encoder = MobileNetV2(embed_dim=embed_dim, width_mult=width_mult)
+        elif encoder_model == "efficient_net":
+            self.encoder = EfficientNet.from_name(
+                "efficientnet-b2",
+                in_channels=1,
+                image_size=(128, 65),
+                include_top=False,
+            )
+            self.embedding_projection = torch.nn.Conv2d(
+                in_channels=1408,
+                out_channels=embed_dim,
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                padding=(0, 0),
+                bias=True,
+            )
+        else:
+            raise ValueError(f"Invalid encoder_model: {encoder_model}.")
+        self.window = torch.nn.Parameter(torch.hann_window(4096))
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Input waveform of shape [batch x channels x samples]
+        Returns:
+            e (Tensor): Latent embedding produced by Encoder. [batch x embed_dim]
+        """
+        bs, chs, samp = x.size()
+        # compute spectrogram of waveform
+        X = torch.stft(
+            x.view(bs, -1),
+            4096,
+            2048,
+            window=self.window,
+            return_complex=True,
+        )
+        X_db = torch.pow(X.abs() + 1e-8, 0.3)
+        X_db_norm = X_db
+        # standardize (0, 1) 0.322970 0.278452
+        X_db_norm -= 0.322970
+        X_db_norm /= 0.278452
+        X_db_norm = X_db_norm.unsqueeze(1).permute(0, 1, 3, 2)
+        if self.encoder_model == "mobilenet_v2":
+            # repeat channels by 3 to fit vision model
+            X_db_norm = X_db_norm.repeat(1, 3, 1, 1)
+            # pass melspectrogram through encoder
+            e = self.encoder(X_db_norm)
+            # apply avg pooling across time for encoder embeddings
+            e = torch.nn.functional.adaptive_avg_pool2d(e, 1).reshape(e.shape[0], -1)
+            # normalize by L2 norm
+            norm = torch.norm(e, p=2, dim=-1, keepdim=True)
+            e_norm = e / norm
+        elif self.encoder_model == "efficient_net":
+            # Efficient Net internal downsamples by 32 on time and freq axis, then average pools the rest
+            e = self.encoder(X_db_norm)
+            # Adding 1x1 conv to project down or up to the requested embedding size
+            e = self.embedding_projection(e)
+            e = torch.squeeze(e, dim=3)
+            e = torch.squeeze(e, dim=2)
+            # normalize by L2 norm
+            norm = torch.norm(e, p=2, dim=-1, keepdim=True)
+            e_norm = e / norm
+        return e_norm

deepafx_st/models/mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# BSD 3-Clause License
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+#  list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the names of its
+#  contributors may be used to endorse or promote products derived from
+#  this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Adaptation of the PyTorch torchvision MobileNetV2 without a classifier.
+# See source here: https://pytorch.org/vision/0.8/_modules/torchvision/models/mobilenet.html#mobilenet_v2
+from torch import nn
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class ConvBNReLU(nn.Sequential):
+    def __init__(
+        self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None
+    ):
+        padding = (kernel_size - 1) // 2
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias=False,
+            ),
+            norm_layer(out_planes),
+            nn.ReLU6(inplace=True),
+        )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(
+                ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer)
+            )
+        layers.extend(
+            [
+                # dw
+                ConvBNReLU(
+                    hidden_dim,
+                    hidden_dim,
+                    stride=stride,
+                    groups=hidden_dim,
+                    norm_layer=norm_layer,
+                ),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                norm_layer(oup),
+            ]
+        )
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(
+        self,
+        embed_dim=1028,
+        width_mult=1.0,
+        inverted_residual_setting=None,
+        round_nearest=8,
+        block=None,
+        norm_layer=None,
+    ):
+        """
+        MobileNet V2 main class
+        Args:
+            embed_dim (int): Number of channels in the final output.
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+            norm_layer: Module specifying the normalization layer to use
+        """
+        super(MobileNetV2, self).__init__()
+        if block is None:
+            block = InvertedResidual
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        input_channel = 32
+        last_channel = embed_dim / width_mult
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+        # only check the first element, assuming user knows t,c,n,s are required
+        if (
+            len(inverted_residual_setting) == 0
+            or len(inverted_residual_setting[0]) != 4
+        ):
+            raise ValueError(
+                "inverted_residual_setting should be non-empty "
+                "or a 4-element list, got {}".format(inverted_residual_setting)
+            )
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(
+            last_channel * max(1.0, width_mult), round_nearest
+        )
+        features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(
+                        input_channel,
+                        output_channel,
+                        stride,
+                        expand_ratio=t,
+                        norm_layer=norm_layer,
+                    )
+                )
+                input_channel = output_channel
+        # building last several layers
+        features.append(
+            ConvBNReLU(
+                input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer
+            )
+        )
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        return self.features(x)
+        # return the features directly, no classifier or pooling
+    def forward(self, x):
+        return self._forward_impl(x)

deepafx_st/probes/cdpam_encoder.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# MIT License
+# Copyright (c) 2021 Pranay Manocha
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# code adapated from https://github.com/pranaymanocha/PerceptualAudio
+import cdpam
+import torch
+class CDPAMEncoder(torch.nn.Module):
+    def __init__(self, cdpam_ckpt: str):
+        super().__init__()
+        # pre-trained model parameterss
+        encoder_layers = 16
+        encoder_filters = 64
+        input_size = 512
+        proj_ndim = [512, 256]
+        ndim = [16, 6]
+        classif_BN = 0
+        classif_act = "no"
+        proj_dp = 0.1
+        proj_BN = 1
+        classif_dp = 0.05
+        model = cdpam.models.FINnet(
+            encoder_layers=encoder_layers,
+            encoder_filters=encoder_filters,
+            ndim=ndim,
+            classif_dp=classif_dp,
+            classif_BN=classif_BN,
+            classif_act=classif_act,
+            input_size=input_size,
+        )
+        state = torch.load(cdpam_ckpt, map_location="cpu")["state"]
+        model.load_state_dict(state)
+        model.eval()
+        self.model = model
+        self.embed_dim = 512
+    def forward(self, x):
+        with torch.no_grad():
+            _, a1, c1 = self.model.base_encoder.forward(x)
+            a1 = torch.nn.functional.normalize(a1, dim=1)
+        return a1

deepafx_st/probes/probe_system.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import torch
+import julius
+import torchopenl3
+import torchmetrics
+import pytorch_lightning as pl
+from typing import Tuple, List, Dict
+from argparse import ArgumentParser
+from deepafx_st.probes.cdpam_encoder import CDPAMEncoder
+from deepafx_st.probes.random_mel import RandomMelProjection
+import deepafx_st.utils as utils
+from deepafx_st.utils import DSPMode
+from deepafx_st.system import System
+from deepafx_st.data.style import StyleDataset
+class ProbeSystem(pl.LightningModule):
+    def __init__(
+        self,
+        audio_dir=None,
+        num_classes=5,
+        task="style",
+        encoder_type="deepafx_st_autodiff",
+        deepafx_st_autodiff_ckpt=None,
+        deepafx_st_spsa_ckpt=None,
+        deepafx_st_proxy0_ckpt=None,
+        probe_type="linear",
+        batch_size=32,
+        lr=3e-4,
+        lr_patience=20,
+        patience=10,
+        preload=False,
+        sample_rate=24000,
+        shuffle=True,
+        num_workers=16,
+        **kwargs,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        if "deepafx_st" in self.hparams.encoder_type:
+            if "autodiff" in self.hparams.encoder_type:
+                self.hparams.deepafx_st_ckpt = self.hparams.deepafx_st_autodiff_ckpt
+            elif "spsa" in self.hparams.encoder_type:
+                self.hparams.deepafx_st_ckpt = self.hparams.deepafx_st_spsa_ckpt
+            elif "proxy0" in self.hparams.encoder_type:
+                self.hparams.deepafx_st_ckpt = self.hparams.deepafx_st_proxy0_ckpt
+            else:
+                raise RuntimeError(f"Invalid encoder_type: {self.hparams.encoder_type}")
+            if self.hparams.deepafx_st_ckpt is None:
+                raise RuntimeError(
+                    f"Must supply {self.hparams.encoder_type}_ckpt checkpoint."
+                )
+            use_dsp = DSPMode.NONE
+            system = System.load_from_checkpoint(
+                self.hparams.deepafx_st_ckpt,
+                use_dsp=use_dsp,
+                batch_size=self.hparams.batch_size,
+                spsa_parallel=False,
+                proxy_ckpts=[],
+                strict=False,
+            )
+            system.eval()
+            self.encoder = system.encoder
+            self.hparams.embed_dim = self.encoder.embed_dim
+            # freeze weights
+            for name, param in self.encoder.named_parameters():
+                param.requires_grad = False
+        elif self.hparams.encoder_type == "openl3":
+            self.encoder = torchopenl3.models.load_audio_embedding_model(
+                input_repr=self.hparams.openl3_input_repr,
+                embedding_size=self.hparams.openl3_embedding_size,
+                content_type=self.hparams.openl3_content_type,
+            )
+            self.hparams.embed_dim = 6144
+        elif self.hparams.encoder_type == "random_mel":
+            self.encoder = RandomMelProjection(
+                self.hparams.sample_rate,
+                self.hparams.random_mel_embedding_size,
+                self.hparams.random_mel_n_mels,
+                self.hparams.random_mel_n_fft,
+                self.hparams.random_mel_hop_size,
+            )
+            self.hparams.embed_dim = self.hparams.random_mel_embedding_size
+        elif self.hparams.encoder_type == "cdpam":
+            self.encoder = CDPAMEncoder(self.hparams.cdpam_ckpt)
+            self.encoder.eval()
+            self.hparams.embed_dim = self.encoder.embed_dim
+        else:
+            raise ValueError(f"Invalid encoder_type: {self.hparams.encoder_type}")
+        if self.hparams.probe_type == "linear":
+            if self.hparams.task == "style":
+                self.probe = torch.nn.Sequential(
+                    torch.nn.Linear(self.hparams.embed_dim, self.hparams.num_classes),
+                    # torch.nn.Softmax(-1),
+                )
+        elif self.hparams.probe_type == "mlp":
+            if self.hparams.task == "style":
+                self.probe = torch.nn.Sequential(
+                    torch.nn.Linear(self.hparams.embed_dim, 512),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(512, 512),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(512, self.hparams.num_classes),
+                )
+        self.accuracy = torchmetrics.Accuracy()
+        self.f1_score = torchmetrics.F1Score(self.hparams.num_classes)
+    def forward(self, x):
+        bs, chs, samp = x.size()
+        with torch.no_grad():
+            if "deepafx_st" in self.hparams.encoder_type:
+                x /= x.abs().max()
+                x *= 10 ** (-12.0 / 20)  # with min 12 dBFS headroom
+                e = self.encoder(x)
+                norm = torch.norm(e, p=2, dim=-1, keepdim=True)
+                e = e / norm
+            elif self.hparams.encoder_type == "openl3":
+                # x = julius.resample_frac(x, self.hparams.sample_rate, 48000)
+                e, ts = torchopenl3.get_audio_embedding(
+                    x,
+                    48000,
+                    model=self.encoder,
+                    input_repr="mel128",
+                    content_type="music",
+                )
+                e = e.permute(0, 2, 1)
+                e = e.mean(dim=-1)
+                # normalize by L2 norm
+                norm = torch.norm(e, p=2, dim=-1, keepdim=True)
+                e = e / norm
+            elif self.hparams.encoder_type == "random_mel":
+                e = self.encoder(x)
+                norm = torch.norm(e, p=2, dim=-1, keepdim=True)
+                e = e / norm
+            elif self.hparams.encoder_type == "cdpam":
+                # x = julius.resample_frac(x, self.hparams.sample_rate, 22050)
+                x = torch.round(x * 32768)
+                e = self.encoder(x)
+        return self.probe(e)
+    def common_step(
+        self,
+        batch: Tuple,
+        batch_idx: int,
+        optimizer_idx: int = 0,
+        train: bool = True,
+    ):
+        loss = 0
+        x, y = batch
+        y_hat = self(x)
+        # compute CE
+        if self.hparams.task == "style":
+            loss = torch.nn.functional.cross_entropy(y_hat, y)
+        if not train:
+            # store audio data
+            data_dict = {"x": x.float().cpu()}
+        else:
+            data_dict = {}
+        self.log(
+            "train_loss" if train else "val_loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=False,
+            logger=True,
+            sync_dist=True,
+        )
+        if not train and self.hparams.task == "style":
+            self.log("val_acc_step", self.accuracy(y_hat, y))
+            self.log("val_f1_step", self.f1_score(y_hat, y))
+        return loss, data_dict
+    def training_step(self, batch, batch_idx, optimizer_idx=0):
+        loss, _ = self.common_step(batch, batch_idx)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss, data_dict = self.common_step(batch, batch_idx, train=False)
+        if batch_idx == 0:
+            return data_dict
+    def validation_epoch_end(self, outputs) -> None:
+        if self.hparams.task == "style":
+            self.log("val_acc_epoch", self.accuracy.compute())
+            self.log("val_f1_epoch", self.f1_score.compute())
+        return super().validation_epoch_end(outputs)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.probe.parameters(),
+            lr=self.hparams.lr,
+            betas=(0.9, 0.999),
+        )
+        ms1 = int(self.hparams.max_epochs * 0.8)
+        ms2 = int(self.hparams.max_epochs * 0.95)
+        print(
+            "Learning rate schedule:",
+            f"0 {self.hparams.lr:0.2e} -> ",
+            f"{ms1} {self.hparams.lr*0.1:0.2e} -> ",
+            f"{ms2} {self.hparams.lr*0.01:0.2e}",
+        )
+        scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            milestones=[ms1, ms2],
+            gamma=0.1,
+        )
+        return [optimizer], {"scheduler": scheduler, "monitor": "val_loss"}
+    def train_dataloader(self):
+        if self.hparams.task == "style":
+            train_dataset = StyleDataset(
+                self.hparams.audio_dir,
+                "train",
+                sample_rate=self.hparams.encoder_sample_rate,
+            )
+        g = torch.Generator()
+        g.manual_seed(0)
+        return torch.utils.data.DataLoader(
+            train_dataset,
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+            shuffle=True,
+            worker_init_fn=utils.seed_worker,
+            generator=g,
+            pin_memory=True,
+        )
+    def val_dataloader(self):
+        if self.hparams.task == "style":
+            val_dataset = StyleDataset(
+                self.hparams.audio_dir,
+                subset="val",
+                sample_rate=self.hparams.encoder_sample_rate,
+            )
+        g = torch.Generator()
+        g.manual_seed(0)
+        return torch.utils.data.DataLoader(
+            val_dataset,
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+            worker_init_fn=utils.seed_worker,
+            generator=g,
+            pin_memory=True,
+        )
+    # add any model hyperparameters here
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        # --- Model  ---
+        parser.add_argument("--encoder_type", type=str, default="deeapfx2")
+        parser.add_argument("--probe_type", type=str, default="linear")
+        parser.add_argument("--task", type=str, default="style")
+        parser.add_argument("--encoder_sample_rate", type=int, default=24000)
+        # --- deeapfx2  ---
+        parser.add_argument("--deepafx_st_autodiff_ckpt", type=str)
+        parser.add_argument("--deepafx_st_spsa_ckpt", type=str)
+        parser.add_argument("--deepafx_st_proxy0_ckpt", type=str)
+        # --- cdpam  ---
+        parser.add_argument("--cdpam_ckpt", type=str)
+        # --- openl3  ---
+        parser.add_argument("--openl3_input_repr", type=str, default="mel128")
+        parser.add_argument("--openl3_content_type", type=str, default="env")
+        parser.add_argument("--openl3_embedding_size", type=int, default=6144)
+        # --- random_mel  ---
+        parser.add_argument("--random_mel_embedding_size", type=str, default=4096)
+        parser.add_argument("--random_mel_n_fft", type=str, default=4096)
+        parser.add_argument("--random_mel_hop_size", type=str, default=1024)
+        parser.add_argument("--random_mel_n_mels", type=str, default=128)
+        # --- Training  ---
+        parser.add_argument("--audio_dir", type=str)
+        parser.add_argument("--num_classes", type=int, default=5)
+        parser.add_argument("--batch_size", type=int, default=32)
+        parser.add_argument("--lr", type=float, default=3e-4)
+        parser.add_argument("--lr_patience", type=int, default=20)
+        parser.add_argument("--patience", type=int, default=10)
+        parser.add_argument("--preload", action="store_true")
+        parser.add_argument("--sample_rate", type=int, default=24000)
+        parser.add_argument("--num_workers", type=int, default=8)
+        return parser

deepafx_st/probes/random_mel.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import math
+import torch
+import librosa
+# based on https://github.com/neuralaudio/hear-baseline/blob/main/hearbaseline/naive.py
+class RandomMelProjection(torch.nn.Module):
+    def __init__(
+        self,
+        sample_rate,
+        embed_dim=4096,
+        n_mels=128,
+        n_fft=4096,
+        hop_size=1024,
+        seed=0,
+        epsilon=1e-4,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.embed_dim = embed_dim
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.hop_size = hop_size
+        self.seed = seed
+        self.epsilon = epsilon
+        # Set random seed
+        torch.random.manual_seed(self.seed)
+        # Create a Hann window buffer to apply to frames prior to FFT.
+        self.register_buffer("window", torch.hann_window(self.n_fft))
+        # Create a mel filter buffer.
+        mel_scale = torch.tensor(
+            librosa.filters.mel(
+                self.sample_rate,
+                n_fft=self.n_fft,
+                n_mels=self.n_mels,
+            )
+        )
+        self.register_buffer("mel_scale", mel_scale)
+        # Projection matrices.
+        normalization = math.sqrt(self.n_mels)
+        self.projection = torch.nn.Parameter(
+            torch.rand(self.n_mels, self.embed_dim) / normalization,
+            requires_grad=False,
+        )
+    def forward(self, x):
+        bs, chs, samp = x.size()
+        x = torch.stft(
+            x.view(bs, -1),
+            self.n_fft,
+            self.hop_size,
+            window=self.window,
+            return_complex=True,
+        )
+        x = x.unsqueeze(1).permute(0, 1, 3, 2)
+        # Apply the mel-scale filter to the power spectrum.
+        x = torch.matmul(x.abs(), self.mel_scale.transpose(0, 1))
+        # power scale
+        x = torch.pow(x + self.epsilon, 0.3)
+        # apply random projection
+        e = x.matmul(self.projection)
+        # take mean across temporal dim
+        e = e.mean(dim=2).view(bs, -1)
+        return e
+    def compute_frame_embedding(self, x):
+        # Compute the real-valued Fourier transform on windowed input signal.
+        x = torch.fft.rfft(x * self.window)
+        # Convert to a power spectrum.
+        x = torch.abs(x) ** 2.0
+        # Apply the mel-scale filter to the power spectrum.
+        x = torch.matmul(x, self.mel_scale.transpose(0, 1))
+        # Convert to a log mel spectrum.
+        x = torch.log(x + self.epsilon)
+        # Apply projection to get a 4096 dimension embedding
+        embedding = x.matmul(self.projection)
+        return embedding

deepafx_st/processors/autodiff/__init__.py ADDED Viewed

File without changes

deepafx_st/processors/autodiff/channel.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+from deepafx_st.processors.autodiff.compressor import Compressor
+from deepafx_st.processors.autodiff.peq import ParametricEQ
+from deepafx_st.processors.autodiff.fir import FIRFilter
+class AutodiffChannel(torch.nn.Module):
+    def __init__(self, sample_rate):
+        super().__init__()
+        self.peq = ParametricEQ(sample_rate)
+        self.comp = Compressor(sample_rate)
+        self.ports = [self.peq.ports, self.comp.ports]
+        self.num_control_params = (
+            self.peq.num_control_params + self.comp.num_control_params
+        )
+    def forward(self, x, p, sample_rate=24000, **kwargs):
+        # split params between EQ and Comp.
+        p_peq = p[:, : self.peq.num_control_params]
+        p_comp = p[:, self.peq.num_control_params :]
+        y = self.peq(x, p_peq, sample_rate)
+        y = self.comp(y, p_comp, sample_rate)
+        return y

deepafx_st/processors/autodiff/compressor.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import math
+import torch
+import scipy.signal
+import deepafx_st.processors.autodiff.signal
+from deepafx_st.processors.processor import Processor
+@torch.jit.script
+def compressor(
+    x: torch.Tensor,
+    sample_rate: float,
+    threshold: torch.Tensor,
+    ratio: torch.Tensor,
+    attack_time: torch.Tensor,
+    release_time: torch.Tensor,
+    knee_dB: torch.Tensor,
+    makeup_gain_dB: torch.Tensor,
+    eps: float = 1e-8,
+):
+    """Note the `release` parameter is not used."""
+    # print(f"autodiff comp fs = {sample_rate}")
+    s = x.size()  # should be one 1d
+    threshold = threshold.squeeze()
+    ratio = ratio.squeeze()
+    attack_time = attack_time.squeeze()
+    makeup_gain_dB = makeup_gain_dB.squeeze()
+    # uni-polar dB signal
+    # Turn the input signal into a uni-polar signal on the dB scale
+    x_G = 20 * torch.log10(torch.abs(x) + 1e-8)  # x_uni casts type
+    # Ensure there are no values of negative infinity
+    x_G = torch.clamp(x_G, min=-96)
+    # Static characteristics with knee
+    y_G = torch.zeros(s).type_as(x)
+    ratio = ratio.view(-1)
+    threshold = threshold.view(-1)
+    attack_time = attack_time.view(-1)
+    release_time = release_time.view(-1)
+    knee_dB = knee_dB.view(-1)
+    makeup_gain_dB = makeup_gain_dB.view(-1)
+    # Below knee
+    idx = torch.where((2 * (x_G - threshold)) < -knee_dB)[0]
+    y_G[idx] = x_G[idx]
+    # At knee
+    idx = torch.where((2 * torch.abs(x_G - threshold)) <= knee_dB)[0]
+    y_G[idx] = x_G[idx] + (
+        (1 / ratio) * (((x_G[idx] - threshold + knee_dB) / 2) ** 2)
+    ) / (2 * knee_dB)
+    # Above knee threshold
+    idx = torch.where((2 * (x_G - threshold)) > knee_dB)[0]
+    y_G[idx] = threshold + ((x_G[idx] - threshold) / ratio)
+    x_L = x_G - y_G
+    # design 1-pole butterworth lowpass
+    fc = 1.0 / (attack_time * sample_rate)
+    b, a = deepafx_st.processors.autodiff.signal.butter(fc)
+    # apply FIR approx of IIR filter
+    y_L = deepafx_st.processors.autodiff.signal.approx_iir_filter(b, a, x_L)
+    lin_y_L = torch.pow(10.0, -y_L / 20.0)  # convert back to linear
+    y = lin_y_L * x  # apply gain
+    # apply makeup gain
+    y *= torch.pow(10.0, makeup_gain_dB / 20.0)
+    return y
+class Compressor(Processor):
+    def __init__(
+        self,
+        sample_rate,
+        max_threshold=0.0,
+        min_threshold=-80,
+        max_ratio=20.0,
+        min_ratio=1.0,
+        max_attack=0.1,
+        min_attack=0.0001,
+        max_release=1.0,
+        min_release=0.005,
+        max_knee=12.0,
+        min_knee=0.0,
+        max_mkgain=48.0,
+        min_mkgain=-48.0,
+        eps=1e-8,
+    ):
+        """ """
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.eps = eps
+        self.ports = [
+            {
+                "name": "Threshold",
+                "min": min_threshold,
+                "max": max_threshold,
+                "default": -12.0,
+                "units": "dB",
+            },
+            {
+                "name": "Ratio",
+                "min": min_ratio,
+                "max": max_ratio,
+                "default": 2.0,
+                "units": "",
+            },
+            {
+                "name": "Attack",
+                "min": min_attack,
+                "max": max_attack,
+                "default": 0.001,
+                "units": "s",
+            },
+            {
+                # this is a dummy parameter
+                "name": "Release (dummy)",
+                "min": min_release,
+                "max": max_release,
+                "default": 0.045,
+                "units": "s",
+            },
+            {
+                "name": "Knee",
+                "min": min_knee,
+                "max": max_knee,
+                "default": 6.0,
+                "units": "dB",
+            },
+            {
+                "name": "Makeup Gain",
+                "min": min_mkgain,
+                "max": max_mkgain,
+                "default": 0.0,
+                "units": "dB",
+            },
+        ]
+        self.num_control_params = len(self.ports)
+    def forward(self, x, p, sample_rate=24000, **kwargs):
+        """
+        Assume that parameters in p are normalized between 0 and 1.
+        x (tensor): Shape batch x 1 x samples
+        p (tensor): shape batch x params
+        """
+        bs, ch, s = x.size()
+        inputs = torch.split(x, 1, 0)
+        params = torch.split(p, 1, 0)
+        y = []  # loop over batch dimension
+        for input, param in zip(inputs, params):
+            denorm_param = self.denormalize_params(param.view(-1))
+            y.append(compressor(input.view(-1), sample_rate, *denorm_param))
+        return torch.stack(y, dim=0).view(bs, 1, -1)

deepafx_st/processors/autodiff/fir.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+class FIRFilter(torch.nn.Module):
+    def __init__(self, num_control_params=63):
+        super().__init__()
+        self.num_control_params = num_control_params
+        self.adaptor = torch.nn.Linear(num_control_params, num_control_params)
+        #self.batched_lfilter = torch.vmap(self.lfilter)
+    def forward(self, x, b, **kwargs):
+        """Forward pass by appling FIR filter to each batch element.
+        Args:
+            x (tensor): Input signals with shape (batch x 1 x samples)
+            b (tensor): Matrix of FIR filter coefficients with shape (batch x ntaps)
+        """
+        bs, ch, s = x.size()
+        b = self.adaptor(b)
+        # pad input
+        x = torch.nn.functional.pad(x, (b.shape[-1] // 2, b.shape[-1] // 2))
+        # add extra dim for virutal batch dim
+        x = x.view(bs, 1, ch, -1)
+        b = b.view(bs, 1, 1, -1)
+        # exlcuding vmap for now
+        y = self.batched_lfilter(x, b).view(bs, ch, s)
+        return y
+    @staticmethod
+    def lfilter(x, b):
+        return torch.nn.functional.conv1d(x, b)
+class FrequencyDomainFIRFilter(torch.nn.Module):
+    def __init__(self, num_control_params=31):
+        super().__init__()
+        self.num_control_params = num_control_params
+        self.adaptor = torch.nn.Linear(num_control_params, num_control_params)
+    def forward(self, x, b, **kwargs):
+        """Forward pass by appling FIR filter to each batch element.
+        Args:
+            x (tensor): Input signals with shape (batch x 1 x samples)
+            b (tensor): Matrix of FIR filter coefficients with shape (batch x ntaps)
+        """
+        bs, c, s = x.size()
+        b = self.adaptor(b)
+        # transform input to freq. domain
+        X = torch.fft.rfft(x.view(bs, -1))
+        # frequency response of filter
+        H = torch.fft.rfft(b.view(bs, -1))
+        # apply filter as multiplication in freq. domain
+        Y = X * H
+        # transform back to time domain
+        y = torch.fft.ifft(Y).view(bs, 1, -1)
+        return y

deepafx_st/processors/autodiff/peq.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import torch
+import deepafx_st.processors.autodiff.signal
+from deepafx_st.processors.processor import Processor
+@torch.jit.script
+def parametric_eq(
+    x: torch.Tensor,
+    sample_rate: float,
+    low_shelf_gain_dB: torch.Tensor,
+    low_shelf_cutoff_freq: torch.Tensor,
+    low_shelf_q_factor: torch.Tensor,
+    first_band_gain_dB: torch.Tensor,
+    first_band_cutoff_freq: torch.Tensor,
+    first_band_q_factor: torch.Tensor,
+    second_band_gain_dB: torch.Tensor,
+    second_band_cutoff_freq: torch.Tensor,
+    second_band_q_factor: torch.Tensor,
+    third_band_gain_dB: torch.Tensor,
+    third_band_cutoff_freq: torch.Tensor,
+    third_band_q_factor: torch.Tensor,
+    fourth_band_gain_dB: torch.Tensor,
+    fourth_band_cutoff_freq: torch.Tensor,
+    fourth_band_q_factor: torch.Tensor,
+    high_shelf_gain_dB: torch.Tensor,
+    high_shelf_cutoff_freq: torch.Tensor,
+    high_shelf_q_factor: torch.Tensor,
+):
+    """Six-band parametric EQ.
+    Low-shelf -> Band 1 -> Band 2 -> Band 3 -> Band 4 -> High-shelf
+    Args:
+        x (torch.Tensor): 1d signal.
+    """
+    a_s, b_s = [], []
+    #print(f"autodiff peq fs = {sample_rate}")
+    # -------- apply low-shelf filter --------
+    b, a = deepafx_st.processors.autodiff.signal.biqaud(
+        low_shelf_gain_dB,
+        low_shelf_cutoff_freq,
+        low_shelf_q_factor,
+        sample_rate,
+        "low_shelf",
+    )
+    b_s.append(b)
+    a_s.append(a)
+    # -------- apply first-band peaking filter --------
+    b, a = deepafx_st.processors.autodiff.signal.biqaud(
+        first_band_gain_dB,
+        first_band_cutoff_freq,
+        first_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    b_s.append(b)
+    a_s.append(a)
+    # -------- apply second-band peaking filter --------
+    b, a = deepafx_st.processors.autodiff.signal.biqaud(
+        second_band_gain_dB,
+        second_band_cutoff_freq,
+        second_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    b_s.append(b)
+    a_s.append(a)
+    # -------- apply third-band peaking filter --------
+    b, a = deepafx_st.processors.autodiff.signal.biqaud(
+        third_band_gain_dB,
+        third_band_cutoff_freq,
+        third_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    b_s.append(b)
+    a_s.append(a)
+    # -------- apply fourth-band peaking filter --------
+    b, a = deepafx_st.processors.autodiff.signal.biqaud(
+        fourth_band_gain_dB,
+        fourth_band_cutoff_freq,
+        fourth_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    b_s.append(b)
+    a_s.append(a)
+    # -------- apply high-shelf filter --------
+    b, a = deepafx_st.processors.autodiff.signal.biqaud(
+        high_shelf_gain_dB,
+        high_shelf_cutoff_freq,
+        high_shelf_q_factor,
+        sample_rate,
+        "high_shelf",
+    )
+    b_s.append(b)
+    a_s.append(a)
+    x = deepafx_st.processors.autodiff.signal.approx_iir_filter_cascade(
+        b_s, a_s, x.view(-1)
+    )
+    return x
+class ParametricEQ(Processor):
+    def __init__(
+        self,
+        sample_rate,
+        min_gain_dB=-24.0,
+        default_gain_dB=0.0,
+        max_gain_dB=24.0,
+        min_q_factor=0.1,
+        default_q_factor=0.707,
+        max_q_factor=10,
+        eps=1e-8,
+    ):
+        """ """
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.eps = eps
+        self.ports = [
+            {
+                "name": "Lowshelf gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Lowshelf cutoff",
+                "min": 20.0,
+                "max": 200.0,
+                "default": 100.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Lowshelf Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "First band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "First band cutoff",
+                "min": 200.0,
+                "max": 2000.0,
+                "default": 400.0,
+                "units": "Hz",
+            },
+            {
+                "name": "First band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": 0.707,
+                "units": "",
+            },
+            {
+                "name": "Second band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Second band cutoff",
+                "min": 200.0,
+                "max": 4000.0,
+                "default": 1000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Second band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "Third band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Third band cutoff",
+                "min": 2000.0,
+                "max": 8000.0,
+                "default": 4000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Third band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "Fourth band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Fourth band cutoff",
+                "min": 4000.0,
+                "max": (24000 // 2) * 0.9,
+                "default": 8000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Fourth band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "Highshelf gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Highshelf cutoff",
+                "min": 4000.0,
+                "max": (24000 // 2) * 0.9,
+                "default": 8000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Highshelf Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+        ]
+        self.num_control_params = len(self.ports)
+    def forward(self, x, p, sample_rate=24000, **kwargs):
+        bs, chs, s = x.size()
+        inputs = torch.split(x, 1, 0)
+        params = torch.split(p, 1, 0)
+        y = []  # loop over batch dimension
+        for input, param in zip(inputs, params):
+            denorm_param = self.denormalize_params(param.view(-1))
+            y.append(parametric_eq(input.view(-1), sample_rate, *denorm_param))
+        return torch.stack(y, dim=0).view(bs, 1, -1)

deepafx_st/processors/autodiff/signal.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import math
+import torch
+from typing import List
+def butter(fc, fs: float = 2.0):
+    """
+    Recall Butterworth polynomials
+    N = 1   s + 1
+    N = 2   s^2 + sqrt(2s) + 1
+    N = 3   (s^2 + s + 1)(s + 1)
+    N = 4   (s^2 + 0.76536s + 1)(s^2 + 1.84776s + 1)
+    Scaling
+    LP to LP:   s -> s/w_c
+    LP to HP:   s -> w_c/s
+    Bilinear transform:
+    s = 2/T_d * (1 - z^-1)/(1 + z^-1)
+    For 1-pole butterworth lowpass
+    1 / (s + 1)     1-pole prototype
+    1 / (s/w_c + 1)  LP to LP
+    1 / (2/T_d * (1 - z^-1)/(1 + z^-1))/w_c + 1)  Bilinear transform
+    """
+    # apply pre-warping to the cutoff
+    T_d = 1 / fs
+    w_d = (2 * math.pi * fc) / fs
+    #    sys.exit()
+    w_c = (2 / T_d) * torch.tan(w_d / 2)
+    a0 = 2 + (T_d * w_c)
+    a1 = (T_d * w_c) - 2
+    b0 = T_d * w_c
+    b1 = T_d * w_c
+    b = torch.stack([b0, b1], dim=0).view(-1)
+    a = torch.stack([a0, a1], dim=0).view(-1)
+    # normalize
+    b = b.type_as(fc) / a0
+    a = a.type_as(fc) / a0
+    return b, a
+def biqaud(
+    gain_dB: torch.Tensor,
+    cutoff_freq: torch.Tensor,
+    q_factor: torch.Tensor,
+    sample_rate: float,
+    filter_type: str = "peaking",
+):
+    # convert inputs to Tensors if needed
+    # gain_dB = torch.tensor([gain_dB])
+    # cutoff_freq = torch.tensor([cutoff_freq])
+    # q_factor = torch.tensor([q_factor])
+    A = 10 ** (gain_dB / 40.0)
+    w0 = 2 * math.pi * (cutoff_freq / sample_rate)
+    alpha = torch.sin(w0) / (2 * q_factor)
+    cos_w0 = torch.cos(w0)
+    sqrt_A = torch.sqrt(A)
+    if filter_type == "high_shelf":
+        b0 = A * ((A + 1) + (A - 1) * cos_w0 + 2 * sqrt_A * alpha)
+        b1 = -2 * A * ((A - 1) + (A + 1) * cos_w0)
+        b2 = A * ((A + 1) + (A - 1) * cos_w0 - 2 * sqrt_A * alpha)
+        a0 = (A + 1) - (A - 1) * cos_w0 + 2 * sqrt_A * alpha
+        a1 = 2 * ((A - 1) - (A + 1) * cos_w0)
+        a2 = (A + 1) - (A - 1) * cos_w0 - 2 * sqrt_A * alpha
+    elif filter_type == "low_shelf":
+        b0 = A * ((A + 1) - (A - 1) * cos_w0 + 2 * sqrt_A * alpha)
+        b1 = 2 * A * ((A - 1) - (A + 1) * cos_w0)
+        b2 = A * ((A + 1) - (A - 1) * cos_w0 - 2 * sqrt_A * alpha)
+        a0 = (A + 1) + (A - 1) * cos_w0 + 2 * sqrt_A * alpha
+        a1 = -2 * ((A - 1) + (A + 1) * cos_w0)
+        a2 = (A + 1) + (A - 1) * cos_w0 - 2 * sqrt_A * alpha
+    elif filter_type == "peaking":
+        b0 = 1 + alpha * A
+        b1 = -2 * cos_w0
+        b2 = 1 - alpha * A
+        a0 = 1 + (alpha / A)
+        a1 = -2 * cos_w0
+        a2 = 1 - (alpha / A)
+    else:
+        raise ValueError(f"Invalid filter_type: {filter_type}.")
+    b = torch.stack([b0, b1, b2], dim=0).view(-1)
+    a = torch.stack([a0, a1, a2], dim=0).view(-1)
+    # normalize
+    b = b.type_as(gain_dB) / a0
+    a = a.type_as(gain_dB) / a0
+    return b, a
+def freqz(b, a, n_fft: int = 512):
+    B = torch.fft.rfft(b, n_fft)
+    A = torch.fft.rfft(a, n_fft)
+    H = B / A
+    return H
+def freq_domain_filter(x, H, n_fft):
+    X = torch.fft.rfft(x, n_fft)
+    # move H to same device as input x
+    H = H.type_as(X)
+    Y = X * H
+    y = torch.fft.irfft(Y, n_fft)
+    return y
+def approx_iir_filter(b, a, x):
+    """Approimxate the application of an IIR filter.
+    Args:
+        b (Tensor): The numerator coefficients.
+    """
+    # round up to nearest power of 2 for FFT
+    # n_fft = 2 ** math.ceil(math.log2(x.shape[-1] + x.shape[-1] - 1))
+    n_fft = 2 ** torch.ceil(torch.log2(torch.tensor(x.shape[-1] + x.shape[-1] - 1)))
+    n_fft = n_fft.int()
+    # move coefficients to same device as x
+    b = b.type_as(x).view(-1)
+    a = a.type_as(x).view(-1)
+    # compute complex response
+    H = freqz(b, a, n_fft=n_fft).view(-1)
+    # apply filter
+    y = freq_domain_filter(x, H, n_fft)
+    # crop
+    y = y[: x.shape[-1]]
+    return y
+def approx_iir_filter_cascade(
+    b_s: List[torch.Tensor],
+    a_s: List[torch.Tensor],
+    x: torch.Tensor,
+):
+    """Apply a cascade of IIR filters.
+    Args:
+        b (list[Tensor]): List of tensors of shape (3)
+        a (list[Tensor]): List of tensors of (3)
+        x (torch.Tensor): 1d Tensor.
+    """
+    if len(b_s) != len(a_s):
+        raise RuntimeError(
+            f"Must have same number of coefficients. Got b: {len(b_s)} and a: {len(a_s)}."
+        )
+    # round up to nearest power of 2 for FFT
+    # n_fft = 2 ** math.ceil(math.log2(x.shape[-1] + x.shape[-1] - 1))
+    n_fft = 2 ** torch.ceil(torch.log2(torch.tensor(x.shape[-1] + x.shape[-1] - 1)))
+    n_fft = n_fft.int()
+    # this could be done in parallel
+    b = torch.stack(b_s, dim=0).type_as(x)
+    a = torch.stack(a_s, dim=0).type_as(x)
+    H = freqz(b, a, n_fft=n_fft)
+    H = torch.prod(H, dim=0).view(-1)
+    # apply filter
+    y = freq_domain_filter(x, H, n_fft)
+    # crop
+    y = y[: x.shape[-1]]
+    return y

deepafx_st/processors/dsp/compressor.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import sys
+import torch
+import numpy as np
+import scipy.signal
+from numba import jit
+from deepafx_st.processors.processor import Processor
+# Adapted from: https://github.com/drscotthawley/signaltrain/blob/master/signaltrain/audio.py
+@jit(nopython=True)
+def my_clip_min(
+    x: np.ndarray,
+    clip_min: float,
+):  # does the work of np.clip(), which numba doesn't support yet
+    # TODO: keep an eye on Numba PR https://github.com/numba/numba/pull/3468 that fixes this
+    inds = np.where(x < clip_min)
+    x[inds] = clip_min
+    return x
+@jit(nopython=True)
+def compressor(
+    x: np.ndarray,
+    sample_rate: float,
+    threshold: float = -24.0,
+    ratio: float = 2.0,
+    attack_time: float = 0.01,
+    release_time: float = 0.01,
+    knee_dB: float = 0.0,
+    makeup_gain_dB: float = 0.0,
+    dtype=np.float32,
+):
+    """
+    Args:
+        x (np.ndarray): Input signal.
+        sample_rate (float): Sample rate in Hz.
+        threshold (float): Threhold in dB.
+        ratio (float): Ratio (should be >=1 , i.e. ratio:1).
+        attack_time (float): Attack time in seconds.
+        release_time (float): Release time in seconds.
+        knee_dB (float): Knee.
+        makeup_gain_dB (float): Makeup Gain.
+        dtype (type): Output type. Default: np.float32
+    Returns:
+        y (np.ndarray): Output signal.
+    """
+    # print(f"dsp comp fs = {sample_rate}")
+    N = len(x)
+    dtype = x.dtype
+    y = np.zeros(N, dtype=dtype)
+    # Initialize separate attack and release times
+    # Where do these numbers come from
+    alpha_A = np.exp(-np.log(9) / (sample_rate * attack_time))
+    alpha_R = np.exp(-np.log(9) / (sample_rate * release_time))
+    # Turn the input signal into a uni-polar signal on the dB scale
+    x_G = 20 * np.log10(np.abs(x) + 1e-8)  # x_uni casts type
+    # Ensure there are no values of negative infinity
+    x_G = my_clip_min(x_G, -96)
+    # Static characteristics with knee
+    y_G = np.zeros(N, dtype=dtype)
+    # Below knee
+    idx = np.where((2 * (x_G - threshold)) < -knee_dB)
+    y_G[idx] = x_G[idx]
+    # At knee
+    idx = np.where((2 * np.abs(x_G - threshold)) <= knee_dB)
+    y_G[idx] = x_G[idx] + (
+        (1 / ratio) * (((x_G[idx] - threshold + knee_dB) / 2) ** 2)
+    ) / (2 * knee_dB)
+    # Above knee threshold
+    idx = np.where((2 * (x_G - threshold)) > knee_dB)
+    y_G[idx] = threshold + ((x_G[idx] - threshold) / ratio)
+    x_L = x_G - y_G
+    # this loop is slow but not vectorizable due to its cumulative, sequential nature. @autojit makes it fast(er).
+    y_L = np.zeros(N, dtype=dtype)
+    for n in range(1, N):
+        # smooth over the gainChange
+        if x_L[n] > y_L[n - 1]:  # attack mode
+            y_L[n] = (alpha_A * y_L[n - 1]) + ((1 - alpha_A) * x_L[n])
+        else:  # release
+            y_L[n] = (alpha_R * y_L[n - 1]) + ((1 - alpha_R) * x_L[n])
+    # Convert to linear amplitude scalar; i.e. map from dB to amplitude
+    lin_y_L = np.power(10.0, (-y_L / 20.0))
+    y = lin_y_L * x  # Apply linear amplitude to input sample
+    y *= np.power(10.0, makeup_gain_dB / 20.0)  # apply makeup gain
+    return y.astype(dtype)
+class Compressor(Processor):
+    def __init__(
+        self,
+        sample_rate,
+        max_threshold=0.0,
+        min_threshold=-80,
+        max_ratio=20.0,
+        min_ratio=1.0,
+        max_attack=0.1,
+        min_attack=0.0001,
+        max_release=1.0,
+        min_release=0.005,
+        max_knee=12.0,
+        min_knee=0.0,
+        max_mkgain=48.0,
+        min_mkgain=-48.0,
+        eps=1e-8,
+    ):
+        """ """
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.eps = eps
+        self.ports = [
+            {
+                "name": "Threshold",
+                "min": min_threshold,
+                "max": max_threshold,
+                "default": -12.0,
+                "units": "",
+            },
+            {
+                "name": "Ratio",
+                "min": min_ratio,
+                "max": max_ratio,
+                "default": 2.0,
+                "units": "",
+            },
+            {
+                "name": "Attack Time",
+                "min": min_attack,
+                "max": max_attack,
+                "default": 0.001,
+                "units": "s",
+            },
+            {
+                "name": "Release Time",
+                "min": min_release,
+                "max": max_release,
+                "default": 0.045,
+                "units": "s",
+            },
+            {
+                "name": "Knee",
+                "min": min_knee,
+                "max": max_knee,
+                "default": 6.0,
+                "units": "dB",
+            },
+            {
+                "name": "Makeup Gain",
+                "min": min_mkgain,
+                "max": max_mkgain,
+                "default": 0.0,
+                "units": "dB",
+            },
+        ]
+        self.num_control_params = len(self.ports)
+        self.process_fn = compressor
+    def forward(self, x, p, sample_rate=24000, **kwargs):
+        "All processing in the forward is in numpy."
+        return self.run_series(x, p, sample_rate)

deepafx_st/processors/dsp/peq.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import torch
+import numpy as np
+import scipy.signal
+from numba import jit
+from deepafx_st.processors.processor import Processor
+@jit(nopython=True)
+def biqaud(
+    gain_dB: float,
+    cutoff_freq: float,
+    q_factor: float,
+    sample_rate: float,
+    filter_type: str,
+):
+    """Use design parameters to generate coeffieicnets for a specific filter type.
+    Args:
+        gain_dB (float): Shelving filter gain in dB.
+        cutoff_freq (float): Cutoff frequency in Hz.
+        q_factor (float): Q factor.
+        sample_rate (float): Sample rate in Hz.
+        filter_type (str): Filter type.
+            One of "low_shelf", "high_shelf", or "peaking"
+    Returns:
+        b (np.ndarray): Numerator filter coefficients stored as [b0, b1, b2]
+        a (np.ndarray): Denominator filter coefficients stored as [a0, a1, a2]
+    """
+    A = 10 ** (gain_dB / 40.0)
+    w0 = 2.0 * np.pi * (cutoff_freq / sample_rate)
+    alpha = np.sin(w0) / (2.0 * q_factor)
+    cos_w0 = np.cos(w0)
+    sqrt_A = np.sqrt(A)
+    if filter_type == "high_shelf":
+        b0 = A * ((A + 1) + (A - 1) * cos_w0 + 2 * sqrt_A * alpha)
+        b1 = -2 * A * ((A - 1) + (A + 1) * cos_w0)
+        b2 = A * ((A + 1) + (A - 1) * cos_w0 - 2 * sqrt_A * alpha)
+        a0 = (A + 1) - (A - 1) * cos_w0 + 2 * sqrt_A * alpha
+        a1 = 2 * ((A - 1) - (A + 1) * cos_w0)
+        a2 = (A + 1) - (A - 1) * cos_w0 - 2 * sqrt_A * alpha
+    elif filter_type == "low_shelf":
+        b0 = A * ((A + 1) - (A - 1) * cos_w0 + 2 * sqrt_A * alpha)
+        b1 = 2 * A * ((A - 1) - (A + 1) * cos_w0)
+        b2 = A * ((A + 1) - (A - 1) * cos_w0 - 2 * sqrt_A * alpha)
+        a0 = (A + 1) + (A - 1) * cos_w0 + 2 * sqrt_A * alpha
+        a1 = -2 * ((A - 1) + (A + 1) * cos_w0)
+        a2 = (A + 1) + (A - 1) * cos_w0 - 2 * sqrt_A * alpha
+    elif filter_type == "peaking":
+        b0 = 1 + alpha * A
+        b1 = -2 * cos_w0
+        b2 = 1 - alpha * A
+        a0 = 1 + alpha / A
+        a1 = -2 * cos_w0
+        a2 = 1 - alpha / A
+    else:
+        pass
+        # raise ValueError(f"Invalid filter_type: {filter_type}.")
+    b = np.array([b0, b1, b2]) / a0
+    a = np.array([a0, a1, a2]) / a0
+    return b, a
+# Adapted from https://github.com/csteinmetz1/pyloudnorm/blob/master/pyloudnorm/iirfilter.py
+def parametric_eq(
+    x: np.ndarray,
+    sample_rate: float,
+    low_shelf_gain_dB: float = 0.0,
+    low_shelf_cutoff_freq: float = 80.0,
+    low_shelf_q_factor: float = 0.707,
+    first_band_gain_dB: float = 0.0,
+    first_band_cutoff_freq: float = 300.0,
+    first_band_q_factor: float = 0.707,
+    second_band_gain_dB: float = 0.0,
+    second_band_cutoff_freq: float = 1000.0,
+    second_band_q_factor: float = 0.707,
+    third_band_gain_dB: float = 0.0,
+    third_band_cutoff_freq: float = 4000.0,
+    third_band_q_factor: float = 0.707,
+    fourth_band_gain_dB: float = 0.0,
+    fourth_band_cutoff_freq: float = 8000.0,
+    fourth_band_q_factor: float = 0.707,
+    high_shelf_gain_dB: float = 0.0,
+    high_shelf_cutoff_freq: float = 1000.0,
+    high_shelf_q_factor: float = 0.707,
+    dtype=np.float32,
+):
+    """Six-band parametric EQ.
+    Low-shelf -> Band 1 -> Band 2 -> Band 3 -> Band 4 -> High-shelf
+    Args:
+    """
+    # print(f"autodiff peq fs = {sample_rate}")
+    # -------- apply low-shelf filter --------
+    b, a = biqaud(
+        low_shelf_gain_dB,
+        low_shelf_cutoff_freq,
+        low_shelf_q_factor,
+        sample_rate,
+        "low_shelf",
+    )
+    sos0 = np.concatenate((b, a))
+    x = scipy.signal.lfilter(b, a, x)
+    # -------- apply first-band peaking filter --------
+    b, a = biqaud(
+        first_band_gain_dB,
+        first_band_cutoff_freq,
+        first_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    sos1 = np.concatenate((b, a))
+    x = scipy.signal.lfilter(b, a, x)
+    # -------- apply second-band peaking filter --------
+    b, a = biqaud(
+        second_band_gain_dB,
+        second_band_cutoff_freq,
+        second_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    sos2 = np.concatenate((b, a))
+    x = scipy.signal.lfilter(b, a, x)
+    # -------- apply third-band peaking filter --------
+    b, a = biqaud(
+        third_band_gain_dB,
+        third_band_cutoff_freq,
+        third_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    sos3 = np.concatenate((b, a))
+    x = scipy.signal.lfilter(b, a, x)
+    # -------- apply fourth-band peaking filter --------
+    b, a = biqaud(
+        fourth_band_gain_dB,
+        fourth_band_cutoff_freq,
+        fourth_band_q_factor,
+        sample_rate,
+        "peaking",
+    )
+    sos4 = np.concatenate((b, a))
+    x = scipy.signal.lfilter(b, a, x)
+    # -------- apply high-shelf filter --------
+    b, a = biqaud(
+        high_shelf_gain_dB,
+        high_shelf_cutoff_freq,
+        high_shelf_q_factor,
+        sample_rate,
+        "high_shelf",
+    )
+    sos5 = np.concatenate((b, a))
+    x = scipy.signal.lfilter(b, a, x)
+    return x.astype(dtype)
+class ParametricEQ(Processor):
+    def __init__(
+        self,
+        sample_rate,
+        min_gain_dB=-24.0,
+        default_gain_dB=0.0,
+        max_gain_dB=24.0,
+        min_q_factor=0.1,
+        default_q_factor=0.707,
+        max_q_factor=10,
+        eps=1e-8,
+    ):
+        """ """
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.eps = eps
+        self.ports = [
+            {
+                "name": "Lowshelf gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Lowshelf cutoff",
+                "min": 20.0,
+                "max": 200.0,
+                "default": 100.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Lowshelf Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "First band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "First band cutoff",
+                "min": 200.0,
+                "max": 2000.0,
+                "default": 400.0,
+                "units": "Hz",
+            },
+            {
+                "name": "First band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": 0.707,
+                "units": "",
+            },
+            {
+                "name": "Second band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Second band cutoff",
+                "min": 800.0,
+                "max": 4000.0,
+                "default": 1000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Second band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "Third band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Third band cutoff",
+                "min": 2000.0,
+                "max": 8000.0,
+                "default": 4000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Third band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "Fourth band gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Fourth band cutoff",
+                "min": 4000.0,
+                "max": (24000 // 2) * 0.9,
+                "default": 8000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Fourth band Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+            {
+                "name": "Highshelf gain",
+                "min": min_gain_dB,
+                "max": max_gain_dB,
+                "default": default_gain_dB,
+                "units": "dB",
+            },
+            {
+                "name": "Highshelf cutoff",
+                "min": 4000.0,
+                "max": (24000 // 2) * 0.9,
+                "default": 8000.0,
+                "units": "Hz",
+            },
+            {
+                "name": "Highshelf Q",
+                "min": min_q_factor,
+                "max": max_q_factor,
+                "default": default_q_factor,
+                "units": "",
+            },
+        ]
+        self.num_control_params = len(self.ports)
+        self.process_fn = parametric_eq
+    def forward(self, x, p, sample_rate=24000, **kwargs):
+        "All processing in the forward is in numpy."
+        return self.run_series(x, p, sample_rate)

deepafx_st/processors/processor.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import multiprocessing
+from abc import ABC, abstractmethod
+import deepafx_st.utils as utils
+import numpy as np
+class Processor(torch.nn.Module, ABC):
+    """Processor base class."""
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def denormalize_params(self, p):
+        """This method takes a tensor of parameters scaled from 0-1 and
+        restores them back to the original parameter range."""
+        # check if the number of parameters is correct
+        params = p  # torch.split(p, 1, -1)
+        if len(params) != self.num_control_params:
+            raise RuntimeError(
+                f"Invalid number of parameters. ",
+                f"Expected {self.num_control_params} but found {len(params)} {params.shape}.",
+            )
+        # iterate over the parameters and expand from 0-1 to full range
+        denorm_params = []
+        for param, port in zip(params, self.ports):
+            # check if parameter exceeds range
+            if param > 1.0 or param < 0.0:
+                raise RuntimeError(
+                    f"""Parameter '{port["name"]}' exceeds range: {param}"""
+                )
+            # denormalize and store result
+            denorm_params.append(utils.denormalize(param, port["max"], port["min"]))
+        return denorm_params
+    def normalize_params(self, *params):
+        """This method creates a vector of parameters normalized from 0-1."""
+        # check if the number of parameters is correct
+        if len(params) != self.num_control_params:
+            raise RuntimeError(
+                f"Invalid number of parameters. ",
+                f"Expected {self.num_control_params} but found {len(params)}.",
+            )
+        norm_params = []
+        for param, port in zip(params, self.ports):
+            norm_params.append(utils.normalize(param, port["max"], port["min"]))
+        p = torch.tensor(norm_params).view(1, -1)
+        return p
+    # def run_series(self, inputs, params):
+    #    """Run the process function in a loop given a list of inputs and parameters"""
+    #    p_b_denorm = [p for p in self.denormalize_params(params)]
+    #    y = self.process_fn(inputs, self.sample_rate, *p_b_denorm)
+    #    return y
+    def run_series(self, inputs, params, sample_rate=24000):
+        """Run the process function in a loop given a list of inputs and parameters"""
+        if params.ndim == 1:
+            params = np.reshape(params, (1, -1))
+            inputs = np.reshape(inputs, (1, -1))
+        bs = inputs.shape[0]
+        ys = []
+        params = np.clip(params, 0, 1)
+        for bidx in range(bs):
+            p_b_denorm = [p for p in self.denormalize_params(params[bidx, :])]
+            y = self.process_fn(
+                inputs[bidx, ...].reshape(-1),
+                sample_rate,
+                *p_b_denorm,
+            )
+            ys.append(y)
+        y = np.stack(ys, axis=0)
+        return y
+    @abstractmethod
+    def forward(self, x, p):
+        pass

deepafx_st/processors/proxy/channel.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+from deepafx_st.processors.proxy.proxy_system import ProxySystem
+from deepafx_st.utils import DSPMode
+class ProxyChannel(torch.nn.Module):
+    def __init__(
+        self,
+        proxy_system_ckpts: list,
+        freeze_proxies: bool = True,
+        dsp_mode: DSPMode = DSPMode.NONE,
+        num_tcns: int = 2,
+        tcn_nblocks: int = 4,
+        tcn_dilation_growth: int = 8,
+        tcn_channel_width: int = 64,
+        tcn_kernel_size: int = 13,
+        sample_rate: int = 24000,
+    ):
+        super().__init__()
+        self.freeze_proxies = freeze_proxies
+        self.dsp_mode = dsp_mode
+        self.num_tcns = num_tcns
+        # load the proxies
+        self.proxies = torch.nn.ModuleList()
+        self.num_control_params = 0
+        self.ports = []
+        for proxy_system_ckpt in proxy_system_ckpts:
+            proxy = ProxySystem.load_from_checkpoint(proxy_system_ckpt)
+            # freeze model parameters
+            if freeze_proxies:
+                for param in proxy.parameters():
+                    param.requires_grad = False
+            self.proxies.append(proxy)
+            if proxy.hparams.processor == "channel":
+                self.ports = proxy.processor.ports
+            else:
+                self.ports.append(proxy.processor.ports)
+            self.num_control_params += proxy.processor.num_control_params
+        if len(proxy_system_ckpts) == 0:
+            if self.num_tcns == 2:
+                peq_proxy = ProxySystem(
+                    processor="peq",
+                    output_gain=False,
+                    nblocks=tcn_nblocks,
+                    dilation_growth=tcn_dilation_growth,
+                    kernel_size=tcn_kernel_size,
+                    channel_width=tcn_channel_width,
+                    sample_rate=sample_rate,
+                )
+                self.proxies.append(peq_proxy)
+                self.ports.append(peq_proxy.processor.ports)
+                self.num_control_params += peq_proxy.processor.num_control_params
+                comp_proxy = ProxySystem(
+                    processor="comp",
+                    output_gain=True,
+                    nblocks=tcn_nblocks,
+                    dilation_growth=tcn_dilation_growth,
+                    kernel_size=tcn_kernel_size,
+                    channel_width=tcn_channel_width,
+                    sample_rate=sample_rate,
+                )
+                self.proxies.append(comp_proxy)
+                self.ports.append(comp_proxy.processor.ports)
+                self.num_control_params += comp_proxy.processor.num_control_params
+            elif self.num_tcns == 1:
+                channel_proxy = ProxySystem(
+                    processor="channel",
+                    output_gain=True,
+                    nblocks=tcn_nblocks,
+                    dilation_growth=tcn_dilation_growth,
+                    kernel_size=tcn_kernel_size,
+                    channel_width=tcn_channel_width,
+                    sample_rate=sample_rate,
+                )
+                self.proxies.append(channel_proxy)
+                for port_list in channel_proxy.processor.ports:
+                    self.ports.append(port_list)
+                self.num_control_params += channel_proxy.processor.num_control_params
+            else:
+                raise ValueError(f"num_tcns must be <= 2. Asked for {self.num_tcns}.")
+    def forward(
+        self,
+        x: torch.Tensor,
+        p: torch.Tensor,
+        dsp_mode: DSPMode = DSPMode.NONE,
+        sample_rate: int = 24000,
+        **kwargs,
+    ):
+        # loop over the proxies and pass parameters
+        stop_idx = 0
+        for proxy in self.proxies:
+            start_idx = stop_idx
+            stop_idx += proxy.processor.num_control_params
+            p_subset = p[:, start_idx:stop_idx]
+            if dsp_mode.name == DSPMode.NONE.name:
+                x = proxy(
+                    x,
+                    p_subset,
+                    use_dsp=False,
+                )
+            elif dsp_mode.name == DSPMode.INFER.name:
+                x = proxy(
+                    x,
+                    p_subset,
+                    use_dsp=True,
+                    sample_rate=sample_rate,
+                )
+            elif dsp_mode.name == DSPMode.TRAIN_INFER.name:
+                # Mimic gumbel softmax implementation to replace grads similar to
+                # https://gist.github.com/yzh119/fd2146d2aeb329d067568a493b20172f
+                x_hard = proxy(
+                    x,
+                    p_subset,
+                    use_dsp=True,
+                    sample_rate=sample_rate,
+                )
+                x = proxy(
+                    x,
+                    p_subset,
+                    use_dsp=False,
+                    sample_rate=sample_rate,
+                )
+                x = (x_hard - x).detach() + x
+            else:
+                assert 0, "invalid dsp model for proxy"
+        return x

deepafx_st/processors/proxy/proxy_system.py ADDED Viewed

	@@ -0,0 +1,289 @@

+from re import X
+import torch
+import auraloss
+import pytorch_lightning as pl
+from typing import Tuple, List, Dict
+from argparse import ArgumentParser
+import deepafx_st.utils as utils
+from deepafx_st.data.proxy import DSPProxyDataset
+from deepafx_st.processors.proxy.tcn import ConditionalTCN
+from deepafx_st.processors.spsa.channel import SPSAChannel
+from deepafx_st.processors.dsp.peq import ParametricEQ
+from deepafx_st.processors.dsp.compressor import Compressor
+class ProxySystem(pl.LightningModule):
+    def __init__(
+        self,
+        causal=True,
+        nblocks=4,
+        dilation_growth=8,
+        kernel_size=13,
+        channel_width=64,
+        input_dir=None,
+        processor="channel",
+        batch_size=32,
+        lr=3e-4,
+        lr_patience=20,
+        patience=10,
+        preload=False,
+        sample_rate=24000,
+        shuffle=True,
+        train_length=65536,
+        train_examples_per_epoch=10000,
+        val_length=131072,
+        val_examples_per_epoch=1000,
+        num_workers=16,
+        output_gain=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        #print(f"Proxy Processor: {processor} @ fs={sample_rate} Hz")
+        # construct both the true DSP...
+        if self.hparams.processor == "peq":
+            self.processor = ParametricEQ(self.hparams.sample_rate)
+        elif self.hparams.processor == "comp":
+            self.processor = Compressor(self.hparams.sample_rate)
+        elif self.hparams.processor == "channel":
+            self.processor = SPSAChannel(self.hparams.sample_rate)
+        # and the neural network proxy
+        self.proxy = ConditionalTCN(
+            self.hparams.sample_rate,
+            num_control_params=self.processor.num_control_params,
+            causal=self.hparams.causal,
+            nblocks=self.hparams.nblocks,
+            channel_width=self.hparams.channel_width,
+            kernel_size=self.hparams.kernel_size,
+            dilation_growth=self.hparams.dilation_growth,
+        )
+        self.receptive_field = self.proxy.compute_receptive_field()
+        self.recon_losses = {}
+        self.recon_loss_weights = {}
+        self.recon_losses["mrstft"] = auraloss.freq.MultiResolutionSTFTLoss(
+            fft_sizes=[32, 128, 512, 2048, 8192, 32768],
+            hop_sizes=[16, 64, 256, 1024, 4096, 16384],
+            win_lengths=[32, 128, 512, 2048, 8192, 32768],
+            w_sc=0.0,
+            w_phs=0.0,
+            w_lin_mag=1.0,
+            w_log_mag=1.0,
+        )
+        self.recon_loss_weights["mrstft"] = 1.0
+        self.recon_losses["l1"] = torch.nn.L1Loss()
+        self.recon_loss_weights["l1"] = 100.0
+    def forward(self, x, p, use_dsp=False, sample_rate=24000, **kwargs):
+        """Use the pre-trained neural network proxy effect."""
+        bs, chs, samp = x.size()
+        if not use_dsp:
+            y = self.proxy(x, p)
+            # manually apply the makeup gain parameter
+            if self.hparams.output_gain and not self.hparams.processor == "peq":
+                gain_db = (p[..., -1] * 96) - 48
+                gain_ln = 10 ** (gain_db / 20.0)
+                y *= gain_ln.view(bs, chs, 1)
+        else:
+            with torch.no_grad():
+                bs, chs, s = x.shape
+                if self.hparams.output_gain and not self.hparams.processor == "peq":
+                    # override makeup gain
+                    gain_db = (p[..., -1] * 96) - 48
+                    gain_ln = 10 ** (gain_db / 20.0)
+                    p[..., -1] = 0.5
+                if self.hparams.processor == "channel":
+                    y_temp = self.processor(x.cpu(), p.cpu())
+                    y_temp = y_temp.view(bs, chs, s).type_as(x)
+                else:
+                    y_temp = self.processor(
+                        x.cpu().numpy(),
+                        p.cpu().numpy(),
+                        sample_rate,
+                    )
+                    y_temp = torch.tensor(y_temp).view(bs, chs, s).type_as(x)
+                y = y_temp.type_as(x).view(bs, 1, -1)
+                if self.hparams.output_gain and not self.hparams.processor == "peq":
+                    y *= gain_ln.view(bs, chs, 1)
+        return y
+    def common_step(
+        self,
+        batch: Tuple,
+        batch_idx: int,
+        optimizer_idx: int = 0,
+        train: bool = True,
+    ):
+        loss = 0
+        x, y, p = batch
+        y_hat = self(x, p)
+        # compute loss
+        for loss_idx, (loss_name, loss_fn) in enumerate(self.recon_losses.items()):
+            tmp_loss = loss_fn(y_hat.float(), y.float())
+            loss += self.recon_loss_weights[loss_name] * tmp_loss
+            self.log(
+                f"train_loss/{loss_name}" if train else f"val_loss/{loss_name}",
+                tmp_loss,
+                on_step=True,
+                on_epoch=True,
+                prog_bar=False,
+                logger=True,
+                sync_dist=True,
+            )
+        if not train:
+            # store audio data
+            data_dict = {
+                "x": x.float().cpu(),
+                "y": y.float().cpu(),
+                "p": p.float().cpu(),
+                "y_hat": y_hat.float().cpu(),
+            }
+        else:
+            data_dict = {}
+        self.log(
+            "train_loss" if train else "val_loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=False,
+            logger=True,
+            sync_dist=True,
+        )
+        return loss, data_dict
+    def training_step(self, batch, batch_idx, optimizer_idx=0):
+        loss, _ = self.common_step(batch, batch_idx)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss, data_dict = self.common_step(batch, batch_idx, train=False)
+        if batch_idx == 0:
+            return data_dict
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(
+            self.proxy.parameters(),
+            lr=self.hparams.lr,
+            betas=(0.9, 0.999),
+        )
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            patience=self.hparams.lr_patience,
+            verbose=True,
+        )
+        return [optimizer], {"scheduler": scheduler, "monitor": "val_loss"}
+    def train_dataloader(self):
+        train_dataset = DSPProxyDataset(
+            self.hparams.input_dir,
+            self.processor,
+            self.hparams.processor,  # name
+            subset="train",
+            length=self.hparams.train_length,
+            num_examples_per_epoch=self.hparams.train_examples_per_epoch,
+            half=True if self.hparams.precision == 16 else False,
+            buffer_size_gb=self.hparams.buffer_size_gb,
+            buffer_reload_rate=self.hparams.buffer_reload_rate,
+        )
+        g = torch.Generator()
+        g.manual_seed(0)
+        return torch.utils.data.DataLoader(
+            train_dataset,
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+            worker_init_fn=utils.seed_worker,
+            generator=g,
+            pin_memory=True,
+        )
+    def val_dataloader(self):
+        val_dataset = DSPProxyDataset(
+            self.hparams.input_dir,
+            self.processor,
+            self.hparams.processor,  # name
+            subset="val",
+            length=self.hparams.val_length,
+            num_examples_per_epoch=self.hparams.val_examples_per_epoch,
+            half=True if self.hparams.precision == 16 else False,
+            buffer_size_gb=self.hparams.buffer_size_gb,
+            buffer_reload_rate=self.hparams.buffer_reload_rate,
+        )
+        g = torch.Generator()
+        g.manual_seed(0)
+        return torch.utils.data.DataLoader(
+            val_dataset,
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+            worker_init_fn=utils.seed_worker,
+            generator=g,
+            pin_memory=True,
+        )
+    @staticmethod
+    def count_control_params(plugin_config):
+        num_control_params = 0
+        for plugin in plugin_config["plugins"]:
+            for port in plugin["ports"]:
+                if port["optim"]:
+                    num_control_params += 1
+        return num_control_params
+    # add any model hyperparameters here
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        # --- Model  ---
+        parser.add_argument("--causal", action="store_true")
+        parser.add_argument("--output_gain", action="store_true")
+        parser.add_argument("--dilation_growth", type=int, default=8)
+        parser.add_argument("--nblocks", type=int, default=4)
+        parser.add_argument("--kernel_size", type=int, default=13)
+        parser.add_argument("--channel_width", type=int, default=13)
+        # --- Training  ---
+        parser.add_argument("--input_dir", type=str)
+        parser.add_argument("--processor", type=str)
+        parser.add_argument("--batch_size", type=int, default=32)
+        parser.add_argument("--lr", type=float, default=3e-4)
+        parser.add_argument("--lr_patience", type=int, default=20)
+        parser.add_argument("--patience", type=int, default=10)
+        parser.add_argument("--preload", action="store_true")
+        parser.add_argument("--sample_rate", type=int, default=24000)
+        parser.add_argument("--shuffle", type=bool, default=True)
+        parser.add_argument("--train_length", type=int, default=65536)
+        parser.add_argument("--train_examples_per_epoch", type=int, default=10000)
+        parser.add_argument("--val_length", type=int, default=131072)
+        parser.add_argument("--val_examples_per_epoch", type=int, default=1000)
+        parser.add_argument("--num_workers", type=int, default=8)
+        parser.add_argument("--buffer_reload_rate", type=int, default=1000)
+        parser.add_argument("--buffer_size_gb", type=float, default=1.0)
+        return parser

deepafx_st/processors/proxy/tcn.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#   Copyright 2022 Christian J. Steinmetz
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#       http://www.apache.org/licenses/LICENSE-2.0
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# TCN implementation adapted from:
+# https://github.com/csteinmetz1/micro-tcn/blob/main/microtcn/tcn.py
+import torch
+from argparse import ArgumentParser
+from deepafx_st.utils import center_crop, causal_crop
+class FiLM(torch.nn.Module):
+    def __init__(self, num_features, cond_dim):
+        super().__init__()
+        self.num_features = num_features
+        self.bn = torch.nn.BatchNorm1d(num_features, affine=False)
+        self.adaptor = torch.nn.Linear(cond_dim, num_features * 2)
+    def forward(self, x, cond):
+        # project conditioning to 2 x num. conv channels
+        cond = self.adaptor(cond)
+        # split the projection into gain and bias
+        g, b = torch.chunk(cond, 2, dim=-1)
+        # add virtual channel dim if needed
+        if g.ndim == 2:
+            g = g.unsqueeze(1)
+            b = b.unsqueeze(1)
+        # reshape for application
+        g = g.permute(0, 2, 1)
+        b = b.permute(0, 2, 1)
+        x = self.bn(x)  # apply BatchNorm without affine
+        x = (x * g) + b  # then apply conditional affine
+        return x
+class ConditionalTCNBlock(torch.nn.Module):
+    def __init__(
+        self, in_ch, out_ch, cond_dim, kernel_size=3, dilation=1, causal=False, **kwargs
+    ):
+        super().__init__()
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.causal = causal
+        self.conv1 = torch.nn.Conv1d(
+            in_ch,
+            out_ch,
+            kernel_size=kernel_size,
+            padding=0,
+            dilation=dilation,
+            bias=True,
+        )
+        self.film = FiLM(out_ch, cond_dim)
+        self.relu = torch.nn.PReLU(out_ch)
+        self.res = torch.nn.Conv1d(
+            in_ch, out_ch, kernel_size=1, groups=in_ch, bias=False
+        )
+    def forward(self, x, p):
+        x_in = x
+        x = self.conv1(x)
+        x = self.film(x, p)  # apply FiLM conditioning
+        x = self.relu(x)
+        x_res = self.res(x_in)
+        if self.causal:
+            x = x + causal_crop(x_res, x.shape[-1])
+        else:
+            x = x + center_crop(x_res, x.shape[-1])
+        return x
+class ConditionalTCN(torch.nn.Module):
+    """Temporal convolutional network with conditioning module.
+    Args:
+        sample_rate (float): Audio sample rate.
+        num_control_params (int, optional): Dimensionality of the conditioning signal. Default: 24
+        ninputs (int, optional): Number of input channels (mono = 1, stereo 2). Default: 1
+        noutputs (int, optional): Number of output channels (mono = 1, stereo 2). Default: 1
+        nblocks (int, optional): Number of total TCN blocks. Default: 10
+        kernel_size (int, optional: Width of the convolutional kernels. Default: 3
+        dialation_growth (int, optional): Compute the dilation factor at each block as dilation_growth ** (n % stack_size). Default: 1
+        channel_growth (int, optional): Compute the output channels at each black as in_ch * channel_growth. Default: 2
+        channel_width (int, optional): When channel_growth = 1 all blocks use convolutions with this many channels. Default: 64
+        stack_size (int, optional): Number of blocks that constitute a single stack of blocks. Default: 10
+        causal (bool, optional): Causal TCN configuration does not consider future input values. Default: False
+    """
+    def __init__(
+        self,
+        sample_rate,
+        num_control_params=24,
+        ninputs=1,
+        noutputs=1,
+        nblocks=10,
+        kernel_size=15,
+        dilation_growth=2,
+        channel_growth=1,
+        channel_width=64,
+        stack_size=10,
+        causal=False,
+        skip_connections=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.num_control_params = num_control_params
+        self.ninputs = ninputs
+        self.noutputs = noutputs
+        self.nblocks = nblocks
+        self.kernel_size = kernel_size
+        self.dilation_growth = dilation_growth
+        self.channel_growth = channel_growth
+        self.channel_width = channel_width
+        self.stack_size = stack_size
+        self.causal = causal
+        self.skip_connections = skip_connections
+        self.sample_rate = sample_rate
+        self.blocks = torch.nn.ModuleList()
+        for n in range(nblocks):
+            in_ch = out_ch if n > 0 else ninputs
+            if self.channel_growth > 1:
+                out_ch = in_ch * self.channel_growth
+            else:
+                out_ch = self.channel_width
+            dilation = self.dilation_growth ** (n % self.stack_size)
+            self.blocks.append(
+                ConditionalTCNBlock(
+                    in_ch,
+                    out_ch,
+                    self.num_control_params,
+                    kernel_size=self.kernel_size,
+                    dilation=dilation,
+                    padding="same" if self.causal else "valid",
+                    causal=self.causal,
+                )
+            )
+        self.output = torch.nn.Conv1d(out_ch, noutputs, kernel_size=1)
+        self.receptive_field = self.compute_receptive_field()
+        # print(
+        #     f"TCN receptive field: {self.receptive_field} samples",
+        #     f" or {(self.receptive_field/self.sample_rate)*1e3:0.3f} ms",
+        # )
+    def forward(self, x, p, **kwargs):
+        # causally pad input signal
+        x = torch.nn.functional.pad(x, (self.receptive_field - 1, 0))
+        # iterate over blocks passing conditioning
+        for idx, block in enumerate(self.blocks):
+            x = block(x, p)
+            if self.skip_connections:
+                if idx == 0:
+                    skips = x
+                else:
+                    skips = center_crop(skips, x[-1]) + x
+            else:
+                skips = 0
+        # final 1x1 convolution to collapse channels
+        out = self.output(x + skips)
+        return out
+    def compute_receptive_field(self):
+        """Compute the receptive field in samples."""
+        rf = self.kernel_size
+        for n in range(1, self.nblocks):
+            dilation = self.dilation_growth ** (n % self.stack_size)
+            rf = rf + ((self.kernel_size - 1) * dilation)
+        return rf

deepafx_st/processors/spsa/channel.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import torch
+import numpy as np
+import torch.multiprocessing as mp
+from deepafx_st.processors.dsp.peq import ParametricEQ
+from deepafx_st.processors.dsp.compressor import Compressor
+from deepafx_st.processors.spsa.spsa_func import SPSAFunction
+from deepafx_st.utils import rademacher
+def dsp_func(x, p, dsp, sample_rate=24000):
+    (peq, comp), meta = dsp
+    p_peq = p[:meta]
+    p_comp = p[meta:]
+    y = peq(x, p_peq, sample_rate)
+    y = comp(y, p_comp, sample_rate)
+    return y
+class SPSAChannel(torch.nn.Module):
+    """
+    Args:
+        sample_rate (float): Sample rate of the plugin instance
+        parallel (bool, optional): Use parallel workers for DSP.
+    By default, this utilizes parallelized instances of the plugin channel,
+    where the number of workers is equal to the batch size.
+    """
+    def __init__(
+        self,
+        sample_rate: int,
+        parallel: bool = False,
+        batch_size: int = 8,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.parallel = parallel
+        if self.parallel:
+            self.apply_func = SPSAFunction.apply
+            procs = {}
+            for b in range(self.batch_size):
+                peq = ParametricEQ(sample_rate)
+                comp = Compressor(sample_rate)
+                dsp = ((peq, comp), peq.num_control_params)
+                parent_conn, child_conn = mp.Pipe()
+                p = mp.Process(target=SPSAChannel.worker_pipe, args=(child_conn, dsp))
+                p.start()
+                procs[b] = [p, parent_conn, child_conn]
+                #print(b, p)
+                # Update stuff for external public members TODO: fix
+                self.ports = [peq.ports, comp.ports]
+                self.num_control_params = (
+                    comp.num_control_params + peq.num_control_params
+                )
+            self.procs = procs
+            #print(self.procs)
+        else:
+            self.peq = ParametricEQ(sample_rate)
+            self.comp = Compressor(sample_rate)
+            self.apply_func = SPSAFunction.apply
+            self.ports = [self.peq.ports, self.comp.ports]
+            self.num_control_params = (
+                self.comp.num_control_params + self.peq.num_control_params
+            )
+            self.dsp = ((self.peq, self.comp), self.peq.num_control_params)
+        # add one param for wet/dry mix
+        # self.num_control_params += 1
+    def __del__(self):
+        if hasattr(self, "procs"):
+            for proc_idx, proc in self.procs.items():
+                #print(f"Closing {proc_idx}...")
+                proc[0].terminate()
+    def forward(self, x, p, epsilon=0.001, sample_rate=24000, **kwargs):
+        """
+        Args:
+            x (Tensor): Input signal with shape: [batch x channels x samples]
+            p (Tensor): Audio effect control parameters with shape: [batch x parameters]
+            epsilon (float, optional): Twiddle parameter range for SPSA gradient estimation.
+        Returns:
+            y (Tensor): Processed audio signal.
+        """
+        if self.parallel:
+            y = self.apply_func(x, p, None, epsilon, self, sample_rate)
+        else:
+            # this will process on CPU in NumPy
+            y = self.apply_func(x, p, None, epsilon, self, sample_rate)
+        return y.type_as(x)
+    @staticmethod
+    def static_backward(dsp, value):
+        (
+            batch_index,
+            x,
+            params,
+            needs_input_grad,
+            needs_param_grad,
+            grad_output,
+            epsilon,
+        ) = value
+        grads_input = None
+        grads_params = None
+        ps = params.shape[-1]
+        factors = [1.0]
+        # estimate gradient w.r.t input
+        if needs_input_grad:
+            delta_k = rademacher(x.shape).numpy()
+            J_plus = dsp_func(x + epsilon * delta_k, params, dsp)
+            J_minus = dsp_func(x - epsilon * delta_k, params, dsp)
+            grads_input = (J_plus - J_minus) / (2.0 * epsilon)
+        # estimate gradient w.r.t params
+        grads_params_runs = []
+        if needs_param_grad:
+            for factor in factors:
+                params_sublist = []
+                delta_k = rademacher(params.shape).numpy()
+                # compute output in two random directions of the parameter space
+                params_plus = np.clip(params + (factor * epsilon * delta_k), 0, 1)
+                J_plus = dsp_func(x, params_plus, dsp)
+                params_minus = np.clip(params - (factor * epsilon * delta_k), 0, 1)
+                J_minus = dsp_func(x, params_minus, dsp)
+                grad_param = J_plus - J_minus
+                # compute gradient for each parameter as a function of epsilon and random direction
+                for sub_p_idx in range(ps):
+                    grad_p = grad_param / (2 * epsilon * delta_k[sub_p_idx])
+                    params_sublist.append(np.sum(grad_output * grad_p))
+                grads_params = np.array(params_sublist)
+                grads_params_runs.append(grads_params)
+            # average gradients
+            grads_params = np.mean(grads_params_runs, axis=0)
+        return grads_input, grads_params
+    @staticmethod
+    def static_forward(dsp, value):
+        batch_index, x, p, sample_rate = value
+        y = dsp_func(x, p, dsp, sample_rate)
+        return y
+    @staticmethod
+    def worker_pipe(child_conn, dsp):
+        while True:
+            msg, value = child_conn.recv()
+            if msg == "forward":
+                child_conn.send(SPSAChannel.static_forward(dsp, value))
+            elif msg == "backward":
+                child_conn.send(SPSAChannel.static_backward(dsp, value))
+            elif msg == "shutdown":
+                break

deepafx_st/processors/spsa/eps_scheduler.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+class EpsilonScheduler:
+    def __init__(
+        self,
+        epsilon: float = 0.001,
+        patience: int = 10,
+        factor: float = 0.5,
+        verbose: bool = False,
+    ):
+        self.epsilon = epsilon
+        self.patience = patience
+        self.factor = factor
+        self.best = 1e16
+        self.count = 0
+        self.verbose = verbose
+    def step(self, metric: float):
+        if metric < self.best:
+            self.best = metric
+            self.count = 0
+        else:
+            self.count += 1
+            if self.verbose:
+                print(f"Train loss has not improved for {self.count} epochs.")
+            if self.count >= self.patience:
+                self.count = 0
+                self.epsilon *= self.factor
+                if self.verbose:
+                    print(f"Reducing epsilon to {self.epsilon:0.2e}...")

deepafx_st/processors/spsa/spsa_func.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+def spsa_func(input, params, process, i, sample_rate=24000):
+    return process(input.cpu(), params.cpu(), i, sample_rate).type_as(input)
+class SPSAFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        params,
+        process,
+        epsilon,
+        thread_context,
+        sample_rate=24000,
+    ):
+        """Apply processor to a batch of tensors using given parameters.
+        Args:
+            input (Tensor): Audio with shape: batch x 2 x samples
+            params (Tensor): Processor parameters with shape: batch x params
+            process (function): Function that will apply processing.
+            epsilon (float): Perturbation strength for SPSA computation.
+        Returns:
+            output (Tensor): Processed audio with same shape as input.
+        """
+        ctx.save_for_backward(input, params)
+        ctx.epsilon = epsilon
+        ctx.process = process
+        ctx.thread_context = thread_context
+        if thread_context.parallel:
+            for i in range(input.shape[0]):
+                msg = (
+                    "forward",
+                    (
+                        i,
+                        input[i].view(-1).detach().cpu().numpy(),
+                        params[i].view(-1).detach().cpu().numpy(),
+                        sample_rate,
+                    ),
+                )
+                thread_context.procs[i][1].send(msg)
+            z = torch.empty_like(input)
+            for i in range(input.shape[0]):
+                z[i] = torch.from_numpy(thread_context.procs[i][1].recv())
+        else:
+            z = torch.empty_like(input)
+            for i in range(input.shape[0]):
+                value = (
+                    i,
+                    input[i].view(-1).detach().cpu().numpy(),
+                    params[i].view(-1).detach().cpu().numpy(),
+                    sample_rate,
+                )
+                z[i] = torch.from_numpy(
+                    thread_context.static_forward(thread_context.dsp, value)
+                )
+        return z
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Estimate gradients using SPSA."""
+        input, params = ctx.saved_tensors
+        epsilon = ctx.epsilon
+        needs_input_grad = ctx.needs_input_grad[0]
+        needs_param_grad = ctx.needs_input_grad[1]
+        thread_context = ctx.thread_context
+        grads_input = None
+        grads_params = None
+        # Receive grads
+        if needs_input_grad:
+            grads_input = torch.empty_like(input)
+        if needs_param_grad:
+            grads_params = torch.empty_like(params)
+        if thread_context.parallel:
+            for i in range(input.shape[0]):
+                msg = (
+                    "backward",
+                    (
+                        i,
+                        input[i].view(-1).detach().cpu().numpy(),
+                        params[i].view(-1).detach().cpu().numpy(),
+                        needs_input_grad,
+                        needs_param_grad,
+                        grad_output[i].view(-1).detach().cpu().numpy(),
+                        epsilon,
+                    ),
+                )
+                thread_context.procs[i][1].send(msg)
+            # Wait for output
+            for i in range(input.shape[0]):
+                temp1, temp2 = thread_context.procs[i][1].recv()
+                if temp1 is not None:
+                    grads_input[i] = torch.from_numpy(temp1)
+                if temp2 is not None:
+                    grads_params[i] = torch.from_numpy(temp2)
+            return grads_input, grads_params, None, None, None, None
+        else:
+            for i in range(input.shape[0]):
+                value = (
+                    i,
+                    input[i].view(-1).detach().cpu().numpy(),
+                    params[i].view(-1).detach().cpu().numpy(),
+                    needs_input_grad,
+                    needs_param_grad,
+                    grad_output[i].view(-1).detach().cpu().numpy(),
+                    epsilon,
+                )
+                temp1, temp2 = thread_context.static_backward(thread_context.dsp, value)
+                if temp1 is not None:
+                    grads_input[i] = torch.from_numpy(temp1)
+                if temp2 is not None:
+                    grads_params[i] = torch.from_numpy(temp2)
+            return grads_input, grads_params, None, None, None, None

deepafx_st/system.py ADDED Viewed

	@@ -0,0 +1,563 @@

+import torch
+import auraloss
+import torchaudio
+from itertools import chain
+import pytorch_lightning as pl
+from argparse import ArgumentParser
+from typing import Tuple, List, Dict
+import deepafx_st.utils as utils
+from deepafx_st.utils import DSPMode
+from deepafx_st.data.dataset import AudioDataset
+from deepafx_st.models.encoder import SpectralEncoder
+from deepafx_st.models.controller import StyleTransferController
+from deepafx_st.processors.spsa.channel import SPSAChannel
+from deepafx_st.processors.spsa.eps_scheduler import EpsilonScheduler
+from deepafx_st.processors.proxy.channel import ProxyChannel
+from deepafx_st.processors.autodiff.channel import AutodiffChannel
+class System(pl.LightningModule):
+    def __init__(
+        self,
+        ext="wav",
+        dsp_sample_rate=24000,
+        **kwargs,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        self.eps_scheduler = EpsilonScheduler(
+            self.hparams.spsa_epsilon,
+            self.hparams.spsa_patience,
+            self.hparams.spsa_factor,
+            self.hparams.spsa_verbose,
+        )
+        self.hparams.dsp_mode = DSPMode.NONE
+        # first construct the processor, since this will dictate encoder
+        if self.hparams.processor_model == "spsa":
+            self.processor = SPSAChannel(
+                self.hparams.dsp_sample_rate,
+                self.hparams.spsa_parallel,
+                self.hparams.batch_size,
+            )
+        elif self.hparams.processor_model == "autodiff":
+            self.processor = AutodiffChannel(self.hparams.dsp_sample_rate)
+        elif self.hparams.processor_model == "proxy0":
+            # print('self.hparams.proxy_ckpts,',self.hparams.proxy_ckpts)
+            self.hparams.dsp_mode = DSPMode.NONE
+            self.processor = ProxyChannel(
+                self.hparams.proxy_ckpts,
+                self.hparams.freeze_proxies,
+                self.hparams.dsp_mode,
+                sample_rate=self.hparams.dsp_sample_rate,
+            )
+        elif self.hparams.processor_model == "proxy1":
+            # print('self.hparams.proxy_ckpts,',self.hparams.proxy_ckpts)
+            self.hparams.dsp_mode = DSPMode.INFER
+            self.processor = ProxyChannel(
+                self.hparams.proxy_ckpts,
+                self.hparams.freeze_proxies,
+                self.hparams.dsp_mode,
+                sample_rate=self.hparams.dsp_sample_rate,
+            )
+        elif self.hparams.processor_model == "proxy2":
+            # print('self.hparams.proxy_ckpts,',self.hparams.proxy_ckpts)
+            self.hparams.dsp_mode = DSPMode.TRAIN_INFER
+            self.processor = ProxyChannel(
+                self.hparams.proxy_ckpts,
+                self.hparams.freeze_proxies,
+                self.hparams.dsp_mode,
+                sample_rate=self.hparams.dsp_sample_rate,
+            )
+        elif self.hparams.processor_model == "tcn1":
+            # self.processor = ConditionalTCN(self.hparams.sample_rate)
+            self.hparams.dsp_mode = DSPMode.NONE
+            self.processor = ProxyChannel(
+                [],
+                freeze_proxies=False,
+                dsp_mode=self.hparams.dsp_mode,
+                tcn_nblocks=self.hparams.tcn_nblocks,
+                tcn_dilation_growth=self.hparams.tcn_dilation_growth,
+                tcn_channel_width=self.hparams.tcn_channel_width,
+                tcn_kernel_size=self.hparams.tcn_kernel_size,
+                num_tcns=1,
+                sample_rate=self.hparams.sample_rate,
+            )
+        elif self.hparams.processor_model == "tcn2":
+            self.hparams.dsp_mode = DSPMode.NONE
+            self.processor = ProxyChannel(
+                [],
+                freeze_proxies=False,
+                dsp_mode=self.hparams.dsp_mode,
+                tcn_nblocks=self.hparams.tcn_nblocks,
+                tcn_dilation_growth=self.hparams.tcn_dilation_growth,
+                tcn_channel_width=self.hparams.tcn_channel_width,
+                tcn_kernel_size=self.hparams.tcn_kernel_size,
+                num_tcns=2,
+                sample_rate=self.hparams.sample_rate,
+            )
+        else:
+            raise ValueError(f"Invalid processor_model: {self.hparams.processor_model}")
+        if self.hparams.encoder_ckpt is not None:
+            # load encoder weights from a pre-trained system
+            system = System.load_from_checkpoint(self.hparams.encoder_ckpt)
+            self.encoder = system.encoder
+            self.hparams.encoder_embed_dim = system.encoder.embed_dim
+        else:
+            self.encoder = SpectralEncoder(
+                self.processor.num_control_params,
+                self.hparams.sample_rate,
+                encoder_model=self.hparams.encoder_model,
+                embed_dim=self.hparams.encoder_embed_dim,
+                width_mult=self.hparams.encoder_width_mult,
+            )
+        if self.hparams.encoder_freeze:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+        self.controller = StyleTransferController(
+            self.processor.num_control_params,
+            self.hparams.encoder_embed_dim,
+        )
+        if len(self.hparams.recon_losses) != len(self.hparams.recon_loss_weights):
+            raise ValueError("Must supply same number of weights as losses.")
+        self.recon_losses = torch.nn.ModuleDict()
+        for recon_loss in self.hparams.recon_losses:
+            if recon_loss == "mrstft":
+                self.recon_losses[recon_loss] = auraloss.freq.MultiResolutionSTFTLoss(
+                    fft_sizes=[32, 128, 512, 2048, 8192, 32768],
+                    hop_sizes=[16, 64, 256, 1024, 4096, 16384],
+                    win_lengths=[32, 128, 512, 2048, 8192, 32768],
+                    w_sc=0.0,
+                    w_phs=0.0,
+                    w_lin_mag=1.0,
+                    w_log_mag=1.0,
+                )
+            elif recon_loss == "mrstft-md":
+                self.recon_losses[recon_loss] = auraloss.freq.MultiResolutionSTFTLoss(
+                    fft_sizes=[128, 512, 2048, 8192],
+                    hop_sizes=[32, 128, 512, 2048],  #  1 / 4
+                    win_lengths=[128, 512, 2048, 8192],
+                    w_sc=0.0,
+                    w_phs=0.0,
+                    w_lin_mag=1.0,
+                    w_log_mag=1.0,
+                )
+            elif recon_loss == "mrstft-sm":
+                self.recon_losses[recon_loss] = auraloss.freq.MultiResolutionSTFTLoss(
+                    fft_sizes=[512, 2048, 8192],
+                    hop_sizes=[256, 1024, 4096],  #  1 / 4
+                    win_lengths=[512, 2048, 8192],
+                    w_sc=0.0,
+                    w_phs=0.0,
+                    w_lin_mag=1.0,
+                    w_log_mag=1.0,
+                )
+            elif recon_loss == "melfft":
+                self.recon_losses[recon_loss] = auraloss.freq.MelSTFTLoss(
+                    self.hparams.sample_rate,
+                    fft_size=self.hparams.train_length,
+                    hop_size=self.hparams.train_length // 2,
+                    win_length=self.hparams.train_length,
+                    n_mels=128,
+                    w_sc=0.0,
+                    device="cuda" if self.hparams.gpus > 0 else "cpu",
+                )
+            elif recon_loss == "melstft":
+                self.recon_losses[recon_loss] = auraloss.freq.MelSTFTLoss(
+                    self.hparams.sample_rate,
+                    device="cuda" if self.hparams.gpus > 0 else "cpu",
+                )
+            elif recon_loss == "l1":
+                self.recon_losses[recon_loss] = torch.nn.L1Loss()
+            elif recon_loss == "sisdr":
+                self.recon_losses[recon_loss] = auraloss.time.SISDRLoss()
+            else:
+                raise ValueError(
+                    f"Invalid reconstruction loss: {self.hparams.recon_losses}"
+                )
+    def forward(
+        self,
+        x: torch.Tensor,
+        y: torch.Tensor = None,
+        e_y: torch.Tensor = None,
+        z: torch.Tensor = None,
+        dsp_mode: DSPMode = DSPMode.NONE,
+        analysis_length: int = 0,
+        sample_rate: int = 24000,
+    ):
+        """Forward pass through the system subnetworks.
+        Args:
+            x (tensor): Input audio tensor with shape (batch x 1 x samples)
+            y (tensor): Target audio tensor with shape (batch x 1 x samples)
+            e_y (tensor): Target embedding with shape (batch x edim)
+            z (tensor): Bottleneck latent.
+            dsp_mode (DSPMode): Mode of operation for the DSP blocks.
+            analysis_length (optional, int): Only analyze the first N samples.
+            sample_rate (optional, int): Desired sampling rate for the DSP blocks.
+        You must supply target audio `y`, `z`, or an embedding for the target `e_y`.
+        Returns:
+            y_hat (tensor): Output audio.
+            p (tensor):
+            e (tensor):
+        """
+        bs, chs, samp = x.size()
+        if sample_rate != self.hparams.sample_rate:
+            x_enc = torchaudio.transforms.Resample(
+                sample_rate, self.hparams.sample_rate
+            ).to(x.device)(x)
+            if y is not None:
+                y_enc = torchaudio.transforms.Resample(
+                    sample_rate, self.hparams.sample_rate
+                ).to(x.device)(y)
+        else:
+            x_enc = x
+            y_enc = y
+        if analysis_length > 0:
+            x_enc = x_enc[..., :analysis_length]
+            if y is not None:
+                y_enc = y_enc[..., :analysis_length]
+        e_x = self.encoder(x_enc)  # generate latent embedding for input
+        if y is not None:
+            e_y = self.encoder(y_enc)  # generate latent embedding for target
+        elif e_y is None:
+            raise RuntimeError("Must supply y, z, or e_y. None supplied.")
+        # learnable comparision
+        p = self.controller(e_x, e_y, z=z)
+        # process audio conditioned on parameters
+        # if there are multiple channels process them using same parameters
+        y_hat = torch.zeros(x.shape).type_as(x)
+        for ch_idx in range(chs):
+            y_hat_ch = self.processor(
+                x[:, ch_idx : ch_idx + 1, :],
+                p,
+                epsilon=self.eps_scheduler.epsilon,
+                dsp_mode=dsp_mode,
+                sample_rate=sample_rate,
+            )
+            y_hat[:, ch_idx : ch_idx + 1, :] = y_hat_ch
+        return y_hat, p, e_x
+    def common_paired_step(
+        self,
+        batch: Tuple,
+        batch_idx: int,
+        optimizer_idx: int = 0,
+        train: bool = False,
+    ):
+        """Model step used for validation and training.
+        Args:
+            batch (Tuple[Tensor, Tensor]): Batch items containing input audio (x) and target audio (y).
+            batch_idx (int): Index of the batch within the current epoch.
+            optimizer_idx (int): Index of the optimizer, this step is called once for each optimizer.
+                The firs optimizer corresponds to the generator and the second optimizer,
+                corresponds to the adversarial loss (when in use).
+            train (bool): Whether step is called during training (True) or validation (False).
+        """
+        x, y = batch
+        loss = 0
+        dsp_mode = self.hparams.dsp_mode
+        if train and dsp_mode.INFER.name == DSPMode.INFER.name:
+            dsp_mode = DSPMode.NONE
+        # proces input audio through model
+        if self.hparams.style_transfer:
+            length = x.shape[-1]
+            x_A = x[..., : length // 2]
+            x_B = x[..., length // 2 :]
+            y_A = y[..., : length // 2]
+            y_B = y[..., length // 2 :]
+            if torch.rand(1).sum() > 0.5:
+                y_ref = y_B
+                y = y_A
+                x = x_A
+            else:
+                y_ref = y_A
+                y = y_B
+                x = x_B
+            y_hat, p, e = self(x, y=y_ref, dsp_mode=dsp_mode)
+        else:
+            y_ref = None
+            y_hat, p, e = self(x, dsp_mode=dsp_mode)
+        # compute reconstruction loss terms
+        for loss_idx, (loss_name, recon_loss_fn) in enumerate(
+            self.recon_losses.items()
+        ):
+            temp_loss = recon_loss_fn(y_hat, y)  # reconstruction loss
+            loss += float(self.hparams.recon_loss_weights[loss_idx]) * temp_loss
+            self.log(
+                ("train" if train else "val") + f"_loss/{loss_name}",
+                temp_loss,
+                on_step=True,
+                on_epoch=True,
+                prog_bar=False,
+                logger=True,
+                sync_dist=True,
+            )
+        # log the overall aggregate loss
+        self.log(
+            ("train" if train else "val") + "_loss/loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=False,
+            logger=True,
+            sync_dist=True,
+        )
+        # store audio data
+        data_dict = {
+            "x": x.cpu(),
+            "y": y.cpu(),
+            "p": p.cpu(),
+            "e": e.cpu(),
+            "y_hat": y_hat.cpu(),
+        }
+        if y_ref is not None:
+            data_dict["y_ref"] = y_ref.cpu()
+        return loss, data_dict
+    def training_step(self, batch, batch_idx, optimizer_idx=0):
+        loss, _ = self.common_paired_step(
+            batch,
+            batch_idx,
+            optimizer_idx,
+            train=True,
+        )
+        return loss
+    def training_epoch_end(self, training_step_outputs):
+        if self.hparams.spsa_schedule and self.hparams.processor_model == "spsa":
+            self.eps_scheduler.step(
+                self.trainer.callback_metrics[self.hparams.train_monitor],
+            )
+    def validation_step(self, batch, batch_idx):
+        loss, data_dict = self.common_paired_step(batch, batch_idx)
+        return data_dict
+    def optimizer_step(
+        self,
+        epoch,
+        batch_idx,
+        optimizer,
+        optimizer_idx,
+        optimizer_closure,
+        on_tpu=False,
+        using_native_amp=False,
+        using_lbfgs=False,
+    ):
+        if optimizer_idx == 0:
+            optimizer.step(closure=optimizer_closure)
+    def configure_optimizers(self):
+        # we need additional optimizer for the discriminator
+        optimizers = []
+        g_optimizer = torch.optim.Adam(
+            chain(
+                self.encoder.parameters(),
+                self.processor.parameters(),
+                self.controller.parameters(),
+            ),
+            lr=self.hparams.lr,
+            betas=(0.9, 0.999),
+        )
+        optimizers.append(g_optimizer)
+        g_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            g_optimizer,
+            patience=self.hparams.lr_patience,
+            verbose=True,
+        )
+        ms1 = int(self.hparams.max_epochs * 0.8)
+        ms2 = int(self.hparams.max_epochs * 0.95)
+        print(
+            "Learning rate schedule:",
+            f"0 {self.hparams.lr:0.2e} -> ",
+            f"{ms1} {self.hparams.lr*0.1:0.2e} -> ",
+            f"{ms2} {self.hparams.lr*0.01:0.2e}",
+        )
+        g_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            g_optimizer,
+            milestones=[ms1, ms2],
+            gamma=0.1,
+        )
+        lr_schedulers = {
+            "scheduler": g_scheduler,
+        }
+        return optimizers, lr_schedulers
+    def train_dataloader(self):
+        train_dataset = AudioDataset(
+            self.hparams.audio_dir,
+            subset="train",
+            train_frac=self.hparams.train_frac,
+            half=self.hparams.half,
+            length=self.hparams.train_length,
+            input_dirs=self.hparams.input_dirs,
+            random_scale_input=self.hparams.random_scale_input,
+            random_scale_target=self.hparams.random_scale_target,
+            buffer_size_gb=self.hparams.buffer_size_gb,
+            buffer_reload_rate=self.hparams.buffer_reload_rate,
+            num_examples_per_epoch=self.hparams.train_examples_per_epoch,
+            augmentations={
+                "pitch": {"sr": self.hparams.sample_rate},
+                "tempo": {"sr": self.hparams.sample_rate},
+            },
+            freq_corrupt=self.hparams.freq_corrupt,
+            drc_corrupt=self.hparams.drc_corrupt,
+            ext=self.hparams.ext,
+        )
+        g = torch.Generator()
+        g.manual_seed(0)
+        return torch.utils.data.DataLoader(
+            train_dataset,
+            num_workers=self.hparams.num_workers,
+            batch_size=self.hparams.batch_size,
+            worker_init_fn=utils.seed_worker,
+            generator=g,
+            pin_memory=True,
+            persistent_workers=True,
+            timeout=60,
+        )
+    def val_dataloader(self):
+        val_dataset = AudioDataset(
+            self.hparams.audio_dir,
+            subset="val",
+            half=self.hparams.half,
+            train_frac=self.hparams.train_frac,
+            length=self.hparams.val_length,
+            input_dirs=self.hparams.input_dirs,
+            buffer_size_gb=self.hparams.buffer_size_gb,
+            buffer_reload_rate=self.hparams.buffer_reload_rate,
+            random_scale_input=self.hparams.random_scale_input,
+            random_scale_target=self.hparams.random_scale_target,
+            num_examples_per_epoch=self.hparams.val_examples_per_epoch,
+            augmentations={},
+            freq_corrupt=self.hparams.freq_corrupt,
+            drc_corrupt=self.hparams.drc_corrupt,
+            ext=self.hparams.ext,
+        )
+        self.val_dataset = val_dataset
+        g = torch.Generator()
+        g.manual_seed(0)
+        return torch.utils.data.DataLoader(
+            val_dataset,
+            num_workers=1,
+            batch_size=self.hparams.batch_size,
+            worker_init_fn=utils.seed_worker,
+            generator=g,
+            pin_memory=True,
+            persistent_workers=True,
+            timeout=60,
+        )
+    def shutdown(self):
+        del self.processor
+    # add any model hyperparameters here
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        # --- Training  ---
+        parser.add_argument("--batch_size", type=int, default=32)
+        parser.add_argument("--lr", type=float, default=3e-4)
+        parser.add_argument("--lr_patience", type=int, default=20)
+        parser.add_argument("--recon_losses", nargs="+", default=["l1"])
+        parser.add_argument("--recon_loss_weights", nargs="+", default=[1.0])
+        # --- Controller  ---
+        parser.add_argument(
+            "--processor_model",
+            type=str,
+            help="autodiff, spsa, tcn1, tcn2, proxy0, proxy1, proxy2",
+        )
+        parser.add_argument("--controller_hidden_dim", type=int, default=256)
+        parser.add_argument("--style_transfer", action="store_true")
+        # --- Encoder ---
+        parser.add_argument("--encoder_model", type=str, default="mobilenet_v2")
+        parser.add_argument("--encoder_embed_dim", type=int, default=128)
+        parser.add_argument("--encoder_width_mult", type=int, default=2)
+        parser.add_argument("--encoder_ckpt", type=str, default=None)
+        parser.add_argument("--encoder_freeze", action="store_true", default=False)
+        # --- TCN  ---
+        parser.add_argument("--tcn_causal", action="store_true")
+        parser.add_argument("--tcn_nblocks", type=int, default=4)
+        parser.add_argument("--tcn_dilation_growth", type=int, default=8)
+        parser.add_argument("--tcn_channel_width", type=int, default=32)
+        parser.add_argument("--tcn_kernel_size", type=int, default=13)
+        # ---  SPSA  ---
+        parser.add_argument("--plugin_config_file", type=str, default=None)
+        parser.add_argument("--spsa_epsilon", type=float, default=0.001)
+        parser.add_argument("--spsa_schedule", action="store_true")
+        parser.add_argument("--spsa_patience", type=int, default=10)
+        parser.add_argument("--spsa_verbose", action="store_true")
+        parser.add_argument("--spsa_factor", type=float, default=0.5)
+        parser.add_argument("--spsa_parallel", action="store_true")
+        # --- Proxy ----
+        parser.add_argument("--proxy_ckpts", nargs="+")
+        parser.add_argument("--freeze_proxies", action="store_true", default=False)
+        parser.add_argument("--use_dsp", action="store_true", default=False)
+        parser.add_argument("--dsp_mode", choices=DSPMode, type=DSPMode)
+        # --- Dataset  ---
+        parser.add_argument("--audio_dir", type=str)
+        parser.add_argument("--ext", type=str, default="wav")
+        parser.add_argument("--input_dirs", nargs="+")
+        parser.add_argument("--buffer_reload_rate", type=int, default=1000)
+        parser.add_argument("--buffer_size_gb", type=float, default=1.0)
+        parser.add_argument("--sample_rate", type=int, default=24000)
+        parser.add_argument("--dsp_sample_rate", type=int, default=24000)
+        parser.add_argument("--shuffle", type=bool, default=True)
+        parser.add_argument("--random_scale_input", action="store_true")
+        parser.add_argument("--random_scale_target", action="store_true")
+        parser.add_argument("--freq_corrupt", action="store_true")
+        parser.add_argument("--drc_corrupt", action="store_true")
+        parser.add_argument("--train_length", type=int, default=65536)
+        parser.add_argument("--train_frac", type=float, default=0.8)
+        parser.add_argument("--half", action="store_true")
+        parser.add_argument("--train_examples_per_epoch", type=int, default=10000)
+        parser.add_argument("--val_length", type=int, default=131072)
+        parser.add_argument("--val_examples_per_epoch", type=int, default=1000)
+        parser.add_argument("--num_workers", type=int, default=16)
+        return parser

deepafx_st/utils.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Adapted from:
+# https://github.com/csteinmetz1/micro-tcn/blob/main/microtcn/utils.py
+import os
+import csv
+import torch
+import fnmatch
+import numpy as np
+import random
+from enum import Enum
+import pyloudnorm as pyln
+class DSPMode(Enum):
+    NONE = "none"
+    TRAIN_INFER = "train_infer"
+    INFER = "infer"
+    def __str__(self):
+        return self.value
+def loudness_normalize(x, sample_rate, target_loudness=-24.0):
+    x = x.view(1, -1)
+    stereo_audio = x.repeat(2, 1).permute(1, 0).numpy()
+    meter = pyln.Meter(sample_rate)
+    loudness = meter.integrated_loudness(stereo_audio)
+    norm_x = pyln.normalize.loudness(
+        stereo_audio,
+        loudness,
+        target_loudness,
+    )
+    x = torch.tensor(norm_x).permute(1, 0)
+    x = x[0, :].view(1, -1)
+    return x
+def get_random_file_id(keys):
+    # generate a random index into the keys of the input files
+    rand_input_idx = torch.randint(0, len(keys) - 1, [1])[0]
+    # find the key (file_id) correponding to the random index
+    rand_input_file_id = list(keys)[rand_input_idx]
+    return rand_input_file_id
+def get_random_patch(audio_file, length, check_silence=True):
+    silent = True
+    while silent:
+        start_idx = int(torch.rand(1) * (audio_file.num_frames - length))
+        stop_idx = start_idx + length
+        patch = audio_file.audio[:, start_idx:stop_idx].clone().detach()
+        if (patch ** 2).mean() > 1e-4 or not check_silence:
+            silent = False
+    return start_idx, stop_idx
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2 ** 32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+def getFilesPath(directory, extension):
+    n_path = []
+    for path, subdirs, files in os.walk(directory):
+        for name in files:
+            if fnmatch.fnmatch(name, extension):
+                n_path.append(os.path.join(path, name))
+    n_path.sort()
+    return n_path
+def count_parameters(model, trainable_only=True):
+    if trainable_only:
+        if len(list(model.parameters())) > 0:
+            params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        else:
+            params = 0
+    else:
+        if len(list(model.parameters())) > 0:
+            params = sum(p.numel() for p in model.parameters())
+        else:
+            params = 0
+    return params
+def system_summary(system):
+    print(f"Encoder: {count_parameters(system.encoder)/1e6:0.2f} M")
+    print(f"Processor: {count_parameters(system.processor)/1e6:0.2f} M")
+    if hasattr(system, "adv_loss_fn"):
+        for idx, disc in enumerate(system.adv_loss_fn.discriminators):
+            print(f"Discriminator {idx+1}: {count_parameters(disc)/1e6:0.2f} M")
+def center_crop(x, length: int):
+    if x.shape[-1] != length:
+        start = (x.shape[-1] - length) // 2
+        stop = start + length
+        x = x[..., start:stop]
+    return x
+def causal_crop(x, length: int):
+    if x.shape[-1] != length:
+        stop = x.shape[-1] - 1
+        start = stop - length
+        x = x[..., start:stop]
+    return x
+def denormalize(norm_val, max_val, min_val):
+    return (norm_val * (max_val - min_val)) + min_val
+def normalize(denorm_val, max_val, min_val):
+    return (denorm_val - min_val) / (max_val - min_val)
+def get_random_patch(audio_file, length, energy_treshold=1e-4):
+    """Produce sample indicies for a random patch of size `length`.
+    This function will check the energy of the selected patch to
+    ensure that it is not complete silence. If silence is found,
+    it will continue searching for a non-silent patch.
+    Args:
+        audio_file (AudioFile): Audio file object.
+        length (int): Number of samples in random patch.
+    Returns:
+        start_idx (int): Starting sample index
+        stop_idx (int): Stop sample index
+    """
+    silent = True
+    while silent:
+        start_idx = int(torch.rand(1) * (audio_file.num_frames - length))
+        stop_idx = start_idx + length
+        patch = audio_file.audio[:, start_idx:stop_idx]
+        if (patch ** 2).mean() > energy_treshold:
+            silent = False
+    return start_idx, stop_idx
+def split_dataset(file_list, subset, train_frac):
+    """Given a list of files, split into train/val/test sets.
+    Args:
+        file_list (list): List of audio files.
+        subset (str): One of "train", "val", or "test".
+        train_frac (float): Fraction of the dataset to use for training.
+    Returns:
+        file_list (list): List of audio files corresponding to subset.
+    """
+    assert train_frac > 0.1 and train_frac < 1.0
+    total_num_examples = len(file_list)
+    train_num_examples = int(total_num_examples * train_frac)
+    val_num_examples = int(total_num_examples * (1 - train_frac) / 2)
+    test_num_examples = total_num_examples - (train_num_examples + val_num_examples)
+    if train_num_examples < 0:
+        raise ValueError(
+            f"No examples in training set. Try increasing train_frac: {train_frac}."
+        )
+    elif val_num_examples < 0:
+        raise ValueError(
+            f"No examples in validation set. Try decreasing train_frac: {train_frac}."
+        )
+    elif test_num_examples < 0:
+        raise ValueError(
+            f"No examples in test set. Try decreasing train_frac: {train_frac}."
+        )
+    if subset == "train":
+        start_idx = 0
+        stop_idx = train_num_examples
+    elif subset == "val":
+        start_idx = train_num_examples
+        stop_idx = start_idx + val_num_examples
+    elif subset == "test":
+        start_idx = train_num_examples + val_num_examples
+        stop_idx = start_idx + test_num_examples + 1
+    else:
+        raise ValueError("Invalid subset: {subset}.")
+    return file_list[start_idx:stop_idx]
+def rademacher(size):
+    """Generates random samples from a Rademacher distribution +-1
+    Args:
+        size (int):
+    """
+    m = torch.distributions.binomial.Binomial(1, 0.5)
+    x = m.sample(size)
+    x[x == 0] = -1
+    return x
+def get_subset(csv_file):
+    subset_files = []
+    with open(csv_file) as fp:
+        reader = csv.DictReader(fp)
+        for row in reader:
+            subset_files.append(row["filepath"])
+    return list(set(subset_files))
+def conform_length(x: torch.Tensor, length: int):
+    """Crop or pad input on last dim to match `length`."""
+    if x.shape[-1] < length:
+        padsize = length - x.shape[-1]
+        x = torch.nn.functional.pad(x, (0, padsize))
+    elif x.shape[-1] > length:
+        x = x[..., :length]
+    return x
+def linear_fade(
+    x: torch.Tensor,
+    fade_ms: float = 50.0,
+    sample_rate: float = 22050,
+):
+    """Apply fade in and fade out to last dim."""
+    fade_samples = int(fade_ms * 1e-3 * 22050)
+    fade_in = torch.linspace(0.0, 1.0, steps=fade_samples)
+    fade_out = torch.linspace(1.0, 0.0, steps=fade_samples)
+    # fade in
+    x[..., :fade_samples] *= fade_in
+    # fade out
+    x[..., -fade_samples:] *= fade_out
+    return x
+# def get_random_patch(x, sample_rate, length_samples):
+#     length = length_samples
+#     silent = True
+#     while silent:
+#         start_idx = np.random.randint(0, x.shape[-1] - length - 1)
+#         stop_idx = start_idx + length
+#         x_crop = x[0:1, start_idx:stop_idx]
+#         # check for silence
+#         frames = length // sample_rate
+#         silent_frames = []
+#         for n in range(frames):
+#             start_idx = n * sample_rate
+#             stop_idx = start_idx + sample_rate
+#             x_frame = x_crop[0:1, start_idx:stop_idx]
+#             if (x_frame ** 2).mean() > 3e-4:
+#                 silent_frames.append(False)
+#             else:
+#                 silent_frames.append(True)
+#         silent = True if any(silent_frames) else False
+#     x_crop /= x_crop.abs().max()
+#     return x_crop

deepafx_st/version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''Version info'''
+short_version = '0.0'
+version = '0.0.1'

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libsndfile1
+sox
+ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/adobe-research/DeepAFx-ST.git
+gradio
+huggingface_hub