Spaces:

cwitkowitz
/

timbre-trap

Running

App Files Files Community

cwitkowitz commited on Oct 23, 2023

Commit

883013e

•

1 Parent(s): 94fc053

Working standalone.

Browse files

Files changed (7) hide show

.gitignore +3 -0
app.py +83 -0
model-8750.pt +3 -0
models/__init__.py +0 -0
models/cqt_module.py +281 -0
models/transcriber.py +626 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*__pycache__
+_outputs
+.idea

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from pyharp import ModelCard, build_endpoint
+import gradio as gr
+import torchaudio
+import torch
+import os
+timbre_trap = torch.load('model-8750.pt', map_location='cpu')
+card = ModelCard(
+    name='Timbre-Trap',
+    description='De-timbre your audio!',
+    author='Frank Cwitkowitz',
+    tags=['example', 'music transcription', 'multi-pitch estimation', 'timbre filtering']
+)
+def process_fn(audio_path, de_timbre):
+    # Load the audio with torchaudio
+    audio, fs = torchaudio.load(audio_path)
+    # Average channels to obtain mono-channel
+    audio = torch.mean(audio, dim=0, keepdim=True)
+    # Resample audio to the specified sampling rate
+    audio = torchaudio.functional.resample(audio, fs, 22050)
+    # Add a batch dimension
+    audio = audio.unsqueeze(0)
+    # Determine original number of samples
+    n_samples = audio.size(-1)
+    # Pad audio to next multiple of block length
+    audio = timbre_trap.sliCQ.pad_to_block_length(audio)
+    # Encode raw audio into latent vectors
+    latents, embeddings, _ = timbre_trap.encode(audio)
+    # Apply skip connections if they are turned on
+    embeddings = timbre_trap.apply_skip_connections(embeddings)
+    # Obtain transcription or reconstructed spectral coefficients
+    coefficients = timbre_trap.decode(latents, embeddings, de_timbre)
+    # Invert reconstructed spectral coefficients
+    audio = timbre_trap.sliCQ.decode(coefficients)
+    # Trim to original number of samples
+    audio = audio[..., :n_samples]
+    # Remove batch dimension
+    audio = audio.squeeze(0)
+    if de_timbre and audio.abs().max():
+        # Normalize audio to [-1, 1]
+        audio /= audio.abs().max()
+    # Create a temporary directory for output
+    os.makedirs('_outputs', exist_ok=True)
+    # Create a path for saving the audio
+    save_path = os.path.join('_outputs', 'output.wav')
+    # Save the audio
+    torchaudio.save(save_path, audio, 22050)
+    return save_path
+with gr.Blocks() as demo:
+    inputs = [
+        gr.Audio(
+            label='Audio Input',
+            type='filepath'
+        ),
+        #gr.Checkbox(
+        #    value=False,
+        #    label='De-Timbre'
+        #)
+        gr.Slider(
+            minimum=0,
+            maximum=1,
+            step=1,
+            value=0,
+            label='De-Timbre'
+        )
+    ]
+    output = gr.Audio(label='Audio Output', type='filepath')
+    ctrls_data, ctrls_button, process_button = build_endpoint(inputs, output, process_fn, card)
+demo.launch(share=True)

model-8750.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1eb515001ebb871a934379bbd44a22e00a2f41b20c34cd862274aa04c0ca900
+size 11401913

models/__init__.py ADDED Viewed

File without changes

models/cqt_module.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from torchaudio.transforms import AmplitudeToDB
+from cqt_pytorch import CQT as _CQT
+import numpy as np
+import librosa
+import torch
+import math
+class CQT(_CQT):
+    """
+    Wrapper which adds some basic functionality to the sliCQ module.
+    """
+    def __init__(self, n_octaves, bins_per_octave, sample_rate, secs_per_block):
+        """
+        Instantiate the sliCQ module and wrapper.
+        Parameters
+        ----------
+        n_octaves : int
+          Number of octaves below Nyquist to span
+        bins_per_octave : int
+          Number of bins allocated to each octave
+        sample_rate : int or float
+          Number of samples per second of audio
+        secs_per_block : float
+          Number of seconds to process at a time
+        """
+        super().__init__(num_octaves=n_octaves,
+                         num_bins_per_octave=bins_per_octave,
+                         sample_rate=sample_rate,
+                         block_length=int(secs_per_block * sample_rate),
+                         power_of_2_length=True)
+        self.sample_rate = sample_rate
+        # Compute hop length corresponding to transform coefficients
+        self.hop_length = (self.block_length / self.max_window_length)
+        # Compute total number of bins
+        self.n_bins = n_octaves * bins_per_octave
+        # Determine frequency (MIDI) below Nyquist by specified octaves
+        fmin = librosa.hz_to_midi((sample_rate / 2) / (2 ** n_octaves))
+        # Determine center frequency (MIDI) associated with each bin of module
+        self.midi_freqs = fmin + np.arange(self.n_bins) / (bins_per_octave / 12)
+    def forward(self, audio):
+        """
+        Encode a batch of audio into CQT spectral coefficients.
+        Parameters
+        ----------
+        audio : Tensor (B x 1 X T)
+          Batch of input audio
+        Returns
+        ----------
+        coefficients : Tensor (B x 2 x F X T)
+          Batch of real/imaginary CQT coefficients
+        """
+        with torch.no_grad():
+            # Obtain complex CQT coefficients
+            coefficients = self.encode(audio)
+            # Convert complex coefficients to real representation
+            coefficients = self.to_real(coefficients)
+        return coefficients
+    @staticmethod
+    def to_real(coefficients):
+        """
+        Convert a set of complex coefficients to equivalent real representation.
+        Parameters
+        ----------
+        coefficients : Tensor (B x 1 x F X T)
+          Batch of complex CQT coefficients
+        Returns
+        ----------
+        coefficients : Tensor (B x 2 x F X T)
+          Batch of real/imaginary CQT coefficients
+        """
+        # Collapse channel dimension (mono assumed)
+        coefficients = coefficients.squeeze(-3)
+        # Convert complex coefficients to real and imaginary
+        coefficients = torch.view_as_real(coefficients)
+        # Place real and imaginary coefficients under channel dimension
+        coefficients = coefficients.transpose(-1, -2).transpose(-2, -3)
+        return coefficients
+    @staticmethod
+    def to_complex(coefficients):
+        """
+        Convert a set of real coefficients to their equivalent complex representation.
+        Parameters
+        ----------
+        coefficients : Tensor (B x 2 x F X T)
+          Batch of real/imaginary CQT coefficients
+        Returns
+        ----------
+        coefficients : Tensor (B x F X T)
+          Batch of complex CQT coefficients
+        """
+        # Move real and imaginary coefficients to last dimension
+        coefficients = coefficients.transpose(-3, -2).transpose(-2, -1)
+        # Convert real and imaginary coefficients to complex
+        coefficients = torch.view_as_complex(coefficients.contiguous())
+        return coefficients
+    @staticmethod
+    def to_magnitude(coefficients):
+        """
+        Compute the magnitude for a set of real coefficients.
+        Parameters
+        ----------
+        coefficients : Tensor (B x 2 x F X T)
+          Batch of real/imaginary CQT coefficients
+        Returns
+        ----------
+        magnitude : Tensor (B x F X T)
+          Batch of magnitude coefficients
+        """
+        # Compute L2-norm of coefficients to compute magnitude
+        magnitude = coefficients.norm(p=2, dim=-3)
+        return magnitude
+    @staticmethod
+    def to_decibels(magnitude, rescale=True):
+        """
+        Convert a set of magnitude coefficients to decibels.
+        TODO - move 0 dB only if maximum is higher?
+             - currently it's consistent with previous dB scaling
+             - currently it's only used for visualization
+        Parameters
+        ----------
+        magnitude : Tensor (B x F X T)
+          Batch of magnitude coefficients (amplitude)
+        rescale : bool
+          Rescale decibels to the range [0, 1]
+        Returns
+        ----------
+        decibels : Tensor (B x F X T)
+          Batch of magnitude coefficients (dB)
+        """
+        # Initialize a differentiable conversion to decibels
+        decibels = AmplitudeToDB(stype='amplitude', top_db=80)(magnitude)
+        if rescale:
+            # Make 0 dB ceiling
+            decibels -= decibels.max()
+            # Rescale decibels to range [0, 1]
+            decibels = 1 + decibels / 80
+        return decibels
+    def decode(self, coefficients):
+        """
+        Invert CQT spectral coefficients to synthesize audio.
+        Parameters
+        ----------
+        coefficients : Tensor (B x 2 OR 1 x F X T)
+          Batch of real/imaginary OR complex CQT coefficients
+        Returns
+        ----------
+        output : Tensor (B x 1 x T)
+          Batch of reconstructed audio
+        """
+        with torch.no_grad():
+            if not coefficients.is_complex():
+                # Convert real coefficients to complex representation
+                coefficients = self.to_complex(coefficients)
+                # Add a channel dimension to coefficients
+                coefficients = coefficients.unsqueeze(-3)
+            # Decode the complex CQT coefficients
+            audio = super().decode(coefficients)
+        return audio
+    def pad_to_block_length(self, audio):
+        """
+        Pad audio to the next multiple of block length such that it can be processed in full.
+        Parameters
+        ----------
+        audio : Tensor (B x 1 X T)
+          Batch of audio
+        Returns
+        ----------
+        audio : Tensor (B x 1 X T + p)
+          Batch of padded audio
+        """
+        # Pad the audio with zeros to fill up the remainder of the final block
+        audio = torch.nn.functional.pad(audio, (0, -audio.size(-1) % self.block_length))
+        return audio
+    def get_expected_samples(self, t):
+        """
+        Determine the number of samples corresponding to a specified amount of time.
+        Parameters
+        ----------
+        t : float
+          Amount of time
+        Returns
+        ----------
+        num_samples : int
+          Number of audio samples expected
+        """
+        # Compute number of samples and round down
+        num_samples = int(max(0, t) * self.sample_rate)
+        return num_samples
+    def get_expected_frames(self, num_samples):
+        """
+        Determine the number of frames the module will return for a given number of samples.
+        Parameters
+        ----------
+        num_samples : int
+          Number of audio samples available
+        Returns
+        ----------
+        num_frames : int
+          Number of frames expected
+        """
+        # Number frames of coefficients per chunk times amount of chunks
+        num_frames = math.ceil((num_samples / self.block_length) * self.max_window_length)
+        return num_frames
+    def get_times(self, n_frames):
+        """
+        Determine the time associated with each frame of coefficients.
+        Parameters
+        ----------
+        n_frames : int
+          Number of frames available
+        Returns
+        ----------
+        times : ndarray (T)
+          Time (seconds) associated with each frame
+        """
+        # Compute times as cumulative hops in seconds
+        times = np.arange(n_frames) * self.hop_length / self.sample_rate
+        return times

models/transcriber.py ADDED Viewed

	@@ -0,0 +1,626 @@

+from .cqt_module import CQT
+import torch.nn as nn
+import torch
+class Transcriber(nn.Module):
+    """
+    Implements a 2D convolutional U-Net architecture based loosely on SoundStream.
+    """
+    def __init__(self, sample_rate, n_octaves, bins_per_octave, secs_per_block=3, latent_size=None, model_complexity=1, skip_connections=False):
+        """
+        Initialize the full autoencoder.
+        Parameters
+        ----------
+        sample_rate : int
+          Expected sample rate of input
+        n_octaves : int
+          Number of octaves below Nyquist frequency to represent
+        bins_per_octave : int
+          Number of frequency bins within each octave
+        secs_per_block : float
+          Number of seconds to process at once with sliCQ
+        latent_size : int or None (Optional)
+          Dimensionality of latent space
+        model_complexity : int
+          Scaling factor for number of filters and embedding sizes
+        skip_connections : bool
+          Whether to include skip connections between encoder and decoder
+        """
+        nn.Module.__init__(self)
+        self.sliCQ = CQT(n_octaves=n_octaves,
+                         bins_per_octave=bins_per_octave,
+                         sample_rate=sample_rate,
+                         secs_per_block=secs_per_block)
+        self.encoder = Encoder(feature_size=self.sliCQ.n_bins, latent_size=latent_size, model_complexity=model_complexity)
+        self.decoder = Decoder(feature_size=self.sliCQ.n_bins, latent_size=latent_size, model_complexity=model_complexity)
+        if skip_connections:
+            # Start by adding encoder features with identity weighting
+            self.skip_weights = torch.nn.Parameter(torch.ones(5))
+        else:
+            # No skip connections
+            self.skip_weights = None
+    def encode(self, audio):
+        """
+        Encode a batch of raw audio into latent codes.
+        Parameters
+        ----------
+        audio : Tensor (B x 1 x T)
+          Batch of input raw audio
+        Returns
+        ----------
+        latents : Tensor (B x D_lat x T)
+          Batch of latent codes
+        embeddings : list of [Tensor (B x C x H x T)]
+          Embeddings produced by encoder at each level
+        losses : dict containing
+          ...
+        """
+        # Compute CQT spectral features
+        coefficients = self.sliCQ(audio)
+        # Encode features into latent vectors
+        latents, embeddings, losses = self.encoder(coefficients)
+        return latents, embeddings, losses
+    def apply_skip_connections(self, embeddings):
+        """
+        Apply skip connections to encoder embeddings, or discard the embeddings if skip connections do not exist.
+        Parameters
+        ----------
+        embeddings : list of [Tensor (B x C x H x T)]
+          Embeddings produced by encoder at each level
+        Returns
+        ----------
+        embeddings : list of [Tensor (B x C x H x T)]
+          Encoder embeddings scaled with learnable weight
+        """
+        if self.skip_weights is not None:
+            # Apply a learnable weight to the embeddings for the skip connection
+            embeddings = [self.skip_weights[i] * e for i, e in enumerate(embeddings)]
+        else:
+            # Discard embeddings from encoder
+            embeddings = None
+        return embeddings
+    def decode(self, latents, embeddings=None, transcribe=False):
+        """
+        Decode a batch of latent codes into logits representing real/imaginary coefficients.
+        Parameters
+        ----------
+        latents : Tensor (B x D_lat x T)
+          Batch of latent codes
+        embeddings : list of [Tensor (B x C x H x T)] or None (no skip connections)
+          Embeddings produced by encoder at each level
+        transcribe : bool
+          Switch for performing transcription vs. reconstruction
+        Returns
+        ----------
+        coefficients : Tensor (B x 2 x F X T)
+          Batch of output logits [-∞, ∞]
+        """
+        # Create binary values to indicate function decoder should perform
+        indicator = (not transcribe) * torch.ones_like(latents[..., :1, :])
+        # Concatenate indicator to final dimension of latents
+        latents = torch.cat((latents, indicator), dim=-2)
+        # Decode latent vectors into real/imaginary coefficients
+        coefficients = self.decoder(latents, embeddings)
+        return coefficients
+    def transcribe(self, audio):
+        """
+        Obtain transcriptions for a batch of raw audio.
+        Parameters
+        ----------
+        audio : Tensor (B x 1 x T)
+          Batch of input raw audio
+        Returns
+        ----------
+        activations : Tensor (B x F X T)
+          Batch of multi-pitch activations [0, 1]
+        """
+        # Encode raw audio into latent vectors
+        latents, embeddings, _ = self.encode(audio)
+        # Apply skip connections if they are turned on
+        embeddings = self.apply_skip_connections(embeddings)
+        # Estimate pitch using transcription switch
+        coefficients = self.decode(latents, embeddings, True)
+        # Extract magnitude of decoded coefficients and convert to activations
+        activations = torch.nn.functional.tanh(self.sliCQ.to_magnitude(coefficients))
+        return activations
+    def reconstruct(self, audio):
+        """
+        Obtain reconstructed coefficients for a batch of raw audio.
+        Parameters
+        ----------
+        audio : Tensor (B x 1 x T)
+          Batch of input raw audio
+        Returns
+        ----------
+        reconstruction : Tensor (B x 2 x F X T)
+          Batch of reconstructed spectral coefficients
+        """
+        # Encode raw audio into latent vectors
+        latents, embeddings, losses = self.encode(audio)
+        # Apply skip connections if they are turned on
+        embeddings = self.apply_skip_connections(embeddings)
+        # Decode latent vectors into spectral coefficients
+        reconstruction = self.decode(latents, embeddings)
+        return reconstruction
+    def forward(self, audio, consistency=False):
+        """
+        Perform all model functions efficiently (for training/evaluation).
+        Parameters
+        ----------
+        audio : Tensor (B x 1 x T)
+          Batch of input raw audio
+        consistency : bool
+          Whether to perform computations for consistency loss
+        Returns
+        ----------
+        reconstruction : Tensor (B x 2 x F X T)
+          Batch of reconstructed spectral coefficients
+        latents : Tensor (B x D_lat x T)
+          Batch of latent codes
+        transcription : Tensor (B x 2 x F X T)
+          Batch of transcription spectral coefficients
+        transcription_rec : Tensor (B x 2 x F X T)
+          Batch of reconstructed spectral coefficients for transcription coefficients input
+        transcription_scr : Tensor (B x 2 x F X T)
+          Batch of transcription spectral coefficients for transcription coefficients input
+        losses : dict containing
+          ...
+        """
+        # Encode raw audio into latent vectors
+        latents, embeddings, losses = self.encode(audio)
+        # Apply skip connections if they are turned on
+        embeddings = self.apply_skip_connections(embeddings)
+        # Decode latent vectors into spectral coefficients
+        reconstruction = self.decode(latents, embeddings)
+        # Estimate pitch using transcription switch
+        transcription = self.decode(latents, embeddings, True)
+        if consistency:
+            # Encode transcription coefficients for samples with ground-truth
+            latents_trn, embeddings_trn, _ = self.encoder(transcription)
+            # Apply skip connections if they are turned on
+            embeddings_trn = self.apply_skip_connections(embeddings_trn)
+            # Attempt to reconstruct transcription spectral coefficients
+            transcription_rec = self.decode(latents_trn, embeddings_trn)
+            # Attempt to transcribe audio pertaining to transcription coefficients
+            transcription_scr = self.decode(latents_trn, embeddings_trn, True)
+        else:
+            # Return null for both sets of coefficients
+            transcription_rec, transcription_scr = None, None
+        return reconstruction, latents, transcription, transcription_rec, transcription_scr, losses
+class Encoder(nn.Module):
+    """
+    Implements a 2D convolutional encoder.
+    """
+    def __init__(self, feature_size, latent_size=None, model_complexity=1):
+        """
+        Initialize the encoder.
+        Parameters
+        ----------
+        feature_size : int
+          Dimensionality of input features
+        latent_size : int or None (Optional)
+          Dimensionality of latent space
+        model_complexity : int
+          Scaling factor for number of filters
+        """
+        nn.Module.__init__(self)
+        channels = (2  * 2 ** (model_complexity - 1),
+                    4  * 2 ** (model_complexity - 1),
+                    8  * 2 ** (model_complexity - 1),
+                    16 * 2 ** (model_complexity - 1),
+                    32 * 2 ** (model_complexity - 1))
+        # Make sure all channel sizes are integers
+        channels = tuple([round(c) for c in channels])
+        if latent_size is None:
+            # Set default dimensionality
+            latent_size = 32 * 2 ** (model_complexity - 1)
+        self.convin = nn.Sequential(
+            nn.Conv2d(2, channels[0], kernel_size=3, padding='same'),
+            nn.ELU(inplace=True)
+        )
+        self.block1 = EncoderBlock(channels[0], channels[1], stride=2)
+        self.block2 = EncoderBlock(channels[1], channels[2], stride=2)
+        self.block3 = EncoderBlock(channels[2], channels[3], stride=2)
+        self.block4 = EncoderBlock(channels[3], channels[4], stride=2)
+        embedding_size = feature_size
+        for i in range(4):
+            # Dimensionality after strided convolutions
+            embedding_size = embedding_size // 2 - 1
+        self.convlat = nn.Conv2d(channels[4], latent_size, kernel_size=(embedding_size, 1))
+    def forward(self, coefficients):
+        """
+        Encode a batch of input spectral features.
+        Parameters
+        ----------
+        coefficients : Tensor (B x 2 x F X T)
+          Batch of input spectral features
+        Returns
+        ----------
+        latents : Tensor (B x D_lat x T)
+          Batch of latent codes
+        embeddings : list of [Tensor (B x C x H x T)]
+          Embeddings produced by encoder at each level
+        losses : dict containing
+          ...
+        """
+        # Initialize a list to hold features for skip connections
+        embeddings = list()
+        # Encode features into embeddings
+        embeddings.append(self.convin(coefficients))
+        embeddings.append(self.block1(embeddings[-1]))
+        embeddings.append(self.block2(embeddings[-1]))
+        embeddings.append(self.block3(embeddings[-1]))
+        embeddings.append(self.block4(embeddings[-1]))
+        # Compute latent vectors from embeddings
+        latents = self.convlat(embeddings[-1]).squeeze(-2)
+        # No encoder losses
+        loss = dict()
+        return latents, embeddings, loss
+class Decoder(nn.Module):
+    """
+    Implements a 2D convolutional decoder.
+    """
+    def __init__(self, feature_size, latent_size=None, model_complexity=1):
+        """
+        Initialize the decoder.
+        Parameters
+        ----------
+        feature_size : int
+          Dimensionality of input features
+        latent_size : int or None (Optional)
+          Dimensionality of latent space
+        model_complexity : int
+          Scaling factor for number of filters
+        """
+        nn.Module.__init__(self)
+        channels = (32 * 2 ** (model_complexity - 1),
+                    16 * 2 ** (model_complexity - 1),
+                    8  * 2 ** (model_complexity - 1),
+                    4  * 2 ** (model_complexity - 1),
+                    2  * 2 ** (model_complexity - 1))
+        # Make sure all channel sizes are integers
+        channels = tuple([round(c) for c in channels])
+        if latent_size is None:
+            # Set default dimensionality
+            latent_size = 32 * 2 ** (model_complexity - 1)
+        padding = list()
+        embedding_size = feature_size
+        for i in range(4):
+            # Padding required for expected output size
+            padding.append(embedding_size % 2)
+            # Dimensionality after strided convolutions
+            embedding_size = embedding_size // 2 - 1
+        # Reverse order
+        padding.reverse()
+        self.convin = nn.Sequential(
+            nn.ConvTranspose2d(latent_size + 1, channels[0], kernel_size=(embedding_size, 1)),
+            nn.ELU(inplace=True)
+        )
+        self.block1 = DecoderBlock(channels[0], channels[1], stride=2, padding=padding[0])
+        self.block2 = DecoderBlock(channels[1], channels[2], stride=2, padding=padding[1])
+        self.block3 = DecoderBlock(channels[2], channels[3], stride=2, padding=padding[2])
+        self.block4 = DecoderBlock(channels[3], channels[4], stride=2, padding=padding[3])
+        self.convout = nn.Conv2d(channels[4], 2, kernel_size=3, padding='same')
+    def forward(self, latents, encoder_embeddings=None):
+        """
+        Decode a batch of input latent codes.
+        Parameters
+        ----------
+        latents : Tensor (B x D_lat x T)
+          Batch of latent codes
+        encoder_embeddings : list of [Tensor (B x C x H x T)] or None (no skip connections)
+          Embeddings produced by encoder at each level
+        Returns
+        ----------
+        output : Tensor (B x 2 x F X T)
+          Batch of output logits [-∞, ∞]
+        """
+        # Restore feature dimension
+        latents = latents.unsqueeze(-2)
+        # Process latents with decoder blocks
+        embeddings = self.convin(latents)
+        if encoder_embeddings is not None:
+            embeddings = embeddings + encoder_embeddings[-1]
+        embeddings = self.block1(embeddings)
+        if encoder_embeddings is not None:
+            embeddings = embeddings + encoder_embeddings[-2]
+        embeddings = self.block2(embeddings)
+        if encoder_embeddings is not None:
+            embeddings = embeddings + encoder_embeddings[-3]
+        embeddings = self.block3(embeddings)
+        if encoder_embeddings is not None:
+            embeddings = embeddings + encoder_embeddings[-4]
+        embeddings = self.block4(embeddings)
+        if encoder_embeddings is not None:
+            embeddings = embeddings + encoder_embeddings[-5]
+        # Decode embeddings into spectral logits
+        output = self.convout(embeddings)
+        return output
+class EncoderBlock(nn.Module):
+    """
+    Implements a chain of residual convolutional blocks with progressively
+    increased dilation, followed by down-sampling via strided convolution.
+    """
+    def __init__(self, in_channels, out_channels, stride=2):
+        """
+        Initialize the encoder block.
+        Parameters
+        ----------
+        in_channels : int
+          Number of input feature channels
+        out_channels : int
+          Number of output feature channels
+        stride : int
+          Stride for the final convolutional layer
+        """
+        nn.Module.__init__(self)
+        self.block1 = ResidualConv2dBlock(in_channels, in_channels, kernel_size=3, dilation=1)
+        self.block2 = ResidualConv2dBlock(in_channels, in_channels, kernel_size=3, dilation=2)
+        self.block3 = ResidualConv2dBlock(in_channels, in_channels, kernel_size=3, dilation=3)
+        self.hop = stride
+        self.win = 2 * stride
+        self.sconv = nn.Sequential(
+            # Down-sample along frequency (height) dimension via strided convolution
+            nn.Conv2d(in_channels, out_channels, kernel_size=(self.win, 1), stride=(self.hop, 1)),
+            nn.ELU(inplace=True)
+        )
+    def forward(self, x):
+        """
+        Feed features through the encoder block.
+        Parameters
+        ----------
+        x : Tensor (B x C_in x H x W)
+          Batch of input features
+        Returns
+        ----------
+        y : Tensor (B x C_out x H x W)
+          Batch of corresponding output features
+        """
+        # Process features
+        y = self.block1(x)
+        y = self.block2(y)
+        y = self.block3(y)
+        # Down-sample
+        y = self.sconv(y)
+        return y
+class DecoderBlock(nn.Module):
+    """
+    Implements up-sampling via transposed convolution, followed by a chain
+    of residual convolutional blocks with progressively increased dilation.
+    """
+    def __init__(self, in_channels, out_channels, stride=2, padding=0):
+        """
+        Initialize the encoder block.
+        Parameters
+        ----------
+        in_channels : int
+          Number of input feature channels
+        out_channels : int
+          Number of output feature channels
+        stride : int
+          Stride for the transposed convolution
+        padding : int
+          Number of features to pad after up-sampling
+        """
+        nn.Module.__init__(self)
+        self.hop = stride
+        self.win = 2 * stride
+        self.tconv = nn.Sequential(
+            # Up-sample along frequency (height) dimension via transposed convolution
+            nn.ConvTranspose2d(in_channels, out_channels, kernel_size=(self.win, 1), stride=(self.hop, 1), output_padding=(padding, 0)),
+            nn.ELU(inplace=True)
+        )
+        self.block1 = ResidualConv2dBlock(out_channels, out_channels, kernel_size=3, dilation=1)
+        self.block2 = ResidualConv2dBlock(out_channels, out_channels, kernel_size=3, dilation=2)
+        self.block3 = ResidualConv2dBlock(out_channels, out_channels, kernel_size=3, dilation=3)
+    def forward(self, x):
+        """
+        Feed features through the decoder block.
+        Parameters
+        ----------
+        x : Tensor (B x C_in x H x W)
+          Batch of input features
+        Returns
+        ----------
+        y : Tensor (B x C_out x H x W)
+          Batch of corresponding output features
+        """
+        # Up-sample
+        y = self.tconv(x)
+        # Process features
+        y = self.block1(y)
+        y = self.block2(y)
+        y = self.block3(y)
+        return y
+class ResidualConv2dBlock(nn.Module):
+    """
+    Implements a 2D convolutional block with dilation, no down-sampling, and a residual connection.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1):
+        """
+        Initialize the convolutional block.
+        Parameters
+        ----------
+        in_channels : int
+          Number of input feature channels
+        out_channels : int
+          Number of output feature channels
+        kernel_size : int
+          Kernel size for convolutions
+        dilation : int
+          Amount of dilation for first convolution
+        """
+        nn.Module.__init__(self)
+        self.conv1 = nn.Sequential(
+            # TODO - only dilate across frequency?
+            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding='same', dilation=dilation),
+            nn.ELU(inplace=True)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(out_channels, out_channels, kernel_size=1),
+            nn.ELU(inplace=True)
+        )
+    def forward(self, x):
+        """
+        Feed features through the convolutional block.
+        Parameters
+        ----------
+        x : Tensor (B x C_in x H x W)
+          Batch of input features
+        Returns
+        ----------
+        y : Tensor (B x C_out x H x W)
+          Batch of corresponding output features
+        """
+        # Process features
+        y = self.conv1(x)
+        y = self.conv2(y)
+        # Residual connection
+        y = y + x
+        return y

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+git+https://github.com/audacitorch/pyharp.git#egg=pyharp
+#git+https://github.com/sony/timbre-trap@main
+torchaudio
+torch
+cqt_pytorch
+librosa