Spaces:

OpenSound
/

Fast-GeCo

Running on Zero

App Files Files Community

anonymous9a7b commited on Nov 13, 2024

Commit

d4c980e

1 Parent(s): 8207a5c

1

Browse files

Files changed (37) hide show

app.py +214 -4
demo/item0_mix.wav +0 -0
demo/item1_mix.wav +0 -0
demo/item2_mix.wav +0 -0
demo/item3_mix.wav +0 -0
demo/item4_mix.wav +0 -0
fastgeco/.DS_Store +0 -0
fastgeco/backbones/.DS_Store +0 -0
fastgeco/backbones/__init__.py +4 -0
fastgeco/backbones/ncsnpp.py +406 -0
fastgeco/backbones/ncsnpp_utils/layers.py +662 -0
fastgeco/backbones/ncsnpp_utils/layerspp.py +274 -0
fastgeco/backbones/ncsnpp_utils/normalization.py +215 -0
fastgeco/backbones/ncsnpp_utils/utils.py +189 -0
fastgeco/backbones/shared.py +123 -0
fastgeco/model.py +258 -0
geco/.DS_Store +0 -0
geco/backbones/.DS_Store +0 -0
geco/backbones/__init__.py +4 -0
geco/backbones/ncsnpp.py +405 -0
geco/backbones/ncsnpp_utils/.DS_Store +0 -0
geco/backbones/ncsnpp_utils/layers.py +662 -0
geco/backbones/ncsnpp_utils/layerspp.py +202 -0
geco/backbones/ncsnpp_utils/normalization.py +215 -0
geco/backbones/ncsnpp_utils/utils.py +189 -0
geco/backbones/shared.py +123 -0
geco/data_module.py +258 -0
geco/model.py +255 -0
geco/sampling/__init__.py +90 -0
geco/sampling/correctors.py +60 -0
geco/sampling/predictors.py +55 -0
geco/sdes.py +205 -0
geco/util/inference.py +211 -0
geco/util/other.py +125 -0
geco/util/registry.py +34 -0
geco/util/tensors.py +16 -0
requirements.txt +20 -0

app.py CHANGED Viewed

@@ -1,7 +1,217 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import spaces
+import numpy as np
+import torch
+from fastgeco.model import ScoreModel
+from geco.util.other import pad_spec
+import os
+import torchaudio
+from speechbrain.lobes.models.dual_path import Encoder, SBTransformerBlock, SBTransformerBlock, Dual_Path_Model, Decoder
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def load_sepformer(ckpt_path):
+    encoder = Encoder(
+        kernel_size=160,
+        out_channels=256,
+        in_channels=1
+    )
+    SBtfintra = SBTransformerBlock(
+        num_layers=8,
+        d_model=256,
+        nhead=8,
+        d_ffn=1024,
+        dropout=0,
+        use_positional_encoding=True,
+        norm_before=True,
+    )
+    SBtfinter = SBTransformerBlock(
+        num_layers=8,
+        d_model=256,
+        nhead=8,
+        d_ffn=1024,
+        dropout=0,
+        use_positional_encoding=True,
+        norm_before=True,
+    )
+    masknet = Dual_Path_Model(
+        num_spks=args.num_spks,
+        in_channels=256,
+        out_channels=256,
+        num_layers=2,
+        K=250,
+        intra_model=SBtfintra,
+        inter_model=SBtfinter,
+        norm='ln',
+        linear_layer_after_inter_intra=False,
+        skip_around_intra=True,
+    )
+    decoder = Decoder(
+        in_channels=256,
+        out_channels=1,
+        kernel_size=160,
+        stride=80,
+        bias=False,
+    )
+    encoder_weights = torch.load(os.path.join(ckpt_path, 'encoder.ckpt'))
+    encoder.load_state_dict(encoder_weights)
+    masknet_weights = torch.load(os.path.join(ckpt_path, 'masknet.ckpt'))
+    masknet.load_state_dict(masknet_weights)
+    decoder_weights = torch.load(os.path.join(ckpt_path, 'decoder.ckpt'))
+    decoder.load_state_dict(decoder_weights)
+    encoder = encoder.eval().to(device)
+    masknet = masknet.eval().to(device)
+    decoder = decoder.eval().to(device)
+    return encoder, masknet, decoder
+def load_fastgeco(ckpt_path):
+    checkpoint_file = os.path.join(ckpt_path, 'fastgeco.ckpt')
+    model = ScoreModel.load_from_checkpoint(
+        checkpoint_file,
+        batch_size=1, num_workers=0, kwargs=dict(gpu=False)
+    )
+    model.eval(no_ema=False)
+    model.to(device)
+    return model
+ckpt_path = 'ckpts/'
+encoder, masknet, decoder = load_sepformer(ckpt_path)
+fastgeco_model = load_fastgeco(ckpt_path)
+sample_rate = 8000
+num_spks = 2
+@spaces.GPU
+def separate(test_file, encoder, masknet, decoder):
+    with torch.no_grad():
+        print('Process SepFormer...')
+        mix, fs_file = torchaudio.load(test_file)
+        mix = mix.to(device)
+        fs_model = sample_rate
+        # resample the data if needed
+        if fs_file != fs_model:
+            print(
+                "Resampling the audio from {} Hz to {} Hz".format(
+                    fs_file, fs_model
+                )
+            )
+            tf = torchaudio.transforms.Resample(
+                orig_freq=fs_file, new_freq=fs_model
+            ).to(device)
+            mix = mix.mean(dim=0, keepdim=True)
+            mix = tf(mix)
+        mix = mix.to(device)
+        # Separation
+        mix_w = encoder(mix)
+        est_mask = masknet(mix_w)
+        mix_w = torch.stack([mix_w] * num_spks)
+        sep_h = mix_w * est_mask
+        # Decoding
+        est_sources = torch.cat(
+            [
+                decoder(sep_h[i]).unsqueeze(-1)
+                for i in range(num_spks)
+            ],
+            dim=-1,
+        )
+        est_sources = (
+            est_sources / est_sources.abs().max(dim=1, keepdim=True)[0]
+        ).squeeze()
+        return est_sources, mix
+@spaces.GPU
+def correct(model, est_sources, mix):
+    with torch.no_grad():
+        print('Process Fast-Geco...')
+        N = 1
+        reverse_starting_point = 0.5
+        output = []
+        for idx in range(num_spks):
+            y = est_sources[:, idx].unsqueeze(0) # noisy
+            m = mix
+            min_leng = min(y.shape[-1],m.shape[-1])
+            y = y[...,:min_leng]
+            m = m[...,:min_leng]
+            T_orig = y.size(1)
+            norm_factor = y.abs().max()
+            y = y / norm_factor
+            m = m / norm_factor
+            Y = torch.unsqueeze(model._forward_transform(model._stft(y.to(device))), 0)
+            Y = pad_spec(Y)
+            M = torch.unsqueeze(model._forward_transform(model._stft(m.to(device))), 0)
+            M = pad_spec(M)
+            timesteps = torch.linspace(reverse_starting_point, 0.03, N, device=Y.device)
+            std = model.sde._std(reverse_starting_point*torch.ones((Y.shape[0],), device=Y.device))
+            z = torch.randn_like(Y)
+            X_t = Y + z * std[:, None, None, None]
+            t = timesteps[0]
+            dt = timesteps[-1]
+            f, g = model.sde.sde(X_t, t, Y)
+            vec_t = torch.ones(Y.shape[0], device=Y.device) * t
+            mean_x_tm1 = X_t - (f - g**2*model.forward(X_t, vec_t, Y, M, vec_t[:,None,None,None]))*dt #mean of x t minus 1 = mu(x_{t-1})
+            sample = mean_x_tm1
+            sample = sample.squeeze()
+            x_hat = model.to_audio(sample.squeeze(), T_orig)
+            x_hat = x_hat * norm_factor
+            new_norm_factor = x_hat.abs().max()
+            x_hat = x_hat / new_norm_factor
+            x_hat = x_hat.squeeze().cpu().numpy()
+            output.append(x_hat)
+    return output[0], output[1]
+@spaces.GPU
+def process_audio(test_file):
+    result, mix = separate(test_file, encoder, masknet, decoder)
+    audio1, audio2 = correct(fastgeco_model, result, mix)
+    return audio1, audio2
+# CSS styling (optional)
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 1280px;
+}
+"""
+# Gradio Blocks layout
+with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("""
+            # Fast-GeCo: Noise-robust Speech Separation with Fast Generative Correction
+            Separate the noisy mixture speech with a generative correction method, only support 2 speakers now.
+            Learn more about 🟣**Fast-GeCo** on the [Fast-GeCo Repo](https://github.com/WangHelin1997/Fast-GeCo/).
+        """)
+        with gr.Tab("Speech Separation"):
+            # Input: Upload audio file
+            with gr.Row():
+                gt_file_input = gr.Audio(label="Upload Audio to Separate", type="filepath", value="demo/item0_mix.wav")
+                button = gr.Button("Generate", scale=1)
+            # Output Component for edited audio
+            with gr.Row():
+                result1 = gr.Audio(label="Separated Audio 1", type="numpy")
+                result2 = gr.Audio(label="Separated Audio 2", type="numpy")
+            # Define the trigger and input-output linking
+            button.click(
+                fn=process_audio,
+                inputs=[
+                    gt_file_input,
+                ],
+                outputs=[result1, result2]
+            )
+    # Launch the Gradio demo
+    demo.launch()

demo/item0_mix.wav ADDED Viewed

Binary file (173 kB). View file

demo/item1_mix.wav ADDED Viewed

Binary file (164 kB). View file

demo/item2_mix.wav ADDED Viewed

Binary file (103 kB). View file

demo/item3_mix.wav ADDED Viewed

Binary file (105 kB). View file

demo/item4_mix.wav ADDED Viewed

Binary file (104 kB). View file

fastgeco/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

fastgeco/backbones/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

fastgeco/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .shared import BackboneRegistry
+from .ncsnpp import NCSNpp
+__all__ = ['BackboneRegistry', 'NCSNpp']

fastgeco/backbones/ncsnpp.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+from score_models.layers import UpsampleLayer, DownsampleLayer
+from .ncsnpp_utils import layers, layerspp, normalization
+import torch.nn as nn
+import functools
+import torch
+import numpy as np
+from .shared import BackboneRegistry
+ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
+ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
+Combine = layerspp.Combine
+conv3x3 = layerspp.conv3x3
+conv1x1 = layerspp.conv1x1
+get_act = layers.get_act
+get_normalization = normalization.get_normalization
+default_initializer = layers.default_init
+@BackboneRegistry.register("ncsnpp")
+class NCSNpp(nn.Module):
+    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
+    @staticmethod
+    def add_argparse_args(parser):
+        # TODO: add additional arguments of constructor, if you wish to modify them.
+        return parser
+    def __init__(self,
+        scale_by_sigma = True,
+        nonlinearity = 'swish',
+        nf = 128,
+        ch_mult = (1, 1, 2, 2, 2, 2, 2),
+        num_res_blocks = 2,
+        attn_resolutions = (16,),
+        resamp_with_conv = True,
+        conditional = True,
+        fir = True,
+        fir_kernel = 'song',
+        skip_rescale = True,
+        resblock_type = 'biggan',
+        progressive = 'output_skip',
+        progressive_input = 'input_skip',
+        progressive_combine = 'sum',
+        init_scale = 0.,
+        fourier_scale = 16,
+        image_size = 256,
+        embedding_type = 'fourier',
+        dropout = .0,
+        **unused_kwargs
+    ):
+        super().__init__()
+        self.act = act = get_act(nonlinearity)
+        self.nf = nf = nf
+        ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions = attn_resolutions
+        dropout = dropout
+        resamp_with_conv = resamp_with_conv
+        self.num_resolutions = num_resolutions = len(ch_mult)
+        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
+        self.conditional = conditional = conditional  # noise-conditional
+        self.scale_by_sigma = scale_by_sigma
+        fir = fir
+        fir_kernel = [1, 3, 3, 1]
+        self.skip_rescale = skip_rescale = skip_rescale
+        self.resblock_type = resblock_type = resblock_type.lower()
+        self.progressive = progressive = progressive.lower()
+        self.progressive_input = progressive_input = progressive_input.lower()
+        self.embedding_type = embedding_type = embedding_type.lower()
+        init_scale = init_scale
+        assert progressive in ['none', 'output_skip', 'residual']
+        assert progressive_input in ['none', 'input_skip', 'residual']
+        assert embedding_type in ['fourier', 'positional']
+        combine_method = progressive_combine.lower()
+        combiner = functools.partial(Combine, method=combine_method)
+        num_channels = 6  # x.real, x.imag, y.real, y.imag
+        self.output_layer = nn.Conv2d(num_channels, 2, 1)
+        modules = []
+        # timestep/noise_level embedding
+        if embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            modules.append(layerspp.GaussianFourierProjection(
+                embedding_size=nf, scale=fourier_scale
+            ))
+            embed_dim = 2 * nf
+        elif embedding_type == 'positional':
+            embed_dim = nf
+        else:
+            raise ValueError(f'embedding type {embedding_type} unknown.')
+        if conditional:
+            modules.append(nn.Linear(embed_dim, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+            modules.append(nn.Linear(nf * 4, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+        AttnBlock = functools.partial(layerspp.AttnBlockpp,
+            init_scale=init_scale, skip_rescale=skip_rescale)
+        Upsample = functools.partial(UpsampleLayer,
+            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive == 'output_skip':
+            self.pyramid_upsample = UpsampleLayer(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive == 'residual':
+            pyramid_upsample = functools.partial(UpsampleLayer, fir=fir,
+                fir_kernel=fir_kernel, with_conv=True)
+        Downsample = functools.partial(DownsampleLayer, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive_input == 'input_skip':
+            self.pyramid_downsample = DownsampleLayer(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive_input == 'residual':
+            pyramid_downsample = functools.partial(DownsampleLayer,
+                fir=fir, fir_kernel=fir_kernel, with_conv=True)
+        if resblock_type == 'ddpm':
+            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
+                dropout=dropout, init_scale=init_scale,
+                skip_rescale=skip_rescale, temb_dim=nf * 4)
+        elif resblock_type == 'biggan':
+            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
+                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
+                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
+        else:
+            raise ValueError(f'resblock type {resblock_type} unrecognized.')
+        # Downsampling block
+        channels = num_channels
+        if progressive_input != 'none':
+            input_pyramid_ch = channels
+        modules.append(conv3x3(channels, nf))
+        hs_c = [nf]
+        in_ch = nf
+        for i_level in range(num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(num_res_blocks):
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
+                in_ch = out_ch
+                if all_resolutions[i_level] in attn_resolutions:
+                    modules.append(AttnBlock(channels=in_ch))
+                hs_c.append(in_ch)
+            if i_level != num_resolutions - 1:
+                if resblock_type == 'ddpm':
+                    modules.append(Downsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
+                if progressive_input == 'input_skip':
+                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
+                    if combine_method == 'cat':
+                        in_ch *= 2
+                elif progressive_input == 'residual':
+                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
+                    input_pyramid_ch = in_ch
+                hs_c.append(in_ch)
+        in_ch = hs_c[-1]
+        modules.append(ResnetBlock(in_ch=in_ch))
+        modules.append(AttnBlock(channels=in_ch))
+        modules.append(ResnetBlock(in_ch=in_ch))
+        pyramid_ch = 0
+        # Upsampling block
+        for i_level in reversed(range(num_resolutions)):
+            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
+                in_ch = out_ch
+            if all_resolutions[i_level] in attn_resolutions:
+                modules.append(AttnBlock(channels=in_ch))
+            if progressive != 'none':
+                if i_level == num_resolutions - 1:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, in_ch, bias=True))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name.')
+                else:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name')
+            if i_level != 0:
+                if resblock_type == 'ddpm':
+                    modules.append(Upsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
+        assert not hs_c
+        if progressive != 'output_skip':
+            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                                                                    num_channels=in_ch, eps=1e-6))
+            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+        self.all_modules = nn.ModuleList(modules)
+    def forward(self, x, time_cond, scale_divide):
+        # timestep/noise_level embedding; only for continuous training
+        modules = self.all_modules
+        m_idx = 0
+        # Convert real and imaginary parts of (x,y) into four channel dimensions
+        x = torch.cat((x[:,[0],:,:].real, x[:,[0],:,:].imag,
+                       x[:,[1],:,:].real, x[:,[1],:,:].imag,
+                       x[:,[2],:,:].real, x[:,[2],:,:].imag), dim=1)
+        if self.embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            used_sigmas = time_cond
+            temb = modules[m_idx](torch.log(used_sigmas))
+            m_idx += 1
+        elif self.embedding_type == 'positional':
+            # Sinusoidal positional embeddings.
+            timesteps = time_cond
+            used_sigmas = self.sigmas[time_cond.long()]
+            temb = layers.get_timestep_embedding(timesteps, self.nf)
+        else:
+            raise ValueError(f'embedding type {self.embedding_type} unknown.')
+        if self.conditional:
+            temb = modules[m_idx](temb)
+            m_idx += 1
+            temb = modules[m_idx](self.act(temb))
+            m_idx += 1
+        else:
+            temb = None
+        # Downsampling block
+        input_pyramid = None
+        if self.progressive_input != 'none':
+            input_pyramid = x
+        # Input layer: Conv2d: 4ch -> 128ch
+        hs = [modules[m_idx](x)]
+        m_idx += 1
+        # Down path in U-Net
+        for i_level in range(self.num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(self.num_res_blocks):
+                h = modules[m_idx](hs[-1], temb)
+                m_idx += 1
+                # Attention layer (optional)
+                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                hs.append(h)
+            # Downsampling
+            if i_level != self.num_resolutions - 1:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](hs[-1])
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](hs[-1], temb)
+                    m_idx += 1
+                if self.progressive_input == 'input_skip':   # Combine h with x
+                    input_pyramid = self.pyramid_downsample(input_pyramid)
+                    h = modules[m_idx](input_pyramid, h)
+                    m_idx += 1
+                elif self.progressive_input == 'residual':
+                    input_pyramid = modules[m_idx](input_pyramid)
+                    m_idx += 1
+                    if self.skip_rescale:
+                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
+                    else:
+                        input_pyramid = input_pyramid + h
+                    h = input_pyramid
+                hs.append(h)
+        h = hs[-1] # actualy equal to: h = h
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        h = modules[m_idx](h)  # Attention block
+        m_idx += 1
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        pyramid = None
+        # Upsampling block
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
+                m_idx += 1
+            # edit: from -1 to -2
+            if h.shape[-2] in self.attn_resolutions:
+                h = modules[m_idx](h)
+                m_idx += 1
+            if self.progressive != 'none':
+                if i_level == self.num_resolutions - 1:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
+                        m_idx += 1
+                    elif self.progressive == 'residual':
+                        pyramid = self.act(modules[m_idx](h))
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name.')
+                else:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
+                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid_h = modules[m_idx](pyramid_h)
+                        m_idx += 1
+                        pyramid = pyramid + pyramid_h
+                    elif self.progressive == 'residual':
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                        if self.skip_rescale:
+                            pyramid = (pyramid + h) / np.sqrt(2.)
+                        else:
+                            pyramid = pyramid + h
+                        h = pyramid
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name')
+            # Upsampling Layer
+            if i_level != 0:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](h, temb)  # Upspampling
+                    m_idx += 1
+        assert not hs
+        if self.progressive == 'output_skip':
+            h = pyramid
+        else:
+            h = self.act(modules[m_idx](h))
+            m_idx += 1
+            h = modules[m_idx](h)
+            m_idx += 1
+        assert m_idx == len(modules), "Implementation error"
+        h = h / scale_divide
+        # h = h / used_sigmas[:, None, None, None]
+        # Convert back to complex number
+        h = self.output_layer(h)
+        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
+        h = torch.view_as_complex(h)[:,None, :, :]
+        return h

fastgeco/backbones/ncsnpp_utils/layers.py ADDED Viewed

	@@ -0,0 +1,662 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+"""Common layers for defining score networks.
+"""
+import math
+import string
+from functools import partial
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+from .normalization import ConditionalInstanceNorm2dPlus
+def get_act(config):
+  """Get activation functions from the config file."""
+  if config == 'elu':
+    return nn.ELU()
+  elif config == 'relu':
+    return nn.ReLU()
+  elif config == 'lrelu':
+    return nn.LeakyReLU(negative_slope=0.2)
+  elif config == 'swish':
+    return nn.SiLU()
+  else:
+    raise NotImplementedError('activation function does not exist!')
+def ncsn_conv1x1(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=0):
+  """1x1 convolution. Same as NCSNv1/v2."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=bias, dilation=dilation,
+                   padding=padding)
+  init_scale = 1e-10 if init_scale == 0 else init_scale
+  conv.weight.data *= init_scale
+  conv.bias.data *= init_scale
+  return conv
+def variance_scaling(scale, mode, distribution,
+                     in_axis=1, out_axis=0,
+                     dtype=torch.float32,
+                     device='cpu'):
+  """Ported from JAX. """
+  def _compute_fans(shape, in_axis=1, out_axis=0):
+    receptive_field_size = np.prod(shape) / shape[in_axis] / shape[out_axis]
+    fan_in = shape[in_axis] * receptive_field_size
+    fan_out = shape[out_axis] * receptive_field_size
+    return fan_in, fan_out
+  def init(shape, dtype=dtype, device=device):
+    fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+    if mode == "fan_in":
+      denominator = fan_in
+    elif mode == "fan_out":
+      denominator = fan_out
+    elif mode == "fan_avg":
+      denominator = (fan_in + fan_out) / 2
+    else:
+      raise ValueError(
+        "invalid mode for variance scaling initializer: {}".format(mode))
+    variance = scale / denominator
+    if distribution == "normal":
+      return torch.randn(*shape, dtype=dtype, device=device) * np.sqrt(variance)
+    elif distribution == "uniform":
+      return (torch.rand(*shape, dtype=dtype, device=device) * 2. - 1.) * np.sqrt(3 * variance)
+    else:
+      raise ValueError("invalid distribution for variance scaling initializer")
+  return init
+def default_init(scale=1.):
+  """The same initialization used in DDPM."""
+  scale = 1e-10 if scale == 0 else scale
+  return variance_scaling(scale, 'fan_avg', 'uniform')
+class Dense(nn.Module):
+  """Linear layer with `default_init`."""
+  def __init__(self):
+    super().__init__()
+def ddpm_conv1x1(in_planes, out_planes, stride=1, bias=True, init_scale=1., padding=0):
+  """1x1 convolution with DDPM initialization."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=padding, bias=bias)
+  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
+  nn.init.zeros_(conv.bias)
+  return conv
+def ncsn_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
+  """3x3 convolution with PyTorch initialization. Same as NCSNv1/NCSNv2."""
+  init_scale = 1e-10 if init_scale == 0 else init_scale
+  conv = nn.Conv2d(in_planes, out_planes, stride=stride, bias=bias,
+                   dilation=dilation, padding=padding, kernel_size=3)
+  conv.weight.data *= init_scale
+  conv.bias.data *= init_scale
+  return conv
+def ddpm_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
+  """3x3 convolution with DDPM initialization."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=padding,
+                   dilation=dilation, bias=bias)
+  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
+  nn.init.zeros_(conv.bias)
+  return conv
+  ###########################################################################
+  # Functions below are ported over from the NCSNv1/NCSNv2 codebase:
+  # https://github.com/ermongroup/ncsn
+  # https://github.com/ermongroup/ncsnv2
+  ###########################################################################
+class CRPBlock(nn.Module):
+  def __init__(self, features, n_stages, act=nn.ReLU(), maxpool=True):
+    super().__init__()
+    self.convs = nn.ModuleList()
+    for i in range(n_stages):
+      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.n_stages = n_stages
+    if maxpool:
+      self.pool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
+    else:
+      self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
+    self.act = act
+  def forward(self, x):
+    x = self.act(x)
+    path = x
+    for i in range(self.n_stages):
+      path = self.pool(path)
+      path = self.convs[i](path)
+      x = path + x
+    return x
+class CondCRPBlock(nn.Module):
+  def __init__(self, features, n_stages, num_classes, normalizer, act=nn.ReLU()):
+    super().__init__()
+    self.convs = nn.ModuleList()
+    self.norms = nn.ModuleList()
+    self.normalizer = normalizer
+    for i in range(n_stages):
+      self.norms.append(normalizer(features, num_classes, bias=True))
+      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.n_stages = n_stages
+    self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
+    self.act = act
+  def forward(self, x, y):
+    x = self.act(x)
+    path = x
+    for i in range(self.n_stages):
+      path = self.norms[i](path, y)
+      path = self.pool(path)
+      path = self.convs[i](path)
+      x = path + x
+    return x
+class RCUBlock(nn.Module):
+  def __init__(self, features, n_blocks, n_stages, act=nn.ReLU()):
+    super().__init__()
+    for i in range(n_blocks):
+      for j in range(n_stages):
+        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.stride = 1
+    self.n_blocks = n_blocks
+    self.n_stages = n_stages
+    self.act = act
+  def forward(self, x):
+    for i in range(self.n_blocks):
+      residual = x
+      for j in range(self.n_stages):
+        x = self.act(x)
+        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
+      x += residual
+    return x
+class CondRCUBlock(nn.Module):
+  def __init__(self, features, n_blocks, n_stages, num_classes, normalizer, act=nn.ReLU()):
+    super().__init__()
+    for i in range(n_blocks):
+      for j in range(n_stages):
+        setattr(self, '{}_{}_norm'.format(i + 1, j + 1), normalizer(features, num_classes, bias=True))
+        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.stride = 1
+    self.n_blocks = n_blocks
+    self.n_stages = n_stages
+    self.act = act
+    self.normalizer = normalizer
+  def forward(self, x, y):
+    for i in range(self.n_blocks):
+      residual = x
+      for j in range(self.n_stages):
+        x = getattr(self, '{}_{}_norm'.format(i + 1, j + 1))(x, y)
+        x = self.act(x)
+        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
+      x += residual
+    return x
+class MSFBlock(nn.Module):
+  def __init__(self, in_planes, features):
+    super().__init__()
+    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
+    self.convs = nn.ModuleList()
+    self.features = features
+    for i in range(len(in_planes)):
+      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
+  def forward(self, xs, shape):
+    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
+    for i in range(len(self.convs)):
+      h = self.convs[i](xs[i])
+      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
+      sums += h
+    return sums
+class CondMSFBlock(nn.Module):
+  def __init__(self, in_planes, features, num_classes, normalizer):
+    super().__init__()
+    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
+    self.convs = nn.ModuleList()
+    self.norms = nn.ModuleList()
+    self.features = features
+    self.normalizer = normalizer
+    for i in range(len(in_planes)):
+      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
+      self.norms.append(normalizer(in_planes[i], num_classes, bias=True))
+  def forward(self, xs, y, shape):
+    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
+    for i in range(len(self.convs)):
+      h = self.norms[i](xs[i], y)
+      h = self.convs[i](h)
+      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
+      sums += h
+    return sums
+class RefineBlock(nn.Module):
+  def __init__(self, in_planes, features, act=nn.ReLU(), start=False, end=False, maxpool=True):
+    super().__init__()
+    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
+    self.n_blocks = n_blocks = len(in_planes)
+    self.adapt_convs = nn.ModuleList()
+    for i in range(n_blocks):
+      self.adapt_convs.append(RCUBlock(in_planes[i], 2, 2, act))
+    self.output_convs = RCUBlock(features, 3 if end else 1, 2, act)
+    if not start:
+      self.msf = MSFBlock(in_planes, features)
+    self.crp = CRPBlock(features, 2, act, maxpool=maxpool)
+  def forward(self, xs, output_shape):
+    assert isinstance(xs, tuple) or isinstance(xs, list)
+    hs = []
+    for i in range(len(xs)):
+      h = self.adapt_convs[i](xs[i])
+      hs.append(h)
+    if self.n_blocks > 1:
+      h = self.msf(hs, output_shape)
+    else:
+      h = hs[0]
+    h = self.crp(h)
+    h = self.output_convs(h)
+    return h
+class CondRefineBlock(nn.Module):
+  def __init__(self, in_planes, features, num_classes, normalizer, act=nn.ReLU(), start=False, end=False):
+    super().__init__()
+    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
+    self.n_blocks = n_blocks = len(in_planes)
+    self.adapt_convs = nn.ModuleList()
+    for i in range(n_blocks):
+      self.adapt_convs.append(
+        CondRCUBlock(in_planes[i], 2, 2, num_classes, normalizer, act)
+      )
+    self.output_convs = CondRCUBlock(features, 3 if end else 1, 2, num_classes, normalizer, act)
+    if not start:
+      self.msf = CondMSFBlock(in_planes, features, num_classes, normalizer)
+    self.crp = CondCRPBlock(features, 2, num_classes, normalizer, act)
+  def forward(self, xs, y, output_shape):
+    assert isinstance(xs, tuple) or isinstance(xs, list)
+    hs = []
+    for i in range(len(xs)):
+      h = self.adapt_convs[i](xs[i], y)
+      hs.append(h)
+    if self.n_blocks > 1:
+      h = self.msf(hs, y, output_shape)
+    else:
+      h = hs[0]
+    h = self.crp(h, y)
+    h = self.output_convs(h, y)
+    return h
+class ConvMeanPool(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, adjust_padding=False):
+    super().__init__()
+    if not adjust_padding:
+      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+      self.conv = conv
+    else:
+      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+      self.conv = nn.Sequential(
+        nn.ZeroPad2d((1, 0, 1, 0)),
+        conv
+      )
+  def forward(self, inputs):
+    output = self.conv(inputs)
+    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
+                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
+    return output
+class MeanPoolConv(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
+    super().__init__()
+    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+  def forward(self, inputs):
+    output = inputs
+    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
+                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
+    return self.conv(output)
+class UpsampleConv(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
+    super().__init__()
+    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+    self.pixelshuffle = nn.PixelShuffle(upscale_factor=2)
+  def forward(self, inputs):
+    output = inputs
+    output = torch.cat([output, output, output, output], dim=1)
+    output = self.pixelshuffle(output)
+    return self.conv(output)
+class ConditionalResidualBlock(nn.Module):
+  def __init__(self, input_dim, output_dim, num_classes, resample=1, act=nn.ELU(),
+               normalization=ConditionalInstanceNorm2dPlus, adjust_padding=False, dilation=None):
+    super().__init__()
+    self.non_linearity = act
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.resample = resample
+    self.normalization = normalization
+    if resample == 'down':
+      if dilation > 1:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
+        self.normalize2 = normalization(input_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+      else:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
+        self.normalize2 = normalization(input_dim, num_classes)
+        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
+        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
+    elif resample is None:
+      if dilation > 1:
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        self.normalize2 = normalization(output_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
+      else:
+        conv_shortcut = nn.Conv2d
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
+        self.normalize2 = normalization(output_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
+    else:
+      raise Exception('invalid resample value')
+    if output_dim != input_dim or resample is not None:
+      self.shortcut = conv_shortcut(input_dim, output_dim)
+    self.normalize1 = normalization(input_dim, num_classes)
+  def forward(self, x, y):
+    output = self.normalize1(x, y)
+    output = self.non_linearity(output)
+    output = self.conv1(output)
+    output = self.normalize2(output, y)
+    output = self.non_linearity(output)
+    output = self.conv2(output)
+    if self.output_dim == self.input_dim and self.resample is None:
+      shortcut = x
+    else:
+      shortcut = self.shortcut(x)
+    return shortcut + output
+class ResidualBlock(nn.Module):
+  def __init__(self, input_dim, output_dim, resample=None, act=nn.ELU(),
+               normalization=nn.InstanceNorm2d, adjust_padding=False, dilation=1):
+    super().__init__()
+    self.non_linearity = act
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.resample = resample
+    self.normalization = normalization
+    if resample == 'down':
+      if dilation > 1:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
+        self.normalize2 = normalization(input_dim)
+        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+      else:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
+        self.normalize2 = normalization(input_dim)
+        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
+        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
+    elif resample is None:
+      if dilation > 1:
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        self.normalize2 = normalization(output_dim)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
+      else:
+        # conv_shortcut = nn.Conv2d ### Something wierd here.
+        conv_shortcut = partial(ncsn_conv1x1)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
+        self.normalize2 = normalization(output_dim)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
+    else:
+      raise Exception('invalid resample value')
+    if output_dim != input_dim or resample is not None:
+      self.shortcut = conv_shortcut(input_dim, output_dim)
+    self.normalize1 = normalization(input_dim)
+  def forward(self, x):
+    output = self.normalize1(x)
+    output = self.non_linearity(output)
+    output = self.conv1(output)
+    output = self.normalize2(output)
+    output = self.non_linearity(output)
+    output = self.conv2(output)
+    if self.output_dim == self.input_dim and self.resample is None:
+      shortcut = x
+    else:
+      shortcut = self.shortcut(x)
+    return shortcut + output
+###########################################################################
+# Functions below are ported over from the DDPM codebase:
+#  https://github.com/hojonathanho/diffusion/blob/master/diffusion_tf/nn.py
+###########################################################################
+def get_timestep_embedding(timesteps, embedding_dim, max_positions=10000):
+  assert len(timesteps.shape) == 1  # and timesteps.dtype == tf.int32
+  half_dim = embedding_dim // 2
+  # magic number 10000 is from transformers
+  emb = math.log(max_positions) / (half_dim - 1)
+  # emb = math.log(2.) / (half_dim - 1)
+  emb = torch.exp(torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) * -emb)
+  # emb = tf.range(num_embeddings, dtype=jnp.float32)[:, None] * emb[None, :]
+  # emb = tf.cast(timesteps, dtype=jnp.float32)[:, None] * emb[None, :]
+  emb = timesteps.float()[:, None] * emb[None, :]
+  emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+  if embedding_dim % 2 == 1:  # zero pad
+    emb = F.pad(emb, (0, 1), mode='constant')
+  assert emb.shape == (timesteps.shape[0], embedding_dim)
+  return emb
+def _einsum(a, b, c, x, y):
+  einsum_str = '{},{}->{}'.format(''.join(a), ''.join(b), ''.join(c))
+  return torch.einsum(einsum_str, x, y)
+def contract_inner(x, y):
+  """tensordot(x, y, 1)."""
+  x_chars = list(string.ascii_lowercase[:len(x.shape)])
+  y_chars = list(string.ascii_lowercase[len(x.shape):len(y.shape) + len(x.shape)])
+  y_chars[0] = x_chars[-1]  # first axis of y and last of x get summed
+  out_chars = x_chars[:-1] + y_chars[1:]
+  return _einsum(x_chars, y_chars, out_chars, x, y)
+class NIN(nn.Module):
+  def __init__(self, in_dim, num_units, init_scale=0.1):
+    super().__init__()
+    self.W = nn.Parameter(default_init(scale=init_scale)((in_dim, num_units)), requires_grad=True)
+    self.b = nn.Parameter(torch.zeros(num_units), requires_grad=True)
+  def forward(self, x):
+    x = x.permute(0, 2, 3, 1)
+    y = contract_inner(x, self.W) + self.b
+    return y.permute(0, 3, 1, 2)
+class AttnBlock(nn.Module):
+  """Channel-wise self-attention block."""
+  def __init__(self, channels):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=0.)
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    return x + h
+class Upsample(nn.Module):
+  def __init__(self, channels, with_conv=False):
+    super().__init__()
+    if with_conv:
+      self.Conv_0 = ddpm_conv3x3(channels, channels)
+    self.with_conv = with_conv
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = F.interpolate(x, (H * 2, W * 2), mode='nearest')
+    if self.with_conv:
+      h = self.Conv_0(h)
+    return h
+class Downsample(nn.Module):
+  def __init__(self, channels, with_conv=False):
+    super().__init__()
+    if with_conv:
+      self.Conv_0 = ddpm_conv3x3(channels, channels, stride=2, padding=0)
+    self.with_conv = with_conv
+  def forward(self, x):
+    B, C, H, W = x.shape
+    # Emulate 'SAME' padding
+    if self.with_conv:
+      x = F.pad(x, (0, 1, 0, 1))
+      x = self.Conv_0(x)
+    else:
+      x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=0)
+    assert x.shape == (B, C, H // 2, W // 2)
+    return x
+class ResnetBlockDDPM(nn.Module):
+  """The ResNet Blocks used in DDPM."""
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False, dropout=0.1):
+    super().__init__()
+    if out_ch is None:
+      out_ch = in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=in_ch, eps=1e-6)
+    self.act = act
+    self.Conv_0 = ddpm_conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=32, num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = ddpm_conv3x3(out_ch, out_ch, init_scale=0.)
+    if in_ch != out_ch:
+      if conv_shortcut:
+        self.Conv_2 = ddpm_conv3x3(in_ch, out_ch)
+      else:
+        self.NIN_0 = NIN(in_ch, out_ch)
+    self.out_ch = out_ch
+    self.in_ch = in_ch
+    self.conv_shortcut = conv_shortcut
+  def forward(self, x, temb=None):
+    B, C, H, W = x.shape
+    assert C == self.in_ch
+    out_ch = self.out_ch if self.out_ch else self.in_ch
+    h = self.act(self.GroupNorm_0(x))
+    h = self.Conv_0(h)
+    # Add bias to each feature map conditioned on the time embedding
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if C != out_ch:
+      if self.conv_shortcut:
+        x = self.Conv_2(x)
+      else:
+        x = self.NIN_0(x)
+    return x + h

fastgeco/backbones/ncsnpp_utils/layerspp.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+"""Layers for defining NCSN++.
+"""
+from . import layers
+import score_models.layers.up_or_downsampling2d as up_or_down_sampling
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+conv1x1 = layers.ddpm_conv1x1
+conv3x3 = layers.ddpm_conv3x3
+NIN = layers.NIN
+default_init = layers.default_init
+class GaussianFourierProjection(nn.Module):
+  """Gaussian Fourier embeddings for noise levels."""
+  def __init__(self, embedding_size=256, scale=1.0):
+    super().__init__()
+    self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+  def forward(self, x):
+    x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
+    return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+class Combine(nn.Module):
+  """Combine information from skip connections."""
+  def __init__(self, dim1, dim2, method='cat'):
+    super().__init__()
+    self.Conv_0 = conv1x1(dim1, dim2)
+    self.method = method
+  def forward(self, x, y):
+    h = self.Conv_0(x)
+    if self.method == 'cat':
+      return torch.cat([h, y], dim=1)
+    elif self.method == 'sum':
+      return h + y
+    else:
+      raise ValueError(f'Method {self.method} not recognized.')
+class AttnBlockpp(nn.Module):
+  """Channel-wise self-attention block. Modified from DDPM."""
+  def __init__(self, channels, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(channels // 4, 32), num_channels=channels,
+                                  eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=init_scale)
+    self.skip_rescale = skip_rescale
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)
+# class Upsample(nn.Module):
+#   def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,
+#                fir_kernel=(1, 3, 3, 1)):
+#     super().__init__()
+#     out_ch = out_ch if out_ch else in_ch
+#     if not fir:
+#       if with_conv:
+#         self.Conv_0 = conv3x3(in_ch, out_ch)
+#     else:
+#       if with_conv:
+#         self.Conv2d_0 = up_or_down_sampling.Conv2d(in_ch, out_ch,
+#                                                  kernel=3, up=True,
+#                                                  resample_kernel=fir_kernel,
+#                                                  use_bias=True,
+#                                                  kernel_init=default_init())
+#     self.fir = fir
+#     self.with_conv = with_conv
+#     self.fir_kernel = fir_kernel
+#     self.out_ch = out_ch
+#   def forward(self, x):
+#     B, C, H, W = x.shape
+#     if not self.fir:
+#       h = F.interpolate(x, (H * 2, W * 2), 'nearest')
+#       if self.with_conv:
+#         h = self.Conv_0(h)
+#     else:
+#       if not self.with_conv:
+#         h = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
+#       else:
+#         h = self.Conv2d_0(x)
+#     return h
+# class Downsample(nn.Module):
+#   def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,
+#                fir_kernel=(1, 3, 3, 1)):
+#     super().__init__()
+#     out_ch = out_ch if out_ch else in_ch
+#     if not fir:
+#       if with_conv:
+#         self.Conv_0 = conv3x3(in_ch, out_ch, stride=2, padding=0)
+#     else:
+#       if with_conv:
+#         self.Conv2d_0 = up_or_down_sampling.Conv2d(in_ch, out_ch,
+#                                                  kernel=3, down=True,
+#                                                  resample_kernel=fir_kernel,
+#                                                  use_bias=True,
+#                                                  kernel_init=default_init())
+#     self.fir = fir
+#     self.fir_kernel = fir_kernel
+#     self.with_conv = with_conv
+#     self.out_ch = out_ch
+#   def forward(self, x):
+#     B, C, H, W = x.shape
+#     if not self.fir:
+#       if self.with_conv:
+#         x = F.pad(x, (0, 1, 0, 1))
+#         x = self.Conv_0(x)
+#       else:
+#         x = F.avg_pool2d(x, 2, stride=2)
+#     else:
+#       if not self.with_conv:
+#         x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
+#       else:
+#         x = self.Conv2d_0(x)
+#     return x
+class ResnetBlockDDPMpp(nn.Module):
+  """ResBlock adapted from DDPM."""
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False,
+               dropout=0.1, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
+    self.Conv_0 = conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
+    if in_ch != out_ch:
+      if conv_shortcut:
+        self.Conv_2 = conv3x3(in_ch, out_ch)
+      else:
+        self.NIN_0 = NIN(in_ch, out_ch)
+    self.skip_rescale = skip_rescale
+    self.act = act
+    self.out_ch = out_ch
+    self.conv_shortcut = conv_shortcut
+  def forward(self, x, temb=None):
+    h = self.act(self.GroupNorm_0(x))
+    h = self.Conv_0(h)
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if x.shape[1] != self.out_ch:
+      if self.conv_shortcut:
+        x = self.Conv_2(x)
+      else:
+        x = self.NIN_0(x)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)
+class ResnetBlockBigGANpp(nn.Module):
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, up=False, down=False,
+               dropout=0.1, fir=False, fir_kernel=(1, 3, 3, 1),
+               skip_rescale=True, init_scale=0.):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
+    self.up = up
+    self.down = down
+    self.fir = fir
+    self.fir_kernel = fir_kernel
+    self.Conv_0 = conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
+    if in_ch != out_ch or up or down:
+      self.Conv_2 = conv1x1(in_ch, out_ch)
+    self.skip_rescale = skip_rescale
+    self.act = act
+    self.in_ch = in_ch
+    self.out_ch = out_ch
+  def forward(self, x, temb=None):
+    h = self.act(self.GroupNorm_0(x))
+    if self.up:
+      if self.fir:
+        h = up_or_down_sampling.upsample_2d(h, self.fir_kernel, factor=2)
+        x = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = up_or_down_sampling.naive_upsample_2d(h, factor=2)
+        x = up_or_down_sampling.naive_upsample_2d(x, factor=2)
+    elif self.down:
+      if self.fir:
+        h = up_or_down_sampling.downsample_2d(h, self.fir_kernel, factor=2)
+        x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = up_or_down_sampling.naive_downsample_2d(h, factor=2)
+        x = up_or_down_sampling.naive_downsample_2d(x, factor=2)
+    h = self.Conv_0(h)
+    # Add bias to each feature map conditioned on the time embedding
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if self.in_ch != self.out_ch or self.up or self.down:
+      x = self.Conv_2(x)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)

fastgeco/backbones/ncsnpp_utils/normalization.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalization layers."""
+import torch.nn as nn
+import torch
+import functools
+def get_normalization(config, conditional=False):
+  """Obtain normalization modules from the config file."""
+  norm = config.model.normalization
+  if conditional:
+    if norm == 'InstanceNorm++':
+      return functools.partial(ConditionalInstanceNorm2dPlus, num_classes=config.model.num_classes)
+    else:
+      raise NotImplementedError(f'{norm} not implemented yet.')
+  else:
+    if norm == 'InstanceNorm':
+      return nn.InstanceNorm2d
+    elif norm == 'InstanceNorm++':
+      return InstanceNorm2dPlus
+    elif norm == 'VarianceNorm':
+      return VarianceNorm2d
+    elif norm == 'GroupNorm':
+      return nn.GroupNorm
+    else:
+      raise ValueError('Unknown normalization: %s' % norm)
+class ConditionalBatchNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.bn = nn.BatchNorm2d(num_features, affine=False)
+    if self.bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    out = self.bn(x)
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=1)
+      out = gamma.view(-1, self.num_features, 1, 1) * out + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * out
+    return out
+class ConditionalInstanceNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    h = self.instance_norm(x)
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=-1)
+      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalVarianceNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=False):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.embed = nn.Embedding(num_classes, num_features)
+    self.embed.weight.data.normal_(1, 0.02)
+  def forward(self, x, y):
+    vars = torch.var(x, dim=(2, 3), keepdim=True)
+    h = x / torch.sqrt(vars + 1e-5)
+    gamma = self.embed(y)
+    out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class VarianceNorm2d(nn.Module):
+  def __init__(self, num_features, bias=False):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.alpha = nn.Parameter(torch.zeros(num_features))
+    self.alpha.data.normal_(1, 0.02)
+  def forward(self, x):
+    vars = torch.var(x, dim=(2, 3), keepdim=True)
+    h = x / torch.sqrt(vars + 1e-5)
+    out = self.alpha.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalNoneNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=-1)
+      out = gamma.view(-1, self.num_features, 1, 1) * x + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * x
+    return out
+class NoneNorm2d(nn.Module):
+  def __init__(self, num_features, bias=True):
+    super().__init__()
+  def forward(self, x):
+    return x
+class InstanceNorm2dPlus(nn.Module):
+  def __init__(self, num_features, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    self.alpha = nn.Parameter(torch.zeros(num_features))
+    self.gamma = nn.Parameter(torch.zeros(num_features))
+    self.alpha.data.normal_(1, 0.02)
+    self.gamma.data.normal_(1, 0.02)
+    if bias:
+      self.beta = nn.Parameter(torch.zeros(num_features))
+  def forward(self, x):
+    means = torch.mean(x, dim=(2, 3))
+    m = torch.mean(means, dim=-1, keepdim=True)
+    v = torch.var(means, dim=-1, keepdim=True)
+    means = (means - m) / (torch.sqrt(v + 1e-5))
+    h = self.instance_norm(x)
+    if self.bias:
+      h = h + means[..., None, None] * self.alpha[..., None, None]
+      out = self.gamma.view(-1, self.num_features, 1, 1) * h + self.beta.view(-1, self.num_features, 1, 1)
+    else:
+      h = h + means[..., None, None] * self.alpha[..., None, None]
+      out = self.gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalInstanceNorm2dPlus(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 3)
+      self.embed.weight.data[:, :2 * num_features].normal_(1, 0.02)  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, 2 * num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, 2 * num_features)
+      self.embed.weight.data.normal_(1, 0.02)
+  def forward(self, x, y):
+    means = torch.mean(x, dim=(2, 3))
+    m = torch.mean(means, dim=-1, keepdim=True)
+    v = torch.var(means, dim=-1, keepdim=True)
+    means = (means - m) / (torch.sqrt(v + 1e-5))
+    h = self.instance_norm(x)
+    if self.bias:
+      gamma, alpha, beta = self.embed(y).chunk(3, dim=-1)
+      h = h + means[..., None, None] * alpha[..., None, None]
+      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma, alpha = self.embed(y).chunk(2, dim=-1)
+      h = h + means[..., None, None] * alpha[..., None, None]
+      out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out

fastgeco/backbones/ncsnpp_utils/utils.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""All functions and modules related to model definition.
+"""
+import torch
+import numpy as np
+from ...sdes import OUVESDE, OUVPSDE
+_MODELS = {}
+def register_model(cls=None, *, name=None):
+  """A decorator for registering model classes."""
+  def _register(cls):
+    if name is None:
+      local_name = cls.__name__
+    else:
+      local_name = name
+    if local_name in _MODELS:
+      raise ValueError(f'Already registered model with name: {local_name}')
+    _MODELS[local_name] = cls
+    return cls
+  if cls is None:
+    return _register
+  else:
+    return _register(cls)
+def get_model(name):
+  return _MODELS[name]
+def get_sigmas(sigma_min, sigma_max, num_scales):
+  """Get sigmas --- the set of noise levels for SMLD from config files.
+  Args:
+    config: A ConfigDict object parsed from the config file
+  Returns:
+    sigmas: a jax numpy arrary of noise levels
+  """
+  sigmas = np.exp(
+    np.linspace(np.log(sigma_max), np.log(sigma_min), num_scales))
+  return sigmas
+def get_ddpm_params(config):
+  """Get betas and alphas --- parameters used in the original DDPM paper."""
+  num_diffusion_timesteps = 1000
+  # parameters need to be adapted if number of time steps differs from 1000
+  beta_start = config.model.beta_min / config.model.num_scales
+  beta_end = config.model.beta_max / config.model.num_scales
+  betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+  alphas = 1. - betas
+  alphas_cumprod = np.cumprod(alphas, axis=0)
+  sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)
+  sqrt_1m_alphas_cumprod = np.sqrt(1. - alphas_cumprod)
+  return {
+    'betas': betas,
+    'alphas': alphas,
+    'alphas_cumprod': alphas_cumprod,
+    'sqrt_alphas_cumprod': sqrt_alphas_cumprod,
+    'sqrt_1m_alphas_cumprod': sqrt_1m_alphas_cumprod,
+    'beta_min': beta_start * (num_diffusion_timesteps - 1),
+    'beta_max': beta_end * (num_diffusion_timesteps - 1),
+    'num_diffusion_timesteps': num_diffusion_timesteps
+  }
+def create_model(config):
+  """Create the score model."""
+  model_name = config.model.name
+  score_model = get_model(model_name)(config)
+  score_model = score_model.to(config.device)
+  score_model = torch.nn.DataParallel(score_model)
+  return score_model
+def get_model_fn(model, train=False):
+  """Create a function to give the output of the score-based model.
+  Args:
+    model: The score model.
+    train: `True` for training and `False` for evaluation.
+  Returns:
+    A model function.
+  """
+  def model_fn(x, labels):
+    """Compute the output of the score-based model.
+    Args:
+      x: A mini-batch of input data.
+      labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
+        for different models.
+    Returns:
+      A tuple of (model output, new mutable states)
+    """
+    if not train:
+      model.eval()
+      return model(x, labels)
+    else:
+      model.train()
+      return model(x, labels)
+  return model_fn
+def get_score_fn(sde, model, train=False, continuous=False):
+  """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.
+  Args:
+    sde: An `sde_lib.SDE` object that represents the forward SDE.
+    model: A score model.
+    train: `True` for training and `False` for evaluation.
+    continuous: If `True`, the score-based model is expected to directly take continuous time steps.
+  Returns:
+    A score function.
+  """
+  model_fn = get_model_fn(model, train=train)
+  if isinstance(sde, OUVPSDE):
+    def score_fn(x, t):
+      # Scale neural network output by standard deviation and flip sign
+      if continuous:
+        # For VP-trained models, t=0 corresponds to the lowest noise level
+        # The maximum value of time embedding is assumed to 999 for
+        # continuously-trained models.
+        labels = t * 999
+        score = model_fn(x, labels)
+        std = sde.marginal_prob(torch.zeros_like(x), t)[1]
+      else:
+        # For VP-trained models, t=0 corresponds to the lowest noise level
+        labels = t * (sde.N - 1)
+        score = model_fn(x, labels)
+        std = sde.sqrt_1m_alphas_cumprod.to(labels.device)[labels.long()]
+      score = -score / std[:, None, None, None]
+      return score
+  elif isinstance(sde, OUVESDE):
+    def score_fn(x, t):
+      if continuous:
+        labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
+      else:
+        # For VE-trained models, t=0 corresponds to the highest noise level
+        labels = sde.T - t
+        labels *= sde.N - 1
+        labels = torch.round(labels).long()
+      score = model_fn(x, labels)
+      return score
+  else:
+    raise NotImplementedError(f"SDE class {sde.__class__.__name__} not yet supported.")
+  return score_fn
+def to_flattened_numpy(x):
+  """Flatten a torch tensor `x` and convert it to numpy."""
+  return x.detach().cpu().numpy().reshape((-1,))
+def from_flattened_numpy(x, shape):
+  """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
+  return torch.from_numpy(x.reshape(shape))

fastgeco/backbones/shared.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import functools
+import numpy as np
+import torch
+import torch.nn as nn
+from geco.util.registry import Registry
+BackboneRegistry = Registry("Backbone")
+class GaussianFourierProjection(nn.Module):
+    """Gaussian random features for encoding time steps."""
+    def __init__(self, embed_dim, scale=16, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if not complex_valued:
+            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
+            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
+            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
+            # and this halving is not necessary.
+            embed_dim = embed_dim // 2
+        # Randomly sample weights during initialization. These weights are fixed
+        # during optimization and are not trainable.
+        self.W = nn.Parameter(torch.randn(embed_dim) * scale, requires_grad=False)
+    def forward(self, t):
+        t_proj = t[:, None] * self.W[None, :] * 2*np.pi
+        if self.complex_valued:
+            return torch.exp(1j * t_proj)
+        else:
+            return torch.cat([torch.sin(t_proj), torch.cos(t_proj)], dim=-1)
+class DiffusionStepEmbedding(nn.Module):
+    """Diffusion-Step embedding as in DiffWave / Vaswani et al. 2017."""
+    def __init__(self, embed_dim, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if not complex_valued:
+            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
+            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
+            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
+            # and this halving is not necessary.
+            embed_dim = embed_dim // 2
+        self.embed_dim = embed_dim
+    def forward(self, t):
+        fac = 10**(4*torch.arange(self.embed_dim, device=t.device) / (self.embed_dim-1))
+        inner = t[:, None] * fac[None, :]
+        if self.complex_valued:
+            return torch.exp(1j * inner)
+        else:
+            return torch.cat([torch.sin(inner), torch.cos(inner)], dim=-1)
+class ComplexLinear(nn.Module):
+    """A potentially complex-valued linear layer. Reduces to a regular linear layer if `complex_valued=False`."""
+    def __init__(self, input_dim, output_dim, complex_valued):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if self.complex_valued:
+            self.re = nn.Linear(input_dim, output_dim)
+            self.im = nn.Linear(input_dim, output_dim)
+        else:
+            self.lin = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        if self.complex_valued:
+            return (self.re(x.real) - self.im(x.imag)) + 1j*(self.re(x.imag) + self.im(x.real))
+        else:
+            return self.lin(x)
+class FeatureMapDense(nn.Module):
+    """A fully connected layer that reshapes outputs to feature maps."""
+    def __init__(self, input_dim, output_dim, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        self.dense = ComplexLinear(input_dim, output_dim, complex_valued=complex_valued)
+    def forward(self, x):
+        return self.dense(x)[..., None, None]
+def torch_complex_from_reim(re, im):
+    return torch.view_as_complex(torch.stack([re, im], dim=-1))
+class ArgsComplexMultiplicationWrapper(nn.Module):
+    """Adapted from `asteroid`'s `complex_nn.py`, allowing args/kwargs to be passed through forward().
+    Make a complex-valued module `F` from a real-valued module `f` by applying
+    complex multiplication rules:
+    F(a + i b) = f1(a) - f1(b) + i (f2(b) + f2(a))
+    where `f1`, `f2` are instances of `f` that do *not* share weights.
+    Args:
+        module_cls (callable): A class or function that returns a Torch module/functional.
+            Constructor of `f` in the formula above.  Called 2x with `*args`, `**kwargs`,
+            to construct the real and imaginary component modules.
+    """
+    def __init__(self, module_cls, *args, **kwargs):
+        super().__init__()
+        self.re_module = module_cls(*args, **kwargs)
+        self.im_module = module_cls(*args, **kwargs)
+    def forward(self, x, *args, **kwargs):
+        return torch_complex_from_reim(
+            self.re_module(x.real, *args, **kwargs) - self.im_module(x.imag, *args, **kwargs),
+            self.re_module(x.imag, *args, **kwargs) + self.im_module(x.real, *args, **kwargs),
+        )
+ComplexConv2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.Conv2d)
+ComplexConvTranspose2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.ConvTranspose2d)

fastgeco/model.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import random
+import time
+from math import ceil
+import warnings
+import numpy as np
+# from asteroid.losses.sdr import SingleSrcNegSDR
+import torch
+import pytorch_lightning as pl
+from torch_ema import ExponentialMovingAverage
+import torch.nn.functional as F
+from geco import sampling
+from geco.sdes import SDERegistry
+from fastgeco.backbones import BackboneRegistry
+from geco.util.inference import evaluate_model2
+from geco.util.other import pad_spec
+import numpy as np
+import matplotlib.pyplot as plt
+class ScoreModel(pl.LightningModule):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--lr", type=float, default=1e-5, help="The learning rate (1e-4 by default)")
+        parser.add_argument("--ema_decay", type=float, default=0.999, help="The parameter EMA decay constant (0.999 by default)")
+        parser.add_argument("--t_eps", type=float, default=0.03, help="The minimum time (3e-2 by default)")
+        parser.add_argument("--num_eval_files", type=int, default=20, help="Number of files for speech enhancement performance evaluation during training. Pass 0 to turn off (no checkpoints based on evaluation metrics will be generated).")
+        parser.add_argument("--loss_type", type=str, default="mse", help="The type of loss function to use.")
+        parser.add_argument("--loss_abs_exponent", type=float, default=0.5,  help="magnitude transformation in the loss term")
+        parser.add_argument("--output_scale", type=str, choices=('sigma', 'time'), default= 'time',  help="backbone model scale before last output layer")
+        return parser
+    def __init__(
+        self, backbone, sde, lr=1e-4, ema_decay=0.999, t_eps=3e-2, loss_abs_exponent=0.5,
+        num_eval_files=20, loss_type='mse', data_module_cls=None, output_scale='time', inference_N=1,
+        inference_start=0.5, **kwargs
+    ):
+        """
+        Create a new ScoreModel.
+        Args:
+            backbone: Backbone DNN that serves as a score-based model.
+            sde: The SDE that defines the diffusion process.
+            lr: The learning rate of the optimizer. (1e-4 by default).
+            ema_decay: The decay constant of the parameter EMA (0.999 by default).
+            t_eps: The minimum time to practically run for to avoid issues very close to zero (1e-5 by default).
+            loss_type: The type of loss to use (wrt. noise z/std). Options are 'mse' (default), 'mae'
+        """
+        super().__init__()
+        # Initialize Backbone DNN
+        dnn_cls = BackboneRegistry.get_by_name(backbone)
+        self.dnn = dnn_cls(**kwargs)
+        # Initialize SDE
+        sde_cls = SDERegistry.get_by_name(sde)
+        self.sde = sde_cls(**kwargs)
+        # Store hyperparams and save them
+        self.lr = lr
+        self.ema_decay = ema_decay
+        self.ema = ExponentialMovingAverage(self.parameters(), decay=self.ema_decay)
+        self._error_loading_ema = False
+        self.t_eps = t_eps
+        self.loss_type = loss_type
+        self.num_eval_files = num_eval_files
+        self.loss_abs_exponent = loss_abs_exponent
+        self.output_scale = output_scale
+        self.save_hyperparameters(ignore=['no_wandb'])
+        self.data_module = data_module_cls(**kwargs, gpu=kwargs.get('gpus', 0) > 0)
+        self.inference_N = inference_N
+        self.inference_start = inference_start
+        # self.si_snr = SingleSrcNegSDR("sisdr", reduction='mean', zero_mean=False)
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return optimizer
+    def optimizer_step(self, *args, **kwargs):
+        # Method overridden so that the EMA params are updated after each optimizer step
+        super().optimizer_step(*args, **kwargs)
+        self.ema.update(self.parameters())
+    # on_load_checkpoint / on_save_checkpoint needed for EMA storing/loading
+    def on_load_checkpoint(self, checkpoint):
+        ema = checkpoint.get('ema', None)
+        if ema is not None:
+            self.ema.load_state_dict(checkpoint['ema'])
+        else:
+            self._error_loading_ema = True
+            warnings.warn("EMA state_dict not found in checkpoint!")
+    def on_save_checkpoint(self, checkpoint):
+        checkpoint['ema'] = self.ema.state_dict()
+    def train(self, mode, no_ema=False):
+        res = super().train(mode)  # call the standard `train` method with the given mode
+        if not self._error_loading_ema:
+            if mode == False and not no_ema:
+                # eval
+                self.ema.store(self.parameters())        # store current params in EMA
+                self.ema.copy_to(self.parameters())      # copy EMA parameters over current params for evaluation
+            else:
+                # train
+                if self.ema.collected_params is not None:
+                    self.ema.restore(self.parameters())  # restore the EMA weights (if stored)
+        return res
+    def eval(self, no_ema=False):
+        return self.train(False, no_ema=no_ema)
+    def sisnr(self, est, ref, eps = 1e-8):
+        est = est - torch.mean(est, dim = -1, keepdim = True)
+        ref = ref - torch.mean(ref, dim = -1, keepdim = True)
+        est_p = (torch.sum(est * ref, dim = -1, keepdim = True) * ref) / torch.sum(ref * ref, dim = -1, keepdim = True)
+        est_v = est - est_p
+        est_sisnr = 10 * torch.log10((torch.sum(est_p * est_p, dim = -1, keepdim = True) + eps) / (torch.sum(est_v * est_v, dim = -1, keepdim = True) + eps))
+        return -est_sisnr
+    def _loss(self, wav_x_tm1, wav_gt):
+        if self.loss_type == 'default':
+            min_leng = min(wav_x_tm1.shape[-1], wav_gt.shape[-1])
+            wav_x_tm1 = wav_x_tm1.squeeze(1)[:,:min_leng]
+            wav_gt = wav_gt.squeeze(1)[:,:min_leng]
+            loss = torch.mean(self.sisnr(wav_x_tm1, wav_gt))
+        else:
+            raise RuntimeError(f'{self.loss_type} loss not defined')
+        return loss
+    def euler_step(self, X, X_t, Y, M, t, dt):
+        f, g = self.sde.sde(X_t, t, Y)
+        vec_t = torch.ones(Y.shape[0], device=Y.device) * t
+        mean_x_tm1 = X_t - (f - g**2*self.forward(X_t, vec_t, Y, M, vec_t[:,None,None,None]))*dt
+        z = torch.randn_like(X)
+        X_t = mean_x_tm1 + z*g*torch.sqrt(dt)
+        return X_t
+    def training_step(self, batch, batch_idx):
+        X, Y, M = batch
+        reverse_start_time = random.uniform(self.t_rsp_min, self.t_rsp_max)
+        N_reverse = random.randint(self.N_min, self.N_max)
+        if self.stop_iteration_random == "random":
+            stop_iteration = random.randint(0, N_reverse-1)
+        elif self.stop_iteration_random == "last":
+            #Used in publication. This means that only the last step is used for updating weights.
+            stop_iteration = N_reverse-1
+        else:
+            raise RuntimeError(f'{self.stop_iteration_random} not defined')
+        timesteps = torch.linspace(reverse_start_time, self.t_eps, N_reverse, device=Y.device)
+        #prior sampling starting from reverse_start_time
+        std = self.sde._std(reverse_start_time*torch.ones((Y.shape[0],), device=Y.device))
+        z = torch.randn_like(Y)
+        X_t = Y + z * std[:, None, None, None]
+        #reverse steps by Euler Maruyama
+        for i in range(len(timesteps)):
+            t = timesteps[i]
+            if i != len(timesteps) - 1:
+                dt = t - timesteps[i+1]
+            else:
+                dt = timesteps[-1]
+            if i != stop_iteration:
+                with torch.no_grad():
+                    #take Euler step here
+                    X_t = self.euler_step(X, X_t, Y, M, t, dt)
+            else:
+                #take a Euler step and compute loss
+                f, g = self.sde.sde(X_t, t, Y)
+                vec_t = torch.ones(Y.shape[0], device=Y.device) * t
+                score = self.forward(X_t, vec_t, Y, M, vec_t[:,None,None,None])
+                mean_x_tm1 = X_t - (f - g**2*score)*dt #mean of x t minus 1 = mu(x_{t-1})
+                mean_gt, _ = self.sde.marginal_prob(X, torch.ones(Y.shape[0], device=Y.device) * (t-dt), Y)
+                wav_gt = self.to_audio(mean_gt.squeeze())
+                wav_x_tm1 = self.to_audio(mean_x_tm1.squeeze())
+                loss = self._loss(wav_x_tm1, wav_gt)
+                break
+        self.log('train_loss', loss, on_step=True, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        # Evaluate speech enhancement performance, compute loss only for a few val data
+        if batch_idx == 0 and self.num_eval_files != 0:
+            pesq, si_sdr, estoi, loss = evaluate_model2(self, self.num_eval_files, self.inference_N, inference_start=self.inference_start)
+            self.log('pesq', pesq, on_step=False, on_epoch=True)
+            self.log('si_sdr', si_sdr, on_step=False, on_epoch=True)
+            self.log('estoi', estoi, on_step=False, on_epoch=True)
+            self.log('valid_loss', loss, on_step=False, on_epoch=True)
+            return loss
+    def forward(self, x, t, y, m, divide_scale):
+        # Concatenate y as an extra channel
+        dnn_input = torch.cat([x, y, m], dim=1)
+        # the minus is most likely unimportant here - taken from Song's repo
+        score = -self.dnn(dnn_input, t, divide_scale)
+        return score
+    def to(self, *args, **kwargs):
+        """Override PyTorch .to() to also transfer the EMA of the model weights"""
+        self.ema.to(*args, **kwargs)
+        return super().to(*args, **kwargs)
+    def train_dataloader(self):
+        return self.data_module.train_dataloader()
+    def val_dataloader(self):
+        return self.data_module.val_dataloader()
+    def test_dataloader(self):
+        return self.data_module.test_dataloader()
+    def setup(self, stage=None):
+        return self.data_module.setup(stage=stage)
+    def to_audio(self, spec, length=None):
+        return self._istft(self._backward_transform(spec), length)
+    def _forward_transform(self, spec):
+        return self.data_module.spec_fwd(spec)
+    def _backward_transform(self, spec):
+        return self.data_module.spec_back(spec)
+    def _stft(self, sig):
+        return self.data_module.stft(sig)
+    def _istft(self, spec, length=None):
+        return self.data_module.istft(spec, length)
+    def add_para(self, N_min=1, N_max=1, t_rsp_min=0.5, t_rsp_max=0.5, batch_size=64, loss_type='default', lr=5e-5, stop_iteration_random='last', inference_N=1, inference_start=0.5):
+        self.t_rsp_min = t_rsp_min
+        self.t_rsp_max = t_rsp_max
+        self.N_min = N_min
+        self.N_max = N_max
+        self.data_module.batch_size = batch_size
+        self.data_module.num_workers = 4
+        self.data_module.gpu = True
+        self.loss_type = loss_type
+        self.lr = lr
+        self.stop_iteration_random = stop_iteration_random
+        self.inference_N = inference_N
+        self.inference_start = inference_start

geco/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

geco/backbones/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

geco/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .shared import BackboneRegistry
+from .ncsnpp import NCSNpp
+__all__ = ['BackboneRegistry', 'NCSNpp']

geco/backbones/ncsnpp.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+from score_models.layers import UpsampleLayer, DownsampleLayer
+from .ncsnpp_utils import layers, layerspp, normalization
+import torch.nn as nn
+import functools
+import torch
+import numpy as np
+from .shared import BackboneRegistry
+ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
+ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
+Combine = layerspp.Combine
+conv3x3 = layerspp.conv3x3
+conv1x1 = layerspp.conv1x1
+get_act = layers.get_act
+get_normalization = normalization.get_normalization
+default_initializer = layers.default_init
+@BackboneRegistry.register("ncsnpp")
+class NCSNpp(nn.Module):
+    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
+    @staticmethod
+    def add_argparse_args(parser):
+        # TODO: add additional arguments of constructor, if you wish to modify them.
+        return parser
+    def __init__(self,
+        scale_by_sigma = True,
+        nonlinearity = 'swish',
+        nf = 128,
+        ch_mult = (1, 1, 2, 2, 2, 2, 2),
+        num_res_blocks = 2,
+        attn_resolutions = (16,),
+        resamp_with_conv = True,
+        conditional = True,
+        fir = True,
+        fir_kernel = 'song',
+        skip_rescale = True,
+        resblock_type = 'biggan',
+        progressive = 'output_skip',
+        progressive_input = 'input_skip',
+        progressive_combine = 'sum',
+        init_scale = 0.,
+        fourier_scale = 16,
+        image_size = 256,
+        embedding_type = 'fourier',
+        dropout = .0,
+        **unused_kwargs
+    ):
+        super().__init__()
+        self.act = act = get_act(nonlinearity)
+        self.nf = nf = nf
+        ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions = attn_resolutions
+        dropout = dropout
+        resamp_with_conv = resamp_with_conv
+        self.num_resolutions = num_resolutions = len(ch_mult)
+        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
+        self.conditional = conditional = conditional  # noise-conditional
+        self.scale_by_sigma = scale_by_sigma
+        fir = fir
+        fir_kernel = [1, 3, 3, 1]
+        self.skip_rescale = skip_rescale = skip_rescale
+        self.resblock_type = resblock_type = resblock_type.lower()
+        self.progressive = progressive = progressive.lower()
+        self.progressive_input = progressive_input = progressive_input.lower()
+        self.embedding_type = embedding_type = embedding_type.lower()
+        init_scale = init_scale
+        assert progressive in ['none', 'output_skip', 'residual']
+        assert progressive_input in ['none', 'input_skip', 'residual']
+        assert embedding_type in ['fourier', 'positional']
+        combine_method = progressive_combine.lower()
+        combiner = functools.partial(Combine, method=combine_method)
+        num_channels = 6  # x.real, x.imag, y.real, y.imag
+        self.output_layer = nn.Conv2d(num_channels, 2, 1)
+        modules = []
+        # timestep/noise_level embedding
+        if embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            modules.append(layerspp.GaussianFourierProjection(
+                embedding_size=nf, scale=fourier_scale
+            ))
+            embed_dim = 2 * nf
+        elif embedding_type == 'positional':
+            embed_dim = nf
+        else:
+            raise ValueError(f'embedding type {embedding_type} unknown.')
+        if conditional:
+            modules.append(nn.Linear(embed_dim, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+            modules.append(nn.Linear(nf * 4, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+        AttnBlock = functools.partial(layerspp.AttnBlockpp,
+            init_scale=init_scale, skip_rescale=skip_rescale)
+        Upsample = functools.partial(UpsampleLayer,
+            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive == 'output_skip':
+            self.pyramid_upsample = UpsampleLayer(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive == 'residual':
+            pyramid_upsample = functools.partial(UpsampleLayer, fir=fir,
+                fir_kernel=fir_kernel, with_conv=True)
+        Downsample = functools.partial(DownsampleLayer, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive_input == 'input_skip':
+            self.pyramid_downsample = DownsampleLayer(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive_input == 'residual':
+            pyramid_downsample = functools.partial(DownsampleLayer,
+                fir=fir, fir_kernel=fir_kernel, with_conv=True)
+        if resblock_type == 'ddpm':
+            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
+                dropout=dropout, init_scale=init_scale,
+                skip_rescale=skip_rescale, temb_dim=nf * 4)
+        elif resblock_type == 'biggan':
+            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
+                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
+                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
+        else:
+            raise ValueError(f'resblock type {resblock_type} unrecognized.')
+        # Downsampling block
+        channels = num_channels
+        if progressive_input != 'none':
+            input_pyramid_ch = channels
+        modules.append(conv3x3(channels, nf))
+        hs_c = [nf]
+        in_ch = nf
+        for i_level in range(num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(num_res_blocks):
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
+                in_ch = out_ch
+                if all_resolutions[i_level] in attn_resolutions:
+                    modules.append(AttnBlock(channels=in_ch))
+                hs_c.append(in_ch)
+            if i_level != num_resolutions - 1:
+                if resblock_type == 'ddpm':
+                    modules.append(Downsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
+                if progressive_input == 'input_skip':
+                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
+                    if combine_method == 'cat':
+                        in_ch *= 2
+                elif progressive_input == 'residual':
+                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
+                    input_pyramid_ch = in_ch
+                hs_c.append(in_ch)
+        in_ch = hs_c[-1]
+        modules.append(ResnetBlock(in_ch=in_ch))
+        modules.append(AttnBlock(channels=in_ch))
+        modules.append(ResnetBlock(in_ch=in_ch))
+        pyramid_ch = 0
+        # Upsampling block
+        for i_level in reversed(range(num_resolutions)):
+            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
+                in_ch = out_ch
+            if all_resolutions[i_level] in attn_resolutions:
+                modules.append(AttnBlock(channels=in_ch))
+            if progressive != 'none':
+                if i_level == num_resolutions - 1:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, in_ch, bias=True))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name.')
+                else:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name')
+            if i_level != 0:
+                if resblock_type == 'ddpm':
+                    modules.append(Upsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
+        assert not hs_c
+        if progressive != 'output_skip':
+            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                                                                    num_channels=in_ch, eps=1e-6))
+            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+        self.all_modules = nn.ModuleList(modules)
+    def forward(self, x, time_cond):
+        # timestep/noise_level embedding; only for continuous training
+        modules = self.all_modules
+        m_idx = 0
+        # Convert real and imaginary parts of (x,y) into four channel dimensions
+        x = torch.cat((x[:,[0],:,:].real, x[:,[0],:,:].imag,
+                       x[:,[1],:,:].real, x[:,[1],:,:].imag,
+                       x[:,[2],:,:].real, x[:,[2],:,:].imag), dim=1)
+        if self.embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            used_sigmas = time_cond
+            temb = modules[m_idx](torch.log(used_sigmas))
+            m_idx += 1
+        elif self.embedding_type == 'positional':
+            # Sinusoidal positional embeddings.
+            timesteps = time_cond
+            used_sigmas = self.sigmas[time_cond.long()]
+            temb = layers.get_timestep_embedding(timesteps, self.nf)
+        else:
+            raise ValueError(f'embedding type {self.embedding_type} unknown.')
+        if self.conditional:
+            temb = modules[m_idx](temb)
+            m_idx += 1
+            temb = modules[m_idx](self.act(temb))
+            m_idx += 1
+        else:
+            temb = None
+        # Downsampling block
+        input_pyramid = None
+        if self.progressive_input != 'none':
+            input_pyramid = x
+        # Input layer: Conv2d: 4ch -> 128ch
+        hs = [modules[m_idx](x)]
+        m_idx += 1
+        # Down path in U-Net
+        for i_level in range(self.num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(self.num_res_blocks):
+                h = modules[m_idx](hs[-1], temb)
+                m_idx += 1
+                # Attention layer (optional)
+                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                hs.append(h)
+            # Downsampling
+            if i_level != self.num_resolutions - 1:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](hs[-1])
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](hs[-1], temb)
+                    m_idx += 1
+                if self.progressive_input == 'input_skip':   # Combine h with x
+                    input_pyramid = self.pyramid_downsample(input_pyramid)
+                    h = modules[m_idx](input_pyramid, h)
+                    m_idx += 1
+                elif self.progressive_input == 'residual':
+                    input_pyramid = modules[m_idx](input_pyramid)
+                    m_idx += 1
+                    if self.skip_rescale:
+                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
+                    else:
+                        input_pyramid = input_pyramid + h
+                    h = input_pyramid
+                hs.append(h)
+        h = hs[-1] # actualy equal to: h = h
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        h = modules[m_idx](h)  # Attention block
+        m_idx += 1
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        pyramid = None
+        # Upsampling block
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
+                m_idx += 1
+            # edit: from -1 to -2
+            if h.shape[-2] in self.attn_resolutions:
+                h = modules[m_idx](h)
+                m_idx += 1
+            if self.progressive != 'none':
+                if i_level == self.num_resolutions - 1:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
+                        m_idx += 1
+                    elif self.progressive == 'residual':
+                        pyramid = self.act(modules[m_idx](h))
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name.')
+                else:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
+                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid_h = modules[m_idx](pyramid_h)
+                        m_idx += 1
+                        pyramid = pyramid + pyramid_h
+                    elif self.progressive == 'residual':
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                        if self.skip_rescale:
+                            pyramid = (pyramid + h) / np.sqrt(2.)
+                        else:
+                            pyramid = pyramid + h
+                        h = pyramid
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name')
+            # Upsampling Layer
+            if i_level != 0:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](h, temb)  # Upspampling
+                    m_idx += 1
+        assert not hs
+        if self.progressive == 'output_skip':
+            h = pyramid
+        else:
+            h = self.act(modules[m_idx](h))
+            m_idx += 1
+            h = modules[m_idx](h)
+            m_idx += 1
+        assert m_idx == len(modules), "Implementation error"
+        h = h / used_sigmas[:, None, None, None]
+        # Convert back to complex number
+        h = self.output_layer(h)
+        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
+        h = torch.view_as_complex(h)[:,None, :, :]
+        return h

geco/backbones/ncsnpp_utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

geco/backbones/ncsnpp_utils/layers.py ADDED Viewed

	@@ -0,0 +1,662 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+"""Common layers for defining score networks.
+"""
+import math
+import string
+from functools import partial
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+from .normalization import ConditionalInstanceNorm2dPlus
+def get_act(config):
+  """Get activation functions from the config file."""
+  if config == 'elu':
+    return nn.ELU()
+  elif config == 'relu':
+    return nn.ReLU()
+  elif config == 'lrelu':
+    return nn.LeakyReLU(negative_slope=0.2)
+  elif config == 'swish':
+    return nn.SiLU()
+  else:
+    raise NotImplementedError('activation function does not exist!')
+def ncsn_conv1x1(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=0):
+  """1x1 convolution. Same as NCSNv1/v2."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=bias, dilation=dilation,
+                   padding=padding)
+  init_scale = 1e-10 if init_scale == 0 else init_scale
+  conv.weight.data *= init_scale
+  conv.bias.data *= init_scale
+  return conv
+def variance_scaling(scale, mode, distribution,
+                     in_axis=1, out_axis=0,
+                     dtype=torch.float32,
+                     device='cpu'):
+  """Ported from JAX. """
+  def _compute_fans(shape, in_axis=1, out_axis=0):
+    receptive_field_size = np.prod(shape) / shape[in_axis] / shape[out_axis]
+    fan_in = shape[in_axis] * receptive_field_size
+    fan_out = shape[out_axis] * receptive_field_size
+    return fan_in, fan_out
+  def init(shape, dtype=dtype, device=device):
+    fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+    if mode == "fan_in":
+      denominator = fan_in
+    elif mode == "fan_out":
+      denominator = fan_out
+    elif mode == "fan_avg":
+      denominator = (fan_in + fan_out) / 2
+    else:
+      raise ValueError(
+        "invalid mode for variance scaling initializer: {}".format(mode))
+    variance = scale / denominator
+    if distribution == "normal":
+      return torch.randn(*shape, dtype=dtype, device=device) * np.sqrt(variance)
+    elif distribution == "uniform":
+      return (torch.rand(*shape, dtype=dtype, device=device) * 2. - 1.) * np.sqrt(3 * variance)
+    else:
+      raise ValueError("invalid distribution for variance scaling initializer")
+  return init
+def default_init(scale=1.):
+  """The same initialization used in DDPM."""
+  scale = 1e-10 if scale == 0 else scale
+  return variance_scaling(scale, 'fan_avg', 'uniform')
+class Dense(nn.Module):
+  """Linear layer with `default_init`."""
+  def __init__(self):
+    super().__init__()
+def ddpm_conv1x1(in_planes, out_planes, stride=1, bias=True, init_scale=1., padding=0):
+  """1x1 convolution with DDPM initialization."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=padding, bias=bias)
+  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
+  nn.init.zeros_(conv.bias)
+  return conv
+def ncsn_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
+  """3x3 convolution with PyTorch initialization. Same as NCSNv1/NCSNv2."""
+  init_scale = 1e-10 if init_scale == 0 else init_scale
+  conv = nn.Conv2d(in_planes, out_planes, stride=stride, bias=bias,
+                   dilation=dilation, padding=padding, kernel_size=3)
+  conv.weight.data *= init_scale
+  conv.bias.data *= init_scale
+  return conv
+def ddpm_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
+  """3x3 convolution with DDPM initialization."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=padding,
+                   dilation=dilation, bias=bias)
+  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
+  nn.init.zeros_(conv.bias)
+  return conv
+  ###########################################################################
+  # Functions below are ported over from the NCSNv1/NCSNv2 codebase:
+  # https://github.com/ermongroup/ncsn
+  # https://github.com/ermongroup/ncsnv2
+  ###########################################################################
+class CRPBlock(nn.Module):
+  def __init__(self, features, n_stages, act=nn.ReLU(), maxpool=True):
+    super().__init__()
+    self.convs = nn.ModuleList()
+    for i in range(n_stages):
+      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.n_stages = n_stages
+    if maxpool:
+      self.pool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
+    else:
+      self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
+    self.act = act
+  def forward(self, x):
+    x = self.act(x)
+    path = x
+    for i in range(self.n_stages):
+      path = self.pool(path)
+      path = self.convs[i](path)
+      x = path + x
+    return x
+class CondCRPBlock(nn.Module):
+  def __init__(self, features, n_stages, num_classes, normalizer, act=nn.ReLU()):
+    super().__init__()
+    self.convs = nn.ModuleList()
+    self.norms = nn.ModuleList()
+    self.normalizer = normalizer
+    for i in range(n_stages):
+      self.norms.append(normalizer(features, num_classes, bias=True))
+      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.n_stages = n_stages
+    self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
+    self.act = act
+  def forward(self, x, y):
+    x = self.act(x)
+    path = x
+    for i in range(self.n_stages):
+      path = self.norms[i](path, y)
+      path = self.pool(path)
+      path = self.convs[i](path)
+      x = path + x
+    return x
+class RCUBlock(nn.Module):
+  def __init__(self, features, n_blocks, n_stages, act=nn.ReLU()):
+    super().__init__()
+    for i in range(n_blocks):
+      for j in range(n_stages):
+        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.stride = 1
+    self.n_blocks = n_blocks
+    self.n_stages = n_stages
+    self.act = act
+  def forward(self, x):
+    for i in range(self.n_blocks):
+      residual = x
+      for j in range(self.n_stages):
+        x = self.act(x)
+        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
+      x += residual
+    return x
+class CondRCUBlock(nn.Module):
+  def __init__(self, features, n_blocks, n_stages, num_classes, normalizer, act=nn.ReLU()):
+    super().__init__()
+    for i in range(n_blocks):
+      for j in range(n_stages):
+        setattr(self, '{}_{}_norm'.format(i + 1, j + 1), normalizer(features, num_classes, bias=True))
+        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.stride = 1
+    self.n_blocks = n_blocks
+    self.n_stages = n_stages
+    self.act = act
+    self.normalizer = normalizer
+  def forward(self, x, y):
+    for i in range(self.n_blocks):
+      residual = x
+      for j in range(self.n_stages):
+        x = getattr(self, '{}_{}_norm'.format(i + 1, j + 1))(x, y)
+        x = self.act(x)
+        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
+      x += residual
+    return x
+class MSFBlock(nn.Module):
+  def __init__(self, in_planes, features):
+    super().__init__()
+    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
+    self.convs = nn.ModuleList()
+    self.features = features
+    for i in range(len(in_planes)):
+      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
+  def forward(self, xs, shape):
+    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
+    for i in range(len(self.convs)):
+      h = self.convs[i](xs[i])
+      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
+      sums += h
+    return sums
+class CondMSFBlock(nn.Module):
+  def __init__(self, in_planes, features, num_classes, normalizer):
+    super().__init__()
+    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
+    self.convs = nn.ModuleList()
+    self.norms = nn.ModuleList()
+    self.features = features
+    self.normalizer = normalizer
+    for i in range(len(in_planes)):
+      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
+      self.norms.append(normalizer(in_planes[i], num_classes, bias=True))
+  def forward(self, xs, y, shape):
+    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
+    for i in range(len(self.convs)):
+      h = self.norms[i](xs[i], y)
+      h = self.convs[i](h)
+      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
+      sums += h
+    return sums
+class RefineBlock(nn.Module):
+  def __init__(self, in_planes, features, act=nn.ReLU(), start=False, end=False, maxpool=True):
+    super().__init__()
+    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
+    self.n_blocks = n_blocks = len(in_planes)
+    self.adapt_convs = nn.ModuleList()
+    for i in range(n_blocks):
+      self.adapt_convs.append(RCUBlock(in_planes[i], 2, 2, act))
+    self.output_convs = RCUBlock(features, 3 if end else 1, 2, act)
+    if not start:
+      self.msf = MSFBlock(in_planes, features)
+    self.crp = CRPBlock(features, 2, act, maxpool=maxpool)
+  def forward(self, xs, output_shape):
+    assert isinstance(xs, tuple) or isinstance(xs, list)
+    hs = []
+    for i in range(len(xs)):
+      h = self.adapt_convs[i](xs[i])
+      hs.append(h)
+    if self.n_blocks > 1:
+      h = self.msf(hs, output_shape)
+    else:
+      h = hs[0]
+    h = self.crp(h)
+    h = self.output_convs(h)
+    return h
+class CondRefineBlock(nn.Module):
+  def __init__(self, in_planes, features, num_classes, normalizer, act=nn.ReLU(), start=False, end=False):
+    super().__init__()
+    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
+    self.n_blocks = n_blocks = len(in_planes)
+    self.adapt_convs = nn.ModuleList()
+    for i in range(n_blocks):
+      self.adapt_convs.append(
+        CondRCUBlock(in_planes[i], 2, 2, num_classes, normalizer, act)
+      )
+    self.output_convs = CondRCUBlock(features, 3 if end else 1, 2, num_classes, normalizer, act)
+    if not start:
+      self.msf = CondMSFBlock(in_planes, features, num_classes, normalizer)
+    self.crp = CondCRPBlock(features, 2, num_classes, normalizer, act)
+  def forward(self, xs, y, output_shape):
+    assert isinstance(xs, tuple) or isinstance(xs, list)
+    hs = []
+    for i in range(len(xs)):
+      h = self.adapt_convs[i](xs[i], y)
+      hs.append(h)
+    if self.n_blocks > 1:
+      h = self.msf(hs, y, output_shape)
+    else:
+      h = hs[0]
+    h = self.crp(h, y)
+    h = self.output_convs(h, y)
+    return h
+class ConvMeanPool(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, adjust_padding=False):
+    super().__init__()
+    if not adjust_padding:
+      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+      self.conv = conv
+    else:
+      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+      self.conv = nn.Sequential(
+        nn.ZeroPad2d((1, 0, 1, 0)),
+        conv
+      )
+  def forward(self, inputs):
+    output = self.conv(inputs)
+    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
+                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
+    return output
+class MeanPoolConv(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
+    super().__init__()
+    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+  def forward(self, inputs):
+    output = inputs
+    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
+                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
+    return self.conv(output)
+class UpsampleConv(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
+    super().__init__()
+    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+    self.pixelshuffle = nn.PixelShuffle(upscale_factor=2)
+  def forward(self, inputs):
+    output = inputs
+    output = torch.cat([output, output, output, output], dim=1)
+    output = self.pixelshuffle(output)
+    return self.conv(output)
+class ConditionalResidualBlock(nn.Module):
+  def __init__(self, input_dim, output_dim, num_classes, resample=1, act=nn.ELU(),
+               normalization=ConditionalInstanceNorm2dPlus, adjust_padding=False, dilation=None):
+    super().__init__()
+    self.non_linearity = act
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.resample = resample
+    self.normalization = normalization
+    if resample == 'down':
+      if dilation > 1:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
+        self.normalize2 = normalization(input_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+      else:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
+        self.normalize2 = normalization(input_dim, num_classes)
+        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
+        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
+    elif resample is None:
+      if dilation > 1:
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        self.normalize2 = normalization(output_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
+      else:
+        conv_shortcut = nn.Conv2d
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
+        self.normalize2 = normalization(output_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
+    else:
+      raise Exception('invalid resample value')
+    if output_dim != input_dim or resample is not None:
+      self.shortcut = conv_shortcut(input_dim, output_dim)
+    self.normalize1 = normalization(input_dim, num_classes)
+  def forward(self, x, y):
+    output = self.normalize1(x, y)
+    output = self.non_linearity(output)
+    output = self.conv1(output)
+    output = self.normalize2(output, y)
+    output = self.non_linearity(output)
+    output = self.conv2(output)
+    if self.output_dim == self.input_dim and self.resample is None:
+      shortcut = x
+    else:
+      shortcut = self.shortcut(x)
+    return shortcut + output
+class ResidualBlock(nn.Module):
+  def __init__(self, input_dim, output_dim, resample=None, act=nn.ELU(),
+               normalization=nn.InstanceNorm2d, adjust_padding=False, dilation=1):
+    super().__init__()
+    self.non_linearity = act
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.resample = resample
+    self.normalization = normalization
+    if resample == 'down':
+      if dilation > 1:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
+        self.normalize2 = normalization(input_dim)
+        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+      else:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
+        self.normalize2 = normalization(input_dim)
+        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
+        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
+    elif resample is None:
+      if dilation > 1:
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        self.normalize2 = normalization(output_dim)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
+      else:
+        # conv_shortcut = nn.Conv2d ### Something wierd here.
+        conv_shortcut = partial(ncsn_conv1x1)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
+        self.normalize2 = normalization(output_dim)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
+    else:
+      raise Exception('invalid resample value')
+    if output_dim != input_dim or resample is not None:
+      self.shortcut = conv_shortcut(input_dim, output_dim)
+    self.normalize1 = normalization(input_dim)
+  def forward(self, x):
+    output = self.normalize1(x)
+    output = self.non_linearity(output)
+    output = self.conv1(output)
+    output = self.normalize2(output)
+    output = self.non_linearity(output)
+    output = self.conv2(output)
+    if self.output_dim == self.input_dim and self.resample is None:
+      shortcut = x
+    else:
+      shortcut = self.shortcut(x)
+    return shortcut + output
+###########################################################################
+# Functions below are ported over from the DDPM codebase:
+#  https://github.com/hojonathanho/diffusion/blob/master/diffusion_tf/nn.py
+###########################################################################
+def get_timestep_embedding(timesteps, embedding_dim, max_positions=10000):
+  assert len(timesteps.shape) == 1  # and timesteps.dtype == tf.int32
+  half_dim = embedding_dim // 2
+  # magic number 10000 is from transformers
+  emb = math.log(max_positions) / (half_dim - 1)
+  # emb = math.log(2.) / (half_dim - 1)
+  emb = torch.exp(torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) * -emb)
+  # emb = tf.range(num_embeddings, dtype=jnp.float32)[:, None] * emb[None, :]
+  # emb = tf.cast(timesteps, dtype=jnp.float32)[:, None] * emb[None, :]
+  emb = timesteps.float()[:, None] * emb[None, :]
+  emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+  if embedding_dim % 2 == 1:  # zero pad
+    emb = F.pad(emb, (0, 1), mode='constant')
+  assert emb.shape == (timesteps.shape[0], embedding_dim)
+  return emb
+def _einsum(a, b, c, x, y):
+  einsum_str = '{},{}->{}'.format(''.join(a), ''.join(b), ''.join(c))
+  return torch.einsum(einsum_str, x, y)
+def contract_inner(x, y):
+  """tensordot(x, y, 1)."""
+  x_chars = list(string.ascii_lowercase[:len(x.shape)])
+  y_chars = list(string.ascii_lowercase[len(x.shape):len(y.shape) + len(x.shape)])
+  y_chars[0] = x_chars[-1]  # first axis of y and last of x get summed
+  out_chars = x_chars[:-1] + y_chars[1:]
+  return _einsum(x_chars, y_chars, out_chars, x, y)
+class NIN(nn.Module):
+  def __init__(self, in_dim, num_units, init_scale=0.1):
+    super().__init__()
+    self.W = nn.Parameter(default_init(scale=init_scale)((in_dim, num_units)), requires_grad=True)
+    self.b = nn.Parameter(torch.zeros(num_units), requires_grad=True)
+  def forward(self, x):
+    x = x.permute(0, 2, 3, 1)
+    y = contract_inner(x, self.W) + self.b
+    return y.permute(0, 3, 1, 2)
+class AttnBlock(nn.Module):
+  """Channel-wise self-attention block."""
+  def __init__(self, channels):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=0.)
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    return x + h
+class Upsample(nn.Module):
+  def __init__(self, channels, with_conv=False):
+    super().__init__()
+    if with_conv:
+      self.Conv_0 = ddpm_conv3x3(channels, channels)
+    self.with_conv = with_conv
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = F.interpolate(x, (H * 2, W * 2), mode='nearest')
+    if self.with_conv:
+      h = self.Conv_0(h)
+    return h
+class Downsample(nn.Module):
+  def __init__(self, channels, with_conv=False):
+    super().__init__()
+    if with_conv:
+      self.Conv_0 = ddpm_conv3x3(channels, channels, stride=2, padding=0)
+    self.with_conv = with_conv
+  def forward(self, x):
+    B, C, H, W = x.shape
+    # Emulate 'SAME' padding
+    if self.with_conv:
+      x = F.pad(x, (0, 1, 0, 1))
+      x = self.Conv_0(x)
+    else:
+      x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=0)
+    assert x.shape == (B, C, H // 2, W // 2)
+    return x
+class ResnetBlockDDPM(nn.Module):
+  """The ResNet Blocks used in DDPM."""
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False, dropout=0.1):
+    super().__init__()
+    if out_ch is None:
+      out_ch = in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=in_ch, eps=1e-6)
+    self.act = act
+    self.Conv_0 = ddpm_conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=32, num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = ddpm_conv3x3(out_ch, out_ch, init_scale=0.)
+    if in_ch != out_ch:
+      if conv_shortcut:
+        self.Conv_2 = ddpm_conv3x3(in_ch, out_ch)
+      else:
+        self.NIN_0 = NIN(in_ch, out_ch)
+    self.out_ch = out_ch
+    self.in_ch = in_ch
+    self.conv_shortcut = conv_shortcut
+  def forward(self, x, temb=None):
+    B, C, H, W = x.shape
+    assert C == self.in_ch
+    out_ch = self.out_ch if self.out_ch else self.in_ch
+    h = self.act(self.GroupNorm_0(x))
+    h = self.Conv_0(h)
+    # Add bias to each feature map conditioned on the time embedding
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if C != out_ch:
+      if self.conv_shortcut:
+        x = self.Conv_2(x)
+      else:
+        x = self.NIN_0(x)
+    return x + h

geco/backbones/ncsnpp_utils/layerspp.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+"""Layers for defining NCSN++.
+"""
+from . import layers
+import score_models.layers.up_or_downsampling2d as up_or_down_sampling
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+conv1x1 = layers.ddpm_conv1x1
+conv3x3 = layers.ddpm_conv3x3
+NIN = layers.NIN
+default_init = layers.default_init
+class GaussianFourierProjection(nn.Module):
+  """Gaussian Fourier embeddings for noise levels."""
+  def __init__(self, embedding_size=256, scale=1.0):
+    super().__init__()
+    self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+  def forward(self, x):
+    x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
+    return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+class Combine(nn.Module):
+  """Combine information from skip connections."""
+  def __init__(self, dim1, dim2, method='cat'):
+    super().__init__()
+    self.Conv_0 = conv1x1(dim1, dim2)
+    self.method = method
+  def forward(self, x, y):
+    h = self.Conv_0(x)
+    if self.method == 'cat':
+      return torch.cat([h, y], dim=1)
+    elif self.method == 'sum':
+      return h + y
+    else:
+      raise ValueError(f'Method {self.method} not recognized.')
+class AttnBlockpp(nn.Module):
+  """Channel-wise self-attention block. Modified from DDPM."""
+  def __init__(self, channels, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(channels // 4, 32), num_channels=channels,
+                                  eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=init_scale)
+    self.skip_rescale = skip_rescale
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)
+class ResnetBlockDDPMpp(nn.Module):
+  """ResBlock adapted from DDPM."""
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False,
+               dropout=0.1, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
+    self.Conv_0 = conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
+    if in_ch != out_ch:
+      if conv_shortcut:
+        self.Conv_2 = conv3x3(in_ch, out_ch)
+      else:
+        self.NIN_0 = NIN(in_ch, out_ch)
+    self.skip_rescale = skip_rescale
+    self.act = act
+    self.out_ch = out_ch
+    self.conv_shortcut = conv_shortcut
+  def forward(self, x, temb=None):
+    h = self.act(self.GroupNorm_0(x))
+    h = self.Conv_0(h)
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if x.shape[1] != self.out_ch:
+      if self.conv_shortcut:
+        x = self.Conv_2(x)
+      else:
+        x = self.NIN_0(x)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)
+class ResnetBlockBigGANpp(nn.Module):
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, up=False, down=False,
+               dropout=0.1, fir=False, fir_kernel=(1, 3, 3, 1),
+               skip_rescale=True, init_scale=0.):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
+    self.up = up
+    self.down = down
+    self.fir = fir
+    self.fir_kernel = fir_kernel
+    self.Conv_0 = conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
+    if in_ch != out_ch or up or down:
+      self.Conv_2 = conv1x1(in_ch, out_ch)
+    self.skip_rescale = skip_rescale
+    self.act = act
+    self.in_ch = in_ch
+    self.out_ch = out_ch
+  def forward(self, x, temb=None):
+    h = self.act(self.GroupNorm_0(x))
+    if self.up:
+      if self.fir:
+        h = up_or_down_sampling.upsample_2d(h, self.fir_kernel, factor=2)
+        x = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = up_or_down_sampling.naive_upsample_2d(h, factor=2)
+        x = up_or_down_sampling.naive_upsample_2d(x, factor=2)
+    elif self.down:
+      if self.fir:
+        h = up_or_down_sampling.downsample_2d(h, self.fir_kernel, factor=2)
+        x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = up_or_down_sampling.naive_downsample_2d(h, factor=2)
+        x = up_or_down_sampling.naive_downsample_2d(x, factor=2)
+    h = self.Conv_0(h)
+    # Add bias to each feature map conditioned on the time embedding
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if self.in_ch != self.out_ch or self.up or self.down:
+      x = self.Conv_2(x)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)

geco/backbones/ncsnpp_utils/normalization.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalization layers."""
+import torch.nn as nn
+import torch
+import functools
+def get_normalization(config, conditional=False):
+  """Obtain normalization modules from the config file."""
+  norm = config.model.normalization
+  if conditional:
+    if norm == 'InstanceNorm++':
+      return functools.partial(ConditionalInstanceNorm2dPlus, num_classes=config.model.num_classes)
+    else:
+      raise NotImplementedError(f'{norm} not implemented yet.')
+  else:
+    if norm == 'InstanceNorm':
+      return nn.InstanceNorm2d
+    elif norm == 'InstanceNorm++':
+      return InstanceNorm2dPlus
+    elif norm == 'VarianceNorm':
+      return VarianceNorm2d
+    elif norm == 'GroupNorm':
+      return nn.GroupNorm
+    else:
+      raise ValueError('Unknown normalization: %s' % norm)
+class ConditionalBatchNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.bn = nn.BatchNorm2d(num_features, affine=False)
+    if self.bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    out = self.bn(x)
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=1)
+      out = gamma.view(-1, self.num_features, 1, 1) * out + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * out
+    return out
+class ConditionalInstanceNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    h = self.instance_norm(x)
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=-1)
+      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalVarianceNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=False):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.embed = nn.Embedding(num_classes, num_features)
+    self.embed.weight.data.normal_(1, 0.02)
+  def forward(self, x, y):
+    vars = torch.var(x, dim=(2, 3), keepdim=True)
+    h = x / torch.sqrt(vars + 1e-5)
+    gamma = self.embed(y)
+    out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class VarianceNorm2d(nn.Module):
+  def __init__(self, num_features, bias=False):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.alpha = nn.Parameter(torch.zeros(num_features))
+    self.alpha.data.normal_(1, 0.02)
+  def forward(self, x):
+    vars = torch.var(x, dim=(2, 3), keepdim=True)
+    h = x / torch.sqrt(vars + 1e-5)
+    out = self.alpha.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalNoneNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=-1)
+      out = gamma.view(-1, self.num_features, 1, 1) * x + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * x
+    return out
+class NoneNorm2d(nn.Module):
+  def __init__(self, num_features, bias=True):
+    super().__init__()
+  def forward(self, x):
+    return x
+class InstanceNorm2dPlus(nn.Module):
+  def __init__(self, num_features, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    self.alpha = nn.Parameter(torch.zeros(num_features))
+    self.gamma = nn.Parameter(torch.zeros(num_features))
+    self.alpha.data.normal_(1, 0.02)
+    self.gamma.data.normal_(1, 0.02)
+    if bias:
+      self.beta = nn.Parameter(torch.zeros(num_features))
+  def forward(self, x):
+    means = torch.mean(x, dim=(2, 3))
+    m = torch.mean(means, dim=-1, keepdim=True)
+    v = torch.var(means, dim=-1, keepdim=True)
+    means = (means - m) / (torch.sqrt(v + 1e-5))
+    h = self.instance_norm(x)
+    if self.bias:
+      h = h + means[..., None, None] * self.alpha[..., None, None]
+      out = self.gamma.view(-1, self.num_features, 1, 1) * h + self.beta.view(-1, self.num_features, 1, 1)
+    else:
+      h = h + means[..., None, None] * self.alpha[..., None, None]
+      out = self.gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalInstanceNorm2dPlus(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 3)
+      self.embed.weight.data[:, :2 * num_features].normal_(1, 0.02)  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, 2 * num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, 2 * num_features)
+      self.embed.weight.data.normal_(1, 0.02)
+  def forward(self, x, y):
+    means = torch.mean(x, dim=(2, 3))
+    m = torch.mean(means, dim=-1, keepdim=True)
+    v = torch.var(means, dim=-1, keepdim=True)
+    means = (means - m) / (torch.sqrt(v + 1e-5))
+    h = self.instance_norm(x)
+    if self.bias:
+      gamma, alpha, beta = self.embed(y).chunk(3, dim=-1)
+      h = h + means[..., None, None] * alpha[..., None, None]
+      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma, alpha = self.embed(y).chunk(2, dim=-1)
+      h = h + means[..., None, None] * alpha[..., None, None]
+      out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out

geco/backbones/ncsnpp_utils/utils.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""All functions and modules related to model definition.
+"""
+import torch
+import numpy as np
+from ...sdes import OUVESDE, OUVPSDE
+_MODELS = {}
+def register_model(cls=None, *, name=None):
+  """A decorator for registering model classes."""
+  def _register(cls):
+    if name is None:
+      local_name = cls.__name__
+    else:
+      local_name = name
+    if local_name in _MODELS:
+      raise ValueError(f'Already registered model with name: {local_name}')
+    _MODELS[local_name] = cls
+    return cls
+  if cls is None:
+    return _register
+  else:
+    return _register(cls)
+def get_model(name):
+  return _MODELS[name]
+def get_sigmas(sigma_min, sigma_max, num_scales):
+  """Get sigmas --- the set of noise levels for SMLD from config files.
+  Args:
+    config: A ConfigDict object parsed from the config file
+  Returns:
+    sigmas: a jax numpy arrary of noise levels
+  """
+  sigmas = np.exp(
+    np.linspace(np.log(sigma_max), np.log(sigma_min), num_scales))
+  return sigmas
+def get_ddpm_params(config):
+  """Get betas and alphas --- parameters used in the original DDPM paper."""
+  num_diffusion_timesteps = 1000
+  # parameters need to be adapted if number of time steps differs from 1000
+  beta_start = config.model.beta_min / config.model.num_scales
+  beta_end = config.model.beta_max / config.model.num_scales
+  betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+  alphas = 1. - betas
+  alphas_cumprod = np.cumprod(alphas, axis=0)
+  sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)
+  sqrt_1m_alphas_cumprod = np.sqrt(1. - alphas_cumprod)
+  return {
+    'betas': betas,
+    'alphas': alphas,
+    'alphas_cumprod': alphas_cumprod,
+    'sqrt_alphas_cumprod': sqrt_alphas_cumprod,
+    'sqrt_1m_alphas_cumprod': sqrt_1m_alphas_cumprod,
+    'beta_min': beta_start * (num_diffusion_timesteps - 1),
+    'beta_max': beta_end * (num_diffusion_timesteps - 1),
+    'num_diffusion_timesteps': num_diffusion_timesteps
+  }
+def create_model(config):
+  """Create the score model."""
+  model_name = config.model.name
+  score_model = get_model(model_name)(config)
+  score_model = score_model.to(config.device)
+  score_model = torch.nn.DataParallel(score_model)
+  return score_model
+def get_model_fn(model, train=False):
+  """Create a function to give the output of the score-based model.
+  Args:
+    model: The score model.
+    train: `True` for training and `False` for evaluation.
+  Returns:
+    A model function.
+  """
+  def model_fn(x, labels):
+    """Compute the output of the score-based model.
+    Args:
+      x: A mini-batch of input data.
+      labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
+        for different models.
+    Returns:
+      A tuple of (model output, new mutable states)
+    """
+    if not train:
+      model.eval()
+      return model(x, labels)
+    else:
+      model.train()
+      return model(x, labels)
+  return model_fn
+def get_score_fn(sde, model, train=False, continuous=False):
+  """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.
+  Args:
+    sde: An `sde_lib.SDE` object that represents the forward SDE.
+    model: A score model.
+    train: `True` for training and `False` for evaluation.
+    continuous: If `True`, the score-based model is expected to directly take continuous time steps.
+  Returns:
+    A score function.
+  """
+  model_fn = get_model_fn(model, train=train)
+  if isinstance(sde, OUVPSDE):
+    def score_fn(x, t):
+      # Scale neural network output by standard deviation and flip sign
+      if continuous:
+        # For VP-trained models, t=0 corresponds to the lowest noise level
+        # The maximum value of time embedding is assumed to 999 for
+        # continuously-trained models.
+        labels = t * 999
+        score = model_fn(x, labels)
+        std = sde.marginal_prob(torch.zeros_like(x), t)[1]
+      else:
+        # For VP-trained models, t=0 corresponds to the lowest noise level
+        labels = t * (sde.N - 1)
+        score = model_fn(x, labels)
+        std = sde.sqrt_1m_alphas_cumprod.to(labels.device)[labels.long()]
+      score = -score / std[:, None, None, None]
+      return score
+  elif isinstance(sde, OUVESDE):
+    def score_fn(x, t):
+      if continuous:
+        labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
+      else:
+        # For VE-trained models, t=0 corresponds to the highest noise level
+        labels = sde.T - t
+        labels *= sde.N - 1
+        labels = torch.round(labels).long()
+      score = model_fn(x, labels)
+      return score
+  else:
+    raise NotImplementedError(f"SDE class {sde.__class__.__name__} not yet supported.")
+  return score_fn
+def to_flattened_numpy(x):
+  """Flatten a torch tensor `x` and convert it to numpy."""
+  return x.detach().cpu().numpy().reshape((-1,))
+def from_flattened_numpy(x, shape):
+  """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
+  return torch.from_numpy(x.reshape(shape))

geco/backbones/shared.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import functools
+import numpy as np
+import torch
+import torch.nn as nn
+from geco.util.registry import Registry
+BackboneRegistry = Registry("Backbone")
+class GaussianFourierProjection(nn.Module):
+    """Gaussian random features for encoding time steps."""
+    def __init__(self, embed_dim, scale=16, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if not complex_valued:
+            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
+            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
+            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
+            # and this halving is not necessary.
+            embed_dim = embed_dim // 2
+        # Randomly sample weights during initialization. These weights are fixed
+        # during optimization and are not trainable.
+        self.W = nn.Parameter(torch.randn(embed_dim) * scale, requires_grad=False)
+    def forward(self, t):
+        t_proj = t[:, None] * self.W[None, :] * 2*np.pi
+        if self.complex_valued:
+            return torch.exp(1j * t_proj)
+        else:
+            return torch.cat([torch.sin(t_proj), torch.cos(t_proj)], dim=-1)
+class DiffusionStepEmbedding(nn.Module):
+    """Diffusion-Step embedding as in DiffWave / Vaswani et al. 2017."""
+    def __init__(self, embed_dim, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if not complex_valued:
+            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
+            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
+            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
+            # and this halving is not necessary.
+            embed_dim = embed_dim // 2
+        self.embed_dim = embed_dim
+    def forward(self, t):
+        fac = 10**(4*torch.arange(self.embed_dim, device=t.device) / (self.embed_dim-1))
+        inner = t[:, None] * fac[None, :]
+        if self.complex_valued:
+            return torch.exp(1j * inner)
+        else:
+            return torch.cat([torch.sin(inner), torch.cos(inner)], dim=-1)
+class ComplexLinear(nn.Module):
+    """A potentially complex-valued linear layer. Reduces to a regular linear layer if `complex_valued=False`."""
+    def __init__(self, input_dim, output_dim, complex_valued):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if self.complex_valued:
+            self.re = nn.Linear(input_dim, output_dim)
+            self.im = nn.Linear(input_dim, output_dim)
+        else:
+            self.lin = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        if self.complex_valued:
+            return (self.re(x.real) - self.im(x.imag)) + 1j*(self.re(x.imag) + self.im(x.real))
+        else:
+            return self.lin(x)
+class FeatureMapDense(nn.Module):
+    """A fully connected layer that reshapes outputs to feature maps."""
+    def __init__(self, input_dim, output_dim, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        self.dense = ComplexLinear(input_dim, output_dim, complex_valued=complex_valued)
+    def forward(self, x):
+        return self.dense(x)[..., None, None]
+def torch_complex_from_reim(re, im):
+    return torch.view_as_complex(torch.stack([re, im], dim=-1))
+class ArgsComplexMultiplicationWrapper(nn.Module):
+    """Adapted from `asteroid`'s `complex_nn.py`, allowing args/kwargs to be passed through forward().
+    Make a complex-valued module `F` from a real-valued module `f` by applying
+    complex multiplication rules:
+    F(a + i b) = f1(a) - f1(b) + i (f2(b) + f2(a))
+    where `f1`, `f2` are instances of `f` that do *not* share weights.
+    Args:
+        module_cls (callable): A class or function that returns a Torch module/functional.
+            Constructor of `f` in the formula above.  Called 2x with `*args`, `**kwargs`,
+            to construct the real and imaginary component modules.
+    """
+    def __init__(self, module_cls, *args, **kwargs):
+        super().__init__()
+        self.re_module = module_cls(*args, **kwargs)
+        self.im_module = module_cls(*args, **kwargs)
+    def forward(self, x, *args, **kwargs):
+        return torch_complex_from_reim(
+            self.re_module(x.real, *args, **kwargs) - self.im_module(x.imag, *args, **kwargs),
+            self.re_module(x.imag, *args, **kwargs) + self.im_module(x.real, *args, **kwargs),
+        )
+ComplexConv2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.Conv2d)
+ComplexConvTranspose2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.ConvTranspose2d)

geco/data_module.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import os
+from os.path import join
+import torch
+import pytorch_lightning as pl
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from glob import glob
+import numpy as np
+import torch.nn.functional as F
+import torchaudio
+def get_window(window_type, window_length):
+    if window_type == 'sqrthann':
+        return torch.sqrt(torch.hann_window(window_length, periodic=True))
+    elif window_type == 'hann':
+        return torch.hann_window(window_length, periodic=True)
+    else:
+        raise NotImplementedError(f"Window type {window_type} not implemented!")
+class Specs(Dataset):
+    def __init__(self, data_dir, dummy, shuffle_spec, num_frames, sampling_rate=8000,
+            format='default', normalize="noisy", spec_transform=None,
+            stft_kwargs=None, **ignored_kwargs):
+        # Read file paths according to file naming format.
+        if format == "default":
+            noisy_files1 = sorted(glob(os.path.join(data_dir, '*_source1hatP.wav')))
+            clean_files1 = [item.replace('_source1hatP.wav', '_source1.wav') for item in noisy_files1]
+            mixture_files1 = [item.replace('_source1hatP.wav', '_mix.wav') for item in noisy_files1]
+            noisy_files2 = sorted(glob(os.path.join(data_dir, '*_source2hatP.wav')))
+            clean_files2 = [item.replace('_source2hatP.wav', '_source2.wav') for item in noisy_files2]
+            mixture_files2 = [item.replace('_source2hatP.wav', '_mix.wav') for item in noisy_files2]
+            self.mixture_files = [*mixture_files1,*mixture_files2]
+            self.noisy_files = [*noisy_files1,*noisy_files2]
+            self.clean_files = [*clean_files1,*clean_files2]
+        else:
+            # Feel free to add your own directory format
+            raise NotImplementedError(f"Directory format {format} unknown!")
+        self.dummy = dummy
+        self.num_frames = num_frames
+        self.shuffle_spec = shuffle_spec
+        self.normalize = normalize
+        self.spec_transform = spec_transform
+        self.sampling_rate = sampling_rate
+        assert all(k in stft_kwargs.keys() for k in ["n_fft", "hop_length", "center", "window"]), "misconfigured STFT kwargs"
+        self.stft_kwargs = stft_kwargs
+        self.hop_length = self.stft_kwargs["hop_length"]
+        assert self.stft_kwargs.get("center", None) == True, "'center' must be True for current implementation"
+    def __getitem__(self, i):
+        x, sr = torchaudio.load(self.clean_files[i])
+        if sr != self.sampling_rate:
+            x = torchaudio.transforms.Resample(sr, self.sampling_rate)(x)
+        y, sr = torchaudio.load(self.noisy_files[i])
+        if sr != self.sampling_rate:
+            y = torchaudio.transforms.Resample(sr, self.sampling_rate)(y)
+        m, sr = torchaudio.load(self.mixture_files[i])
+        if sr != self.sampling_rate:
+            m = torchaudio.transforms.Resample(sr, self.sampling_rate)(m)
+        min_leng = min(x.shape[-1],y.shape[-1],m.shape[-1])
+        x = x[...,:min_leng]
+        y = y[...,:min_leng]
+        m = m[...,:min_leng]
+        # formula applies for center=True
+        target_len = (self.num_frames - 1) * self.hop_length
+        current_len = x.size(-1)
+        pad = max(target_len - current_len, 0)
+        if pad == 0:
+            # extract random part of the audio file
+            if self.shuffle_spec:
+                start = int(np.random.uniform(0, current_len-target_len))
+            else:
+                start = int((current_len-target_len)/2)
+            if y[..., start:start+target_len].abs().max() < 0.05:
+                start = 0
+            x = x[..., start:start+target_len]
+            y = y[..., start:start+target_len]
+            m = m[..., start:start+target_len]
+        else:
+            # pad audio if the length T is smaller than num_frames
+            x = F.pad(x, (pad//2, pad//2+(pad%2)), mode='constant')
+            y = F.pad(y, (pad//2, pad//2+(pad%2)), mode='constant')
+            m = F.pad(m, (pad//2, pad//2+(pad%2)), mode='constant')
+        # normalize w.r.t to the noisy or the clean signal or not at all
+        # to ensure same clean signal power in x and y.
+        if self.normalize == "noisy":
+            normfac = y.abs().max()
+        elif self.normalize == "clean":
+            normfac = x.abs().max()
+        elif self.normalize == "not":
+            normfac = 1.0
+        x = x / normfac
+        y = y / normfac
+        m = m / normfac
+        X = torch.stft(x, **self.stft_kwargs)
+        Y = torch.stft(y, **self.stft_kwargs)
+        M = torch.stft(m, **self.stft_kwargs)
+        X, Y, M = self.spec_transform(X), self.spec_transform(Y), self.spec_transform(M)
+        return X, Y, M
+    def __len__(self):
+        if self.dummy:
+            # for debugging shrink the data set size
+            return int(len(self.clean_files)/200)
+        else:
+            return len(self.clean_files)
+class SpecsDataModule(pl.LightningDataModule):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--train_dir", type=str, default='/export/corpora7/HW/speechbrain/recipes/LibriMix/separation/2025/save/libri2mix-train100')
+        parser.add_argument("--val_dir", type=str, default='/export/corpora7/HW/speechbrain/recipes/LibriMix/separation/2025/save/libri2mix-dev')
+        parser.add_argument("--test_dir", type=str, default='/export/corpora7/HW/speechbrain/recipes/LibriMix/separation/2025/save/libri2mix-test')
+        parser.add_argument("--format", type=str, default="default", help="Read file paths according to file naming format.")
+        parser.add_argument("--sampling_rate", type=int, default=8000, help="The sampling rate.")
+        parser.add_argument("--batch_size", type=int, default=16, help="The batch size. 8 by default.")
+        parser.add_argument("--n_fft", type=int, default=510, help="Number of FFT bins. 510 by default.")   # to assure 128 freq bins
+        parser.add_argument("--hop_length", type=int, default=64, help="Window hop length. 128 by default.")
+        parser.add_argument("--num_frames", type=int, default=256, help="Number of frames for the dataset. 256 by default.")
+        parser.add_argument("--window", type=str, choices=("sqrthann", "hann"), default="hann", help="The window function to use for the STFT. 'hann' by default.")
+        parser.add_argument("--num_workers", type=int, default=8, help="Number of workers to use for DataLoaders. 4 by default.")
+        parser.add_argument("--dummy", action="store_true", help="Use reduced dummy dataset for prototyping.")
+        parser.add_argument("--spec_factor", type=float, default=0.15, help="Factor to multiply complex STFT coefficients by. 0.15 by default.")
+        parser.add_argument("--spec_abs_exponent", type=float, default=0.5, help="Exponent e for the transformation abs(z)**e * exp(1j*angle(z)). 0.5 by default.")
+        parser.add_argument("--normalize", type=str, choices=("clean", "noisy", "not"), default="noisy", help="Normalize the input waveforms by the clean signal, the noisy signal, or not at all.")
+        parser.add_argument("--transform_type", type=str, choices=("exponent", "log", "none"), default="exponent", help="Spectogram transformation for input representation.")
+        return parser
+    def __init__(
+        self, train_dir, val_dir, test_dir, format='default', sampling_rate=8000, batch_size=8,
+        n_fft=510, hop_length=64, num_frames=256, window='hann',
+        num_workers=4, dummy=False, spec_factor=0.15, spec_abs_exponent=0.5,
+        gpu=True, normalize='noisy', transform_type="exponent", **kwargs
+    ):
+        super().__init__()
+        self.train_dir = train_dir
+        self.val_dir = val_dir
+        self.test_dir = test_dir
+        self.format = format
+        self.sampling_rate = sampling_rate
+        self.batch_size = batch_size
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.num_frames = num_frames
+        self.window = get_window(window, self.n_fft)
+        self.windows = {}
+        self.num_workers = num_workers
+        self.dummy = dummy
+        self.spec_factor = spec_factor
+        self.spec_abs_exponent = spec_abs_exponent
+        self.gpu = gpu
+        self.normalize = normalize
+        self.transform_type = transform_type
+        self.kwargs = kwargs
+    def setup(self, stage=None):
+        specs_kwargs = dict(
+            stft_kwargs=self.stft_kwargs, num_frames=self.num_frames,
+            spec_transform=self.spec_fwd, **self.kwargs
+        )
+        if stage == 'fit' or stage is None:
+            self.train_set = Specs(data_dir=self.train_dir,
+                dummy=self.dummy, shuffle_spec=True, format=self.format,
+                normalize=self.normalize, sampling_rate=self.sampling_rate, **specs_kwargs)
+            self.valid_set = Specs(data_dir=self.val_dir,
+                dummy=self.dummy, shuffle_spec=False, format=self.format,
+                normalize=self.normalize, sampling_rate=self.sampling_rate, **specs_kwargs)
+        if stage == 'test' or stage is None:
+            self.test_set = Specs(data_dir=self.test_dir,
+                dummy=self.dummy, shuffle_spec=False, format=self.format,
+                normalize=self.normalize, sampling_rate=self.sampling_rate, **specs_kwargs)
+    def spec_fwd(self, spec):
+        if self.transform_type == "exponent":
+            if self.spec_abs_exponent != 1:
+                # only do this calculation if spec_exponent != 1, otherwise it's quite a bit of wasted computation
+                # and introduced numerical error
+                e = self.spec_abs_exponent
+                spec = spec.abs()**e * torch.exp(1j * spec.angle())
+            spec = spec * self.spec_factor
+        elif self.transform_type == "log":
+            spec = torch.log(1 + spec.abs()) * torch.exp(1j * spec.angle())
+            spec = spec * self.spec_factor
+        elif self.transform_type == "none":
+            spec = spec
+        return spec
+    def spec_back(self, spec):
+        if self.transform_type == "exponent":
+            spec = spec / self.spec_factor
+            if self.spec_abs_exponent != 1:
+                e = self.spec_abs_exponent
+                spec = spec.abs()**(1/e) * torch.exp(1j * spec.angle())
+        elif self.transform_type == "log":
+            spec = spec / self.spec_factor
+            spec = (torch.exp(spec.abs()) - 1) * torch.exp(1j * spec.angle())
+        elif self.transform_type == "none":
+            spec = spec
+        return spec
+    @property
+    def stft_kwargs(self):
+        return {**self.istft_kwargs, "return_complex": True}
+    @property
+    def istft_kwargs(self):
+        return dict(
+            n_fft=self.n_fft, hop_length=self.hop_length,
+            window=self.window, center=True
+        )
+    def _get_window(self, x):
+        """
+        Retrieve an appropriate window for the given tensor x, matching the device.
+        Caches the retrieved windows so that only one window tensor will be allocated per device.
+        """
+        window = self.windows.get(x.device, None)
+        if window is None:
+            window = self.window.to(x.device)
+            self.windows[x.device] = window
+        return window
+    def stft(self, sig):
+        window = self._get_window(sig)
+        return torch.stft(sig, **{**self.stft_kwargs, "window": window})
+    def istft(self, spec, length=None):
+        window = self._get_window(spec)
+        return torch.istft(spec, **{**self.istft_kwargs, "window": window, "length": length})
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_set, batch_size=self.batch_size,
+            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=True
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.valid_set, batch_size=self.batch_size,
+            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_set, batch_size=self.batch_size,
+            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=False
+        )

geco/model.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import time
+from math import ceil
+import warnings
+import numpy as np
+import torch
+import pytorch_lightning as pl
+from torch_ema import ExponentialMovingAverage
+import torch.nn.functional as F
+from geco import sampling
+from geco.sdes import SDERegistry
+from geco.backbones import BackboneRegistry
+from geco.util.inference import evaluate_model
+from geco.util.other import pad_spec
+import numpy as np
+import matplotlib.pyplot as plt
+class ScoreModel(pl.LightningModule):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--lr", type=float, default=1e-4, help="The learning rate (1e-4 by default)")
+        parser.add_argument("--ema_decay", type=float, default=0.999, help="The parameter EMA decay constant (0.999 by default)")
+        parser.add_argument("--t_eps", type=float, default=0.03, help="The minimum time (3e-2 by default)")
+        parser.add_argument("--num_eval_files", type=int, default=20, help="Number of files for speech enhancement performance evaluation during training. Pass 0 to turn off (no checkpoints based on evaluation metrics will be generated).")
+        parser.add_argument("--loss_type", type=str, default="mse", help="The type of loss function to use.")
+        parser.add_argument("--loss_abs_exponent", type=float, default=0.5,  help="magnitude transformation in the loss term")
+        return parser
+    def __init__(
+        self, backbone, sde, lr=1e-4, ema_decay=0.999, t_eps=3e-2, loss_abs_exponent=0.5,
+        num_eval_files=20, loss_type='mse', data_module_cls=None, **kwargs
+    ):
+        """
+        Create a new ScoreModel.
+        Args:
+            backbone: Backbone DNN that serves as a score-based model.
+            sde: The SDE that defines the diffusion process.
+            lr: The learning rate of the optimizer. (1e-4 by default).
+            ema_decay: The decay constant of the parameter EMA (0.999 by default).
+            t_eps: The minimum time to practically run for to avoid issues very close to zero (1e-5 by default).
+            loss_type: The type of loss to use (wrt. noise z/std). Options are 'mse' (default), 'mae'
+        """
+        super().__init__()
+        # Initialize Backbone DNN
+        dnn_cls = BackboneRegistry.get_by_name(backbone)
+        self.dnn = dnn_cls(**kwargs)
+        # Initialize SDE
+        if sde == 'bbve':
+            #change parameters, if the old class bbve is used. Needed for loading the provided checkpoint
+            #as that checkpoint was trained with the old class.
+            sde = 'bbed'
+            kwargs['k'] = kwargs['sigma_max']
+            del kwargs['sigma_max']
+            del kwargs['sigma_min']
+        sde_cls = SDERegistry.get_by_name(sde)
+        self.sde = sde_cls(**kwargs)
+        # Store hyperparams and save them
+        self.lr = lr
+        self.ema_decay = ema_decay
+        self.ema = ExponentialMovingAverage(self.parameters(), decay=self.ema_decay)
+        self._error_loading_ema = False
+        self.t_eps = t_eps
+        self.loss_type = loss_type
+        self.num_eval_files = num_eval_files
+        self.loss_abs_exponent = loss_abs_exponent
+        self.save_hyperparameters(ignore=['no_wandb'])
+        self.data_module = data_module_cls(**kwargs, gpu=kwargs.get('gpus', 0) > 0)
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return optimizer
+    def optimizer_step(self, *args, **kwargs):
+        # Method overridden so that the EMA params are updated after each optimizer step
+        super().optimizer_step(*args, **kwargs)
+        self.ema.update(self.parameters())
+    # on_load_checkpoint / on_save_checkpoint needed for EMA storing/loading
+    def on_load_checkpoint(self, checkpoint):
+        ema = checkpoint.get('ema', None)
+        if ema is not None:
+            self.ema.load_state_dict(checkpoint['ema'])
+        else:
+            self._error_loading_ema = True
+            warnings.warn("EMA state_dict not found in checkpoint!")
+    def on_save_checkpoint(self, checkpoint):
+        checkpoint['ema'] = self.ema.state_dict()
+    def train(self, mode, no_ema=False):
+        res = super().train(mode)  # call the standard `train` method with the given mode
+        if not self._error_loading_ema:
+            if mode == False and not no_ema:
+                # eval
+                self.ema.store(self.parameters())        # store current params in EMA
+                self.ema.copy_to(self.parameters())      # copy EMA parameters over current params for evaluation
+            else:
+                # train
+                if self.ema.collected_params is not None:
+                    self.ema.restore(self.parameters())  # restore the EMA weights (if stored)
+        return res
+    def eval(self, no_ema=False):
+        return self.train(False, no_ema=no_ema)
+    def _loss(self, score, sigmas, z):
+        if self.loss_type == 'mse':
+            err = sigmas*score + z
+            losses = torch.square(err.abs())
+        elif self.loss_type == 'mae':
+            losses = err.abs()
+        # taken from reduce_op function: sum over channels and position and mean over batch dim
+        # presumably only important for absolute loss number, not for gradients
+        loss = torch.mean(0.5*torch.sum(losses.reshape(losses.shape[0], -1), dim=-1))
+        return loss
+    def _step(self, batch, batch_idx):
+        x, y, m = batch
+        rdm = torch.rand(x.shape[0], device=x.device) * (self.sde.T - self.t_eps) + self.t_eps
+        t = torch.min(rdm, torch.tensor(self.sde.T))
+        mean, std = self.sde.marginal_prob(x, t, y)
+        z = torch.randn_like(x)  #
+        sigmas = std[:, None, None, None]
+        perturbed_data = mean + sigmas * z
+        score = self(perturbed_data, t, y, m)
+        loss = self._loss(score, sigmas, z)
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self._step(batch, batch_idx)
+        self.log('train_loss', loss, on_step=True, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self._step(batch, batch_idx)
+        self.log('valid_loss', loss, on_step=False, on_epoch=True)
+        # Evaluate speech enhancement performance
+        if batch_idx == 0 and self.num_eval_files != 0:
+            pesq, si_sdr, estoi = evaluate_model(self, self.num_eval_files)
+            self.log('pesq', pesq, on_step=False, on_epoch=True)
+            self.log('si_sdr', si_sdr, on_step=False, on_epoch=True)
+            self.log('estoi', estoi, on_step=False, on_epoch=True)
+        return loss
+    def forward(self, x, t, y, m):
+        # Concatenate y as an extra channel
+        dnn_input = torch.cat([x, y, m], dim=1)
+        # print(dnn_input.shape)
+        # the minus is most likely unimportant here - taken from Song's repo
+        score = -self.dnn(dnn_input, t)
+        return score
+    def to(self, *args, **kwargs):
+        """Override PyTorch .to() to also transfer the EMA of the model weights"""
+        self.ema.to(*args, **kwargs)
+        return super().to(*args, **kwargs)
+    def get_pc_sampler(self, predictor_name, corrector_name, y, m, Y_prior=None, N=None, minibatch=None, timestep_type=None, **kwargs):
+        N = self.sde.N if N is None else N
+        sde = self.sde.copy()
+        sde.N = N
+        kwargs = {"eps": self.t_eps, **kwargs}
+        if minibatch is None:
+            return sampling.get_pc_sampler(predictor_name, corrector_name, sde=sde, score_fn=self, Y=y, M=m, Y_prior=Y_prior, timestep_type=timestep_type, **kwargs)
+        else:
+            M = y.shape[0]
+            def batched_sampling_fn():
+                samples, ns = [], []
+                for i in range(int(ceil(M / minibatch))):
+                    y_mini = y[i*minibatch:(i+1)*minibatch]
+                    y_prior_mini = Y_prior[i*minibatch:(i+1)*minibatch]
+                    sampler = sampling.get_pc_sampler(predictor_name, corrector_name, sde=sde, score_fn=self, Y=y_mini, M=m, y_prior=y_prior_mini, **kwargs)
+                    sample, n = sampler()
+                    samples.append(sample)
+                    ns.append(n)
+                samples = torch.cat(samples, dim=0)
+                return samples, ns
+            return batched_sampling_fn
+    def train_dataloader(self):
+        return self.data_module.train_dataloader()
+    def val_dataloader(self):
+        return self.data_module.val_dataloader()
+    def test_dataloader(self):
+        return self.data_module.test_dataloader()
+    def setup(self, stage=None):
+        return self.data_module.setup(stage=stage)
+    def to_audio(self, spec, length=None):
+        return self._istft(self._backward_transform(spec), length)
+    def _forward_transform(self, spec):
+        return self.data_module.spec_fwd(spec)
+    def _backward_transform(self, spec):
+        return self.data_module.spec_back(spec)
+    def _stft(self, sig):
+        return self.data_module.stft(sig)
+    def _istft(self, spec, length=None):
+        return self.data_module.istft(spec, length)
+    def enhance(self, y, m, sampler_type="pc", predictor="reverse_diffusion",
+        corrector="ald", N=30, corrector_steps=1, snr=0.5, timeit=False,
+        **kwargs
+    ):
+        """
+        One-call speech enhancement of noisy speech `y`, for convenience.
+        """
+        sr=8000
+        start = time.time()
+        T_orig = y.size(1)
+        norm_factor = y.abs().max().item()
+        y = y / norm_factor
+        m = m / norm_factor
+        Y = torch.unsqueeze(self._forward_transform(self._stft(y.cuda())), 0)
+        Y = pad_spec(Y)
+        M = torch.unsqueeze(self._forward_transform(self._stft(m.cuda())), 0)
+        M = pad_spec(M)
+        if sampler_type == "pc":
+            sampler = self.get_pc_sampler(predictor, corrector, Y.cuda(), M.cuda(), N=N,
+                corrector_steps=corrector_steps, snr=snr, intermediate=False,
+                **kwargs)
+        else:
+            print("{} is not a valid sampler type!".format(sampler_type))
+        sample, nfe = sampler()
+        sample = sample.squeeze()
+        x_hat = self.to_audio(sample)
+        x_hat = x_hat * norm_factor
+        x_hat = x_hat.squeeze().cpu().numpy()
+        end = time.time()
+        if timeit:
+            rtf = (end-start)/(len(x_hat)/sr)
+            return x_hat, nfe, rtf
+        else:
+            return x_hat

geco/sampling/__init__.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Various sampling methods."""
+from scipy import integrate
+import torch
+from .predictors import Predictor, PredictorRegistry, ReverseDiffusionPredictor
+from .correctors import Corrector, CorrectorRegistry
+import numpy as np
+import matplotlib.pyplot as plt
+__all__ = [
+    'PredictorRegistry', 'CorrectorRegistry', 'Predictor', 'Corrector',
+    'get_sampler'
+]
+def to_flattened_numpy(x):
+    """Flatten a torch tensor `x` and convert it to numpy."""
+    return x.detach().cpu().numpy().reshape((-1,))
+def from_flattened_numpy(x, shape):
+    """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
+    return torch.from_numpy(x.reshape(shape))
+def get_pc_sampler(
+    predictor_name, corrector_name, sde, score_fn, Y, M, Y_prior=None,
+    denoise=True, eps=3e-2, snr=0.1, corrector_steps=1, probability_flow: bool = False,
+    intermediate=False, timestep_type=None, **kwargs
+):
+    """Create a Predictor-Corrector (PC) sampler.
+    Args:
+        predictor_name: The name of a registered `sampling.Predictor`.
+        corrector_name: The name of a registered `sampling.Corrector`.
+        sde: An `sdes.SDE` object representing the forward SDE.
+        score_fn: A function (typically learned model) that predicts the score.
+        y: A `torch.Tensor`, representing the (non-white-)noisy starting point(s) to condition the prior on.
+        denoise: If `True`, add one-step denoising to the final samples.
+        eps: A `float` number. The reverse-time SDE and ODE are integrated to `epsilon` to avoid numerical issues.
+        snr: The SNR to use for the corrector. 0.1 by default, and ignored for `NoneCorrector`.
+        N: The number of reverse sampling steps. If `None`, uses the SDE's `N` property by default.
+    Returns:
+        A sampling function that returns samples and the number of function evaluations during sampling.
+    """
+    predictor_cls = PredictorRegistry.get_by_name(predictor_name)
+    corrector_cls = CorrectorRegistry.get_by_name(corrector_name)
+    predictor = predictor_cls(sde, score_fn, probability_flow=probability_flow)
+    corrector = corrector_cls(sde, score_fn, snr=snr, n_steps=corrector_steps)
+    def pc_sampler(Y_prior=Y_prior, timestep_type=timestep_type):
+        """The PC sampler function."""
+        with torch.no_grad():
+            if Y_prior == None:
+                Y_prior = Y
+            xt, _ = sde.prior_sampling(Y_prior.shape, Y_prior)
+            timesteps = timesteps_space(sde.T, sde.N,eps, Y.device, type=timestep_type)
+            xt = xt.to(Y_prior.device)
+            for i in range(len(timesteps)):
+                t = timesteps[i]
+                if i != len(timesteps) - 1:
+                    stepsize = t - timesteps[i+1]
+                else:
+                    stepsize = timesteps[-1]
+                vec_t = torch.ones(Y.shape[0], device=Y.device) * t
+                xt, xt_mean = corrector.update_fn(xt, vec_t, Y, M)
+                xt, xt_mean = predictor.update_fn(xt, vec_t, Y, M, stepsize)
+            x_result = xt_mean if denoise else xt
+            ns = len(timesteps) * (corrector.n_steps + 1)
+            return x_result, ns
+    if intermediate:
+        return pc_sampler_intermediate
+    else:
+        return pc_sampler
+def timesteps_space(sdeT, sdeN,  eps, device, type='linear'):
+    timesteps = torch.linspace(sdeT, eps, sdeN, device=device)
+    if type == 'linear':
+        return timesteps
+    else:
+        pass #not used, can be used to implement different sampling schedules
+    return timesteps

geco/sampling/correctors.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import abc
+import torch
+from geco import sdes
+from geco.util.registry import Registry
+CorrectorRegistry = Registry("Corrector")
+class Corrector(abc.ABC):
+    """The abstract class for a corrector algorithm."""
+    def __init__(self, sde, score_fn, snr, n_steps):
+        super().__init__()
+        self.rsde = sde.reverse(score_fn)
+        self.score_fn = score_fn
+        self.snr = snr
+        self.n_steps = n_steps
+    @abc.abstractmethod
+    def update_fn(self, x, t, *args):
+        """One update of the corrector.
+        Args:
+            x: A PyTorch tensor representing the current state
+            t: A PyTorch tensor representing the current time step.
+            *args: Possibly additional arguments, in particular `y` for OU processes
+        Returns:
+            x: A PyTorch tensor of the next state.
+            x_mean: A PyTorch tensor. The next state without random noise. Useful for denoising.
+        """
+        pass
+@CorrectorRegistry.register(name='ald')
+class AnnealedLangevinDynamics(Corrector):
+    """The original annealed Langevin dynamics predictor in NCSN/NCSNv2."""
+    def __init__(self, sde, score_fn, snr, n_steps):
+        super().__init__(sde, score_fn, snr, n_steps)
+        self.sde = sde
+        self.score_fn = score_fn
+        self.snr = snr
+        self.n_steps = n_steps
+    def update_fn(self, x, t, y, m):
+        x_mean = 0
+        n_steps = self.n_steps
+        target_snr = self.snr
+        std = self.sde.marginal_prob(x, t, y)[1]
+        for _ in range(n_steps):
+            # print(x.shape, y.shape,m.shape)
+            grad = self.score_fn(x, t, y, m)
+            noise = torch.randn_like(x)
+            step_size = (target_snr * std) ** 2 * 2
+            x_mean = x + step_size[:, None, None, None] * grad
+            x = x_mean + noise * torch.sqrt(step_size * 2)[:, None, None, None]
+        return x, x_mean

geco/sampling/predictors.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import abc
+import torch
+import numpy as np
+from geco.util.registry import Registry
+PredictorRegistry = Registry("Predictor")
+class Predictor(abc.ABC):
+    """The abstract class for a predictor algorithm."""
+    def __init__(self, sde, score_fn, probability_flow=False):
+        super().__init__()
+        self.sde = sde
+        self.rsde = sde.reverse(score_fn)
+        self.score_fn = score_fn
+        self.probability_flow = probability_flow
+    @abc.abstractmethod
+    def update_fn(self, x, t, *args):
+        """One update of the predictor.
+        Args:
+            x: A PyTorch tensor representing the current state
+            t: A Pytorch tensor representing the current time step.
+            *args: Possibly additional arguments, in particular `y` for OU processes
+        Returns:
+            x: A PyTorch tensor of the next state.
+            x_mean: A PyTorch tensor. The next state without random noise. Useful for denoising.
+        """
+        pass
+    def debug_update_fn(self, x, t, *args):
+        raise NotImplementedError(f"Debug update function not implemented for predictor {self}.")
+@PredictorRegistry.register('reverse_diffusion')
+class ReverseDiffusionPredictor(Predictor):
+    def __init__(self, sde, score_fn, probability_flow=False):
+        super().__init__(sde, score_fn, probability_flow=probability_flow)
+    def update_fn(self, x, t, y, m, stepsize):
+        f, g = self.rsde.discretize(x, t, y, m, stepsize)
+        z = torch.randn_like(x)
+        x_mean = x - f
+        x = x_mean + g[:, None, None, None] * z
+        return x, x_mean
+    def update_fn_analyze(self, x, t, *args):
+        raise NotImplementedError("update_fn_analyze() has not been implemented yet for the ReverseDiffusionPredictor")

geco/sdes.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+Abstract SDE classes, Reverse SDE, and VE/VP SDEs.
+Taken and adapted from https://github.com/yang-song/score_sde_pytorch/blob/1618ddea340f3e4a2ed7852a0694a809775cf8d0/sde_lib.py
+"""
+import abc
+import warnings
+import math
+import scipy.special as sc
+import numpy as np
+from geco.util.tensors import batch_broadcast
+import torch
+from geco.util.registry import Registry
+SDERegistry = Registry("SDE")
+class SDE(abc.ABC):
+    """SDE abstract class. Functions are designed for a mini-batch of inputs."""
+    def __init__(self, N):
+        """Construct an SDE.
+        Args:
+            N: number of discretization time steps.
+        """
+        super().__init__()
+        self.N = N
+    @property
+    @abc.abstractmethod
+    def T(self):
+        """End time of the SDE."""
+        pass
+    @abc.abstractmethod
+    def sde(self, x, t, *args):
+        pass
+    @abc.abstractmethod
+    def marginal_prob(self, x, t, *args):
+        """Parameters to determine the marginal distribution of the SDE, $p_t(x|args)$."""
+        pass
+    @abc.abstractmethod
+    def prior_sampling(self, shape, *args):
+        """Generate one sample from the prior distribution, $p_T(x|args)$ with shape `shape`."""
+        pass
+    @abc.abstractmethod
+    def prior_logp(self, z):
+        """Compute log-density of the prior distribution.
+        Useful for computing the log-likelihood via probability flow ODE.
+        Args:
+            z: latent code
+        Returns:
+            log probability density
+        """
+        pass
+    @staticmethod
+    @abc.abstractmethod
+    def add_argparse_args(parent_parser):
+        """
+        Add the necessary arguments for instantiation of this SDE class to an argparse ArgumentParser.
+        """
+        pass
+    def discretize(self, x, t, y, stepsize):
+        """Discretize the SDE in the form: x_{i+1} = x_i + f_i(x_i) + G_i z_i.
+        Useful for reverse diffusion sampling and probabiliy flow sampling.
+        Defaults to Euler-Maruyama discretization.
+        Args:
+            x: a torch tensor
+            t: a torch float representing the time step (from 0 to `self.T`)
+        Returns:
+            f, G
+        """
+        dt = stepsize
+        #dt = 1 /self.N
+        drift, diffusion = self.sde(x, t, y)
+        f = drift * dt
+        G = diffusion * torch.sqrt(torch.tensor(dt, device=t.device))
+        return f, G
+    def reverse(oself, score_model, probability_flow=False):
+        """Create the reverse-time SDE/ODE.
+        Args:
+            score_model: A function that takes x, t and y and returns the score.
+            probability_flow: If `True`, create the reverse-time ODE used for probability flow sampling.
+        """
+        N = oself.N
+        T = oself.T
+        sde_fn = oself.sde
+        discretize_fn = oself.discretize
+        # Build the class for reverse-time SDE.
+        class RSDE(oself.__class__):
+            def __init__(self):
+                self.N = N
+                self.probability_flow = probability_flow
+            @property
+            def T(self):
+                return T
+            def sde(self, x, t, *args):
+                """Create the drift and diffusion functions for the reverse SDE/ODE."""
+                rsde_parts = self.rsde_parts(x, t, *args)
+                total_drift, diffusion = rsde_parts["total_drift"], rsde_parts["diffusion"]
+                return total_drift, diffusion
+            def discretize(self, x, t, y, m, stepsize):
+                """Create discretized iteration rules for the reverse diffusion sampler."""
+                f, G = discretize_fn(x, t, y, stepsize)
+                if torch.is_complex(G):
+                    G = G.imag
+                rev_f = f - G[:, None, None, None] ** 2 * score_model(x, t, y, m) * (0.5 if self.probability_flow else 1.)
+                rev_G = torch.zeros_like(G) if self.probability_flow else G
+                return rev_f, rev_G
+        return RSDE()
+    @abc.abstractmethod
+    def copy(self):
+        pass
+@SDERegistry.register("bbed")
+class BBED(SDE):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--sde-n", type=int, default=30, help="The number of timesteps in the SDE discretization. 30 by default")
+        parser.add_argument("--T_sampling", type=float, default=0.999, help="The T so that t < T during sampling in the train step.")
+        parser.add_argument("--k", type=float, default = 2.6, help="base factor for diffusion term")
+        parser.add_argument("--theta", type=float, default = 0.52, help="root scale factor for diffusion term.")
+        return parser
+    def __init__(self, T_sampling, k, theta, N=1000, **kwargs):
+        """Construct an Brownian Bridge with Exploding Diffusion Coefficient SDE with parameterization as in the paper.
+        dx = (y-x)/(Tc-t) dt + sqrt(theta)*k^t dw
+        """
+        super().__init__(N)
+        self.k = k
+        self.logk = np.log(self.k)
+        self.theta = theta
+        self.N = N
+        self.Eilog = sc.expi(-2*self.logk)
+        self.T = T_sampling #for sampling in train step and inference
+        self.Tc = 1 #for constructing the SDE, dont change this
+    def copy(self):
+        return BBED(self.T, self.k, self.theta, N=self.N)
+    def T(self):
+        return self.T
+    def Tc(self):
+        return self.Tc
+    def sde(self, x, t, y):
+        drift = (y - x)/(self.Tc - t)
+        sigma = (self.k) ** t
+        diffusion = sigma * np.sqrt(self.theta)
+        return drift, diffusion
+    def _mean(self, x0, t, y):
+        time = (t/self.Tc)[:, None, None, None]
+        mean = x0*(1-time) + y*time
+        return mean
+    def _std(self, t):
+        t_np = t.cpu().detach().numpy()
+        Eis = sc.expi(2*(t_np-1)*self.logk) - self.Eilog
+        h = 2*self.k**2*self.logk
+        var = (self.k**(2*t_np)-1+t_np) + h*(1-t_np)*Eis
+        var = torch.tensor(var).to(device=t.device)*(1-t)*self.theta
+        return torch.sqrt(var)
+    def marginal_prob(self, x0, t, y):
+        return self._mean(x0, t, y), self._std(t)
+    def prior_sampling(self, shape, y):
+        if shape != y.shape:
+            warnings.warn(f"Target shape {shape} does not match shape of y {y.shape}! Ignoring target shape.")
+        std = self._std(self.T*torch.ones((y.shape[0],), device=y.device))
+        z = torch.randn_like(y)
+        x_T = y + z * std[:, None, None, None]
+        return x_T, z
+    def prior_logp(self, z):
+        raise NotImplementedError("prior_logp for BBED not yet implemented!")

geco/util/inference.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import torch
+import torchaudio
+import torch.nn.functional as F
+from pesq import pesq
+from pystoi import stoi
+from .other import si_sdr, pad_spec
+# Settings
+sr = 8000
+snr = 0.5
+N = 30
+corrector_steps = 1
+def evaluate_model(model, num_eval_files):
+    clean_files = model.data_module.valid_set.clean_files
+    noisy_files = model.data_module.valid_set.noisy_files
+    mixture_files = model.data_module.valid_set.mixture_files
+    # Select test files uniformly accros validation files
+    total_num_files = len(clean_files)
+    indices = torch.linspace(0, total_num_files-1, num_eval_files, dtype=torch.int)
+    clean_files = list(clean_files[i] for i in indices)
+    noisy_files = list(noisy_files[i] for i in indices)
+    mixture_files = list(mixture_files[i] for i in indices)
+    _pesq = 0
+    _si_sdr = 0
+    _estoi = 0
+    # iterate over files
+    for (clean_file, noisy_file, mixture_file) in zip(clean_files, noisy_files, mixture_files):
+        # Load wavs
+        x, sr_ = torchaudio.load(clean_file)
+        if sr_ != sr:
+            x = torchaudio.transforms.Resample(sr_, sr)(x)
+        y, sr_ = torchaudio.load(noisy_file)
+        if sr_ != sr:
+            y = torchaudio.transforms.Resample(sr_, sr)(y)
+        m, sr_ = torchaudio.load(mixture_file)
+        if sr_ != sr:
+            m = torchaudio.transforms.Resample(sr_, sr)(m)
+        min_leng = min(x.shape[-1],y.shape[-1],m.shape[-1])
+        x = x[...,:min_leng]
+        y = y[...,:min_leng]
+        m = m[...,:min_leng]
+        T_orig = x.size(1)
+        # Normalize per utterance
+        norm_factor = y.abs().max()
+        y = y / norm_factor
+        m = m / norm_factor
+        # Prepare DNN input
+        Y = torch.unsqueeze(model._forward_transform(model._stft(y.cuda())), 0)
+        Y = pad_spec(Y)
+        M = torch.unsqueeze(model._forward_transform(model._stft(m.cuda())), 0)
+        M = pad_spec(M)
+        y = y * norm_factor
+        # print(x.shape,y.shape,m.shape,Y.shape,M.shape)
+        # Reverse sampling
+        sampler = model.get_pc_sampler(
+            'reverse_diffusion', 'ald', Y.cuda(), M.cuda(), N=N,
+            corrector_steps=corrector_steps, snr=snr)
+        sample, _ = sampler()
+        sample = sample.squeeze()
+        x_hat = model.to_audio(sample.squeeze(), T_orig)
+        x_hat = x_hat * norm_factor
+        x_hat = x_hat.squeeze().cpu().numpy()
+        x = x.squeeze().cpu().numpy()
+        y = y.squeeze().cpu().numpy()
+        _si_sdr += si_sdr(x, x_hat)
+        _pesq += pesq(sr, x, x_hat, 'nb')
+        _estoi += stoi(x, x_hat, sr, extended=True)
+    return _pesq/num_eval_files, _si_sdr/num_eval_files, _estoi/num_eval_files
+def evaluate_model2(model, num_eval_files, inference_N, inference_start=0.5):
+    N = inference_N
+    reverse_start_time = inference_start
+    clean_files = model.data_module.valid_set.clean_files
+    noisy_files = model.data_module.valid_set.noisy_files
+    mixture_files = model.data_module.valid_set.mixture_files
+    # Select test files uniformly accros validation files
+    total_num_files = len(clean_files)
+    indices = torch.linspace(0, total_num_files-1, num_eval_files, dtype=torch.int)
+    clean_files = list(clean_files[i] for i in indices)
+    noisy_files = list(noisy_files[i] for i in indices)
+    mixture_files = list(mixture_files[i] for i in indices)
+    _pesq = 0
+    _si_sdr = 0
+    _estoi = 0
+    # iterate over files
+    for (clean_file, noisy_file, mixture_file) in zip(clean_files, noisy_files, mixture_files):
+        # Load wavs
+        x, sr_ = torchaudio.load(clean_file)
+        if sr_ != sr:
+            x = torchaudio.transforms.Resample(sr_, sr)(x)
+        y, sr_ = torchaudio.load(noisy_file)
+        if sr_ != sr:
+            y = torchaudio.transforms.Resample(sr_, sr)(y)
+        m, sr_ = torchaudio.load(mixture_file)
+        if sr_ != sr:
+            m = torchaudio.transforms.Resample(sr_, sr)(m)
+        #requires only for BWE as the dataset has different length of clean and noisy files
+        min_leng = min(x.shape[-1],y.shape[-1],m.shape[-1])
+        x = x[...,:min_leng]
+        y = y[...,:min_leng]
+        m = m[...,:min_leng]
+        T_orig = x.size(1)
+        # Normalize per utterance
+        norm_factor = y.abs().max()
+        y = y / norm_factor
+        x = x / norm_factor
+        m = m / norm_factor
+        # Prepare DNN input
+        Y = torch.unsqueeze(model._forward_transform(model._stft(y.cuda())), 0)
+        Y = pad_spec(Y)
+        X = torch.unsqueeze(model._forward_transform(model._stft(x.cuda())), 0)
+        X = pad_spec(X)
+        M = torch.unsqueeze(model._forward_transform(model._stft(m.cuda())), 0)
+        M = pad_spec(M)
+        y = y * norm_factor
+        x = x * norm_factor
+        x = x.squeeze().cpu().numpy()
+        y = y.squeeze().cpu().numpy()
+        total_loss = 0
+        timesteps = torch.linspace(reverse_start_time, 0.03, N, device=Y.device)
+        #prior sampling starting from reverse_start_time
+        std = model.sde._std(reverse_start_time*torch.ones((Y.shape[0],), device=Y.device))
+        z = torch.randn_like(Y)
+        X_t = Y + z * std[:, None, None, None]
+        #reverse steps by Euler Maruyama
+        for i in range(len(timesteps)):
+            t = timesteps[i]
+            if i != len(timesteps) - 1:
+                dt = t - timesteps[i+1]
+            else:
+                dt = timesteps[-1]
+            with torch.no_grad():
+                #take Euler step here
+                f, g = model.sde.sde(X_t, t, Y)
+                vec_t = torch.ones(Y.shape[0], device=Y.device) * t
+                score = model.forward(X_t, vec_t, Y, M, vec_t[:,None,None,None])
+                mean_x_tm1 = X_t - (f - g**2*score)*dt #mean of x t minus 1 = mu(x_{t-1})
+                if i == len(timesteps) - 1: #output
+                    X_t = mean_x_tm1
+                    break
+                z = torch.randn_like(X)
+                X_t = mean_x_tm1 + z*g*torch.sqrt(dt)
+        sample = X_t
+        sample = sample.squeeze()
+        x_hat = model.to_audio(sample.squeeze(), T_orig)
+        x_hat = x_hat * norm_factor
+        x_hat = x_hat.squeeze().cpu().numpy()
+        _si_sdr += si_sdr(x, x_hat)
+        _pesq += pesq(sr, x, x_hat, 'nb')
+        _estoi += stoi(x, x_hat, sr, extended=True)
+    return _pesq/num_eval_files, _si_sdr/num_eval_files, _estoi/num_eval_files, total_loss/num_eval_files
+def convert_to_audio(X, deemp, T_orig, model, norm_factor):
+    sample = X
+    sample = sample.squeeze()
+    if len(sample.shape)==4:
+        sample = sample*deemp[None, None, :, None].to(device=sample.device)
+    elif len(sample.shape)==3:
+        sample = sample*deemp[None, :, None].to(device=sample.device)
+    else:
+        sample = sample*deemp[:, None].to(device=sample.device)
+    x_hat = model.to_audio(sample.squeeze(), T_orig)
+    x_hat = x_hat * norm_factor
+    x_hat = x_hat.squeeze().cpu().numpy()
+    return x_hat

geco/util/other.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import numpy as np
+import scipy.stats
+from scipy.signal import butter, sosfilt
+import torch
+from pesq import pesq
+from pystoi import stoi
+def si_sdr_components(s_hat, s, n):
+    """
+    """
+    # s_target
+    alpha_s = np.dot(s_hat, s) / np.linalg.norm(s)**2
+    s_target = alpha_s * s
+    # e_noise
+    alpha_n = np.dot(s_hat, n) / np.linalg.norm(n)**2
+    e_noise = alpha_n * n
+    # e_art
+    e_art = s_hat - s_target - e_noise
+    return s_target, e_noise, e_art
+def energy_ratios(s_hat, s, n):
+    """
+    """
+    s_target, e_noise, e_art = si_sdr_components(s_hat, s, n)
+    si_sdr = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_noise + e_art)**2)
+    si_sir = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_noise)**2)
+    si_sar = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_art)**2)
+    return si_sdr, si_sir, si_sar
+def mean_conf_int(data, confidence=0.95):
+    a = 1.0 * np.array(data)
+    n = len(a)
+    m, se = np.mean(a), scipy.stats.sem(a)
+    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
+    return m, h
+class Method():
+    def __init__(self, name, base_dir, metrics):
+        self.name = name
+        self.base_dir = base_dir
+        self.metrics = {}
+        for i in range(len(metrics)):
+            metric = metrics[i]
+            value = []
+            self.metrics[metric] = value
+    def append(self, matric, value):
+        self.metrics[matric].append(value)
+    def get_mean_ci(self, metric):
+        return mean_conf_int(np.array(self.metrics[metric]))
+def hp_filter(signal, cut_off=80, order=10, sr=16000):
+    factor = cut_off /sr * 2
+    sos = butter(order, factor, 'hp', output='sos')
+    filtered = sosfilt(sos, signal)
+    return filtered
+def si_sdr(s, s_hat):
+    alpha = np.dot(s_hat, s)/np.linalg.norm(s)**2
+    sdr = 10*np.log10(np.linalg.norm(alpha*s)**2/np.linalg.norm(
+        alpha*s - s_hat)**2)
+    return sdr
+def snr_dB(s,n):
+    s_power = 1/len(s)*np.sum(s**2)
+    n_power = 1/len(n)*np.sum(n**2)
+    snr_dB = 10*np.log10(s_power/n_power)
+    return snr_dB
+def pad_spec(Y):
+    T = Y.size(3)
+    if T%64 !=0:
+        num_pad = 64-T%64
+    else:
+        num_pad = 0
+    pad2d = torch.nn.ZeroPad2d((0, num_pad, 0,0))
+    return pad2d(Y)
+def ensure_dir(file_path):
+    directory = file_path
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+def print_metrics(x, y, x_hat_list, labels, sr=16000):
+    _si_sdr_mix = si_sdr(x, y)
+    _pesq_mix = pesq(sr, x, y, 'wb')
+    _estoi_mix = stoi(x, y, sr, extended=True)
+    print(f'Mixture:  PESQ: {_pesq_mix:.2f}, ESTOI: {_estoi_mix:.2f}, SI-SDR: {_si_sdr_mix:.2f}')
+    for i, x_hat in enumerate(x_hat_list):
+        _si_sdr = si_sdr(x, x_hat)
+        _pesq = pesq(sr, x, x_hat, 'wb')
+        _estoi = stoi(x, x_hat, sr, extended=True)
+        print(f'{labels[i]}: {_pesq:.2f}, ESTOI: {_estoi:.2f}, SI-SDR: {_si_sdr:.2f}')
+def mean_std(data):
+    data = data[~np.isnan(data)]
+    mean = np.mean(data)
+    std = np.std(data)
+    return mean, std
+def print_mean_std(data, decimal=2):
+    data = np.array(data)
+    data = data[~np.isnan(data)]
+    mean = np.mean(data)
+    std = np.std(data)
+    if decimal == 2:
+        string = f'{mean:.2f} ± {std:.2f}'
+    elif decimal == 1:
+        string = f'{mean:.1f} ± {std:.1f}'
+    return string

geco/util/registry.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import warnings
+from typing import Callable
+class Registry:
+    def __init__(self, managed_thing: str):
+        """
+        Create a new registry.
+        Args:
+            managed_thing: A string describing what type of thing is managed by this registry. Will be used for
+                warnings and errors, so it's a good idea to keep this string globally unique and easily understood.
+        """
+        self.managed_thing = managed_thing
+        self._registry = {}
+    def register(self, name: str) -> Callable:
+        def inner_wrapper(wrapped_class) -> Callable:
+            if name in self._registry:
+                warnings.warn(f"{self.managed_thing} with name '{name}' doubly registered, old class will be replaced.")
+            self._registry[name] = wrapped_class
+            return wrapped_class
+        return inner_wrapper
+    def get_by_name(self, name: str):
+        """Get a managed thing by name."""
+        if name in self._registry:
+            return self._registry[name]
+        else:
+            raise ValueError(f"{self.managed_thing} with name '{name}' unknown.")
+    def get_all_names(self):
+        """Get the list of things' names registered to this registry."""
+        return list(self._registry.keys())

geco/util/tensors.py ADDED Viewed

	@@ -0,0 +1,16 @@

+def batch_broadcast(a, x):
+    """Broadcasts a over all dimensions of x, except the batch dimension, which must match."""
+    if len(a.shape) != 1:
+        a = a.squeeze()
+        if len(a.shape) != 1:
+            raise ValueError(
+                f"Don't know how to batch-broadcast tensor `a` with more than one effective dimension (shape {a.shape})"
+            )
+    if a.shape[0] != x.shape[0] and a.shape[0] != 1:
+        raise ValueError(
+            f"Don't know how to batch-broadcast shape {a.shape} over {x.shape} as the batch dimension is not matching")
+    out = a.view((x.shape[0], *(1 for _ in range(len(x.shape)-1))))
+    return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+pytorch-lightning==1.5.10
+torch==2.4.0
+torch-ema==0.3
+torch-optimizer==0.3.0
+torch-stoi==0.1.2
+torchaudio==2.4.0
+torchinfo==1.6.3
+torchmetrics==0.9.3
+torchsde==0.2.5
+torchvision==0.19.0
+tornado==6.2
+tqdm==4.63.0
+ninja
+matplotlib
+pesq
+wandb
+PySoundFile
+pandas
+git+https://github.com/WangHelin1997/Fast-GeCo.git#subdirectory=score_models
+speechbrain==1.0.0