File size: 7,087 Bytes
ccee439 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from __gin__ import dynamic_registration
import cached_conv as cc
from cached_conv import convs
import rave
from rave import blocks
from rave import core
from rave import dataset
from rave import descript_discriminator
from rave import discriminator
from rave import model
from rave import pqmf
import torch
import torch.nn as nn
# Macros:
# ==============================================================================
ACTIVATION = @blocks.Snake
CAPACITY = 64
DILATIONS = [[1, 3, 9], [1, 3, 9], [1, 3, 9], [1, 3]]
KERNEL_SIZE = 3
LATENT_SIZE = 128
N_BAND = 8
NOISE_AUGMENTATION = 0
PHASE_1_DURATION = 0
RATIOS = [4, 4, 4, 1]
SAMPLING_RATE = 48000
# Parameters for blocks.AdaptiveInstanceNormalization:
# ==============================================================================
# None.
# Parameters for variational/blocks.AdaptiveInstanceNormalization:
# ==============================================================================
# None.
# Parameters for core.AudioDistanceV1:
# ==============================================================================
core.AudioDistanceV1.log_epsilon = 1e-07
core.AudioDistanceV1.multiscale_stft = @core.MultiScaleSTFT
# Parameters for model.BetaWarmupCallback:
# ==============================================================================
model.BetaWarmupCallback.initial_value = 1e-06
model.BetaWarmupCallback.target_value = 0.005
model.BetaWarmupCallback.warmup_len = 20000
# Parameters for pqmf.CachedPQMF:
# ==============================================================================
pqmf.CachedPQMF.attenuation = 100
pqmf.CachedPQMF.n_band = %N_BAND
# Parameters for cc.Conv1d:
# ==============================================================================
cc.Conv1d.bias = False
# Parameters for variational/cc.Conv1d:
# ==============================================================================
variational/cc.Conv1d.bias = False
# Parameters for cc.ConvTranspose1d:
# ==============================================================================
cc.ConvTranspose1d.bias = False
# Parameters for descript_discriminator.DescriptDiscriminator:
# ==============================================================================
descript_discriminator.DescriptDiscriminator.bands = \
[(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
descript_discriminator.DescriptDiscriminator.fft_sizes = [2048, 1024, 512]
descript_discriminator.DescriptDiscriminator.periods = [2, 3, 5, 7, 11]
descript_discriminator.DescriptDiscriminator.rates = []
descript_discriminator.DescriptDiscriminator.sample_rate = 44100
# Parameters for variational/blocks.EncoderV2:
# ==============================================================================
variational/blocks.EncoderV2.activation = %ACTIVATION
variational/blocks.EncoderV2.adain = @blocks.AdaptiveInstanceNormalization
variational/blocks.EncoderV2.capacity = %CAPACITY
variational/blocks.EncoderV2.data_size = %N_BAND
variational/blocks.EncoderV2.dilations = %DILATIONS
variational/blocks.EncoderV2.keep_dim = False
variational/blocks.EncoderV2.kernel_size = %KERNEL_SIZE
variational/blocks.EncoderV2.latent_size = %LATENT_SIZE
variational/blocks.EncoderV2.n_out = 2
variational/blocks.EncoderV2.ratios = %RATIOS
variational/blocks.EncoderV2.recurrent_layer = None
variational/blocks.EncoderV2.spectrogram = None
# Parameters for blocks.GeneratorV2:
# ==============================================================================
blocks.GeneratorV2.activation = %ACTIVATION
blocks.GeneratorV2.adain = @blocks.AdaptiveInstanceNormalization
blocks.GeneratorV2.amplitude_modulation = True
blocks.GeneratorV2.capacity = %CAPACITY
blocks.GeneratorV2.causal_convtranspose = True
blocks.GeneratorV2.data_size = %N_BAND
blocks.GeneratorV2.dilations = %DILATIONS
blocks.GeneratorV2.keep_dim = False
blocks.GeneratorV2.kernel_size = %KERNEL_SIZE
blocks.GeneratorV2.latent_size = @core.get_augmented_latent_size()
blocks.GeneratorV2.noise_module = None
blocks.GeneratorV2.ratios = %RATIOS
blocks.GeneratorV2.recurrent_layer = None
# Parameters for core.get_augmented_latent_size:
# ==============================================================================
core.get_augmented_latent_size.latent_size = %LATENT_SIZE
core.get_augmented_latent_size.noise_augmentation = %NOISE_AUGMENTATION
# Parameters for convs.get_padding:
# ==============================================================================
convs.get_padding.dilation = 1
convs.get_padding.mode = 'causal'
convs.get_padding.stride = 1
# Parameters for variational/convs.get_padding:
# ==============================================================================
variational/convs.get_padding.dilation = 1
variational/convs.get_padding.mode = 'causal'
variational/convs.get_padding.stride = 1
# Parameters for core.MultiScaleSTFT:
# ==============================================================================
core.MultiScaleSTFT.magnitude = True
core.MultiScaleSTFT.normalized = False
core.MultiScaleSTFT.num_mels = None
core.MultiScaleSTFT.random_crop = True
core.MultiScaleSTFT.sample_rate = %SAMPLING_RATE
core.MultiScaleSTFT.scales = [2048, 1024, 512, 256, 128]
# Parameters for blocks.normalization:
# ==============================================================================
blocks.normalization.mode = 'weight_norm'
# Parameters for variational/blocks.normalization:
# ==============================================================================
variational/blocks.normalization.mode = 'weight_norm'
# Parameters for model.RAVE:
# ==============================================================================
model.RAVE.audio_distance = @core.AudioDistanceV1
model.RAVE.decoder = @blocks.GeneratorV2
model.RAVE.discriminator = @descript_discriminator.DescriptDiscriminator
model.RAVE.enable_pqmf_decode = True
model.RAVE.enable_pqmf_encode = True
model.RAVE.encoder = @blocks.VariationalEncoder
model.RAVE.feature_matching_fun = @feature_matching/core.mean_difference
model.RAVE.freeze_encoder = False
model.RAVE.gan_loss = @core.hinge_gan
model.RAVE.latent_size = %LATENT_SIZE
model.RAVE.multiband_audio_distance = @core.AudioDistanceV1
model.RAVE.num_skipped_features = 1
model.RAVE.phase_1_duration = %PHASE_1_DURATION
model.RAVE.pqmf = @pqmf.CachedPQMF
model.RAVE.sampling_rate = %SAMPLING_RATE
model.RAVE.update_discriminator_every = 4
model.RAVE.valid_signal_crop = True
model.RAVE.warmup_quantize = None
model.RAVE.weights = {'feature_matching': 20}
# Parameters for blocks.Snake:
# ==============================================================================
# None.
# Parameters for variational/blocks.Snake:
# ==============================================================================
# None.
# Parameters for dataset.split_dataset:
# ==============================================================================
dataset.split_dataset.max_residual = 1000
# Parameters for blocks.VariationalEncoder:
# ==============================================================================
blocks.VariationalEncoder.encoder = @variational/blocks.EncoderV2
|