feature_extractor: class_path: vocos.feature_extractors.EncodecFeatures init_args: encodec_model: encodec_24khz bandwidths: [1.5, 3.0, 6.0, 12.0] train_codebooks: false backbone: class_path: vocos.models.VocosBackbone init_args: input_channels: 128 dim: 384 intermediate_dim: 1152 num_layers: 8 adanorm_num_embeddings: 4 # len(bandwidths) head: class_path: vocos.heads.ISTFTHead init_args: dim: 384 n_fft: 1280 hop_length: 320 padding: same