{ "backbone_class": "decoder.models.VocosBackbone", "head_class": "decoder.heads.ISTFTHead", "backbone_config": { "input_channels": 512, "dim": 768, "intermediate_dim": 2304, "num_layers": 12, "adanorm_num_embeddings": 4 }, "head_config": { "dim": 768, "n_fft": 1280, "hop_length": 320, "padding": "same" } }