{ | |
"codec": { | |
"ssl_adaptor": { | |
"in_dim": 1280, | |
"embed_dim": 768, | |
"out_dim": 768, | |
"num_layers": 4, | |
"num_heads": 12, | |
"ffn_dim": 3072, | |
"attn_dropout": 0, | |
"dropout": 0 | |
}, | |
"acoustic_encoder": { | |
"num_mels": 128, | |
"sampling_rate": 16000, | |
"hop_length": 160, | |
"n_fft": 400, | |
"fmin": 0, | |
"fmax": 8000, | |
"embed_dim": 768, | |
"num_layers": 12, | |
"num_heads": 12, | |
"ffn_dim": 3072, | |
"attn_dropout": 0, | |
"dropout": 0, | |
"max_positions": 1500 | |
}, | |
"downsample": { | |
"embed_dim": 1536, | |
"avg_pooler": 4 | |
}, | |
"rvq": { | |
"input_dim": 1536, | |
"rvq_dim": 768, | |
"output_dim": 768, | |
"num_quantizers": 16, | |
"codebook_size": 2048, | |
"codebook_dim": 512 | |
}, | |
"upsample": { | |
"embed_dim": 768, | |
"stride": 4 | |
}, | |
"semantic_decoder": { | |
"in_dim": 768, | |
"embed_dim": 768, | |
"out_dim": 1280, | |
"num_layers": 4, | |
"num_heads": 12, | |
"ffn_dim": 3072, | |
"attn_dropout": 0, | |
"dropout": 0 | |
}, | |
"acoustic_decoder": { | |
"embed_dim": 768, | |
"num_layers": 12, | |
"num_heads": 12, | |
"dropout": 0, | |
"hop_length": 240, | |
"causal": true | |
} | |
} | |
} |