| { |
| "model_type": "semantic_vocoder", |
| "auto_map": { |
| "AutoConfig": "model.SemanticVocoderConfig", |
| "AutoModel": "model.SemanticVocoder" |
| }, |
| "model": { |
| "autoencoder": { |
| "_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder", |
| "encoder_name": "dasheng_base", |
| "n_timesteps": 200, |
| "sample_rate": 24000, |
| "clamp_pred": true, |
| "downsampling_ratio": 960, |
| "encoder_sampling_rate": 16000, |
| "vocoder": { |
| "_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator", |
| "latent_dim": 768, |
| "hop_length": 960, |
| "n_ffts": [ |
| 512, |
| 256, |
| 128 |
| ], |
| "hop_lengths": [ |
| 320, |
| 160, |
| 80 |
| ], |
| "channels": [ |
| 768, |
| 512, |
| 384 |
| ], |
| "time_embed_channels": 512, |
| "hidden_factor": 3, |
| "conv_kernel_sizes": [ |
| 7, |
| 7, |
| 7 |
| ], |
| "num_layers": [ |
| 8, |
| 8, |
| 8 |
| ], |
| "use_cond_encoder": true, |
| "cond_enc_channels": 512, |
| "cond_enc_hidden_factor": 3, |
| "cond_enc_conv_kernel_size": 7, |
| "cond_enc_num_layers": 4, |
| "residual_scale": 1.0, |
| "init_noise_scale": 0.1, |
| "pred_x1": true, |
| "branch_reduction": "mean", |
| "spec_scaling_loss": true, |
| "loss_n_filters": 256, |
| "loss_n_fft": 1024, |
| "loss_hop_length": 256, |
| "loss_power": 0.5, |
| "loss_eps": 1e-07, |
| "loss_scale_min": 0.01, |
| "loss_scale_max": 100.0, |
| "branch_dropout": 0.05, |
| "max_add_noise_scale": 0.0 |
| } |
| }, |
| "backbone": { |
| "_target_": "models.dit.mask_dit.UDiT", |
| "img_size": 250, |
| "patch_size": 1, |
| "in_chans": 768, |
| "out_chans": 768, |
| "input_type": "1d", |
| "embed_dim": 1024, |
| "depth": 24, |
| "num_heads": 16, |
| "mlp_ratio": 4.0, |
| "qkv_bias": false, |
| "qk_scale": null, |
| "qk_norm": "layernorm", |
| "norm_layer": "layernorm", |
| "act_layer": "geglu", |
| "context_norm": true, |
| "use_checkpoint": true, |
| "time_fusion": "ada_sola_bias", |
| "ada_sola_rank": 32, |
| "ada_sola_alpha": 32, |
| "cls_dim": null, |
| "context_dim": 1024, |
| "context_fusion": "cross", |
| "context_max_length": null, |
| "context_pe_method": "none", |
| "pe_method": "none", |
| "rope_mode": "shared", |
| "use_conv": true, |
| "skip": true, |
| "skip_norm": true |
| }, |
| "cfg_drop_ratio": 0.2, |
| "sample_strategy": "uniform", |
| "_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching", |
| "content_encoder": { |
| "_target_": "models.content_encoder.content_encoder.ContentEncoder", |
| "embed_dim": 1024, |
| "text_encoder": { |
| "_target_": "models.content_encoder.text_encoder.T5TextEncoder", |
| "model_name": "google/flan-t5-large", |
| "embed_dim": 1024 |
| } |
| } |
| } |
| } |