| { | |
| "architectures": [ | |
| "S3GenModel" | |
| ], | |
| "cfm_inference_cfg_rate": 0.7, | |
| "cfm_sigma_min": 1e-06, | |
| "cfm_solver": "euler", | |
| "cfm_t_scheduler": "cosine", | |
| "decoder_act_fn": "gelu", | |
| "decoder_attention_head_dim": 64, | |
| "decoder_channels": [ | |
| 256 | |
| ], | |
| "decoder_in_channels": 320, | |
| "decoder_n_blocks": 4, | |
| "decoder_num_heads": 8, | |
| "decoder_num_mid_blocks": 12, | |
| "decoder_out_channels": 80, | |
| "dtype": "float32", | |
| "encoder_attention_heads": 8, | |
| "encoder_dropout_rate": 0.1, | |
| "encoder_linear_units": 2048, | |
| "encoder_num_blocks": 6, | |
| "encoder_output_size": 512, | |
| "fmax": 8000, | |
| "fmin": 0, | |
| "hop_length": 480, | |
| "input_frame_rate": 25, | |
| "mel_bins": 80, | |
| "model_type": "s3gen", | |
| "n_fft": 1920, | |
| "pre_lookahead_len": 3, | |
| "sampling_rate": 24000, | |
| "speaker_embed_dim": 192, | |
| "speaker_feat_dim": 80, | |
| "token_embed_dim": 512, | |
| "token_mel_ratio": 2, | |
| "transformers_version": "5.0.0.dev0", | |
| "vocab_size": 6561, | |
| "win_size": 1920 | |
| } | |