{ "embed_dim": 2048, "audio_cfg": { "audio_length": 1024, "clip_samples": 480000, "mel_bins": 64, "sample_rate": 48000, "window_size": 1024, "hop_size": 480, "fmin": 50, "fmax": 14000, "class_num": 527, "model_type": "HTSAT", "model_name": "large" }, "text_cfg": { "context_length": 77, "vocab_size": 49408, "width": 512, "heads": 8, "layers": 12 } }