| { | |
| "architectures": [ | |
| "GlapModel" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_glap.GlapConfig", | |
| "AutoModel": "modeling_glap.GlapModel" | |
| }, | |
| "model_type": "glap", | |
| "audio_embed_dim": 768, | |
| "audio_depth": 12, | |
| "audio_num_heads": 12, | |
| "patch_size": [ | |
| 64, | |
| 4 | |
| ], | |
| "patch_stride": [ | |
| 64, | |
| 4 | |
| ], | |
| "target_length": 1008, | |
| "sample_rate": 16000, | |
| "text_vocab_size": 256206, | |
| "text_model_dim": 1024, | |
| "text_num_layers": 24, | |
| "text_num_heads": 16, | |
| "text_ffn_inner_dim": 8192, | |
| "text_max_seq_len": 514, | |
| "text_pad_idx": 0, | |
| "text_dropout_p": 0.1, | |
| "embed_size": 1024 | |
| } |