num_audio_tokens: 626 num_text_tokens: 21178 gpt_config: hidden_size: 768 intermediate_size: 3072 num_attention_heads: 12 num_hidden_layers: 20 use_cache: False max_position_embeddings: 4096 # attn_implementation: flash_attention_2 spk_emb_dim: 192 spk_KL: False num_audio_tokens: 626 num_text_tokens: null num_vq: 4