GLAP / configuration_glap.py
Heinrich Dinkel
Updated GLAP, removed all dependencies to sonar.
a09cac7
"""GLAP (Generalized Language Audio Pretraining) configuration."""
from transformers import PretrainedConfig
class GlapConfig(PretrainedConfig):
model_type = "glap"
def __init__(
self,
# Audio encoder (Dasheng)
audio_embed_dim: int = 768,
audio_depth: int = 12,
audio_num_heads: int = 12,
patch_size: list = None,
patch_stride: list = None,
target_length: int = 1008,
sample_rate: int = 16000,
# Text encoder (SONAR)
text_vocab_size: int = 256206,
text_model_dim: int = 1024,
text_num_layers: int = 24,
text_num_heads: int = 16,
text_ffn_inner_dim: int = 8192,
text_max_seq_len: int = 514,
text_pad_idx: int = 0,
text_dropout_p: float = 0.1,
# Projection
embed_size: int = 1024,
**kwargs,
):
super().__init__(**kwargs)
self.audio_embed_dim = audio_embed_dim
self.audio_depth = audio_depth
self.audio_num_heads = audio_num_heads
self.patch_size = patch_size or [64, 4]
self.patch_stride = patch_stride or [64, 4]
self.target_length = target_length
self.sample_rate = sample_rate
self.text_vocab_size = text_vocab_size
self.text_model_dim = text_model_dim
self.text_num_layers = text_num_layers
self.text_num_heads = text_num_heads
self.text_ffn_inner_dim = text_ffn_inner_dim
self.text_max_seq_len = text_max_seq_len
self.text_pad_idx = text_pad_idx
self.text_dropout_p = text_dropout_p
self.embed_size = embed_size