# TEXT ENCODER CONFIG text_model: 'bert-base-uncased' text_len: 100 transformer_embed_dim: 768 freeze_text_encoder_weights: True # AUDIO ENCODER CONFIG audioenc_name: 'Cnn14' out_emb: 2048 sampling_rate: 44100 duration: 5 fmin: 50 fmax: 14000 n_fft: 1028 hop_size: 320 mel_bins: 64 window_size: 1024 # PROJECTION SPACE CONFIG d_proj: 1024 temperature: 0.003 # TRAINING AND EVALUATION CONFIG num_classes: 527 batch_size: 1024 demo: False