framework: pretrain | |
data_dir: ../../dataset | |
train_data: all | |
caption_type: meta_tag_caption_sim | |
workers: 4 | |
total_steps: 32768 | |
start_steps: 0 | |
batch_size: 768 | |
world_size: 1 | |
lr: 5.0e-05 | |
min_lr: 1.0e-09 | |
seed: null | |
print_freq: 10 | |
cos: true | |
n_fft: 1024 | |
hop_size: 0.01 | |
sr: 22050 | |
duration: 10 | |
max_length: 128 | |
audio_loader: ffmpeg | |
audio_arch: resnet | |
text_arch: roberta-base | |
n_heads: 8 | |
width: 64 | |
n_mels: 128 | |
audio_dim: 768 | |
text_dim: 768 | |
mlp_dim: 128 | |
temperature: 0.1 | |
tid: base | |
gpu: 0 | |
epochs: 19 | |
start_epoch: 0 | |
warmup_steps: 5000 | |