d_model: 384
dim_feedforward: 1024
learning_rate: 0.001
n_heads: 4
n_layers: 4
num_speakers: 3
num_steps: 384
weight_decay: 0.001