train: seed: 1234 epochs: 300 batch_size: 8 gradient_accumulation: 4 save_every_n_epoch: 1 precision: 16 gradient_clip: 1.0 optimizer: lr: 0.01 lr_init: 0.00001 lr_end: 0.0001 warmup_steps: 2000 decay_steps: 40000 data: max_eval_sample: 8 max_sec: 54 num_workers: 1 pad_val: 1024 # same with EOS in model model: vocab_size: 1025 phoneme_vocab_size: 512 embedding_dim: 512 hidden_dim: 512 head: 16 linear_units: 2048 n_layer: 12 dropout: 0 EOS: 1024 inference: top_k: 5