{ "batch_size": 64, "mask_probability": 0.15, "num_layers": 6, "num_heads": 8, "d_model": 512, "d_ff": 1024, "p_dropout": 0.1, "max_seq_len": 256, "vocab_size": 20000, "learning_rate": 1e-4, "grad_clip_value": 1 }