{ "batch_size": 64, "mask_probability": 0.15, "num_layers": 6, "num_heads": 8, "d_model": 128, "d_ff": 256, "p_dropout": 0.1, "max_seq_len": 128, "vocab_size": 25000, "learning_rate": 3e-4, "grad_clip_value": 1, }