{ | |
"batch_size": 64, | |
"mask_probability": 0.15, | |
"num_layers": 6, | |
"num_heads": 8, | |
"d_model": 128, | |
"d_ff": 256, | |
"p_dropout": 0.1, | |
"max_seq_len": 128, | |
"vocab_size": 25000, | |
"learning_rate": 3e-4, | |
"grad_clip_value": 1, | |
} |
{ | |
"batch_size": 64, | |
"mask_probability": 0.15, | |
"num_layers": 6, | |
"num_heads": 8, | |
"d_model": 128, | |
"d_ff": 256, | |
"p_dropout": 0.1, | |
"max_seq_len": 128, | |
"vocab_size": 25000, | |
"learning_rate": 3e-4, | |
"grad_clip_value": 1, | |
} |