{ | |
n_layers = 6, | |
d_model = 128, | |
d_head = 64, | |
n_heads = 8, | |
d_mlp = 512, | |
d_vocab = 61, | |
n_ctx = 59, | |
act_fn="gelu", | |
normalization_type="LNPre", | |
} |
{ | |
n_layers = 6, | |
d_model = 128, | |
d_head = 64, | |
n_heads = 8, | |
d_mlp = 512, | |
d_vocab = 61, | |
n_ctx = 59, | |
act_fn="gelu", | |
normalization_type="LNPre", | |
} |