{ n_layers = 6, d_model = 128, d_head = 64, n_heads = 8, d_mlp = 512, d_vocab = 61, n_ctx = 59, act_fn="gelu", normalization_type="LNPre", }