arch: type: TransformerLMHeadModel args: transformer_config: type: TransformerDecoderOnlyModel args: embed_config: type: TransformerEmbeddingBlock args: token_embed_config: type: TokenEmbedding args: n_embed: 768 n_vocab: 64000 pos_embed_config: type: PositionEmbedding args: n_embed: 768 n_pos: 2048 type_embed_config: null ln_config: null p_drop_embed: 0.1 concat_strategy: id_first decoder_config: type: TransformerDecoderBlock args: attn_config: type: MultiHeadKeyValueAttention args: n_embed: 768 n_pos: 2048 n_head: 12 head_size: 64 p_drop_attn: 0.1 p_drop_resid: 0.1 bias_attn: true bias_proj: true cross_attn: false scale_dot_product: true scale_layer_wise: false layer_idx: null perform_linear_bias: false perform_bloom_split_head: false perform_query_scaling: false attn_window_size: null mlp_config: type: TransformerMLP args: n_embed: 768 n_inner: 3072 act_fn_config: type: NewGELUActivation args: {} p_drop_mlp: 0.1 ln_config: type: LayerNorm args: n_embed: 768 ln_eps: 1.0e-05 n_embed: 768 post_norm: false add_cross_attn: false n_embed: 768 n_layer: 12 n_head: 12 ln_config: type: LayerNorm args: n_embed: 768 ln_eps: 1.0e-05 perform_linear_bias: false attn_window_size_loop_unit: null lm_head_config: type: TransformerLMHead args: n_vocab: 64000 n_embed: 768 perform_transform: false act_fn_config: null ln_config: null