File size: 2,438 Bytes
fc1343a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
arch:
  type: TransformerLMHeadModel
  args:
    transformer_config:
      type: TransformerDecoderOnlyModel
      args:
        embed_config:
          type: TransformerEmbeddingBlock
          args:
            token_embed_config:
              type: TokenEmbedding
              args:
                n_embed: 512
                n_vocab: 50304
            pos_embed_config: null
            type_embed_config: null
            ln_config: null
            p_drop_embed: 0.0
            concat_strategy: id_first
        decoder_config:
          type: ParallelTransformerDecoderBlock
          args:
            attn_config:
              type: GPTNeoXAttention
              args:
                n_embed: 512
                n_pos: 2048
                n_head: 8
                n_key_value_head: 8
                head_size: 64
                p_drop_attn: 0.0
                p_drop_resid: 0.0
                bias_attn: true
                bias_proj: true
                cross_attn: false
                scale_dot_product: true
                scale_layer_wise: false
                layer_idx: null
                rope_config:
                  type: MistralRotaryEmbedding
                  args:
                    rotary_head_size: 16
                    n_pos: 2048
                    base: 10000
                    scaling_type: null
                    scaling_factor: null
                perform_bloom_split_head: true
            mlp_config:
              type: TransformerMLP
              args:
                n_embed: 512
                n_inner: 2048
                act_fn_config:
                  type: NewGELUActivation
                  args: {}
                p_drop_mlp: 0.0
            ln_config:
              type: LayerNorm
              args:
                n_embed: 512
                ln_eps: 1.0e-05
            n_embed: 512
            post_norm: false
            add_cross_attn: false
            share_layer_norm: false
        n_embed: 512
        n_layer: 6
        n_head: 8
        ln_config:
          type: LayerNorm
          args:
            n_embed: 512
            ln_eps: 1.0e-05
        perform_linear_bias: false
        attn_window_size_loop_unit: null
    lm_head_config:
      type: TransformerLMHead
      args:
        n_vocab: 50304
        n_embed: 512
        bias_lm_head: false
        perform_transform: false
        act_fn_config: null
        ln_config: null