Variable decoder/decoder/encoder_decoder_attention/key/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable decoder/decoder/encoder_decoder_attention/out/kernel size 576 shape (heads=4, layers=3, kv=6, embed=8) partition spec ('model', None, None, None) Variable decoder/decoder/encoder_decoder_attention/query/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable decoder/decoder/encoder_decoder_attention/value/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable decoder/decoder/mlp/wi_0/kernel size 384 shape (embed=8, layers=3, mlp=16) partition spec (None, None, 'model') Variable decoder/decoder/mlp/wi_1/kernel size 384 shape (embed=8, layers=3, mlp=16) partition spec (None, None, 'model') Variable decoder/decoder/mlp/wo/kernel size 384 shape (mlp=16, layers=3, embed=8) partition spec ('model', None, None) Variable decoder/decoder/pre_cross_attention_layer_norm/scale size 24 shape (embed=8, layers=3) partition spec (None, None) Variable decoder/decoder/pre_mlp_layer_norm/scale size 24 shape (embed=8, layers=3) partition spec (None, None) Variable decoder/decoder/pre_self_attention_layer_norm/scale size 24 shape (embed=8, layers=3) partition spec (None, None) Variable decoder/decoder/relpos_bias/rel_embedding size 768 shape (heads=4, layers=3, relpos_buckets=64) partition spec ('model', None, None) Variable decoder/decoder/self_attention/key/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable decoder/decoder/self_attention/out/kernel size 576 shape (heads=4, layers=3, kv=6, embed=8) partition spec ('model', None, None, None) Variable decoder/decoder/self_attention/query/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable decoder/decoder/self_attention/value/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable decoder/decoder_norm/scale size 8 shape (embed=8) partition spec (None,) Variable decoder/logits_dense/kernel size 2048 shape (embed=8, vocab=256) partition spec (None, 'model') Variable encoder/encoder/attention/key/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable encoder/encoder/attention/out/kernel size 576 shape (heads=4, layers=3, kv=6, embed=8) partition spec ('model', None, None, None) Variable encoder/encoder/attention/query/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable encoder/encoder/attention/value/kernel size 576 shape (embed=8, layers=3, heads=4, kv=6) partition spec (None, None, 'model', None) Variable encoder/encoder/mlp/wi_0/kernel size 384 shape (embed=8, layers=3, mlp=16) partition spec (None, None, 'model') Variable encoder/encoder/mlp/wi_1/kernel size 384 shape (embed=8, layers=3, mlp=16) partition spec (None, None, 'model') Variable encoder/encoder/mlp/wo/kernel size 384 shape (mlp=16, layers=3, embed=8) partition spec ('model', None, None) Variable encoder/encoder/pre_attention_layer_norm/scale size 24 shape (embed=8, layers=3) partition spec (None, None) Variable encoder/encoder/pre_mlp_layer_norm/scale size 24 shape (embed=8, layers=3) partition spec (None, None) Variable encoder/encoder/relpos_bias/rel_embedding size 768 shape (heads=4, layers=3, relpos_buckets=64) partition spec ('model', None, None) Variable encoder/encoder_norm/scale size 8 shape (embed=8) partition spec (None,) Variable token_embedder/embedding size 2048 shape (vocab=256, embed=8) partition spec ('model', None) Total number of parameters: 14984 Variable param_states/decoder/decoder/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/key/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/key/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/out/kernel/v_col size 72 shape (4, 3, 6) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/out/kernel/v_row size 24 shape (3, 8) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/query/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/query/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/value/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/decoder/decoder/encoder_decoder_attention/value/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/mlp/wi_0/kernel/v_col size 48 shape (3, 16) partition spec None Variable param_states/decoder/decoder/mlp/wi_0/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/mlp/wi_1/kernel/v_col size 48 shape (3, 16) partition spec None Variable param_states/decoder/decoder/mlp/wi_1/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/mlp/wo/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/mlp/wo/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/mlp/wo/kernel/v_col size 48 shape (16, 3) partition spec None Variable param_states/decoder/decoder/mlp/wo/kernel/v_row size 24 shape (3, 8) partition spec None Variable param_states/decoder/decoder/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_cross_attention_layer_norm/scale/v size 24 shape (embed=8, layers=3) partition spec (None, None) Variable param_states/decoder/decoder/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_mlp_layer_norm/scale/v size 24 shape (embed=8, layers=3) partition spec (None, None) Variable param_states/decoder/decoder/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_self_attention_layer_norm/scale/v size 24 shape (embed=8, layers=3) partition spec (None, None) Variable param_states/decoder/decoder/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/relpos_bias/rel_embedding/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/relpos_bias/rel_embedding/v size 768 shape (heads=4, layers=3, relpos_buckets=64) partition spec ('model', None, None) Variable param_states/decoder/decoder/relpos_bias/rel_embedding/v_col size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/relpos_bias/rel_embedding/v_row size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/key/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/key/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/key/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/decoder/decoder/self_attention/key/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/self_attention/out/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/out/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/out/kernel/v_col size 72 shape (4, 3, 6) partition spec None Variable param_states/decoder/decoder/self_attention/out/kernel/v_row size 24 shape (3, 8) partition spec None Variable param_states/decoder/decoder/self_attention/query/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/query/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/query/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/decoder/decoder/self_attention/query/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder/self_attention/value/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/value/kernel/v size 1 shape (1,) partition spec None Variable param_states/decoder/decoder/self_attention/value/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/decoder/decoder/self_attention/value/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/decoder/decoder_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/decoder/decoder_norm/scale/v size 8 shape (embed=8) partition spec (None,) Variable param_states/decoder/decoder_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/decoder/decoder_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/decoder/logits_dense/kernel/m size 1 shape (1,) partition spec None Variable param_states/decoder/logits_dense/kernel/v size 2048 shape (embed=8, vocab=256) partition spec (None, 'model') Variable param_states/decoder/logits_dense/kernel/v_col size 1 shape (1,) partition spec None Variable param_states/decoder/logits_dense/kernel/v_row size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/key/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/key/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/key/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/encoder/encoder/attention/key/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/encoder/encoder/attention/out/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/out/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/out/kernel/v_col size 72 shape (4, 3, 6) partition spec None Variable param_states/encoder/encoder/attention/out/kernel/v_row size 24 shape (3, 8) partition spec None Variable param_states/encoder/encoder/attention/query/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/query/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/query/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/encoder/encoder/attention/query/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/encoder/encoder/attention/value/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/value/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/attention/value/kernel/v_col size 72 shape (3, 4, 6) partition spec None Variable param_states/encoder/encoder/attention/value/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/encoder/encoder/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/mlp/wi_0/kernel/v_col size 48 shape (3, 16) partition spec None Variable param_states/encoder/encoder/mlp/wi_0/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/encoder/encoder/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/mlp/wi_1/kernel/v_col size 48 shape (3, 16) partition spec None Variable param_states/encoder/encoder/mlp/wi_1/kernel/v_row size 24 shape (8, 3) partition spec None Variable param_states/encoder/encoder/mlp/wo/kernel/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/mlp/wo/kernel/v size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/mlp/wo/kernel/v_col size 48 shape (16, 3) partition spec None Variable param_states/encoder/encoder/mlp/wo/kernel/v_row size 24 shape (3, 8) partition spec None Variable param_states/encoder/encoder/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/pre_attention_layer_norm/scale/v size 24 shape (embed=8, layers=3) partition spec (None, None) Variable param_states/encoder/encoder/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/pre_mlp_layer_norm/scale/v size 24 shape (embed=8, layers=3) partition spec (None, None) Variable param_states/encoder/encoder/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/relpos_bias/rel_embedding/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/relpos_bias/rel_embedding/v size 768 shape (heads=4, layers=3, relpos_buckets=64) partition spec ('model', None, None) Variable param_states/encoder/encoder/relpos_bias/rel_embedding/v_col size 1 shape (1,) partition spec None Variable param_states/encoder/encoder/relpos_bias/rel_embedding/v_row size 1 shape (1,) partition spec None Variable param_states/encoder/encoder_norm/scale/m size 1 shape (1,) partition spec None Variable param_states/encoder/encoder_norm/scale/v size 8 shape (embed=8) partition spec (None,) Variable param_states/encoder/encoder_norm/scale/v_col size 1 shape (1,) partition spec None Variable param_states/encoder/encoder_norm/scale/v_row size 1 shape (1,) partition spec None Variable param_states/token_embedder/embedding/m size 1 shape (1,) partition spec None Variable param_states/token_embedder/embedding/v size 2048 shape (vocab=256, embed=8) partition spec ('model', None) Variable param_states/token_embedder/embedding/v_col size 1 shape (1,) partition spec None Variable param_states/token_embedder/embedding/v_row size 1 shape (1,) partition spec None Variable step size 1 shape () partition spec None