thiomajid's picture
Training in progress, step 100
fd9bc05 verified
raw
history blame
2.47 kB
{
"architectures": [
"DistilxLSTM"
],
"model_type": "xlstm",
"pad_token_id": 151643,
"torch_dtype": "float32",
"transformers_version": "4.47.0",
"xlstm_cfg": {
"_block_map": "1,0,1,0,1,0",
"add_embedding_dropout": false,
"add_post_blocks_norm": true,
"bias": false,
"context_length": 512,
"dropout": 0.0,
"embedding_dim": 896,
"mlstm_block": {
"_block_idx": null,
"_num_blocks": 6,
"mlstm": {
"_inner_embedding_dim": 1792,
"_num_blocks": 6,
"_proj_up_dim": 1792,
"bias": false,
"context_length": 512,
"conv1d_kernel_size": 4,
"dropout": 0.0,
"embedding_dim": 896,
"num_heads": 16,
"proj_factor": 2.0,
"qkv_proj_blocksize": 32,
"round_proj_up_dim_up": true,
"round_proj_up_to_multiple_of": 64
}
},
"num_blocks": 6,
"slstm_at": [
0,
2,
4
],
"slstm_block": {
"_block_idx": null,
"_num_blocks": 6,
"feedforward": {
"_num_blocks": 1,
"_proj_up_dim": 0,
"act_fn": "gelu",
"bias": false,
"dropout": 0.0,
"embedding_dim": -1,
"ff_type": "ffn_gated",
"proj_factor": 1.7,
"round_proj_up_dim_up": true,
"round_proj_up_to_multiple_of": 64
},
"slstm": {
"_block_idx": null,
"_num_blocks": 6,
"backend": "cuda",
"batch_size": 8,
"bias_init": "powerlaw_blockdependent",
"constants": {},
"conv1d_kernel_size": 4,
"dropout": 0.0,
"dtype": "bfloat16",
"dtype_a": "float32",
"dtype_b": "float32",
"dtype_g": "bfloat16",
"dtype_r": "bfloat16",
"dtype_s": "bfloat16",
"dtype_w": "bfloat16",
"embedding_dim": 896,
"enable_automatic_mixed_precision": true,
"forward_clipval": null,
"function": "slstm",
"gradient_recurrent_clipval": null,
"gradient_recurrent_cut": false,
"group_norm_weight": true,
"hidden_size": 896,
"initial_val": 0.0,
"input_shape": "BSGNH",
"internal_input_shape": "SBNGH",
"num_gates": 4,
"num_heads": 16,
"num_states": 4,
"output_shape": "BNSH",
"recurrent_weight_init": "zeros"
}
},
"tie_weights": false,
"vocab_size": 151936,
"weight_decay_on_embedding": false
}
}