|
{ |
|
"architectures": [ |
|
"DistilxLSTM" |
|
], |
|
"model_type": "xlstm", |
|
"pad_token_id": 151643, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.47.0", |
|
"xlstm_cfg": { |
|
"_block_map": "1,0,1,0,1,0", |
|
"add_embedding_dropout": false, |
|
"add_post_blocks_norm": true, |
|
"bias": false, |
|
"context_length": 512, |
|
"dropout": 0.0, |
|
"embedding_dim": 896, |
|
"mlstm_block": { |
|
"_block_idx": null, |
|
"_num_blocks": 6, |
|
"mlstm": { |
|
"_inner_embedding_dim": 1792, |
|
"_num_blocks": 6, |
|
"_proj_up_dim": 1792, |
|
"bias": false, |
|
"context_length": 512, |
|
"conv1d_kernel_size": 4, |
|
"dropout": 0.0, |
|
"embedding_dim": 896, |
|
"num_heads": 16, |
|
"proj_factor": 2.0, |
|
"qkv_proj_blocksize": 32, |
|
"round_proj_up_dim_up": true, |
|
"round_proj_up_to_multiple_of": 64 |
|
} |
|
}, |
|
"num_blocks": 6, |
|
"slstm_at": [ |
|
0, |
|
2, |
|
4 |
|
], |
|
"slstm_block": { |
|
"_block_idx": null, |
|
"_num_blocks": 6, |
|
"feedforward": { |
|
"_num_blocks": 1, |
|
"_proj_up_dim": 0, |
|
"act_fn": "gelu", |
|
"bias": false, |
|
"dropout": 0.0, |
|
"embedding_dim": -1, |
|
"ff_type": "ffn_gated", |
|
"proj_factor": 1.7, |
|
"round_proj_up_dim_up": true, |
|
"round_proj_up_to_multiple_of": 64 |
|
}, |
|
"slstm": { |
|
"_block_idx": null, |
|
"_num_blocks": 6, |
|
"backend": "cuda", |
|
"batch_size": 8, |
|
"bias_init": "powerlaw_blockdependent", |
|
"constants": {}, |
|
"conv1d_kernel_size": 4, |
|
"dropout": 0.0, |
|
"dtype": "bfloat16", |
|
"dtype_a": "float32", |
|
"dtype_b": "float32", |
|
"dtype_g": "bfloat16", |
|
"dtype_r": "bfloat16", |
|
"dtype_s": "bfloat16", |
|
"dtype_w": "bfloat16", |
|
"embedding_dim": 896, |
|
"enable_automatic_mixed_precision": true, |
|
"forward_clipval": null, |
|
"function": "slstm", |
|
"gradient_recurrent_clipval": null, |
|
"gradient_recurrent_cut": false, |
|
"group_norm_weight": true, |
|
"hidden_size": 896, |
|
"initial_val": 0.0, |
|
"input_shape": "BSGNH", |
|
"internal_input_shape": "SBNGH", |
|
"num_gates": 4, |
|
"num_heads": 16, |
|
"num_states": 4, |
|
"output_shape": "BNSH", |
|
"recurrent_weight_init": "zeros" |
|
} |
|
}, |
|
"tie_weights": false, |
|
"vocab_size": 151936, |
|
"weight_decay_on_embedding": false |
|
} |
|
} |
|
|