walsh-1-7b / config.json
dinalt's picture
Upload model
b9012bf verified
raw
history blame
1.5 kB
{
"_name_or_path": "/home/dinalt/ai_assets/models/walsh",
"activation_args": {},
"activation_cls": "torch.nn.GELU",
"architectures": [
"HFCausalModel"
],
"attention_args": {
"beta": 0.25,
"dropout": 0.1
},
"attention_cls": ".CausalSelfAttention",
"auto_map": {
"AutoConfig": "modelling_walsh.Config",
"AutoModelForCausalLM": "modelling_walsh.HFCausalModel"
},
"d_embed": 2048,
"dim_feedforward": 8192,
"dropout": 0.1,
"embdding_cls": "torch.nn.Embedding",
"embedding_args": {},
"feedforward_args": {
"beta": 0.25,
"bias": true
},
"feedforward_cls": ".FeedforwardLayer",
"head_args": {},
"head_cls": ".Transformer",
"init_gain": 1.0,
"layer_args": {
"alpha": 2.828427124746
},
"layer_cls": ".DeepnetLayer",
"layer_stack_args": {},
"layer_stack_cls": ".TransformerLayerStack",
"loss_function": ".causal_loss",
"max_sequence_length": 16384,
"model_type": "walsh-causal-v1",
"norm_args": {
"normalized_shape": 2084
},
"norm_cls": "torch.nn.LayerNorm",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"output_proj_args": {},
"output_proj_cls": "torch.nn.Linear",
"pad_index": null,
"positional_encoder_args": {
"d_embed": 2048,
"gain": 0.3333,
"max_seq": 16384
},
"positional_encoder_cls": ".RSWalshPositionalEncoder",
"torch_dtype": "bfloat16",
"transformer_args": {},
"transformer_cls": ".Transformer",
"transformers_version": "4.37.2",
"vocab_size": 32000
}