Text Generation
scaling
ArturBaranowskiAA's picture
Upload Pharia-1-LLM-7B-control.
7efcdb5
raw
history blame
1.4 kB
{
"transformer_architecture": {
"vocab_size": 128000,
"vocab_file": "vocab.json",
"hidden_size": 4608,
"num_layers": 27,
"num_attention_heads": 36,
"num_local_attention_heads": 0,
"local_attention_window_size": null,
"rotary_embedding_base": 1000000,
"rotary_percentage": 1.0,
"sequence_length": 8192,
"norm_type": "layernorm",
"relative_position_embedding_type": "rotary_complex",
"mlp_type": "default",
"mlp_factor": 4.0,
"attention_bias": true,
"attention_qkv_in_one": false,
"attention_num_kv_heads": 4,
"attention_use_matmul": false,
"mlp_bias": true,
"key_query_norm": false,
"weight_tying": false,
"masked_softmax": {
"kernel": "torch",
"softmax_in_fp32": true,
"scale": 1.0,
"deterministic_flash_attn_bwd": false
},
"layernorm": {
"optimization_type": "torch",
"layernorm_epsilon": 1e-05
},
"precision": "bfloat16",
"dropout_embedding": 0.0,
"dropout_attention_probs": 0.0,
"dropout_after_attention": 0.0,
"dropout_after_mlp": 0.0,
"finetunable_token_ids": [],
"image_encoder": false,
"dropout_image_encoder": 0.0,
"lora_config": null
}
}