imfinethx's picture
new dns ckpts for extra experiments
e93b448
{
"_name_or_path": "checkpoints/mtgv/MobileLLaMA-1.4B-Chat",
"anyprec": {
"arch_config": {
"layers_name": "layers",
"model_name": "model",
"module_names": [
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.o_proj",
"mlp.gate_proj",
"mlp.up_proj",
"mlp.down_proj"
]
},
"group_count": 1,
"parent_precision": 4,
"seed_precision": 2,
"sparse_numvals": {
"model.layers.0.mlp.down_proj": 103227,
"model.layers.0.mlp.gate_proj": 104099,
"model.layers.0.mlp.up_proj": 94464,
"model.layers.0.self_attn.k_proj": 123232,
"model.layers.0.self_attn.o_proj": 40694,
"model.layers.0.self_attn.q_proj": 107706,
"model.layers.0.self_attn.v_proj": 45328,
"model.layers.1.mlp.down_proj": 99550,
"model.layers.1.mlp.gate_proj": 105114,
"model.layers.1.mlp.up_proj": 94325,
"model.layers.1.self_attn.k_proj": 282518,
"model.layers.1.self_attn.o_proj": 79591,
"model.layers.1.self_attn.q_proj": 274280,
"model.layers.1.self_attn.v_proj": 54163,
"model.layers.10.mlp.down_proj": 98714,
"model.layers.10.mlp.gate_proj": 119347,
"model.layers.10.mlp.up_proj": 102192,
"model.layers.10.self_attn.k_proj": 135560,
"model.layers.10.self_attn.o_proj": 49120,
"model.layers.10.self_attn.q_proj": 122608,
"model.layers.10.self_attn.v_proj": 51203,
"model.layers.11.mlp.down_proj": 99624,
"model.layers.11.mlp.gate_proj": 121942,
"model.layers.11.mlp.up_proj": 103267,
"model.layers.11.self_attn.k_proj": 127872,
"model.layers.11.self_attn.o_proj": 45314,
"model.layers.11.self_attn.q_proj": 125543,
"model.layers.11.self_attn.v_proj": 49611,
"model.layers.12.mlp.down_proj": 103935,
"model.layers.12.mlp.gate_proj": 134318,
"model.layers.12.mlp.up_proj": 109313,
"model.layers.12.self_attn.k_proj": 115429,
"model.layers.12.self_attn.o_proj": 41620,
"model.layers.12.self_attn.q_proj": 109014,
"model.layers.12.self_attn.v_proj": 48602,
"model.layers.13.mlp.down_proj": 108353,
"model.layers.13.mlp.gate_proj": 158387,
"model.layers.13.mlp.up_proj": 112940,
"model.layers.13.self_attn.k_proj": 114487,
"model.layers.13.self_attn.o_proj": 45428,
"model.layers.13.self_attn.q_proj": 113406,
"model.layers.13.self_attn.v_proj": 57153,
"model.layers.14.mlp.down_proj": 113863,
"model.layers.14.mlp.gate_proj": 168356,
"model.layers.14.mlp.up_proj": 118034,
"model.layers.14.self_attn.k_proj": 120529,
"model.layers.14.self_attn.o_proj": 45075,
"model.layers.14.self_attn.q_proj": 112276,
"model.layers.14.self_attn.v_proj": 54165,
"model.layers.15.mlp.down_proj": 111026,
"model.layers.15.mlp.gate_proj": 162430,
"model.layers.15.mlp.up_proj": 119102,
"model.layers.15.self_attn.k_proj": 123737,
"model.layers.15.self_attn.o_proj": 47443,
"model.layers.15.self_attn.q_proj": 125065,
"model.layers.15.self_attn.v_proj": 56583,
"model.layers.16.mlp.down_proj": 110444,
"model.layers.16.mlp.gate_proj": 154396,
"model.layers.16.mlp.up_proj": 118334,
"model.layers.16.self_attn.k_proj": 108657,
"model.layers.16.self_attn.o_proj": 47679,
"model.layers.16.self_attn.q_proj": 107248,
"model.layers.16.self_attn.v_proj": 54589,
"model.layers.17.mlp.down_proj": 108084,
"model.layers.17.mlp.gate_proj": 143872,
"model.layers.17.mlp.up_proj": 116322,
"model.layers.17.self_attn.k_proj": 117888,
"model.layers.17.self_attn.o_proj": 48108,
"model.layers.17.self_attn.q_proj": 120507,
"model.layers.17.self_attn.v_proj": 52799,
"model.layers.18.mlp.down_proj": 103807,
"model.layers.18.mlp.gate_proj": 131358,
"model.layers.18.mlp.up_proj": 113634,
"model.layers.18.self_attn.k_proj": 105856,
"model.layers.18.self_attn.o_proj": 49346,
"model.layers.18.self_attn.q_proj": 122501,
"model.layers.18.self_attn.v_proj": 51576,
"model.layers.19.mlp.down_proj": 102430,
"model.layers.19.mlp.gate_proj": 123863,
"model.layers.19.mlp.up_proj": 110315,
"model.layers.19.self_attn.k_proj": 104299,
"model.layers.19.self_attn.o_proj": 61142,
"model.layers.19.self_attn.q_proj": 105187,
"model.layers.19.self_attn.v_proj": 62397,
"model.layers.2.mlp.down_proj": 95852,
"model.layers.2.mlp.gate_proj": 95205,
"model.layers.2.mlp.up_proj": 92548,
"model.layers.2.self_attn.k_proj": 189410,
"model.layers.2.self_attn.o_proj": 49748,
"model.layers.2.self_attn.q_proj": 149970,
"model.layers.2.self_attn.v_proj": 44302,
"model.layers.20.mlp.down_proj": 101061,
"model.layers.20.mlp.gate_proj": 118439,
"model.layers.20.mlp.up_proj": 109198,
"model.layers.20.self_attn.k_proj": 87935,
"model.layers.20.self_attn.o_proj": 54376,
"model.layers.20.self_attn.q_proj": 89672,
"model.layers.20.self_attn.v_proj": 50476,
"model.layers.21.mlp.down_proj": 100632,
"model.layers.21.mlp.gate_proj": 109938,
"model.layers.21.mlp.up_proj": 103567,
"model.layers.21.self_attn.k_proj": 91720,
"model.layers.21.self_attn.o_proj": 74802,
"model.layers.21.self_attn.q_proj": 88150,
"model.layers.21.self_attn.v_proj": 69573,
"model.layers.22.mlp.down_proj": 111233,
"model.layers.22.mlp.gate_proj": 121677,
"model.layers.22.mlp.up_proj": 108841,
"model.layers.22.self_attn.k_proj": 100375,
"model.layers.22.self_attn.o_proj": 61122,
"model.layers.22.self_attn.q_proj": 101996,
"model.layers.22.self_attn.v_proj": 59543,
"model.layers.23.mlp.down_proj": 165882,
"model.layers.23.mlp.gate_proj": 149269,
"model.layers.23.mlp.up_proj": 156710,
"model.layers.23.self_attn.k_proj": 74217,
"model.layers.23.self_attn.o_proj": 70134,
"model.layers.23.self_attn.q_proj": 71364,
"model.layers.23.self_attn.v_proj": 66215,
"model.layers.3.mlp.down_proj": 94469,
"model.layers.3.mlp.gate_proj": 93720,
"model.layers.3.mlp.up_proj": 93335,
"model.layers.3.self_attn.k_proj": 129559,
"model.layers.3.self_attn.o_proj": 39091,
"model.layers.3.self_attn.q_proj": 100716,
"model.layers.3.self_attn.v_proj": 39453,
"model.layers.4.mlp.down_proj": 97785,
"model.layers.4.mlp.gate_proj": 97248,
"model.layers.4.mlp.up_proj": 95089,
"model.layers.4.self_attn.k_proj": 128341,
"model.layers.4.self_attn.o_proj": 37298,
"model.layers.4.self_attn.q_proj": 92904,
"model.layers.4.self_attn.v_proj": 38505,
"model.layers.5.mlp.down_proj": 96379,
"model.layers.5.mlp.gate_proj": 95376,
"model.layers.5.mlp.up_proj": 94082,
"model.layers.5.self_attn.k_proj": 122556,
"model.layers.5.self_attn.o_proj": 43788,
"model.layers.5.self_attn.q_proj": 102967,
"model.layers.5.self_attn.v_proj": 47872,
"model.layers.6.mlp.down_proj": 94813,
"model.layers.6.mlp.gate_proj": 99275,
"model.layers.6.mlp.up_proj": 94511,
"model.layers.6.self_attn.k_proj": 123676,
"model.layers.6.self_attn.o_proj": 49440,
"model.layers.6.self_attn.q_proj": 103919,
"model.layers.6.self_attn.v_proj": 57010,
"model.layers.7.mlp.down_proj": 94883,
"model.layers.7.mlp.gate_proj": 102457,
"model.layers.7.mlp.up_proj": 97437,
"model.layers.7.self_attn.k_proj": 100675,
"model.layers.7.self_attn.o_proj": 42193,
"model.layers.7.self_attn.q_proj": 83228,
"model.layers.7.self_attn.v_proj": 45513,
"model.layers.8.mlp.down_proj": 97843,
"model.layers.8.mlp.gate_proj": 113784,
"model.layers.8.mlp.up_proj": 103097,
"model.layers.8.self_attn.k_proj": 113650,
"model.layers.8.self_attn.o_proj": 52513,
"model.layers.8.self_attn.q_proj": 92988,
"model.layers.8.self_attn.v_proj": 60755,
"model.layers.9.mlp.down_proj": 96823,
"model.layers.9.mlp.gate_proj": 112042,
"model.layers.9.mlp.up_proj": 102440,
"model.layers.9.self_attn.k_proj": 128169,
"model.layers.9.self_attn.o_proj": 54639,
"model.layers.9.self_attn.q_proj": 116292,
"model.layers.9.self_attn.v_proj": 60561
}
},
"architectures": [
"LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 5632,
"max_position_embeddings": 2048,
"max_sequence_length": 2048,
"model_type": "llama",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"num_key_value_heads": 16,
"pad_token_id": 0,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 10000.0,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.39.3",
"use_cache": true,
"vocab_size": 32000
}