|
{ |
|
"_name_or_path": "checkpoints/microsoft/phi-1_5", |
|
"anyprec": { |
|
"arch_config": { |
|
"layers_name": "layers", |
|
"model_name": "model", |
|
"module_names": [ |
|
"self_attn.q_proj", |
|
"self_attn.k_proj", |
|
"self_attn.v_proj", |
|
"self_attn.dense", |
|
"mlp.fc1", |
|
"mlp.fc2" |
|
] |
|
}, |
|
"group_count": 1, |
|
"parent_precision": 4, |
|
"seed_precision": 2, |
|
"sparse_numvals": { |
|
"model.layers.0.mlp.fc1": 675311, |
|
"model.layers.0.mlp.fc2": 624542, |
|
"model.layers.0.self_attn.dense": 82454, |
|
"model.layers.0.self_attn.k_proj": 180985, |
|
"model.layers.0.self_attn.q_proj": 156889, |
|
"model.layers.0.self_attn.v_proj": 94332, |
|
"model.layers.1.mlp.fc1": 201243, |
|
"model.layers.1.mlp.fc2": 325345, |
|
"model.layers.1.self_attn.dense": 73540, |
|
"model.layers.1.self_attn.k_proj": 103462, |
|
"model.layers.1.self_attn.q_proj": 99058, |
|
"model.layers.1.self_attn.v_proj": 86333, |
|
"model.layers.10.mlp.fc1": 295445, |
|
"model.layers.10.mlp.fc2": 323451, |
|
"model.layers.10.self_attn.dense": 72926, |
|
"model.layers.10.self_attn.k_proj": 100535, |
|
"model.layers.10.self_attn.q_proj": 94643, |
|
"model.layers.10.self_attn.v_proj": 85469, |
|
"model.layers.11.mlp.fc1": 291450, |
|
"model.layers.11.mlp.fc2": 321914, |
|
"model.layers.11.self_attn.dense": 71540, |
|
"model.layers.11.self_attn.k_proj": 97013, |
|
"model.layers.11.self_attn.q_proj": 89427, |
|
"model.layers.11.self_attn.v_proj": 82468, |
|
"model.layers.12.mlp.fc1": 285509, |
|
"model.layers.12.mlp.fc2": 328599, |
|
"model.layers.12.self_attn.dense": 69830, |
|
"model.layers.12.self_attn.k_proj": 101851, |
|
"model.layers.12.self_attn.q_proj": 94202, |
|
"model.layers.12.self_attn.v_proj": 84071, |
|
"model.layers.13.mlp.fc1": 277413, |
|
"model.layers.13.mlp.fc2": 308466, |
|
"model.layers.13.self_attn.dense": 74257, |
|
"model.layers.13.self_attn.k_proj": 101329, |
|
"model.layers.13.self_attn.q_proj": 94394, |
|
"model.layers.13.self_attn.v_proj": 83090, |
|
"model.layers.14.mlp.fc1": 272080, |
|
"model.layers.14.mlp.fc2": 347434, |
|
"model.layers.14.self_attn.dense": 77486, |
|
"model.layers.14.self_attn.k_proj": 99568, |
|
"model.layers.14.self_attn.q_proj": 97367, |
|
"model.layers.14.self_attn.v_proj": 85949, |
|
"model.layers.15.mlp.fc1": 262687, |
|
"model.layers.15.mlp.fc2": 326298, |
|
"model.layers.15.self_attn.dense": 71091, |
|
"model.layers.15.self_attn.k_proj": 99666, |
|
"model.layers.15.self_attn.q_proj": 114009, |
|
"model.layers.15.self_attn.v_proj": 79472, |
|
"model.layers.16.mlp.fc1": 255098, |
|
"model.layers.16.mlp.fc2": 365511, |
|
"model.layers.16.self_attn.dense": 69991, |
|
"model.layers.16.self_attn.k_proj": 97043, |
|
"model.layers.16.self_attn.q_proj": 101120, |
|
"model.layers.16.self_attn.v_proj": 75770, |
|
"model.layers.17.mlp.fc1": 245724, |
|
"model.layers.17.mlp.fc2": 337114, |
|
"model.layers.17.self_attn.dense": 70591, |
|
"model.layers.17.self_attn.k_proj": 92819, |
|
"model.layers.17.self_attn.q_proj": 91892, |
|
"model.layers.17.self_attn.v_proj": 74253, |
|
"model.layers.18.mlp.fc1": 240816, |
|
"model.layers.18.mlp.fc2": 335322, |
|
"model.layers.18.self_attn.dense": 79137, |
|
"model.layers.18.self_attn.k_proj": 98409, |
|
"model.layers.18.self_attn.q_proj": 123879, |
|
"model.layers.18.self_attn.v_proj": 85293, |
|
"model.layers.19.mlp.fc1": 234256, |
|
"model.layers.19.mlp.fc2": 317669, |
|
"model.layers.19.self_attn.dense": 80277, |
|
"model.layers.19.self_attn.k_proj": 97895, |
|
"model.layers.19.self_attn.q_proj": 121723, |
|
"model.layers.19.self_attn.v_proj": 82971, |
|
"model.layers.2.mlp.fc1": 233518, |
|
"model.layers.2.mlp.fc2": 315355, |
|
"model.layers.2.self_attn.dense": 68242, |
|
"model.layers.2.self_attn.k_proj": 102966, |
|
"model.layers.2.self_attn.q_proj": 98281, |
|
"model.layers.2.self_attn.v_proj": 83855, |
|
"model.layers.20.mlp.fc1": 230157, |
|
"model.layers.20.mlp.fc2": 317412, |
|
"model.layers.20.self_attn.dense": 70557, |
|
"model.layers.20.self_attn.k_proj": 96874, |
|
"model.layers.20.self_attn.q_proj": 117460, |
|
"model.layers.20.self_attn.v_proj": 75849, |
|
"model.layers.21.mlp.fc1": 227363, |
|
"model.layers.21.mlp.fc2": 323600, |
|
"model.layers.21.self_attn.dense": 73035, |
|
"model.layers.21.self_attn.k_proj": 93176, |
|
"model.layers.21.self_attn.q_proj": 124248, |
|
"model.layers.21.self_attn.v_proj": 75505, |
|
"model.layers.22.mlp.fc1": 233020, |
|
"model.layers.22.mlp.fc2": 395456, |
|
"model.layers.22.self_attn.dense": 71502, |
|
"model.layers.22.self_attn.k_proj": 88462, |
|
"model.layers.22.self_attn.q_proj": 162865, |
|
"model.layers.22.self_attn.v_proj": 73909, |
|
"model.layers.23.mlp.fc1": 285355, |
|
"model.layers.23.mlp.fc2": 631745, |
|
"model.layers.23.self_attn.dense": 101963, |
|
"model.layers.23.self_attn.k_proj": 107304, |
|
"model.layers.23.self_attn.q_proj": 260586, |
|
"model.layers.23.self_attn.v_proj": 107005, |
|
"model.layers.3.mlp.fc1": 269841, |
|
"model.layers.3.mlp.fc2": 330081, |
|
"model.layers.3.self_attn.dense": 76738, |
|
"model.layers.3.self_attn.k_proj": 114282, |
|
"model.layers.3.self_attn.q_proj": 110068, |
|
"model.layers.3.self_attn.v_proj": 96670, |
|
"model.layers.4.mlp.fc1": 305604, |
|
"model.layers.4.mlp.fc2": 333478, |
|
"model.layers.4.self_attn.dense": 73404, |
|
"model.layers.4.self_attn.k_proj": 105649, |
|
"model.layers.4.self_attn.q_proj": 102666, |
|
"model.layers.4.self_attn.v_proj": 92391, |
|
"model.layers.5.mlp.fc1": 293406, |
|
"model.layers.5.mlp.fc2": 337582, |
|
"model.layers.5.self_attn.dense": 71678, |
|
"model.layers.5.self_attn.k_proj": 120017, |
|
"model.layers.5.self_attn.q_proj": 121205, |
|
"model.layers.5.self_attn.v_proj": 92099, |
|
"model.layers.6.mlp.fc1": 291972, |
|
"model.layers.6.mlp.fc2": 329924, |
|
"model.layers.6.self_attn.dense": 81259, |
|
"model.layers.6.self_attn.k_proj": 104051, |
|
"model.layers.6.self_attn.q_proj": 100833, |
|
"model.layers.6.self_attn.v_proj": 93397, |
|
"model.layers.7.mlp.fc1": 293548, |
|
"model.layers.7.mlp.fc2": 331966, |
|
"model.layers.7.self_attn.dense": 68519, |
|
"model.layers.7.self_attn.k_proj": 108909, |
|
"model.layers.7.self_attn.q_proj": 103642, |
|
"model.layers.7.self_attn.v_proj": 84278, |
|
"model.layers.8.mlp.fc1": 304480, |
|
"model.layers.8.mlp.fc2": 318568, |
|
"model.layers.8.self_attn.dense": 76294, |
|
"model.layers.8.self_attn.k_proj": 110748, |
|
"model.layers.8.self_attn.q_proj": 103303, |
|
"model.layers.8.self_attn.v_proj": 91497, |
|
"model.layers.9.mlp.fc1": 298086, |
|
"model.layers.9.mlp.fc2": 319091, |
|
"model.layers.9.self_attn.dense": 68561, |
|
"model.layers.9.self_attn.k_proj": 109187, |
|
"model.layers.9.self_attn.q_proj": 103326, |
|
"model.layers.9.self_attn.v_proj": 83167 |
|
} |
|
}, |
|
"architectures": [ |
|
"PhiForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": null, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": null, |
|
"hidden_act": "gelu_new", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"layer_norm_eps": 1e-05, |
|
"max_position_embeddings": 2048, |
|
"model_type": "phi", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 24, |
|
"num_key_value_heads": 32, |
|
"partial_rotary_factor": 0.5, |
|
"qk_layernorm": false, |
|
"resid_pdrop": 0.0, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float16", |
|
"transformers_version": "4.39.3", |
|
"use_cache": true, |
|
"vocab_size": 51200 |
|
} |
|
|