|
{ |
|
"measurement": { |
|
"model.layers.0": { |
|
"accuracy": 0.9238128662109375, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.1": { |
|
"accuracy": 0.9267945289611816, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.2": { |
|
"accuracy": 0.7579765319824219, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.3": { |
|
"accuracy": 0.9311037063598633, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.4": { |
|
"accuracy": 0.9208853244781494, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.5": { |
|
"accuracy": 0.917156457901001, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.6": { |
|
"accuracy": 0.910703182220459, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.7": { |
|
"accuracy": 0.9117922782897949, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.8": { |
|
"accuracy": 0.9101357460021973, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.9": { |
|
"accuracy": 0.7812089920043945, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.10": { |
|
"accuracy": 0.9760679006576538, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.11": { |
|
"accuracy": 0.9733130931854248, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.12": { |
|
"accuracy": 0.9691377878189087, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.13": { |
|
"accuracy": 0.9678272008895874, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.14": { |
|
"accuracy": 0.9669649600982666, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.15": { |
|
"accuracy": 0.9652617573738098, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.16": { |
|
"accuracy": 0.9648621082305908, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.17": { |
|
"accuracy": 0.9644956588745117, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.18": { |
|
"accuracy": 0.961793065071106, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.19": { |
|
"accuracy": 0.9628909826278687, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.20": { |
|
"accuracy": 0.9613349437713623, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.21": { |
|
"accuracy": 0.9600133895874023, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.22": { |
|
"accuracy": 0.9601926803588867, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.23": { |
|
"accuracy": 0.9591658115386963, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.24": { |
|
"accuracy": 0.9612970352172852, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.25": { |
|
"accuracy": 0.9634448289871216, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.26": { |
|
"accuracy": 0.964637279510498, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.27": { |
|
"accuracy": 0.9656151533126831, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.28": { |
|
"accuracy": 0.9678228497505188, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.29": { |
|
"accuracy": 0.9674580693244934, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.30": { |
|
"accuracy": 0.9668487310409546, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.31": { |
|
"accuracy": 0.9646785855293274, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.32": { |
|
"accuracy": 0.9609519243240356, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.33": { |
|
"accuracy": 0.9568789005279541, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.34": { |
|
"accuracy": 0.953727126121521, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.35": { |
|
"accuracy": 0.9515633583068848, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.36": { |
|
"accuracy": 0.9445004463195801, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.37": { |
|
"accuracy": 0.9457055330276489, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.38": { |
|
"accuracy": 0.9431143999099731, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.39": { |
|
"accuracy": 0.9441231489181519, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.40": { |
|
"accuracy": 0.9393521547317505, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.41": { |
|
"accuracy": 0.9391511678695679, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.42": { |
|
"accuracy": 0.936943769454956, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.43": { |
|
"accuracy": 0.9338333606719971, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.44": { |
|
"accuracy": 0.9306356906890869, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.45": { |
|
"accuracy": 0.925384521484375, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.46": { |
|
"accuracy": 0.9210290908813477, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.47": { |
|
"accuracy": 0.9177703857421875, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.48": { |
|
"accuracy": 0.9188454151153564, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.49": { |
|
"accuracy": 0.9213569164276123, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.50": { |
|
"accuracy": 0.9211857318878174, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.51": { |
|
"accuracy": 0.925978422164917, |
|
"total_bits": 1059511104, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.52": { |
|
"accuracy": 0.9274883270263672, |
|
"total_bits": 1059511104, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.53": { |
|
"accuracy": 0.9286580085754395, |
|
"total_bits": 1018223424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.54": { |
|
"accuracy": 0.9474391937255859, |
|
"total_bits": 1284004512, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.55": { |
|
"accuracy": 0.9473202228546143, |
|
"total_bits": 1284004512, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.56": { |
|
"accuracy": 0.947467565536499, |
|
"total_bits": 1284004512, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.57": { |
|
"accuracy": 0.9502480030059814, |
|
"total_bits": 1345932192, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.58": { |
|
"accuracy": 0.9509830474853516, |
|
"total_bits": 1397539872, |
|
"q_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.59": { |
|
"accuracy": 0.9609636068344116, |
|
"total_bits": 1611721632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.60": { |
|
"accuracy": 0.9647481441497803, |
|
"total_bits": 1611721632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.61": { |
|
"accuracy": 0.9642865657806396, |
|
"total_bits": 1611721632, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.62": { |
|
"accuracy": 0.9850096702575684, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
}, |
|
"model.layers.63": { |
|
"accuracy": 0.9903278350830078, |
|
"total_bits": 1007903424, |
|
"q_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"k_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"v_proj": { |
|
"group_size": { |
|
"4": 128 |
|
}, |
|
"bits": [ |
|
4 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"o_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"up_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"gate_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
}, |
|
"down_proj": { |
|
"group_size": { |
|
"2": 64 |
|
}, |
|
"bits": [ |
|
2 |
|
], |
|
"bits_prop": [ |
|
1 |
|
], |
|
"scale_bits": 4 |
|
} |
|
} |
|
} |
|
} |