diff --git "a/measurement.json" "b/measurement.json" --- "a/measurement.json" +++ "b/measurement.json" @@ -2,7 +2,7 @@ "measurement": { "model.layers.0.self_attn": [ { - "accuracy": 0.846385340157308, + "accuracy": 0.8600787763532839, "total_bits": 89665536, "q_proj": { "group_size": { @@ -66,7 +66,7 @@ } }, { - "accuracy": 0.8670843541622162, + "accuracy": 0.8805500598330247, "total_bits": 92221440, "q_proj": { "group_size": { @@ -130,7 +130,7 @@ } }, { - "accuracy": 0.8880218887015393, + "accuracy": 0.8930538296699524, "total_bits": 95758848, "q_proj": { "group_size": { @@ -194,7 +194,7 @@ } }, { - "accuracy": 0.9219632529114422, + "accuracy": 0.9270089719640582, "total_bits": 112272384, "q_proj": { "group_size": { @@ -258,7 +258,7 @@ } }, { - "accuracy": 0.9248139297491625, + "accuracy": 0.9326198771595955, "total_bits": 132913152, "q_proj": { "group_size": { @@ -322,7 +322,7 @@ } }, { - "accuracy": 0.9298182245539992, + "accuracy": 0.9321126467303226, "total_bits": 132980224, "q_proj": { "group_size": { @@ -386,7 +386,7 @@ } }, { - "accuracy": 0.9417410312514556, + "accuracy": 0.9508777625466648, "total_bits": 169613312, "q_proj": { "group_size": { @@ -438,7 +438,7 @@ } }, { - "accuracy": 0.9482899961110792, + "accuracy": 0.9509126853786016, "total_bits": 169745920, "q_proj": { "group_size": { @@ -490,7 +490,7 @@ } }, { - "accuracy": 0.9536740066189515, + "accuracy": 0.9556551169800132, "total_bits": 171195392, "q_proj": { "group_size": { @@ -542,7 +542,7 @@ } }, { - "accuracy": 0.9559611572246802, + "accuracy": 0.9575732953258251, "total_bits": 173563904, "q_proj": { "group_size": { @@ -594,7 +594,7 @@ } }, { - "accuracy": 0.9637513543037992, + "accuracy": 0.9629762850113606, "total_bits": 174923264, "q_proj": { "group_size": { @@ -658,7 +658,7 @@ } }, { - "accuracy": 0.9664112066752032, + "accuracy": 0.9680987506320602, "total_bits": 175750144, "q_proj": { "group_size": { @@ -722,7 +722,7 @@ } }, { - "accuracy": 0.967982933611462, + "accuracy": 0.9672622473812417, "total_bits": 179253248, "q_proj": { "group_size": { @@ -783,7 +783,7 @@ } }, { - "accuracy": 0.9711626785758295, + "accuracy": 0.9729988840653708, "total_bits": 181592064, "q_proj": { "group_size": { @@ -844,7 +844,7 @@ } }, { - "accuracy": 0.981876133990131, + "accuracy": 0.9837960013139405, "total_bits": 220469248, "q_proj": { "group_size": { @@ -905,7 +905,7 @@ } }, { - "accuracy": 0.9840561123564839, + "accuracy": 0.9865582109870095, "total_bits": 223535104, "q_proj": { "group_size": { @@ -966,7 +966,7 @@ } }, { - "accuracy": 0.9849492262950853, + "accuracy": 0.9870722737457407, "total_bits": 253499392, "q_proj": { "group_size": { @@ -1018,7 +1018,7 @@ } }, { - "accuracy": 0.9922585192038432, + "accuracy": 0.9919990593028304, "total_bits": 265838592, "q_proj": { "group_size": { @@ -1070,7 +1070,7 @@ } }, { - "accuracy": 0.9959459496465953, + "accuracy": 0.9964774065087304, "total_bits": 337385472, "q_proj": { "group_size": { @@ -1122,11 +1122,11 @@ } } ], - "model.layers.0.block_sparse_moe": [ + "model.layers.0.mlp": [ { - "accuracy": 0.8324242069532997, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.8070184431577984, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -1141,7 +1141,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -1156,7 +1156,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -1176,9 +1176,9 @@ } }, { - "accuracy": 0.8388489963192689, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.8132859295920322, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -1193,7 +1193,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -1208,7 +1208,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -1228,9 +1228,9 @@ } }, { - "accuracy": 0.8565590291431076, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.8438375007949377, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -1245,7 +1245,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -1260,7 +1260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -1277,9 +1277,9 @@ } }, { - "accuracy": 0.8607355258182475, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.8535225414916089, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -1294,7 +1294,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -1309,7 +1309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -1326,9 +1326,9 @@ } }, { - "accuracy": 0.9228841970233541, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9045984341125739, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -1343,7 +1343,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -1358,7 +1358,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -1378,9 +1378,9 @@ } }, { - "accuracy": 0.9289927174778361, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9125326875793307, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -1395,7 +1395,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -1410,7 +1410,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -1430,9 +1430,9 @@ } }, { - "accuracy": 0.9353208004644042, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9256841510926422, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -1447,7 +1447,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -1462,7 +1462,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -1479,9 +1479,9 @@ } }, { - "accuracy": 0.9612029226202714, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9499706767107311, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -1493,7 +1493,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -1505,7 +1505,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -1522,9 +1522,9 @@ } }, { - "accuracy": 0.964529751064746, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9548157188845308, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -1536,7 +1536,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -1548,7 +1548,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -1565,9 +1565,9 @@ } }, { - "accuracy": 0.9617070203745052, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.951466162934115, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -1582,7 +1582,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -1597,7 +1597,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -1617,9 +1617,9 @@ } }, { - "accuracy": 0.9665895215186634, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9576070646902448, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -1634,7 +1634,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -1649,7 +1649,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -1669,9 +1669,9 @@ } }, { - "accuracy": 0.9808745485191283, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9751689596787879, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -1686,7 +1686,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -1701,7 +1701,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -1721,9 +1721,9 @@ } }, { - "accuracy": 0.9836898759791726, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9787953452354199, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -1738,7 +1738,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -1753,7 +1753,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -1773,9 +1773,9 @@ } }, { - "accuracy": 0.9897603520535325, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9863979414987721, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -1787,7 +1787,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -1799,7 +1799,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -1816,9 +1816,9 @@ } }, { - "accuracy": 0.9903831579232294, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9873247195623422, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -1833,7 +1833,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -1848,7 +1848,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -1865,9 +1865,9 @@ } }, { - "accuracy": 0.991698085747071, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9902768012283272, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -1882,7 +1882,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -1897,7 +1897,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -1911,9 +1911,9 @@ } }, { - "accuracy": 0.997267104807849, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9962502605968008, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -1925,7 +1925,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -1937,7 +1937,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -1953,7 +1953,7 @@ ], "model.layers.1.self_attn": [ { - "accuracy": 0.8521327913591736, + "accuracy": 0.8812538334413579, "total_bits": 89665536, "q_proj": { "group_size": { @@ -2017,7 +2017,7 @@ } }, { - "accuracy": 0.8637357546310676, + "accuracy": 0.8883446760867771, "total_bits": 92221440, "q_proj": { "group_size": { @@ -2081,7 +2081,7 @@ } }, { - "accuracy": 0.8751458702118773, + "accuracy": 0.9010343202634862, "total_bits": 95758848, "q_proj": { "group_size": { @@ -2145,7 +2145,7 @@ } }, { - "accuracy": 0.9195249268883153, + "accuracy": 0.9310616699880675, "total_bits": 112272384, "q_proj": { "group_size": { @@ -2209,7 +2209,7 @@ } }, { - "accuracy": 0.9248873556130811, + "accuracy": 0.9378329791912907, "total_bits": 132913152, "q_proj": { "group_size": { @@ -2273,7 +2273,7 @@ } }, { - "accuracy": 0.9270870924780243, + "accuracy": 0.9397324896172473, "total_bits": 132980224, "q_proj": { "group_size": { @@ -2337,7 +2337,7 @@ } }, { - "accuracy": 0.9479517913178394, + "accuracy": 0.9532725897274519, "total_bits": 169613312, "q_proj": { "group_size": { @@ -2389,7 +2389,7 @@ } }, { - "accuracy": 0.9511575577290434, + "accuracy": 0.9551734610607749, "total_bits": 169745920, "q_proj": { "group_size": { @@ -2441,7 +2441,7 @@ } }, { - "accuracy": 0.9555047873995806, + "accuracy": 0.9587045101154792, "total_bits": 171195392, "q_proj": { "group_size": { @@ -2493,7 +2493,7 @@ } }, { - "accuracy": 0.9583136213845328, + "accuracy": 0.9606185079013047, "total_bits": 173563904, "q_proj": { "group_size": { @@ -2545,7 +2545,7 @@ } }, { - "accuracy": 0.9622990587039998, + "accuracy": 0.9697113254744756, "total_bits": 174923264, "q_proj": { "group_size": { @@ -2609,7 +2609,7 @@ } }, { - "accuracy": 0.9647127313441352, + "accuracy": 0.9718783742895252, "total_bits": 175750144, "q_proj": { "group_size": { @@ -2673,7 +2673,7 @@ } }, { - "accuracy": 0.9663331522361228, + "accuracy": 0.9725239418054882, "total_bits": 179253248, "q_proj": { "group_size": { @@ -2734,7 +2734,7 @@ } }, { - "accuracy": 0.9692848717891857, + "accuracy": 0.9750776415396678, "total_bits": 181592064, "q_proj": { "group_size": { @@ -2795,7 +2795,7 @@ } }, { - "accuracy": 0.9812996867848071, + "accuracy": 0.9841298388905431, "total_bits": 220469248, "q_proj": { "group_size": { @@ -2856,7 +2856,7 @@ } }, { - "accuracy": 0.9841903593195113, + "accuracy": 0.9871465894148538, "total_bits": 223535104, "q_proj": { "group_size": { @@ -2917,7 +2917,7 @@ } }, { - "accuracy": 0.9860342596412489, + "accuracy": 0.987190838265968, "total_bits": 253499392, "q_proj": { "group_size": { @@ -2969,7 +2969,7 @@ } }, { - "accuracy": 0.9913815130773735, + "accuracy": 0.9933492715568527, "total_bits": 265838592, "q_proj": { "group_size": { @@ -3021,7 +3021,7 @@ } }, { - "accuracy": 0.9963134127963138, + "accuracy": 0.996719001829134, "total_bits": 337385472, "q_proj": { "group_size": { @@ -3073,11 +3073,11 @@ } } ], - "model.layers.1.block_sparse_moe": [ + "model.layers.1.mlp": [ { - "accuracy": 0.9324584905254214, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9253737340240102, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -3092,7 +3092,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -3107,7 +3107,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -3127,9 +3127,9 @@ } }, { - "accuracy": 0.9478668086230755, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9479098694497033, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -3144,7 +3144,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -3159,7 +3159,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -3179,9 +3179,9 @@ } }, { - "accuracy": 0.9493800366395398, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9477320009548413, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -3196,7 +3196,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -3211,7 +3211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -3228,9 +3228,9 @@ } }, { - "accuracy": 0.9498187403537726, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9482870496025211, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -3245,7 +3245,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -3260,7 +3260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -3277,9 +3277,9 @@ } }, { - "accuracy": 0.9808891446966874, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9843674453808681, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -3294,7 +3294,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -3309,7 +3309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -3329,9 +3329,9 @@ } }, { - "accuracy": 0.9875295693544965, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.987620656868737, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -3346,7 +3346,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -3361,7 +3361,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -3381,9 +3381,9 @@ } }, { - "accuracy": 0.988905018097476, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9894055090109376, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -3398,7 +3398,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -3413,7 +3413,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -3430,9 +3430,9 @@ } }, { - "accuracy": 0.9920513479676294, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9927752633383008, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -3444,7 +3444,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -3456,7 +3456,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -3473,9 +3473,9 @@ } }, { - "accuracy": 0.9938272803433632, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9933086709290939, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -3487,7 +3487,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -3499,7 +3499,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -3516,9 +3516,9 @@ } }, { - "accuracy": 0.993752601360412, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.993387115690367, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -3533,7 +3533,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -3548,7 +3548,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -3568,9 +3568,9 @@ } }, { - "accuracy": 0.9948625652928298, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.994028670590763, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -3585,7 +3585,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -3600,7 +3600,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -3620,9 +3620,9 @@ } }, { - "accuracy": 0.9969651552627942, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9965054595928737, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -3637,7 +3637,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -3652,7 +3652,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -3672,9 +3672,9 @@ } }, { - "accuracy": 0.996982053485944, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9968209691762336, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -3689,7 +3689,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -3704,7 +3704,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -3724,9 +3724,9 @@ } }, { - "accuracy": 0.9981960501815927, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.997641545947102, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -3738,7 +3738,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -3750,7 +3750,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -3767,9 +3767,9 @@ } }, { - "accuracy": 0.9982470140645379, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9980542714352181, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -3784,7 +3784,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -3799,7 +3799,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -3816,9 +3816,9 @@ } }, { - "accuracy": 0.9984907397087418, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9984126343685937, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -3833,7 +3833,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -3848,7 +3848,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -3862,9 +3862,9 @@ } }, { - "accuracy": 0.999117652193251, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9991089748708826, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -3876,7 +3876,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -3888,7 +3888,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -3904,7 +3904,7 @@ ], "model.layers.2.self_attn": [ { - "accuracy": 0.9795441449080643, + "accuracy": 0.9837872849679307, "total_bits": 89665536, "q_proj": { "group_size": { @@ -3968,7 +3968,7 @@ } }, { - "accuracy": 0.9813775877027135, + "accuracy": 0.9847270852984175, "total_bits": 92221440, "q_proj": { "group_size": { @@ -4032,7 +4032,7 @@ } }, { - "accuracy": 0.983051600228799, + "accuracy": 0.9860854080731147, "total_bits": 95758848, "q_proj": { "group_size": { @@ -4096,7 +4096,7 @@ } }, { - "accuracy": 0.986047137185539, + "accuracy": 0.9888253506567133, "total_bits": 112272384, "q_proj": { "group_size": { @@ -4160,7 +4160,7 @@ } }, { - "accuracy": 0.9889153237209508, + "accuracy": 0.9906600356886262, "total_bits": 132913152, "q_proj": { "group_size": { @@ -4224,7 +4224,7 @@ } }, { - "accuracy": 0.9888666628026649, + "accuracy": 0.9905554064745573, "total_bits": 132980224, "q_proj": { "group_size": { @@ -4288,7 +4288,7 @@ } }, { - "accuracy": 0.9924452196267483, + "accuracy": 0.9935185528409324, "total_bits": 169613312, "q_proj": { "group_size": { @@ -4340,7 +4340,7 @@ } }, { - "accuracy": 0.9928540734476164, + "accuracy": 0.9940227098566922, "total_bits": 169745920, "q_proj": { "group_size": { @@ -4392,7 +4392,7 @@ } }, { - "accuracy": 0.9930738379051419, + "accuracy": 0.9944108020587775, "total_bits": 171195392, "q_proj": { "group_size": { @@ -4444,7 +4444,7 @@ } }, { - "accuracy": 0.993209737107942, + "accuracy": 0.9945564572576826, "total_bits": 173563904, "q_proj": { "group_size": { @@ -4496,7 +4496,7 @@ } }, { - "accuracy": 0.9935248297835259, + "accuracy": 0.9945127430224889, "total_bits": 174923264, "q_proj": { "group_size": { @@ -4560,7 +4560,7 @@ } }, { - "accuracy": 0.9936493357251349, + "accuracy": 0.9947974341063711, "total_bits": 175750144, "q_proj": { "group_size": { @@ -4624,7 +4624,7 @@ } }, { - "accuracy": 0.9949891750063551, + "accuracy": 0.9959855163837538, "total_bits": 179253248, "q_proj": { "group_size": { @@ -4685,7 +4685,7 @@ } }, { - "accuracy": 0.9954101890570631, + "accuracy": 0.9964198896624638, "total_bits": 181592064, "q_proj": { "group_size": { @@ -4746,7 +4746,7 @@ } }, { - "accuracy": 0.9971753409328429, + "accuracy": 0.9978521741271068, "total_bits": 220469248, "q_proj": { "group_size": { @@ -4807,7 +4807,7 @@ } }, { - "accuracy": 0.9976039941079522, + "accuracy": 0.9981708209502118, "total_bits": 223535104, "q_proj": { "group_size": { @@ -4868,7 +4868,7 @@ } }, { - "accuracy": 0.9976401237378779, + "accuracy": 0.998259322178599, "total_bits": 253499392, "q_proj": { "group_size": { @@ -4920,7 +4920,7 @@ } }, { - "accuracy": 0.9989218185043061, + "accuracy": 0.9991632168944067, "total_bits": 265838592, "q_proj": { "group_size": { @@ -4972,7 +4972,7 @@ } }, { - "accuracy": 0.9993825332188097, + "accuracy": 0.9995428297245924, "total_bits": 337385472, "q_proj": { "group_size": { @@ -5024,11 +5024,11 @@ } } ], - "model.layers.2.block_sparse_moe": [ + "model.layers.2.mlp": [ { - "accuracy": 0.9670179018652753, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9672844327594105, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -5043,7 +5043,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -5058,7 +5058,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -5078,9 +5078,9 @@ } }, { - "accuracy": 0.968241252691338, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9682485490645233, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -5095,7 +5095,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -5110,7 +5110,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -5130,9 +5130,9 @@ } }, { - "accuracy": 0.971632618261011, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9727090873608464, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -5147,7 +5147,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -5162,7 +5162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -5179,9 +5179,9 @@ } }, { - "accuracy": 0.9724238111000312, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9740772233099529, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -5196,7 +5196,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -5211,7 +5211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -5228,9 +5228,9 @@ } }, { - "accuracy": 0.9843998258247187, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9833401534951439, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -5245,7 +5245,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -5260,7 +5260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -5280,9 +5280,9 @@ } }, { - "accuracy": 0.9856258957695804, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9847067977595878, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -5297,7 +5297,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -5312,7 +5312,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -5332,9 +5332,9 @@ } }, { - "accuracy": 0.986865914475761, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.986646582544046, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -5349,7 +5349,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -5364,7 +5364,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -5381,9 +5381,9 @@ } }, { - "accuracy": 0.9921592393350837, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.991328130390397, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -5395,7 +5395,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -5407,7 +5407,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -5424,9 +5424,9 @@ } }, { - "accuracy": 0.9928100429181206, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9920847613094864, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -5438,7 +5438,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -5450,7 +5450,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -5467,9 +5467,9 @@ } }, { - "accuracy": 0.9922667818241998, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9914708084696414, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -5484,7 +5484,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -5499,7 +5499,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -5519,9 +5519,9 @@ } }, { - "accuracy": 0.993231785214065, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9925549471701839, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -5536,7 +5536,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -5551,7 +5551,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -5571,9 +5571,9 @@ } }, { - "accuracy": 0.996173335631427, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9956418546223033, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -5588,7 +5588,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -5603,7 +5603,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -5623,9 +5623,9 @@ } }, { - "accuracy": 0.9967265648546776, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9962836312938874, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -5640,7 +5640,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -5655,7 +5655,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -5675,9 +5675,9 @@ } }, { - "accuracy": 0.9979728608220619, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9976301224026094, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -5689,7 +5689,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -5701,7 +5701,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -5718,9 +5718,9 @@ } }, { - "accuracy": 0.99808885968952, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9977723961622503, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -5735,7 +5735,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -5750,7 +5750,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -5767,9 +5767,9 @@ } }, { - "accuracy": 0.9983284447807819, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.998158193116787, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -5784,7 +5784,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -5799,7 +5799,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -5813,9 +5813,9 @@ } }, { - "accuracy": 0.9994732041209078, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9993683456590301, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -5827,7 +5827,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -5839,7 +5839,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -5855,7 +5855,7 @@ ], "model.layers.3.self_attn": [ { - "accuracy": 0.9551259400813203, + "accuracy": 0.9707197740575985, "total_bits": 89665536, "q_proj": { "group_size": { @@ -5919,7 +5919,7 @@ } }, { - "accuracy": 0.9575085420357554, + "accuracy": 0.9723640436208562, "total_bits": 92221440, "q_proj": { "group_size": { @@ -5983,7 +5983,7 @@ } }, { - "accuracy": 0.9661188743224269, + "accuracy": 0.9755948865786195, "total_bits": 95758848, "q_proj": { "group_size": { @@ -6047,7 +6047,7 @@ } }, { - "accuracy": 0.9699701259010717, + "accuracy": 0.978933476499821, "total_bits": 112272384, "q_proj": { "group_size": { @@ -6111,7 +6111,7 @@ } }, { - "accuracy": 0.9784623553094111, + "accuracy": 0.9847938547057933, "total_bits": 132913152, "q_proj": { "group_size": { @@ -6175,7 +6175,7 @@ } }, { - "accuracy": 0.9795324101455902, + "accuracy": 0.9851365875158655, "total_bits": 132980224, "q_proj": { "group_size": { @@ -6239,7 +6239,7 @@ } }, { - "accuracy": 0.9880765460333542, + "accuracy": 0.9913054740816158, "total_bits": 169613312, "q_proj": { "group_size": { @@ -6291,7 +6291,7 @@ } }, { - "accuracy": 0.9887280390156727, + "accuracy": 0.9915867910450814, "total_bits": 169745920, "q_proj": { "group_size": { @@ -6343,7 +6343,7 @@ } }, { - "accuracy": 0.9895580301649476, + "accuracy": 0.992432350599158, "total_bits": 171195392, "q_proj": { "group_size": { @@ -6395,7 +6395,7 @@ } }, { - "accuracy": 0.9897787445960077, + "accuracy": 0.9926906896657065, "total_bits": 173563904, "q_proj": { "group_size": { @@ -6447,7 +6447,7 @@ } }, { - "accuracy": 0.9895667863127432, + "accuracy": 0.992190949349223, "total_bits": 174923264, "q_proj": { "group_size": { @@ -6511,7 +6511,7 @@ } }, { - "accuracy": 0.9900656247903642, + "accuracy": 0.9929004733965389, "total_bits": 175750144, "q_proj": { "group_size": { @@ -6575,7 +6575,7 @@ } }, { - "accuracy": 0.9928545017824754, + "accuracy": 0.994474850423438, "total_bits": 179253248, "q_proj": { "group_size": { @@ -6636,7 +6636,7 @@ } }, { - "accuracy": 0.9932616465774021, + "accuracy": 0.9950638974306026, "total_bits": 181592064, "q_proj": { "group_size": { @@ -6697,7 +6697,7 @@ } }, { - "accuracy": 0.9960276977967863, + "accuracy": 0.9970709266827295, "total_bits": 220469248, "q_proj": { "group_size": { @@ -6758,7 +6758,7 @@ } }, { - "accuracy": 0.9966938559603142, + "accuracy": 0.9974997325273427, "total_bits": 223535104, "q_proj": { "group_size": { @@ -6819,7 +6819,7 @@ } }, { - "accuracy": 0.9967310187210771, + "accuracy": 0.9978008458856493, "total_bits": 253499392, "q_proj": { "group_size": { @@ -6871,7 +6871,7 @@ } }, { - "accuracy": 0.998578734573369, + "accuracy": 0.9988595945653383, "total_bits": 265838592, "q_proj": { "group_size": { @@ -6923,7 +6923,7 @@ } }, { - "accuracy": 0.9991499127538287, + "accuracy": 0.9994396824377442, "total_bits": 337385472, "q_proj": { "group_size": { @@ -6975,11 +6975,11 @@ } } ], - "model.layers.3.block_sparse_moe": [ + "model.layers.3.mlp": [ { - "accuracy": 0.9548824113842688, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9567624988328469, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -6994,7 +6994,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -7009,7 +7009,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -7029,9 +7029,9 @@ } }, { - "accuracy": 0.9566254253058057, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9580053432207358, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -7046,7 +7046,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -7061,7 +7061,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -7081,9 +7081,9 @@ } }, { - "accuracy": 0.9615888638715995, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9641852398942176, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -7098,7 +7098,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -7113,7 +7113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -7130,9 +7130,9 @@ } }, { - "accuracy": 0.9626979312222255, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9660298594420678, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -7147,7 +7147,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -7162,7 +7162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -7179,9 +7179,9 @@ } }, { - "accuracy": 0.9785523645971951, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9780524450011159, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -7196,7 +7196,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -7211,7 +7211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -7231,9 +7231,9 @@ } }, { - "accuracy": 0.9803317003932438, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.979764724199317, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -7248,7 +7248,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -7263,7 +7263,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -7283,9 +7283,9 @@ } }, { - "accuracy": 0.9820941828289315, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9823974333948603, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -7300,7 +7300,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -7315,7 +7315,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -7332,9 +7332,9 @@ } }, { - "accuracy": 0.9892527189988055, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9885873382567967, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -7346,7 +7346,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -7358,7 +7358,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -7375,9 +7375,9 @@ } }, { - "accuracy": 0.9901440804824233, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9895589285481133, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -7389,7 +7389,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -7401,7 +7401,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -7418,9 +7418,9 @@ } }, { - "accuracy": 0.9893892295658588, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9888214297886742, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -7435,7 +7435,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -7450,7 +7450,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -7470,9 +7470,9 @@ } }, { - "accuracy": 0.9907122704592582, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9901749738001902, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -7487,7 +7487,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -7502,7 +7502,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -7522,9 +7522,9 @@ } }, { - "accuracy": 0.9947478979776957, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9943097051429121, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -7539,7 +7539,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -7554,7 +7554,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -7574,9 +7574,9 @@ } }, { - "accuracy": 0.9954981810633877, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9951048930984383, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -7591,7 +7591,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -7606,7 +7606,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -7626,9 +7626,9 @@ } }, { - "accuracy": 0.9972210151486491, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9969083806990009, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -7640,7 +7640,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -7652,7 +7652,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -7669,9 +7669,9 @@ } }, { - "accuracy": 0.9973904589852808, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9971034851317343, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -7686,7 +7686,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -7701,7 +7701,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -7718,9 +7718,9 @@ } }, { - "accuracy": 0.9977305036797923, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9976346400860501, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -7735,7 +7735,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -7750,7 +7750,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -7764,9 +7764,9 @@ } }, { - "accuracy": 0.999288983342826, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9991817340196576, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -7778,7 +7778,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -7790,7 +7790,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -7806,7 +7806,7 @@ ], "model.layers.4.self_attn": [ { - "accuracy": 0.9501384955487753, + "accuracy": 0.9675148161814401, "total_bits": 89665536, "q_proj": { "group_size": { @@ -7870,7 +7870,7 @@ } }, { - "accuracy": 0.9532344313828569, + "accuracy": 0.9683779655514579, "total_bits": 92221440, "q_proj": { "group_size": { @@ -7934,7 +7934,7 @@ } }, { - "accuracy": 0.9606703547270674, + "accuracy": 0.9712841526458138, "total_bits": 95758848, "q_proj": { "group_size": { @@ -7998,7 +7998,7 @@ } }, { - "accuracy": 0.9668476238454643, + "accuracy": 0.9770930287869353, "total_bits": 112272384, "q_proj": { "group_size": { @@ -8062,7 +8062,7 @@ } }, { - "accuracy": 0.9747759504733902, + "accuracy": 0.9822262644669727, "total_bits": 132913152, "q_proj": { "group_size": { @@ -8126,7 +8126,7 @@ } }, { - "accuracy": 0.9754207195027879, + "accuracy": 0.9825665302525618, "total_bits": 132980224, "q_proj": { "group_size": { @@ -8190,7 +8190,7 @@ } }, { - "accuracy": 0.9852170429535603, + "accuracy": 0.989792095423725, "total_bits": 169613312, "q_proj": { "group_size": { @@ -8242,7 +8242,7 @@ } }, { - "accuracy": 0.9855685299262404, + "accuracy": 0.9901830459405717, "total_bits": 169745920, "q_proj": { "group_size": { @@ -8294,7 +8294,7 @@ } }, { - "accuracy": 0.9862806395088372, + "accuracy": 0.9906895863578508, "total_bits": 171195392, "q_proj": { "group_size": { @@ -8346,7 +8346,7 @@ } }, { - "accuracy": 0.9868911676304905, + "accuracy": 0.9911309187300503, "total_bits": 173563904, "q_proj": { "group_size": { @@ -8398,7 +8398,7 @@ } }, { - "accuracy": 0.9874015084810948, + "accuracy": 0.9906271434830207, "total_bits": 174923264, "q_proj": { "group_size": { @@ -8462,7 +8462,7 @@ } }, { - "accuracy": 0.9882506720329586, + "accuracy": 0.9915738738092937, "total_bits": 175750144, "q_proj": { "group_size": { @@ -8526,7 +8526,7 @@ } }, { - "accuracy": 0.9907431018195654, + "accuracy": 0.9931416222458019, "total_bits": 179253248, "q_proj": { "group_size": { @@ -8587,7 +8587,7 @@ } }, { - "accuracy": 0.9914250604465211, + "accuracy": 0.9937281439350428, "total_bits": 181592064, "q_proj": { "group_size": { @@ -8648,7 +8648,7 @@ } }, { - "accuracy": 0.994904503912518, + "accuracy": 0.9962962127204886, "total_bits": 220469248, "q_proj": { "group_size": { @@ -8709,7 +8709,7 @@ } }, { - "accuracy": 0.9956280516803657, + "accuracy": 0.9968787471175586, "total_bits": 223535104, "q_proj": { "group_size": { @@ -8770,7 +8770,7 @@ } }, { - "accuracy": 0.9959938074829743, + "accuracy": 0.9972852849107432, "total_bits": 253499392, "q_proj": { "group_size": { @@ -8822,7 +8822,7 @@ } }, { - "accuracy": 0.9979615601968315, + "accuracy": 0.9984632519374014, "total_bits": 265838592, "q_proj": { "group_size": { @@ -8874,7 +8874,7 @@ } }, { - "accuracy": 0.9989524221868793, + "accuracy": 0.9992826943144839, "total_bits": 337385472, "q_proj": { "group_size": { @@ -8926,11 +8926,11 @@ } } ], - "model.layers.4.block_sparse_moe": [ + "model.layers.4.mlp": [ { - "accuracy": 0.9430409505179054, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.944639681495334, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -8945,7 +8945,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -8960,7 +8960,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -8980,9 +8980,9 @@ } }, { - "accuracy": 0.9452259920929608, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9463916591515666, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -8997,7 +8997,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -9012,7 +9012,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -9032,9 +9032,9 @@ } }, { - "accuracy": 0.952361611747428, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9551517799692719, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -9049,7 +9049,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -9064,7 +9064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -9081,9 +9081,9 @@ } }, { - "accuracy": 0.953863462335185, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9576579031387442, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -9098,7 +9098,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -9113,7 +9113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -9130,9 +9130,9 @@ } }, { - "accuracy": 0.9732333503075337, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.972181461898512, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -9147,7 +9147,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -9162,7 +9162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -9182,9 +9182,9 @@ } }, { - "accuracy": 0.9754221997174778, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9744004312118417, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -9199,7 +9199,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -9214,7 +9214,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -9234,9 +9234,9 @@ } }, { - "accuracy": 0.977813727369434, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9779725838942748, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -9251,7 +9251,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -9266,7 +9266,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -9283,9 +9283,9 @@ } }, { - "accuracy": 0.9865550903701469, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9855439783760199, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -9297,7 +9297,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -9309,7 +9309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -9326,9 +9326,9 @@ } }, { - "accuracy": 0.9875913096199694, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9867568128791294, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -9340,7 +9340,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -9352,7 +9352,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -9369,9 +9369,9 @@ } }, { - "accuracy": 0.9867197757488803, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.985830779873619, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -9386,7 +9386,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -9401,7 +9401,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -9421,9 +9421,9 @@ } }, { - "accuracy": 0.9884021188083448, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9875741610548606, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -9438,7 +9438,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -9453,7 +9453,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -9473,9 +9473,9 @@ } }, { - "accuracy": 0.9934098214695328, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.99278425231674, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -9490,7 +9490,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -9505,7 +9505,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -9525,9 +9525,9 @@ } }, { - "accuracy": 0.9943592815757974, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9938036358768219, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -9542,7 +9542,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -9557,7 +9557,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -9577,9 +9577,9 @@ } }, { - "accuracy": 0.99648185185843, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9960670125376629, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -9591,7 +9591,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -9603,7 +9603,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -9620,9 +9620,9 @@ } }, { - "accuracy": 0.9967099708186364, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9963162359352665, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -9637,7 +9637,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -9652,7 +9652,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -9669,9 +9669,9 @@ } }, { - "accuracy": 0.997184813145156, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9970679238720453, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -9686,7 +9686,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -9701,7 +9701,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -9715,9 +9715,9 @@ } }, { - "accuracy": 0.9990889086191984, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9989544474872099, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -9729,7 +9729,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -9741,7 +9741,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -9757,7 +9757,7 @@ ], "model.layers.5.self_attn": [ { - "accuracy": 0.9475822389910096, + "accuracy": 0.9611736427208311, "total_bits": 89665536, "q_proj": { "group_size": { @@ -9821,7 +9821,7 @@ } }, { - "accuracy": 0.9508471961476301, + "accuracy": 0.9633335407431188, "total_bits": 92221440, "q_proj": { "group_size": { @@ -9885,7 +9885,7 @@ } }, { - "accuracy": 0.9593624929456335, + "accuracy": 0.9670639327776275, "total_bits": 95758848, "q_proj": { "group_size": { @@ -9949,7 +9949,7 @@ } }, { - "accuracy": 0.9663599807965129, + "accuracy": 0.9729697142090452, "total_bits": 112272384, "q_proj": { "group_size": { @@ -10013,7 +10013,7 @@ } }, { - "accuracy": 0.974280083453969, + "accuracy": 0.9799755058300338, "total_bits": 132913152, "q_proj": { "group_size": { @@ -10077,7 +10077,7 @@ } }, { - "accuracy": 0.9746846952720692, + "accuracy": 0.9807142771682457, "total_bits": 132980224, "q_proj": { "group_size": { @@ -10141,7 +10141,7 @@ } }, { - "accuracy": 0.9852304836539062, + "accuracy": 0.9892870190945503, "total_bits": 169613312, "q_proj": { "group_size": { @@ -10193,7 +10193,7 @@ } }, { - "accuracy": 0.9857582308744129, + "accuracy": 0.9895276091502685, "total_bits": 169745920, "q_proj": { "group_size": { @@ -10245,7 +10245,7 @@ } }, { - "accuracy": 0.9867243649634091, + "accuracy": 0.9903184270829355, "total_bits": 171195392, "q_proj": { "group_size": { @@ -10297,7 +10297,7 @@ } }, { - "accuracy": 0.9871439644576687, + "accuracy": 0.9907997123602974, "total_bits": 173563904, "q_proj": { "group_size": { @@ -10349,7 +10349,7 @@ } }, { - "accuracy": 0.9874821053327698, + "accuracy": 0.9905507350340486, "total_bits": 174923264, "q_proj": { "group_size": { @@ -10413,7 +10413,7 @@ } }, { - "accuracy": 0.9879955599868768, + "accuracy": 0.9911971691456672, "total_bits": 175750144, "q_proj": { "group_size": { @@ -10477,7 +10477,7 @@ } }, { - "accuracy": 0.9903234019630441, + "accuracy": 0.9925187124846209, "total_bits": 179253248, "q_proj": { "group_size": { @@ -10538,7 +10538,7 @@ } }, { - "accuracy": 0.991094174206649, + "accuracy": 0.9931644755293076, "total_bits": 181592064, "q_proj": { "group_size": { @@ -10599,7 +10599,7 @@ } }, { - "accuracy": 0.9948129127733409, + "accuracy": 0.9959969547528186, "total_bits": 220469248, "q_proj": { "group_size": { @@ -10660,7 +10660,7 @@ } }, { - "accuracy": 0.9955492213883094, + "accuracy": 0.9965934961678853, "total_bits": 223535104, "q_proj": { "group_size": { @@ -10721,7 +10721,7 @@ } }, { - "accuracy": 0.9961505988612771, + "accuracy": 0.9972743046543512, "total_bits": 253499392, "q_proj": { "group_size": { @@ -10773,7 +10773,7 @@ } }, { - "accuracy": 0.9977290626970658, + "accuracy": 0.9982007348689398, "total_bits": 265838592, "q_proj": { "group_size": { @@ -10825,7 +10825,7 @@ } }, { - "accuracy": 0.9989589033924092, + "accuracy": 0.999259817544187, "total_bits": 337385472, "q_proj": { "group_size": { @@ -10877,11 +10877,11 @@ } } ], - "model.layers.5.block_sparse_moe": [ + "model.layers.5.mlp": [ { - "accuracy": 0.9330970599855247, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9347764431057792, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -10896,7 +10896,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -10911,7 +10911,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -10931,9 +10931,9 @@ } }, { - "accuracy": 0.9357169303846986, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.936870607596479, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -10948,7 +10948,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -10963,7 +10963,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -10983,9 +10983,9 @@ } }, { - "accuracy": 0.9442015262810808, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9470993180416132, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -11000,7 +11000,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -11015,7 +11015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -11032,9 +11032,9 @@ } }, { - "accuracy": 0.945981360187656, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9500016227952743, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -11049,7 +11049,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -11064,7 +11064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -11081,9 +11081,9 @@ } }, { - "accuracy": 0.9685394747280761, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9672537298971101, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -11098,7 +11098,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -11113,7 +11113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -11133,9 +11133,9 @@ } }, { - "accuracy": 0.9711701207255062, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9698613169357965, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -11150,7 +11150,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -11165,7 +11165,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -11185,9 +11185,9 @@ } }, { - "accuracy": 0.9740049669421033, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9740173788251061, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -11202,7 +11202,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -11217,7 +11217,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -11234,9 +11234,9 @@ } }, { - "accuracy": 0.9842126996893632, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9829962034954837, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -11248,7 +11248,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -11260,7 +11260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -11277,9 +11277,9 @@ } }, { - "accuracy": 0.9854353756496781, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9843941922544649, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -11291,7 +11291,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -11303,7 +11303,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -11320,9 +11320,9 @@ } }, { - "accuracy": 0.9844052731794747, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9833100400374908, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -11337,7 +11337,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -11352,7 +11352,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -11372,9 +11372,9 @@ } }, { - "accuracy": 0.9863939782800643, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9853635710222941, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -11389,7 +11389,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -11404,7 +11404,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -11424,9 +11424,9 @@ } }, { - "accuracy": 0.9922800195452414, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9914999013080409, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -11441,7 +11441,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -11456,7 +11456,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -11476,9 +11476,9 @@ } }, { - "accuracy": 0.9933836153022161, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.992704051522244, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -11493,7 +11493,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -11508,7 +11508,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -11528,9 +11528,9 @@ } }, { - "accuracy": 0.9958625629446224, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9953574258834124, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -11542,7 +11542,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -11554,7 +11554,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -11571,9 +11571,9 @@ } }, { - "accuracy": 0.9961509568812815, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9956657087283307, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -11588,7 +11588,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -11603,7 +11603,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -11620,9 +11620,9 @@ } }, { - "accuracy": 0.9967106071080228, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9965291302344811, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -11637,7 +11637,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -11652,7 +11652,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -11666,9 +11666,9 @@ } }, { - "accuracy": 0.9989239935070815, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9987612839230631, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -11680,7 +11680,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -11692,7 +11692,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -11708,7 +11708,7 @@ ], "model.layers.6.self_attn": [ { - "accuracy": 0.9403435831007204, + "accuracy": 0.9563827413673464, "total_bits": 89665536, "q_proj": { "group_size": { @@ -11772,7 +11772,7 @@ } }, { - "accuracy": 0.943042871591292, + "accuracy": 0.9581675030487149, "total_bits": 92221440, "q_proj": { "group_size": { @@ -11836,7 +11836,7 @@ } }, { - "accuracy": 0.9522734310870108, + "accuracy": 0.9633465372025967, "total_bits": 95758848, "q_proj": { "group_size": { @@ -11900,7 +11900,7 @@ } }, { - "accuracy": 0.9632584452629089, + "accuracy": 0.9730664523024308, "total_bits": 112272384, "q_proj": { "group_size": { @@ -11964,7 +11964,7 @@ } }, { - "accuracy": 0.9709139095717355, + "accuracy": 0.9780485199666337, "total_bits": 132913152, "q_proj": { "group_size": { @@ -12028,7 +12028,7 @@ } }, { - "accuracy": 0.9716128685364598, + "accuracy": 0.9782849491426819, "total_bits": 132980224, "q_proj": { "group_size": { @@ -12092,7 +12092,7 @@ } }, { - "accuracy": 0.983950871435043, + "accuracy": 0.9881794764905384, "total_bits": 169613312, "q_proj": { "group_size": { @@ -12144,7 +12144,7 @@ } }, { - "accuracy": 0.9846181383258418, + "accuracy": 0.9882673912583605, "total_bits": 169745920, "q_proj": { "group_size": { @@ -12196,7 +12196,7 @@ } }, { - "accuracy": 0.9857672229899388, + "accuracy": 0.9890698604285717, "total_bits": 171195392, "q_proj": { "group_size": { @@ -12248,7 +12248,7 @@ } }, { - "accuracy": 0.9863739558740666, + "accuracy": 0.9896169859229734, "total_bits": 173563904, "q_proj": { "group_size": { @@ -12300,7 +12300,7 @@ } }, { - "accuracy": 0.9857180962633145, + "accuracy": 0.9891044872224724, "total_bits": 174923264, "q_proj": { "group_size": { @@ -12364,7 +12364,7 @@ } }, { - "accuracy": 0.9865654392756129, + "accuracy": 0.9898155320092643, "total_bits": 175750144, "q_proj": { "group_size": { @@ -12428,7 +12428,7 @@ } }, { - "accuracy": 0.988739458010777, + "accuracy": 0.9911232846298892, "total_bits": 179253248, "q_proj": { "group_size": { @@ -12489,7 +12489,7 @@ } }, { - "accuracy": 0.9896529544586021, + "accuracy": 0.9918209256016111, "total_bits": 181592064, "q_proj": { "group_size": { @@ -12550,7 +12550,7 @@ } }, { - "accuracy": 0.9939449530119371, + "accuracy": 0.9951916322185609, "total_bits": 220469248, "q_proj": { "group_size": { @@ -12611,7 +12611,7 @@ } }, { - "accuracy": 0.9948137817463201, + "accuracy": 0.9958824546345951, "total_bits": 223535104, "q_proj": { "group_size": { @@ -12672,7 +12672,7 @@ } }, { - "accuracy": 0.9959104807287651, + "accuracy": 0.996909341879042, "total_bits": 253499392, "q_proj": { "group_size": { @@ -12724,7 +12724,7 @@ } }, { - "accuracy": 0.9973848418724772, + "accuracy": 0.9978934777226593, "total_bits": 265838592, "q_proj": { "group_size": { @@ -12776,7 +12776,7 @@ } }, { - "accuracy": 0.9989016128697825, + "accuracy": 0.9991893594468773, "total_bits": 337385472, "q_proj": { "group_size": { @@ -12828,11 +12828,11 @@ } } ], - "model.layers.6.block_sparse_moe": [ + "model.layers.6.mlp": [ { - "accuracy": 0.926799679861257, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9286569243198947, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -12847,7 +12847,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -12862,7 +12862,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -12882,9 +12882,9 @@ } }, { - "accuracy": 0.9295770284768782, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9308436009052553, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -12899,7 +12899,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -12914,7 +12914,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -12934,9 +12934,9 @@ } }, { - "accuracy": 0.9390443865405886, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9422135768752349, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -12951,7 +12951,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -12966,7 +12966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -12983,9 +12983,9 @@ } }, { - "accuracy": 0.9410078419666541, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.945391336750043, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -13000,7 +13000,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -13015,7 +13015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -13032,9 +13032,9 @@ } }, { - "accuracy": 0.9655850874750238, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9641639163815662, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -13049,7 +13049,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -13064,7 +13064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -13084,9 +13084,9 @@ } }, { - "accuracy": 0.9684472279132981, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9670195780988586, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -13101,7 +13101,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -13116,7 +13116,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -13136,9 +13136,9 @@ } }, { - "accuracy": 0.971598464211351, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9715838383785204, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -13153,7 +13153,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -13168,7 +13168,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -13185,9 +13185,9 @@ } }, { - "accuracy": 0.9828122296909753, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9814185249178033, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -13199,7 +13199,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -13211,7 +13211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -13228,9 +13228,9 @@ } }, { - "accuracy": 0.9841036467175734, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9829287872110543, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -13242,7 +13242,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -13254,7 +13254,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -13271,9 +13271,9 @@ } }, { - "accuracy": 0.9829470692202449, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9817119689266148, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -13288,7 +13288,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -13303,7 +13303,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -13323,9 +13323,9 @@ } }, { - "accuracy": 0.9851054365403558, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9839528401529318, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -13340,7 +13340,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -13355,7 +13355,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -13375,9 +13375,9 @@ } }, { - "accuracy": 0.9915641695261002, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9906677133893889, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -13392,7 +13392,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -13407,7 +13407,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -13427,9 +13427,9 @@ } }, { - "accuracy": 0.9927779776779445, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9919949616673157, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -13444,7 +13444,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -13459,7 +13459,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -13479,9 +13479,9 @@ } }, { - "accuracy": 0.9955281341619986, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9949198536158196, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -13493,7 +13493,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -13505,7 +13505,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -13522,9 +13522,9 @@ } }, { - "accuracy": 0.9957901725766102, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9952370149794182, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -13539,7 +13539,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -13554,7 +13554,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -13571,9 +13571,9 @@ } }, { - "accuracy": 0.9964275332290287, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9962068307826197, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -13588,7 +13588,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -13603,7 +13603,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -13617,9 +13617,9 @@ } }, { - "accuracy": 0.9988426360896004, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9986566038630706, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -13631,7 +13631,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -13643,7 +13643,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -13659,7 +13659,7 @@ ], "model.layers.7.self_attn": [ { - "accuracy": 0.9365530625769967, + "accuracy": 0.9499918338107436, "total_bits": 89665536, "q_proj": { "group_size": { @@ -13723,7 +13723,7 @@ } }, { - "accuracy": 0.9381882597349191, + "accuracy": 0.9513833263986989, "total_bits": 92221440, "q_proj": { "group_size": { @@ -13787,7 +13787,7 @@ } }, { - "accuracy": 0.9474649635192595, + "accuracy": 0.9575070885843352, "total_bits": 95758848, "q_proj": { "group_size": { @@ -13851,7 +13851,7 @@ } }, { - "accuracy": 0.9600888077953929, + "accuracy": 0.9689599325586307, "total_bits": 112272384, "q_proj": { "group_size": { @@ -13915,7 +13915,7 @@ } }, { - "accuracy": 0.9682890978690825, + "accuracy": 0.9750161753281167, "total_bits": 132913152, "q_proj": { "group_size": { @@ -13979,7 +13979,7 @@ } }, { - "accuracy": 0.9684009453967998, + "accuracy": 0.9752402876160646, "total_bits": 132980224, "q_proj": { "group_size": { @@ -14043,7 +14043,7 @@ } }, { - "accuracy": 0.9823253581692514, + "accuracy": 0.9863599498854264, "total_bits": 169613312, "q_proj": { "group_size": { @@ -14095,7 +14095,7 @@ } }, { - "accuracy": 0.9825815390305299, + "accuracy": 0.986572731972525, "total_bits": 169745920, "q_proj": { "group_size": { @@ -14147,7 +14147,7 @@ } }, { - "accuracy": 0.9839327674555151, + "accuracy": 0.98754257431842, "total_bits": 171195392, "q_proj": { "group_size": { @@ -14199,7 +14199,7 @@ } }, { - "accuracy": 0.9846781739278844, + "accuracy": 0.988264819729681, "total_bits": 173563904, "q_proj": { "group_size": { @@ -14251,7 +14251,7 @@ } }, { - "accuracy": 0.984113915872417, + "accuracy": 0.9875574346974885, "total_bits": 174923264, "q_proj": { "group_size": { @@ -14315,7 +14315,7 @@ } }, { - "accuracy": 0.9853785711487657, + "accuracy": 0.9882976354933098, "total_bits": 175750144, "q_proj": { "group_size": { @@ -14379,7 +14379,7 @@ } }, { - "accuracy": 0.9871541570970103, + "accuracy": 0.9895581300370395, "total_bits": 179253248, "q_proj": { "group_size": { @@ -14440,7 +14440,7 @@ } }, { - "accuracy": 0.9880625561712996, + "accuracy": 0.99036589130073, "total_bits": 181592064, "q_proj": { "group_size": { @@ -14501,7 +14501,7 @@ } }, { - "accuracy": 0.9929636530578136, + "accuracy": 0.9943453255477116, "total_bits": 220469248, "q_proj": { "group_size": { @@ -14562,7 +14562,7 @@ } }, { - "accuracy": 0.9940473428608751, + "accuracy": 0.9951718850306382, "total_bits": 223535104, "q_proj": { "group_size": { @@ -14623,7 +14623,7 @@ } }, { - "accuracy": 0.9953926458562675, + "accuracy": 0.9964656568205866, "total_bits": 253499392, "q_proj": { "group_size": { @@ -14675,7 +14675,7 @@ } }, { - "accuracy": 0.9969289508452149, + "accuracy": 0.9974921134508852, "total_bits": 265838592, "q_proj": { "group_size": { @@ -14727,7 +14727,7 @@ } }, { - "accuracy": 0.9987685527885333, + "accuracy": 0.9990539407835489, "total_bits": 337385472, "q_proj": { "group_size": { @@ -14779,11 +14779,11 @@ } } ], - "model.layers.7.block_sparse_moe": [ + "model.layers.7.mlp": [ { - "accuracy": 0.9228573219948694, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9259405167479264, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -14798,7 +14798,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -14813,7 +14813,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -14833,9 +14833,9 @@ } }, { - "accuracy": 0.9256855231758795, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9282099312465442, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -14850,7 +14850,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -14865,7 +14865,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -14885,9 +14885,9 @@ } }, { - "accuracy": 0.9358362961364419, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9401247408241034, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -14902,7 +14902,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -14917,7 +14917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -14934,9 +14934,9 @@ } }, { - "accuracy": 0.9379441061694371, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9434805277146792, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -14951,7 +14951,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -14966,7 +14966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -14983,9 +14983,9 @@ } }, { - "accuracy": 0.9635149213044267, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9626394660261116, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -15000,7 +15000,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -15015,7 +15015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -15035,9 +15035,9 @@ } }, { - "accuracy": 0.966613655615794, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9656735034169335, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -15052,7 +15052,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -15067,7 +15067,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -15087,9 +15087,9 @@ } }, { - "accuracy": 0.9700325820595026, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.970489716696504, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -15104,7 +15104,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -15119,7 +15119,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -15136,9 +15136,9 @@ } }, { - "accuracy": 0.9816514553110066, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9805654691238153, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -15150,7 +15150,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -15162,7 +15162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -15179,9 +15179,9 @@ } }, { - "accuracy": 0.9831282199409447, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9821809555747008, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -15193,7 +15193,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -15205,7 +15205,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -15222,9 +15222,9 @@ } }, { - "accuracy": 0.9818561818744791, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9809141439434729, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -15239,7 +15239,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -15254,7 +15254,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -15274,9 +15274,9 @@ } }, { - "accuracy": 0.9841741921968366, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9832683878607655, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -15291,7 +15291,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -15306,7 +15306,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -15326,9 +15326,9 @@ } }, { - "accuracy": 0.9909754595964363, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9902537560923711, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -15343,7 +15343,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -15358,7 +15358,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -15378,9 +15378,9 @@ } }, { - "accuracy": 0.9923101421690693, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9916401883146089, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -15395,7 +15395,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -15410,7 +15410,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -15430,9 +15430,9 @@ } }, { - "accuracy": 0.9952036216732507, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9946859619874311, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -15444,7 +15444,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -15456,7 +15456,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -15473,9 +15473,9 @@ } }, { - "accuracy": 0.9954999021965226, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9950284306817737, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -15490,7 +15490,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -15505,7 +15505,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -15522,9 +15522,9 @@ } }, { - "accuracy": 0.9962162414541174, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9960819317003418, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -15539,7 +15539,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -15554,7 +15554,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -15568,9 +15568,9 @@ } }, { - "accuracy": 0.9987618498577687, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9985976114556635, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -15582,7 +15582,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -15594,7 +15594,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -15610,7 +15610,7 @@ ], "model.layers.8.self_attn": [ { - "accuracy": 0.9310825231828188, + "accuracy": 0.949622079632, "total_bits": 89665536, "q_proj": { "group_size": { @@ -15674,7 +15674,7 @@ } }, { - "accuracy": 0.9334916837121311, + "accuracy": 0.9516214221519859, "total_bits": 92221440, "q_proj": { "group_size": { @@ -15738,7 +15738,7 @@ } }, { - "accuracy": 0.9421536724426245, + "accuracy": 0.9564297805099111, "total_bits": 95758848, "q_proj": { "group_size": { @@ -15802,7 +15802,7 @@ } }, { - "accuracy": 0.9564096921760785, + "accuracy": 0.9675953928381205, "total_bits": 112272384, "q_proj": { "group_size": { @@ -15866,7 +15866,7 @@ } }, { - "accuracy": 0.9649832523182819, + "accuracy": 0.9742434676246423, "total_bits": 132913152, "q_proj": { "group_size": { @@ -15930,7 +15930,7 @@ } }, { - "accuracy": 0.9654168255235019, + "accuracy": 0.9746199527657345, "total_bits": 132980224, "q_proj": { "group_size": { @@ -15994,7 +15994,7 @@ } }, { - "accuracy": 0.9805737694137191, + "accuracy": 0.9855569024864388, "total_bits": 169613312, "q_proj": { "group_size": { @@ -16046,7 +16046,7 @@ } }, { - "accuracy": 0.9808163152713525, + "accuracy": 0.9858438473832059, "total_bits": 169745920, "q_proj": { "group_size": { @@ -16098,7 +16098,7 @@ } }, { - "accuracy": 0.9821695956940714, + "accuracy": 0.9868788475562867, "total_bits": 171195392, "q_proj": { "group_size": { @@ -16150,7 +16150,7 @@ } }, { - "accuracy": 0.9832292308442687, + "accuracy": 0.9876481426242543, "total_bits": 173563904, "q_proj": { "group_size": { @@ -16202,7 +16202,7 @@ } }, { - "accuracy": 0.9824752989470175, + "accuracy": 0.9870050916644303, "total_bits": 174923264, "q_proj": { "group_size": { @@ -16266,7 +16266,7 @@ } }, { - "accuracy": 0.9835325159916752, + "accuracy": 0.987788005437898, "total_bits": 175750144, "q_proj": { "group_size": { @@ -16330,7 +16330,7 @@ } }, { - "accuracy": 0.9857140314814291, + "accuracy": 0.9890447544228089, "total_bits": 179253248, "q_proj": { "group_size": { @@ -16391,7 +16391,7 @@ } }, { - "accuracy": 0.9867697178239101, + "accuracy": 0.9899648613492517, "total_bits": 181592064, "q_proj": { "group_size": { @@ -16452,7 +16452,7 @@ } }, { - "accuracy": 0.9920890752569234, + "accuracy": 0.9940567836042886, "total_bits": 220469248, "q_proj": { "group_size": { @@ -16513,7 +16513,7 @@ } }, { - "accuracy": 0.9932691788693008, + "accuracy": 0.9949417515706859, "total_bits": 223535104, "q_proj": { "group_size": { @@ -16574,7 +16574,7 @@ } }, { - "accuracy": 0.9947937725503978, + "accuracy": 0.996205939261831, "total_bits": 253499392, "q_proj": { "group_size": { @@ -16626,7 +16626,7 @@ } }, { - "accuracy": 0.9964956084854508, + "accuracy": 0.9973220073213604, "total_bits": 265838592, "q_proj": { "group_size": { @@ -16678,7 +16678,7 @@ } }, { - "accuracy": 0.9986297639736318, + "accuracy": 0.9989957615310375, "total_bits": 337385472, "q_proj": { "group_size": { @@ -16730,11 +16730,11 @@ } } ], - "model.layers.8.block_sparse_moe": [ + "model.layers.8.mlp": [ { - "accuracy": 0.9204807552067857, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9242972276712719, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -16749,7 +16749,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -16764,7 +16764,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -16784,9 +16784,9 @@ } }, { - "accuracy": 0.9234070652409604, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9265914451526969, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -16801,7 +16801,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -16816,7 +16816,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -16836,9 +16836,9 @@ } }, { - "accuracy": 0.9335175503633524, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9383951130470163, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -16853,7 +16853,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -16868,7 +16868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -16885,9 +16885,9 @@ } }, { - "accuracy": 0.9356210229820326, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9417343852355292, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -16902,7 +16902,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -16917,7 +16917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -16934,9 +16934,9 @@ } }, { - "accuracy": 0.9623162945438373, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9617170901282838, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -16951,7 +16951,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -16966,7 +16966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -16986,9 +16986,9 @@ } }, { - "accuracy": 0.9654689245906315, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9648430781733048, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -17003,7 +17003,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -17018,7 +17018,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -17038,9 +17038,9 @@ } }, { - "accuracy": 0.968888534331008, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.969658492917293, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -17055,7 +17055,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -17070,7 +17070,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -17087,9 +17087,9 @@ } }, { - "accuracy": 0.9811484676442648, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9801137778221777, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -17101,7 +17101,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -17113,7 +17113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -17130,9 +17130,9 @@ } }, { - "accuracy": 0.982538180768882, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9817441582287613, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -17144,7 +17144,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -17156,7 +17156,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -17173,9 +17173,9 @@ } }, { - "accuracy": 0.9812206204882578, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9804237633943558, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -17190,7 +17190,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -17205,7 +17205,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -17225,9 +17225,9 @@ } }, { - "accuracy": 0.9836560174528706, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.98284867234332, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -17242,7 +17242,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -17257,7 +17257,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -17277,9 +17277,9 @@ } }, { - "accuracy": 0.9906556778097231, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9899963410571218, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -17294,7 +17294,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -17309,7 +17309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -17329,9 +17329,9 @@ } }, { - "accuracy": 0.992043827268246, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9914326956740728, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -17346,7 +17346,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -17361,7 +17361,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -17381,9 +17381,9 @@ } }, { - "accuracy": 0.9950413011063478, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.994542267688207, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -17395,7 +17395,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -17407,7 +17407,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -17424,9 +17424,9 @@ } }, { - "accuracy": 0.9953316424017478, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9948894479312003, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -17441,7 +17441,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -17456,7 +17456,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -17473,9 +17473,9 @@ } }, { - "accuracy": 0.9960382048555306, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9959300729250046, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -17490,7 +17490,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -17505,7 +17505,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -17519,9 +17519,9 @@ } }, { - "accuracy": 0.9987212244672455, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9985541774290859, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -17533,7 +17533,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -17545,7 +17545,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -17561,7 +17561,7 @@ ], "model.layers.9.self_attn": [ { - "accuracy": 0.9227808321777143, + "accuracy": 0.9461901721201444, "total_bits": 89665536, "q_proj": { "group_size": { @@ -17625,7 +17625,7 @@ } }, { - "accuracy": 0.9245412763404219, + "accuracy": 0.9476211194537187, "total_bits": 92221440, "q_proj": { "group_size": { @@ -17689,7 +17689,7 @@ } }, { - "accuracy": 0.9349466119157641, + "accuracy": 0.9523624929746515, "total_bits": 95758848, "q_proj": { "group_size": { @@ -17753,7 +17753,7 @@ } }, { - "accuracy": 0.9482090630029377, + "accuracy": 0.9636443723973475, "total_bits": 112272384, "q_proj": { "group_size": { @@ -17817,7 +17817,7 @@ } }, { - "accuracy": 0.9598034725181366, + "accuracy": 0.9712372422218323, "total_bits": 132913152, "q_proj": { "group_size": { @@ -17881,7 +17881,7 @@ } }, { - "accuracy": 0.9609961889095997, + "accuracy": 0.9715482738163126, "total_bits": 132980224, "q_proj": { "group_size": { @@ -17945,7 +17945,7 @@ } }, { - "accuracy": 0.9779940328903889, + "accuracy": 0.9839093900824848, "total_bits": 169613312, "q_proj": { "group_size": { @@ -17997,7 +17997,7 @@ } }, { - "accuracy": 0.9779499097678223, + "accuracy": 0.9843661240547111, "total_bits": 169745920, "q_proj": { "group_size": { @@ -18049,7 +18049,7 @@ } }, { - "accuracy": 0.9802826994441842, + "accuracy": 0.9855494796739597, "total_bits": 171195392, "q_proj": { "group_size": { @@ -18101,7 +18101,7 @@ } }, { - "accuracy": 0.9814546593513928, + "accuracy": 0.9863956168441004, "total_bits": 173563904, "q_proj": { "group_size": { @@ -18153,7 +18153,7 @@ } }, { - "accuracy": 0.9799881231431898, + "accuracy": 0.9852579927660132, "total_bits": 174923264, "q_proj": { "group_size": { @@ -18217,7 +18217,7 @@ } }, { - "accuracy": 0.9816469519252056, + "accuracy": 0.986335435195973, "total_bits": 175750144, "q_proj": { "group_size": { @@ -18281,7 +18281,7 @@ } }, { - "accuracy": 0.9842005922508082, + "accuracy": 0.9880314246310216, "total_bits": 179253248, "q_proj": { "group_size": { @@ -18342,7 +18342,7 @@ } }, { - "accuracy": 0.9853926774506506, + "accuracy": 0.9890811989868158, "total_bits": 181592064, "q_proj": { "group_size": { @@ -18403,7 +18403,7 @@ } }, { - "accuracy": 0.9912662639919865, + "accuracy": 0.9934601496512953, "total_bits": 220469248, "q_proj": { "group_size": { @@ -18464,7 +18464,7 @@ } }, { - "accuracy": 0.992505823016951, + "accuracy": 0.994482040233714, "total_bits": 223535104, "q_proj": { "group_size": { @@ -18525,7 +18525,7 @@ } }, { - "accuracy": 0.994027387718425, + "accuracy": 0.9958087403547803, "total_bits": 253499392, "q_proj": { "group_size": { @@ -18577,7 +18577,7 @@ } }, { - "accuracy": 0.9963108478236551, + "accuracy": 0.9971875118361296, "total_bits": 265838592, "q_proj": { "group_size": { @@ -18629,7 +18629,7 @@ } }, { - "accuracy": 0.9984306024669326, + "accuracy": 0.9989026100895564, "total_bits": 337385472, "q_proj": { "group_size": { @@ -18681,11 +18681,11 @@ } } ], - "model.layers.9.block_sparse_moe": [ + "model.layers.9.mlp": [ { - "accuracy": 0.9186476008280328, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9216987433794298, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -18700,7 +18700,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -18715,7 +18715,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -18735,9 +18735,9 @@ } }, { - "accuracy": 0.9215159339732245, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9240213144374521, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -18752,7 +18752,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -18767,7 +18767,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -18787,9 +18787,9 @@ } }, { - "accuracy": 0.9319214016983384, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9361893550345772, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -18804,7 +18804,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -18819,7 +18819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -18836,9 +18836,9 @@ } }, { - "accuracy": 0.9340870174530306, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9396449457854033, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -18853,7 +18853,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -18868,7 +18868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -18885,9 +18885,9 @@ } }, { - "accuracy": 0.9614756679848621, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9604351151930659, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -18902,7 +18902,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -18917,7 +18917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -18937,9 +18937,9 @@ } }, { - "accuracy": 0.9646684247020044, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9636437380196232, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -18954,7 +18954,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -18969,7 +18969,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -18989,9 +18989,9 @@ } }, { - "accuracy": 0.9681861304530972, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9686130972481087, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -19006,7 +19006,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -19021,7 +19021,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -19038,9 +19038,9 @@ } }, { - "accuracy": 0.9807194060106811, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9794092378333995, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -19052,7 +19052,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -19064,7 +19064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -19081,9 +19081,9 @@ } }, { - "accuracy": 0.9821537621809464, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9811010842554664, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -19095,7 +19095,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -19107,7 +19107,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -19124,9 +19124,9 @@ } }, { - "accuracy": 0.9808666882546324, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9797771367685575, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -19141,7 +19141,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -19156,7 +19156,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -19176,9 +19176,9 @@ } }, { - "accuracy": 0.9832957325699297, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9822643262971389, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -19193,7 +19193,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -19208,7 +19208,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -19228,9 +19228,9 @@ } }, { - "accuracy": 0.9905135765132543, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9896702573781735, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -19245,7 +19245,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -19260,7 +19260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -19280,9 +19280,9 @@ } }, { - "accuracy": 0.9919045056755605, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.991143028717488, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -19297,7 +19297,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -19312,7 +19312,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -19332,9 +19332,9 @@ } }, { - "accuracy": 0.9949800188310052, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9943612839683498, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -19346,7 +19346,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -19358,7 +19358,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -19375,9 +19375,9 @@ } }, { - "accuracy": 0.995298075501954, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9947242641723469, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -19392,7 +19392,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -19407,7 +19407,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -19424,9 +19424,9 @@ } }, { - "accuracy": 0.9960413019442441, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9958141572699931, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -19441,7 +19441,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -19456,7 +19456,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -19470,9 +19470,9 @@ } }, { - "accuracy": 0.9987066149999584, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9984990562661551, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -19484,7 +19484,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -19496,7 +19496,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -19512,7 +19512,7 @@ ], "model.layers.10.self_attn": [ { - "accuracy": 0.9263995758404857, + "accuracy": 0.9451580886778078, "total_bits": 89665536, "q_proj": { "group_size": { @@ -19576,7 +19576,7 @@ } }, { - "accuracy": 0.9280166604409092, + "accuracy": 0.9467395417588322, "total_bits": 92221440, "q_proj": { "group_size": { @@ -19640,7 +19640,7 @@ } }, { - "accuracy": 0.9379807053820083, + "accuracy": 0.9526131990316667, "total_bits": 95758848, "q_proj": { "group_size": { @@ -19704,7 +19704,7 @@ } }, { - "accuracy": 0.9522572022519613, + "accuracy": 0.9640310901756349, "total_bits": 112272384, "q_proj": { "group_size": { @@ -19768,7 +19768,7 @@ } }, { - "accuracy": 0.9620968166150545, + "accuracy": 0.9714674281054422, "total_bits": 132913152, "q_proj": { "group_size": { @@ -19832,7 +19832,7 @@ } }, { - "accuracy": 0.9630412202524512, + "accuracy": 0.9717663153515834, "total_bits": 132980224, "q_proj": { "group_size": { @@ -19896,7 +19896,7 @@ } }, { - "accuracy": 0.9783801015858588, + "accuracy": 0.9842620977249584, "total_bits": 169613312, "q_proj": { "group_size": { @@ -19948,7 +19948,7 @@ } }, { - "accuracy": 0.9791110296194491, + "accuracy": 0.984502583055904, "total_bits": 169745920, "q_proj": { "group_size": { @@ -20000,7 +20000,7 @@ } }, { - "accuracy": 0.9810583774784678, + "accuracy": 0.9857162597442144, "total_bits": 171195392, "q_proj": { "group_size": { @@ -20052,7 +20052,7 @@ } }, { - "accuracy": 0.9818809954939705, + "accuracy": 0.9865676298198339, "total_bits": 173563904, "q_proj": { "group_size": { @@ -20104,7 +20104,7 @@ } }, { - "accuracy": 0.98102107889166, + "accuracy": 0.9857582061698562, "total_bits": 174923264, "q_proj": { "group_size": { @@ -20168,7 +20168,7 @@ } }, { - "accuracy": 0.9824635798209592, + "accuracy": 0.9867941707274631, "total_bits": 175750144, "q_proj": { "group_size": { @@ -20232,7 +20232,7 @@ } }, { - "accuracy": 0.984751472347661, + "accuracy": 0.9882237025977749, "total_bits": 179253248, "q_proj": { "group_size": { @@ -20293,7 +20293,7 @@ } }, { - "accuracy": 0.9858558184810375, + "accuracy": 0.989124229398409, "total_bits": 181592064, "q_proj": { "group_size": { @@ -20354,7 +20354,7 @@ } }, { - "accuracy": 0.9916773429385534, + "accuracy": 0.9935391340282207, "total_bits": 220469248, "q_proj": { "group_size": { @@ -20415,7 +20415,7 @@ } }, { - "accuracy": 0.9929503999699495, + "accuracy": 0.9945418046615822, "total_bits": 223535104, "q_proj": { "group_size": { @@ -20476,7 +20476,7 @@ } }, { - "accuracy": 0.9944743210768425, + "accuracy": 0.9958665798736834, "total_bits": 253499392, "q_proj": { "group_size": { @@ -20528,7 +20528,7 @@ } }, { - "accuracy": 0.9962296926931135, + "accuracy": 0.9970899069448933, "total_bits": 265838592, "q_proj": { "group_size": { @@ -20580,7 +20580,7 @@ } }, { - "accuracy": 0.9985284238355234, + "accuracy": 0.9988952024754039, "total_bits": 337385472, "q_proj": { "group_size": { @@ -20632,11 +20632,11 @@ } } ], - "model.layers.10.block_sparse_moe": [ + "model.layers.10.mlp": [ { - "accuracy": 0.9179219334925476, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.920626043096969, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -20651,7 +20651,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -20666,7 +20666,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -20686,9 +20686,9 @@ } }, { - "accuracy": 0.9208223566805062, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9229697863522329, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -20703,7 +20703,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -20718,7 +20718,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -20738,9 +20738,9 @@ } }, { - "accuracy": 0.9312317383132482, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.935167394186321, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -20755,7 +20755,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -20770,7 +20770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -20787,9 +20787,9 @@ } }, { - "accuracy": 0.9334287992433498, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9386483526935703, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -20804,7 +20804,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -20819,7 +20819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -20836,9 +20836,9 @@ } }, { - "accuracy": 0.9611955391929338, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9599458476234424, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -20853,7 +20853,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -20868,7 +20868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -20888,9 +20888,9 @@ } }, { - "accuracy": 0.9643507914519623, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9631230077265125, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -20905,7 +20905,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -20920,7 +20920,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -20940,9 +20940,9 @@ } }, { - "accuracy": 0.967939004007923, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9681433376513029, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -20957,7 +20957,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -20972,7 +20972,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -20989,9 +20989,9 @@ } }, { - "accuracy": 0.9807050981509843, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9792530484693615, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -21003,7 +21003,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -21015,7 +21015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -21032,9 +21032,9 @@ } }, { - "accuracy": 0.9821274705525291, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9809101347468401, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -21046,7 +21046,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -21058,7 +21058,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -21075,9 +21075,9 @@ } }, { - "accuracy": 0.9807698768808654, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9795381524355004, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -21092,7 +21092,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -21107,7 +21107,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -21127,9 +21127,9 @@ } }, { - "accuracy": 0.9832176146538634, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9820511281294259, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -21144,7 +21144,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -21159,7 +21159,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -21179,9 +21179,9 @@ } }, { - "accuracy": 0.9905040371231735, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9895674055197129, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -21196,7 +21196,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -21211,7 +21211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -21231,9 +21231,9 @@ } }, { - "accuracy": 0.9919005270655218, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9910417126344615, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -21248,7 +21248,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -21263,7 +21263,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -21283,9 +21283,9 @@ } }, { - "accuracy": 0.9950200153122607, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9943332437450361, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -21297,7 +21297,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -21309,7 +21309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -21326,9 +21326,9 @@ } }, { - "accuracy": 0.9953120934017199, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.994671578751877, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -21343,7 +21343,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -21358,7 +21358,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -21375,9 +21375,9 @@ } }, { - "accuracy": 0.9960557815226677, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9957421741595394, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -21392,7 +21392,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -21407,7 +21407,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -21421,9 +21421,9 @@ } }, { - "accuracy": 0.9987227458531331, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.998498952374671, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -21435,7 +21435,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -21447,7 +21447,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -21463,7 +21463,7 @@ ], "model.layers.11.self_attn": [ { - "accuracy": 0.9272899478673935, + "accuracy": 0.9469542790596422, "total_bits": 89665536, "q_proj": { "group_size": { @@ -21527,7 +21527,7 @@ } }, { - "accuracy": 0.9276470020413399, + "accuracy": 0.9484867501611772, "total_bits": 92221440, "q_proj": { "group_size": { @@ -21591,7 +21591,7 @@ } }, { - "accuracy": 0.9374686552113608, + "accuracy": 0.9535855151908963, "total_bits": 95758848, "q_proj": { "group_size": { @@ -21655,7 +21655,7 @@ } }, { - "accuracy": 0.9502892423617213, + "accuracy": 0.9649058034349429, "total_bits": 112272384, "q_proj": { "group_size": { @@ -21719,7 +21719,7 @@ } }, { - "accuracy": 0.9625186936831788, + "accuracy": 0.9728143688193277, "total_bits": 132913152, "q_proj": { "group_size": { @@ -21783,7 +21783,7 @@ } }, { - "accuracy": 0.9633390976018027, + "accuracy": 0.9731722203524489, "total_bits": 132980224, "q_proj": { "group_size": { @@ -21847,7 +21847,7 @@ } }, { - "accuracy": 0.9797175423683304, + "accuracy": 0.984941178981803, "total_bits": 169613312, "q_proj": { "group_size": { @@ -21899,7 +21899,7 @@ } }, { - "accuracy": 0.9800941193182218, + "accuracy": 0.9852205766610017, "total_bits": 169745920, "q_proj": { "group_size": { @@ -21951,7 +21951,7 @@ } }, { - "accuracy": 0.9812152203368513, + "accuracy": 0.9862103504618924, "total_bits": 171195392, "q_proj": { "group_size": { @@ -22003,7 +22003,7 @@ } }, { - "accuracy": 0.9823465504144367, + "accuracy": 0.9870128660511813, "total_bits": 173563904, "q_proj": { "group_size": { @@ -22055,7 +22055,7 @@ } }, { - "accuracy": 0.9816771441776502, + "accuracy": 0.986219716258347, "total_bits": 174923264, "q_proj": { "group_size": { @@ -22119,7 +22119,7 @@ } }, { - "accuracy": 0.9824309838249495, + "accuracy": 0.9871530125260746, "total_bits": 175750144, "q_proj": { "group_size": { @@ -22183,7 +22183,7 @@ } }, { - "accuracy": 0.9847063060457769, + "accuracy": 0.9884389408275878, "total_bits": 179253248, "q_proj": { "group_size": { @@ -22244,7 +22244,7 @@ } }, { - "accuracy": 0.9860868631420951, + "accuracy": 0.9893247609418866, "total_bits": 181592064, "q_proj": { "group_size": { @@ -22305,7 +22305,7 @@ } }, { - "accuracy": 0.991836195296951, + "accuracy": 0.993740423472206, "total_bits": 220469248, "q_proj": { "group_size": { @@ -22366,7 +22366,7 @@ } }, { - "accuracy": 0.9930776141118258, + "accuracy": 0.9946446177353593, "total_bits": 223535104, "q_proj": { "group_size": { @@ -22427,7 +22427,7 @@ } }, { - "accuracy": 0.9946522794688415, + "accuracy": 0.99608718261956, "total_bits": 253499392, "q_proj": { "group_size": { @@ -22479,7 +22479,7 @@ } }, { - "accuracy": 0.9963443699464398, + "accuracy": 0.9971813304398797, "total_bits": 265838592, "q_proj": { "group_size": { @@ -22531,7 +22531,7 @@ } }, { - "accuracy": 0.9985927237124231, + "accuracy": 0.9989617652639649, "total_bits": 337385472, "q_proj": { "group_size": { @@ -22583,11 +22583,11 @@ } } ], - "model.layers.11.block_sparse_moe": [ + "model.layers.11.mlp": [ { - "accuracy": 0.9163732097337121, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9192196109185093, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -22602,7 +22602,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -22617,7 +22617,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -22637,9 +22637,9 @@ } }, { - "accuracy": 0.9193548003309652, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9216961715566485, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -22654,7 +22654,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -22669,7 +22669,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -22689,9 +22689,9 @@ } }, { - "accuracy": 0.9301097585182441, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9340641935797114, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -22706,7 +22706,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -22721,7 +22721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -22738,9 +22738,9 @@ } }, { - "accuracy": 0.9323978831893519, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9376158208439225, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -22755,7 +22755,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -22770,7 +22770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -22787,9 +22787,9 @@ } }, { - "accuracy": 0.9602782044951853, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9590805503294656, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -22804,7 +22804,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -22819,7 +22819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -22839,9 +22839,9 @@ } }, { - "accuracy": 0.9636333191669301, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.962422707069077, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -22856,7 +22856,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -22871,7 +22871,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -22891,9 +22891,9 @@ } }, { - "accuracy": 0.9673173698155504, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.96747996589463, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -22908,7 +22908,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -22923,7 +22923,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -22940,9 +22940,9 @@ } }, { - "accuracy": 0.980115708159773, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9786642449662873, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -22954,7 +22954,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -22966,7 +22966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -22983,9 +22983,9 @@ } }, { - "accuracy": 0.9816270657373887, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9803751649236992, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -22997,7 +22997,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -23009,7 +23009,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -23026,9 +23026,9 @@ } }, { - "accuracy": 0.9802890293496219, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9790288335888794, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -23043,7 +23043,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -23058,7 +23058,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -23078,9 +23078,9 @@ } }, { - "accuracy": 0.9827987267880848, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9816060256899187, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -23095,7 +23095,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -23110,7 +23110,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -23130,9 +23130,9 @@ } }, { - "accuracy": 0.9902477094805554, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9892634294289899, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -23147,7 +23147,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -23162,7 +23162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -23182,9 +23182,9 @@ } }, { - "accuracy": 0.9917030130807114, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9908025186067742, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -23199,7 +23199,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -23214,7 +23214,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -23234,9 +23234,9 @@ } }, { - "accuracy": 0.9948482973329527, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.994129055804622, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -23248,7 +23248,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -23260,7 +23260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -23277,9 +23277,9 @@ } }, { - "accuracy": 0.9951870612873647, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9945139340370109, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -23294,7 +23294,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -23309,7 +23309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -23326,9 +23326,9 @@ } }, { - "accuracy": 0.9959776910928715, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9956296518395998, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -23343,7 +23343,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -23358,7 +23358,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -23372,9 +23372,9 @@ } }, { - "accuracy": 0.9986808355346224, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9984485936951578, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -23386,7 +23386,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -23398,7 +23398,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -23414,7 +23414,7 @@ ], "model.layers.12.self_attn": [ { - "accuracy": 0.9183806439763621, + "accuracy": 0.9407036465249563, "total_bits": 89665536, "q_proj": { "group_size": { @@ -23478,7 +23478,7 @@ } }, { - "accuracy": 0.9226338867294162, + "accuracy": 0.9431414157152176, "total_bits": 92221440, "q_proj": { "group_size": { @@ -23542,7 +23542,7 @@ } }, { - "accuracy": 0.9354290675960089, + "accuracy": 0.9505380636179134, "total_bits": 95758848, "q_proj": { "group_size": { @@ -23606,7 +23606,7 @@ } }, { - "accuracy": 0.951254987795102, + "accuracy": 0.9644050426585109, "total_bits": 112272384, "q_proj": { "group_size": { @@ -23670,7 +23670,7 @@ } }, { - "accuracy": 0.9600028915232733, + "accuracy": 0.970311891973803, "total_bits": 132913152, "q_proj": { "group_size": { @@ -23734,7 +23734,7 @@ } }, { - "accuracy": 0.9606332534826115, + "accuracy": 0.9706549811127939, "total_bits": 132980224, "q_proj": { "group_size": { @@ -23798,7 +23798,7 @@ } }, { - "accuracy": 0.9773664190188834, + "accuracy": 0.9834626000374556, "total_bits": 169613312, "q_proj": { "group_size": { @@ -23850,7 +23850,7 @@ } }, { - "accuracy": 0.9782024943514874, + "accuracy": 0.9838127691886926, "total_bits": 169745920, "q_proj": { "group_size": { @@ -23902,7 +23902,7 @@ } }, { - "accuracy": 0.9798377019500262, + "accuracy": 0.984814469212372, "total_bits": 171195392, "q_proj": { "group_size": { @@ -23954,7 +23954,7 @@ } }, { - "accuracy": 0.9806975354685595, + "accuracy": 0.9856005118375546, "total_bits": 173563904, "q_proj": { "group_size": { @@ -24006,7 +24006,7 @@ } }, { - "accuracy": 0.9801827449547617, + "accuracy": 0.9850995247987541, "total_bits": 174923264, "q_proj": { "group_size": { @@ -24070,7 +24070,7 @@ } }, { - "accuracy": 0.981384973678934, + "accuracy": 0.9860195296963579, "total_bits": 175750144, "q_proj": { "group_size": { @@ -24134,7 +24134,7 @@ } }, { - "accuracy": 0.984072589462525, + "accuracy": 0.9877668667927777, "total_bits": 179253248, "q_proj": { "group_size": { @@ -24195,7 +24195,7 @@ } }, { - "accuracy": 0.9854205264954975, + "accuracy": 0.9886748754713488, "total_bits": 181592064, "q_proj": { "group_size": { @@ -24256,7 +24256,7 @@ } }, { - "accuracy": 0.9914159666207668, + "accuracy": 0.9933994075553002, "total_bits": 220469248, "q_proj": { "group_size": { @@ -24317,7 +24317,7 @@ } }, { - "accuracy": 0.9927223878422458, + "accuracy": 0.9943339287818066, "total_bits": 223535104, "q_proj": { "group_size": { @@ -24378,7 +24378,7 @@ } }, { - "accuracy": 0.9941089358916017, + "accuracy": 0.9956968363472506, "total_bits": 253499392, "q_proj": { "group_size": { @@ -24430,7 +24430,7 @@ } }, { - "accuracy": 0.9961723701685274, + "accuracy": 0.9970048661833923, "total_bits": 265838592, "q_proj": { "group_size": { @@ -24482,7 +24482,7 @@ } }, { - "accuracy": 0.998416797968706, + "accuracy": 0.9988653909133159, "total_bits": 337385472, "q_proj": { "group_size": { @@ -24534,11 +24534,11 @@ } } ], - "model.layers.12.block_sparse_moe": [ + "model.layers.12.mlp": [ { - "accuracy": 0.9149962277396729, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9169626700642862, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -24553,7 +24553,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -24568,7 +24568,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -24588,9 +24588,9 @@ } }, { - "accuracy": 0.9180999929575544, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9195618786309895, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -24605,7 +24605,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -24620,7 +24620,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -24640,9 +24640,9 @@ } }, { - "accuracy": 0.9289694514713789, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9323753394970768, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -24657,7 +24657,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -24672,7 +24672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -24689,9 +24689,9 @@ } }, { - "accuracy": 0.9313249023337113, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9361059948017723, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -24706,7 +24706,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -24721,7 +24721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -24738,9 +24738,9 @@ } }, { - "accuracy": 0.9596142080661497, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9579335804048338, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -24755,7 +24755,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -24770,7 +24770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -24790,9 +24790,9 @@ } }, { - "accuracy": 0.9630738560502466, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9614412857121543, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -24807,7 +24807,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -24822,7 +24822,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -24842,9 +24842,9 @@ } }, { - "accuracy": 0.9667932666642101, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9666366558521986, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -24859,7 +24859,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -24874,7 +24874,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -24891,9 +24891,9 @@ } }, { - "accuracy": 0.9795101667803369, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9778119566801348, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -24905,7 +24905,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -24917,7 +24917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -24934,9 +24934,9 @@ } }, { - "accuracy": 0.9811005180603579, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9796785843607626, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -24948,7 +24948,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -24960,7 +24960,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -24977,9 +24977,9 @@ } }, { - "accuracy": 0.9799635790680584, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9784359903025784, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -24994,7 +24994,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -25009,7 +25009,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -25029,9 +25029,9 @@ } }, { - "accuracy": 0.982545079418311, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9811468347907066, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -25046,7 +25046,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -25061,7 +25061,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -25081,9 +25081,9 @@ } }, { - "accuracy": 0.9900827084815031, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9889545060792252, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -25098,7 +25098,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -25113,7 +25113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -25133,9 +25133,9 @@ } }, { - "accuracy": 0.9915601911856547, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9905664747041699, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -25150,7 +25150,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -25165,7 +25165,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -25185,9 +25185,9 @@ } }, { - "accuracy": 0.9946493926874705, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9938707310755394, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -25199,7 +25199,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -25211,7 +25211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -25228,9 +25228,9 @@ } }, { - "accuracy": 0.9951088999300018, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9943746531896881, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -25245,7 +25245,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -25260,7 +25260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -25277,9 +25277,9 @@ } }, { - "accuracy": 0.9959307032220653, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9955550832918992, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -25294,7 +25294,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -25309,7 +25309,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -25323,9 +25323,9 @@ } }, { - "accuracy": 0.9986207535410742, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9983706212890531, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -25337,7 +25337,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -25349,7 +25349,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -25365,7 +25365,7 @@ ], "model.layers.13.self_attn": [ { - "accuracy": 0.9144012908402243, + "accuracy": 0.9373557000960174, "total_bits": 89665536, "q_proj": { "group_size": { @@ -25429,7 +25429,7 @@ } }, { - "accuracy": 0.9174986803217938, + "accuracy": 0.9397313634031698, "total_bits": 92221440, "q_proj": { "group_size": { @@ -25493,7 +25493,7 @@ } }, { - "accuracy": 0.9305551852050581, + "accuracy": 0.9468525999078625, "total_bits": 95758848, "q_proj": { "group_size": { @@ -25557,7 +25557,7 @@ } }, { - "accuracy": 0.945450622980532, + "accuracy": 0.9598459701396918, "total_bits": 112272384, "q_proj": { "group_size": { @@ -25621,7 +25621,7 @@ } }, { - "accuracy": 0.957477915737974, + "accuracy": 0.9686920179152175, "total_bits": 132913152, "q_proj": { "group_size": { @@ -25685,7 +25685,7 @@ } }, { - "accuracy": 0.9579711731915411, + "accuracy": 0.9690814191769612, "total_bits": 132980224, "q_proj": { "group_size": { @@ -25749,7 +25749,7 @@ } }, { - "accuracy": 0.9762551714911273, + "accuracy": 0.9826673616802222, "total_bits": 169613312, "q_proj": { "group_size": { @@ -25801,7 +25801,7 @@ } }, { - "accuracy": 0.9771409908211545, + "accuracy": 0.9830741789682131, "total_bits": 169745920, "q_proj": { "group_size": { @@ -25853,7 +25853,7 @@ } }, { - "accuracy": 0.9785458887682149, + "accuracy": 0.9842408546491673, "total_bits": 171195392, "q_proj": { "group_size": { @@ -25905,7 +25905,7 @@ } }, { - "accuracy": 0.9795245103243935, + "accuracy": 0.9850418758823684, "total_bits": 173563904, "q_proj": { "group_size": { @@ -25957,7 +25957,7 @@ } }, { - "accuracy": 0.9785206440444055, + "accuracy": 0.984390150510559, "total_bits": 174923264, "q_proj": { "group_size": { @@ -26021,7 +26021,7 @@ } }, { - "accuracy": 0.9802142043450945, + "accuracy": 0.9852917588953125, "total_bits": 175750144, "q_proj": { "group_size": { @@ -26085,7 +26085,7 @@ } }, { - "accuracy": 0.9828334983536288, + "accuracy": 0.987041928462292, "total_bits": 179253248, "q_proj": { "group_size": { @@ -26146,7 +26146,7 @@ } }, { - "accuracy": 0.9841697327792645, + "accuracy": 0.9880126629171795, "total_bits": 181592064, "q_proj": { "group_size": { @@ -26207,7 +26207,7 @@ } }, { - "accuracy": 0.9908477053732464, + "accuracy": 0.9930326923433888, "total_bits": 220469248, "q_proj": { "group_size": { @@ -26268,7 +26268,7 @@ } }, { - "accuracy": 0.9920735385170892, + "accuracy": 0.9940043881560039, "total_bits": 223535104, "q_proj": { "group_size": { @@ -26329,7 +26329,7 @@ } }, { - "accuracy": 0.9938684424728548, + "accuracy": 0.9955524501856416, "total_bits": 253499392, "q_proj": { "group_size": { @@ -26381,7 +26381,7 @@ } }, { - "accuracy": 0.9958915048597479, + "accuracy": 0.9968729533536947, "total_bits": 265838592, "q_proj": { "group_size": { @@ -26433,7 +26433,7 @@ } }, { - "accuracy": 0.9983690507330099, + "accuracy": 0.9987999525864756, "total_bits": 337385472, "q_proj": { "group_size": { @@ -26485,11 +26485,11 @@ } } ], - "model.layers.13.block_sparse_moe": [ + "model.layers.13.mlp": [ { - "accuracy": 0.9124777630755776, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9140017562006649, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -26504,7 +26504,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -26519,7 +26519,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -26539,9 +26539,9 @@ } }, { - "accuracy": 0.91576033614968, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9167497140404425, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -26556,7 +26556,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -26571,7 +26571,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -26591,9 +26591,9 @@ } }, { - "accuracy": 0.9270481390780524, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9299622797652295, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -26608,7 +26608,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -26623,7 +26623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -26640,9 +26640,9 @@ } }, { - "accuracy": 0.9294941776285046, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9338386684264007, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -26657,7 +26657,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -26672,7 +26672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -26689,9 +26689,9 @@ } }, { - "accuracy": 0.9585712462859719, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9565691607759187, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -26706,7 +26706,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -26721,7 +26721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -26741,9 +26741,9 @@ } }, { - "accuracy": 0.9621162887074446, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9601395119373736, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -26758,7 +26758,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -26773,7 +26773,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -26793,9 +26793,9 @@ } }, { - "accuracy": 0.9659501760217705, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9655332125135159, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -26810,7 +26810,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -26825,7 +26825,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -26842,9 +26842,9 @@ } }, { - "accuracy": 0.979063157188265, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9771511765000852, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -26856,7 +26856,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -26868,7 +26868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -26885,9 +26885,9 @@ } }, { - "accuracy": 0.9806926975418863, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9790309963649825, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -26899,7 +26899,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -26911,7 +26911,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -26928,9 +26928,9 @@ } }, { - "accuracy": 0.9795198769455677, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9778062307618951, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -26945,7 +26945,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -26960,7 +26960,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -26980,9 +26980,9 @@ } }, { - "accuracy": 0.9821243185648009, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9805724001244495, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -26997,7 +26997,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -27012,7 +27012,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -27032,9 +27032,9 @@ } }, { - "accuracy": 0.9898580933773988, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9886581419820064, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -27049,7 +27049,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -27064,7 +27064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -27084,9 +27084,9 @@ } }, { - "accuracy": 0.9913554838368375, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9902876732184699, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -27101,7 +27101,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -27116,7 +27116,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -27136,9 +27136,9 @@ } }, { - "accuracy": 0.9945427862265589, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9937138643213793, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -27150,7 +27150,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -27162,7 +27162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -27179,9 +27179,9 @@ } }, { - "accuracy": 0.994988271035254, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9942322439808202, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -27196,7 +27196,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -27211,7 +27211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -27228,9 +27228,9 @@ } }, { - "accuracy": 0.995795306100167, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9954047784527862, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -27245,7 +27245,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -27260,7 +27260,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -27274,9 +27274,9 @@ } }, { - "accuracy": 0.9985821192263087, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9983155273060363, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -27288,7 +27288,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -27300,7 +27300,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -27316,7 +27316,7 @@ ], "model.layers.14.self_attn": [ { - "accuracy": 0.916202432623035, + "accuracy": 0.9366658794644632, "total_bits": 89665536, "q_proj": { "group_size": { @@ -27380,7 +27380,7 @@ } }, { - "accuracy": 0.9180160927537241, + "accuracy": 0.9388638801480594, "total_bits": 92221440, "q_proj": { "group_size": { @@ -27444,7 +27444,7 @@ } }, { - "accuracy": 0.92972700062551, + "accuracy": 0.9452546873178921, "total_bits": 95758848, "q_proj": { "group_size": { @@ -27508,7 +27508,7 @@ } }, { - "accuracy": 0.9459662723698115, + "accuracy": 0.9589772350890071, "total_bits": 112272384, "q_proj": { "group_size": { @@ -27572,7 +27572,7 @@ } }, { - "accuracy": 0.9582060490195689, + "accuracy": 0.9677323483322796, "total_bits": 132913152, "q_proj": { "group_size": { @@ -27636,7 +27636,7 @@ } }, { - "accuracy": 0.9586123284932814, + "accuracy": 0.96817981010597, "total_bits": 132980224, "q_proj": { "group_size": { @@ -27700,7 +27700,7 @@ } }, { - "accuracy": 0.9754843556959378, + "accuracy": 0.9812583496401969, "total_bits": 169613312, "q_proj": { "group_size": { @@ -27752,7 +27752,7 @@ } }, { - "accuracy": 0.9761458419655499, + "accuracy": 0.9816225999966264, "total_bits": 169745920, "q_proj": { "group_size": { @@ -27804,7 +27804,7 @@ } }, { - "accuracy": 0.9778668781448352, + "accuracy": 0.982910465056959, "total_bits": 171195392, "q_proj": { "group_size": { @@ -27856,7 +27856,7 @@ } }, { - "accuracy": 0.9793567784424675, + "accuracy": 0.9839204324801502, "total_bits": 173563904, "q_proj": { "group_size": { @@ -27908,7 +27908,7 @@ } }, { - "accuracy": 0.9788200702322157, + "accuracy": 0.9838033046965536, "total_bits": 174923264, "q_proj": { "group_size": { @@ -27972,7 +27972,7 @@ } }, { - "accuracy": 0.9802238134863345, + "accuracy": 0.984805405286974, "total_bits": 175750144, "q_proj": { "group_size": { @@ -28036,7 +28036,7 @@ } }, { - "accuracy": 0.9824789450748971, + "accuracy": 0.9862614001735652, "total_bits": 179253248, "q_proj": { "group_size": { @@ -28097,7 +28097,7 @@ } }, { - "accuracy": 0.9839427672620666, + "accuracy": 0.9873551047211023, "total_bits": 181592064, "q_proj": { "group_size": { @@ -28158,7 +28158,7 @@ } }, { - "accuracy": 0.9904928588749546, + "accuracy": 0.9925091162471003, "total_bits": 220469248, "q_proj": { "group_size": { @@ -28219,7 +28219,7 @@ } }, { - "accuracy": 0.9920184795842751, + "accuracy": 0.9936672189689585, "total_bits": 223535104, "q_proj": { "group_size": { @@ -28280,7 +28280,7 @@ } }, { - "accuracy": 0.9936674176715314, + "accuracy": 0.9951060084675095, "total_bits": 253499392, "q_proj": { "group_size": { @@ -28332,7 +28332,7 @@ } }, { - "accuracy": 0.9956683734260303, + "accuracy": 0.9964925019242066, "total_bits": 265838592, "q_proj": { "group_size": { @@ -28384,7 +28384,7 @@ } }, { - "accuracy": 0.9983236914880476, + "accuracy": 0.9987114007848217, "total_bits": 337385472, "q_proj": { "group_size": { @@ -28436,11 +28436,11 @@ } } ], - "model.layers.14.block_sparse_moe": [ + "model.layers.14.mlp": [ { - "accuracy": 0.9074882612024483, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9095255297265554, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -28455,7 +28455,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -28470,7 +28470,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -28490,9 +28490,9 @@ } }, { - "accuracy": 0.9110314002946803, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9125240276518621, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -28507,7 +28507,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -28522,7 +28522,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -28542,9 +28542,9 @@ } }, { - "accuracy": 0.9229472097205489, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9264011349725096, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -28559,7 +28559,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -28574,7 +28574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -28591,9 +28591,9 @@ } }, { - "accuracy": 0.9256350586288854, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9307416475525028, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -28608,7 +28608,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -28623,7 +28623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -28640,9 +28640,9 @@ } }, { - "accuracy": 0.9562946789358792, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9542000711356339, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -28657,7 +28657,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -28672,7 +28672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -28692,9 +28692,9 @@ } }, { - "accuracy": 0.9599329980188295, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9580087925454503, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -28709,7 +28709,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -28724,7 +28724,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -28744,9 +28744,9 @@ } }, { - "accuracy": 0.9640237304725146, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9637619938309255, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -28761,7 +28761,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -28776,7 +28776,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -28793,9 +28793,9 @@ } }, { - "accuracy": 0.9776939484535864, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.975708099866384, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -28807,7 +28807,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -28819,7 +28819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -28836,9 +28836,9 @@ } }, { - "accuracy": 0.9793846583189932, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9777254358326134, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -28850,7 +28850,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -28862,7 +28862,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -28879,9 +28879,9 @@ } }, { - "accuracy": 0.978274950365487, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9765865155250618, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -28896,7 +28896,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -28911,7 +28911,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -28931,9 +28931,9 @@ } }, { - "accuracy": 0.9810984523868874, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9795258572619212, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -28948,7 +28948,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -28963,7 +28963,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -28983,9 +28983,9 @@ } }, { - "accuracy": 0.9892634971459445, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9880328797245104, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -29000,7 +29000,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -29015,7 +29015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -29035,9 +29035,9 @@ } }, { - "accuracy": 0.990830084210948, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9897575417876636, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -29052,7 +29052,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -29067,7 +29067,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -29087,9 +29087,9 @@ } }, { - "accuracy": 0.994142733318241, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9932938322155295, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -29101,7 +29101,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -29113,7 +29113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -29130,9 +29130,9 @@ } }, { - "accuracy": 0.9946599451359361, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9939226419542377, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -29147,7 +29147,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -29162,7 +29162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -29179,9 +29179,9 @@ } }, { - "accuracy": 0.9955196939041152, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9951866719087488, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -29196,7 +29196,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -29211,7 +29211,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -29225,9 +29225,9 @@ } }, { - "accuracy": 0.9984644169147175, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9982010111041171, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -29239,7 +29239,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -29251,7 +29251,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -29267,7 +29267,7 @@ ], "model.layers.15.self_attn": [ { - "accuracy": 0.910214204733309, + "accuracy": 0.9323553715489412, "total_bits": 89665536, "q_proj": { "group_size": { @@ -29331,7 +29331,7 @@ } }, { - "accuracy": 0.9127265961938783, + "accuracy": 0.9339591564708635, "total_bits": 92221440, "q_proj": { "group_size": { @@ -29395,7 +29395,7 @@ } }, { - "accuracy": 0.9264898219783055, + "accuracy": 0.9427500846550653, "total_bits": 95758848, "q_proj": { "group_size": { @@ -29459,7 +29459,7 @@ } }, { - "accuracy": 0.9431474404899698, + "accuracy": 0.9570052352194723, "total_bits": 112272384, "q_proj": { "group_size": { @@ -29523,7 +29523,7 @@ } }, { - "accuracy": 0.9547215639368484, + "accuracy": 0.9652711602025911, "total_bits": 132913152, "q_proj": { "group_size": { @@ -29587,7 +29587,7 @@ } }, { - "accuracy": 0.9550610188590853, + "accuracy": 0.9656447923105014, "total_bits": 132980224, "q_proj": { "group_size": { @@ -29651,7 +29651,7 @@ } }, { - "accuracy": 0.9738762601229706, + "accuracy": 0.9805942362566528, "total_bits": 169613312, "q_proj": { "group_size": { @@ -29703,7 +29703,7 @@ } }, { - "accuracy": 0.9748681478790546, + "accuracy": 0.9811192372029549, "total_bits": 169745920, "q_proj": { "group_size": { @@ -29755,7 +29755,7 @@ } }, { - "accuracy": 0.9769428834613216, + "accuracy": 0.9828172509901618, "total_bits": 171195392, "q_proj": { "group_size": { @@ -29807,7 +29807,7 @@ } }, { - "accuracy": 0.9780246798243177, + "accuracy": 0.9837071365725837, "total_bits": 173563904, "q_proj": { "group_size": { @@ -29859,7 +29859,7 @@ } }, { - "accuracy": 0.9770336125634218, + "accuracy": 0.9822727887352046, "total_bits": 174923264, "q_proj": { "group_size": { @@ -29923,7 +29923,7 @@ } }, { - "accuracy": 0.9786481051460693, + "accuracy": 0.9837061848099294, "total_bits": 175750144, "q_proj": { "group_size": { @@ -29987,7 +29987,7 @@ } }, { - "accuracy": 0.9816839022954044, + "accuracy": 0.9856124072531728, "total_bits": 179253248, "q_proj": { "group_size": { @@ -30048,7 +30048,7 @@ } }, { - "accuracy": 0.9834264408012754, + "accuracy": 0.9869205776700064, "total_bits": 181592064, "q_proj": { "group_size": { @@ -30109,7 +30109,7 @@ } }, { - "accuracy": 0.9898895459543717, + "accuracy": 0.9920928015520698, "total_bits": 220469248, "q_proj": { "group_size": { @@ -30170,7 +30170,7 @@ } }, { - "accuracy": 0.9916064394392857, + "accuracy": 0.9934168596457886, "total_bits": 223535104, "q_proj": { "group_size": { @@ -30231,7 +30231,7 @@ } }, { - "accuracy": 0.993178044960491, + "accuracy": 0.9949097848056179, "total_bits": 253499392, "q_proj": { "group_size": { @@ -30283,7 +30283,7 @@ } }, { - "accuracy": 0.9957729683772317, + "accuracy": 0.9966491204073751, "total_bits": 265838592, "q_proj": { "group_size": { @@ -30335,7 +30335,7 @@ } }, { - "accuracy": 0.9981960479574474, + "accuracy": 0.9986730016136885, "total_bits": 337385472, "q_proj": { "group_size": { @@ -30387,11 +30387,11 @@ } } ], - "model.layers.15.block_sparse_moe": [ + "model.layers.15.mlp": [ { - "accuracy": 0.896523899545795, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9039521864370296, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -30406,7 +30406,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -30421,7 +30421,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -30441,9 +30441,9 @@ } }, { - "accuracy": 0.9002315245176616, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9071239472219819, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -30458,7 +30458,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -30473,7 +30473,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -30493,9 +30493,9 @@ } }, { - "accuracy": 0.9157736024966365, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9223991297185421, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -30510,7 +30510,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -30525,7 +30525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -30542,9 +30542,9 @@ } }, { - "accuracy": 0.9195202847844676, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.926959704999861, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -30559,7 +30559,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -30574,7 +30574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -30591,9 +30591,9 @@ } }, { - "accuracy": 0.9502214640378952, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9517026157363465, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -30608,7 +30608,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -30623,7 +30623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -30643,9 +30643,9 @@ } }, { - "accuracy": 0.9547426288849429, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9556571107945944, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -30660,7 +30660,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -30675,7 +30675,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -30695,9 +30695,9 @@ } }, { - "accuracy": 0.9600102658334532, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9618644555540461, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -30712,7 +30712,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -30727,7 +30727,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -30744,9 +30744,9 @@ } }, { - "accuracy": 0.9741458232073408, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9745870660010137, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -30758,7 +30758,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -30770,7 +30770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -30787,9 +30787,9 @@ } }, { - "accuracy": 0.9763929899781942, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9766394538608821, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -30801,7 +30801,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -30813,7 +30813,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -30830,9 +30830,9 @@ } }, { - "accuracy": 0.9748979177522031, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.975351951281099, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -30847,7 +30847,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -30862,7 +30862,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -30882,9 +30882,9 @@ } }, { - "accuracy": 0.9782903827236671, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9783967442222332, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -30899,7 +30899,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -30914,7 +30914,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -30934,9 +30934,9 @@ } }, { - "accuracy": 0.9873457340229499, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9874154011669912, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -30951,7 +30951,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -30966,7 +30966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -30986,9 +30986,9 @@ } }, { - "accuracy": 0.9892727465760943, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9892147927741078, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -31003,7 +31003,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -31018,7 +31018,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -31038,9 +31038,9 @@ } }, { - "accuracy": 0.9930088063701987, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.993021795783486, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -31052,7 +31052,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -31064,7 +31064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -31081,9 +31081,9 @@ } }, { - "accuracy": 0.9935342693995488, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9935941275723866, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -31098,7 +31098,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -31113,7 +31113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -31130,9 +31130,9 @@ } }, { - "accuracy": 0.9946590676462572, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9949383722299612, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -31147,7 +31147,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -31162,7 +31162,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -31176,9 +31176,9 @@ } }, { - "accuracy": 0.9981502383322406, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9981253623018825, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -31190,7 +31190,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -31202,7 +31202,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -31218,7 +31218,7 @@ ], "model.layers.16.self_attn": [ { - "accuracy": 0.9248751519541991, + "accuracy": 0.9404891268595269, "total_bits": 89665536, "q_proj": { "group_size": { @@ -31282,7 +31282,7 @@ } }, { - "accuracy": 0.9273689406874933, + "accuracy": 0.9430430176618853, "total_bits": 92221440, "q_proj": { "group_size": { @@ -31346,7 +31346,7 @@ } }, { - "accuracy": 0.9380726167245915, + "accuracy": 0.950142603465601, "total_bits": 95758848, "q_proj": { "group_size": { @@ -31410,7 +31410,7 @@ } }, { - "accuracy": 0.9524976234687001, + "accuracy": 0.9628151758132797, "total_bits": 112272384, "q_proj": { "group_size": { @@ -31474,7 +31474,7 @@ } }, { - "accuracy": 0.9621274505594843, + "accuracy": 0.9697161659010147, "total_bits": 132913152, "q_proj": { "group_size": { @@ -31538,7 +31538,7 @@ } }, { - "accuracy": 0.9629117920994759, + "accuracy": 0.9705624160797972, "total_bits": 132980224, "q_proj": { "group_size": { @@ -31602,7 +31602,7 @@ } }, { - "accuracy": 0.9766717172766987, + "accuracy": 0.9823217015027216, "total_bits": 169613312, "q_proj": { "group_size": { @@ -31654,7 +31654,7 @@ } }, { - "accuracy": 0.9781217413807386, + "accuracy": 0.9830186661136778, "total_bits": 169745920, "q_proj": { "group_size": { @@ -31706,7 +31706,7 @@ } }, { - "accuracy": 0.9796651428271281, + "accuracy": 0.9843173222616315, "total_bits": 171195392, "q_proj": { "group_size": { @@ -31758,7 +31758,7 @@ } }, { - "accuracy": 0.980605088321394, + "accuracy": 0.9851114478844561, "total_bits": 173563904, "q_proj": { "group_size": { @@ -31810,7 +31810,7 @@ } }, { - "accuracy": 0.98085281512651, + "accuracy": 0.9849206057720279, "total_bits": 174923264, "q_proj": { "group_size": { @@ -31874,7 +31874,7 @@ } }, { - "accuracy": 0.9822097716264819, + "accuracy": 0.9858800395342865, "total_bits": 175750144, "q_proj": { "group_size": { @@ -31938,7 +31938,7 @@ } }, { - "accuracy": 0.9844745430899294, + "accuracy": 0.9875051402486861, "total_bits": 179253248, "q_proj": { "group_size": { @@ -31999,7 +31999,7 @@ } }, { - "accuracy": 0.985576358035599, + "accuracy": 0.9884646587367905, "total_bits": 181592064, "q_proj": { "group_size": { @@ -32060,7 +32060,7 @@ } }, { - "accuracy": 0.9914003307931125, + "accuracy": 0.9931502686200762, "total_bits": 220469248, "q_proj": { "group_size": { @@ -32121,7 +32121,7 @@ } }, { - "accuracy": 0.9927930525553069, + "accuracy": 0.9942170133887741, "total_bits": 223535104, "q_proj": { "group_size": { @@ -32182,7 +32182,7 @@ } }, { - "accuracy": 0.993938328771803, + "accuracy": 0.9953975016986462, "total_bits": 253499392, "q_proj": { "group_size": { @@ -32234,7 +32234,7 @@ } }, { - "accuracy": 0.9961202110433461, + "accuracy": 0.9969012692301092, "total_bits": 265838592, "q_proj": { "group_size": { @@ -32286,7 +32286,7 @@ } }, { - "accuracy": 0.9983754179219863, + "accuracy": 0.9987853840299833, "total_bits": 337385472, "q_proj": { "group_size": { @@ -32338,11 +32338,11 @@ } } ], - "model.layers.16.block_sparse_moe": [ + "model.layers.16.mlp": [ { - "accuracy": 0.9019071718579844, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9045525161843551, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -32357,7 +32357,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -32372,7 +32372,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -32392,9 +32392,9 @@ } }, { - "accuracy": 0.905277032601206, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9075992703437805, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -32409,7 +32409,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -32424,7 +32424,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -32444,9 +32444,9 @@ } }, { - "accuracy": 0.9202075926096815, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9225928632444457, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -32461,7 +32461,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -32476,7 +32476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -32493,9 +32493,9 @@ } }, { - "accuracy": 0.9238606767826959, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.927222442274031, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -32510,7 +32510,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -32525,7 +32525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -32542,9 +32542,9 @@ } }, { - "accuracy": 0.9534354803985671, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9519430462663111, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -32559,7 +32559,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -32574,7 +32574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -32594,9 +32594,9 @@ } }, { - "accuracy": 0.9574646944866368, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9558149303652739, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -32611,7 +32611,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -32626,7 +32626,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -32646,9 +32646,9 @@ } }, { - "accuracy": 0.9627644543192888, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.962022663927392, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -32663,7 +32663,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -32678,7 +32678,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -32695,9 +32695,9 @@ } }, { - "accuracy": 0.976499177123371, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9747046179775345, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -32709,7 +32709,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -32721,7 +32721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -32738,9 +32738,9 @@ } }, { - "accuracy": 0.9785081886833435, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.976777723451194, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -32752,7 +32752,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -32764,7 +32764,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -32781,9 +32781,9 @@ } }, { - "accuracy": 0.9770585900956863, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9754684797831272, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -32798,7 +32798,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -32813,7 +32813,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -32833,9 +32833,9 @@ } }, { - "accuracy": 0.9801160721598488, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9784994658670927, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -32850,7 +32850,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -32865,7 +32865,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -32885,9 +32885,9 @@ } }, { - "accuracy": 0.9887693278050345, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9874803159925106, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -32902,7 +32902,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -32917,7 +32917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -32937,9 +32937,9 @@ } }, { - "accuracy": 0.9904636533097609, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9892622225819842, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -32954,7 +32954,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -32969,7 +32969,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -32989,9 +32989,9 @@ } }, { - "accuracy": 0.993983720894903, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9930526322170504, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -33003,7 +33003,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -33015,7 +33015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -33032,9 +33032,9 @@ } }, { - "accuracy": 0.994502533116917, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9936390833831147, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -33049,7 +33049,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -33064,7 +33064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -33081,9 +33081,9 @@ } }, { - "accuracy": 0.9955992466083875, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9949562736223206, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -33098,7 +33098,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -33113,7 +33113,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -33127,9 +33127,9 @@ } }, { - "accuracy": 0.9983886312938443, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9981364282583328, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -33141,7 +33141,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -33153,7 +33153,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -33169,7 +33169,7 @@ ], "model.layers.17.self_attn": [ { - "accuracy": 0.927353847967951, + "accuracy": 0.9442092272403994, "total_bits": 89665536, "q_proj": { "group_size": { @@ -33233,7 +33233,7 @@ } }, { - "accuracy": 0.9329249196146664, + "accuracy": 0.9473236686501064, "total_bits": 92221440, "q_proj": { "group_size": { @@ -33297,7 +33297,7 @@ } }, { - "accuracy": 0.943610639752526, + "accuracy": 0.9546649388380741, "total_bits": 95758848, "q_proj": { "group_size": { @@ -33361,7 +33361,7 @@ } }, { - "accuracy": 0.9554822593927383, + "accuracy": 0.9648497660497302, "total_bits": 112272384, "q_proj": { "group_size": { @@ -33425,7 +33425,7 @@ } }, { - "accuracy": 0.9631625336447829, + "accuracy": 0.9718471669444912, "total_bits": 132913152, "q_proj": { "group_size": { @@ -33489,7 +33489,7 @@ } }, { - "accuracy": 0.9658305678320558, + "accuracy": 0.9725092883760992, "total_bits": 132980224, "q_proj": { "group_size": { @@ -33553,7 +33553,7 @@ } }, { - "accuracy": 0.9789580759151202, + "accuracy": 0.9840643891672555, "total_bits": 169613312, "q_proj": { "group_size": { @@ -33605,7 +33605,7 @@ } }, { - "accuracy": 0.9800687248965627, + "accuracy": 0.9845438579489526, "total_bits": 169745920, "q_proj": { "group_size": { @@ -33657,7 +33657,7 @@ } }, { - "accuracy": 0.9813235665328408, + "accuracy": 0.9853999054451522, "total_bits": 171195392, "q_proj": { "group_size": { @@ -33709,7 +33709,7 @@ } }, { - "accuracy": 0.9820889919310024, + "accuracy": 0.9861153933268628, "total_bits": 173563904, "q_proj": { "group_size": { @@ -33761,7 +33761,7 @@ } }, { - "accuracy": 0.9819212372561819, + "accuracy": 0.9855584967881441, "total_bits": 174923264, "q_proj": { "group_size": { @@ -33825,7 +33825,7 @@ } }, { - "accuracy": 0.9832243367441391, + "accuracy": 0.9866977575921306, "total_bits": 175750144, "q_proj": { "group_size": { @@ -33889,7 +33889,7 @@ } }, { - "accuracy": 0.985870974334447, + "accuracy": 0.9886023730872885, "total_bits": 179253248, "q_proj": { "group_size": { @@ -33950,7 +33950,7 @@ } }, { - "accuracy": 0.9867824900797323, + "accuracy": 0.989564807644408, "total_bits": 181592064, "q_proj": { "group_size": { @@ -34011,7 +34011,7 @@ } }, { - "accuracy": 0.9919136155053581, + "accuracy": 0.9937973425993206, "total_bits": 220469248, "q_proj": { "group_size": { @@ -34072,7 +34072,7 @@ } }, { - "accuracy": 0.9933549004459852, + "accuracy": 0.9947491195052862, "total_bits": 223535104, "q_proj": { "group_size": { @@ -34133,7 +34133,7 @@ } }, { - "accuracy": 0.9943032974230224, + "accuracy": 0.9958191560948954, "total_bits": 253499392, "q_proj": { "group_size": { @@ -34185,7 +34185,7 @@ } }, { - "accuracy": 0.9966109868531164, + "accuracy": 0.9972895059219905, "total_bits": 265838592, "q_proj": { "group_size": { @@ -34237,7 +34237,7 @@ } }, { - "accuracy": 0.9984831992156902, + "accuracy": 0.9988775234791989, "total_bits": 337385472, "q_proj": { "group_size": { @@ -34289,11 +34289,11 @@ } } ], - "model.layers.17.block_sparse_moe": [ + "model.layers.17.mlp": [ { - "accuracy": 0.9048774967852392, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9030087170632262, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -34308,7 +34308,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -34323,7 +34323,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -34343,9 +34343,9 @@ } }, { - "accuracy": 0.9083912894129753, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9060020450698703, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -34360,7 +34360,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -34375,7 +34375,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -34395,9 +34395,9 @@ } }, { - "accuracy": 0.9211463316490776, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9215387941191071, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -34412,7 +34412,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -34427,7 +34427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -34444,9 +34444,9 @@ } }, { - "accuracy": 0.9240363555911341, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9262980246230176, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -34461,7 +34461,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -34476,7 +34476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -34493,9 +34493,9 @@ } }, { - "accuracy": 0.9553846740408948, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9511300618515203, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -34510,7 +34510,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -34525,7 +34525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -34545,9 +34545,9 @@ } }, { - "accuracy": 0.9590523670378485, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9551026676046221, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -34562,7 +34562,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -34577,7 +34577,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -34597,9 +34597,9 @@ } }, { - "accuracy": 0.9634676532525766, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9615459810746343, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -34614,7 +34614,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -34629,7 +34629,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -34646,9 +34646,9 @@ } }, { - "accuracy": 0.9773717696141255, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9742067357721297, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -34660,7 +34660,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -34672,7 +34672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -34689,9 +34689,9 @@ } }, { - "accuracy": 0.9791323247022534, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9763584552136692, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -34703,7 +34703,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -34715,7 +34715,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -34732,9 +34732,9 @@ } }, { - "accuracy": 0.9779354223589364, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9750363804694069, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -34749,7 +34749,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -34764,7 +34764,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -34784,9 +34784,9 @@ } }, { - "accuracy": 0.9807450296847444, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9781509514309858, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -34801,7 +34801,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -34816,7 +34816,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -34836,9 +34836,9 @@ } }, { - "accuracy": 0.9890571342487084, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9872377620225674, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -34853,7 +34853,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -34868,7 +34868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -34888,9 +34888,9 @@ } }, { - "accuracy": 0.9906672552031907, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9890835067796472, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -34905,7 +34905,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -34920,7 +34920,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -34940,9 +34940,9 @@ } }, { - "accuracy": 0.9940811393684462, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.992890218762975, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -34954,7 +34954,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -34966,7 +34966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -34983,9 +34983,9 @@ } }, { - "accuracy": 0.9945923208234537, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9935184179094473, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -35000,7 +35000,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -35015,7 +35015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -35032,9 +35032,9 @@ } }, { - "accuracy": 0.9954917905484572, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9948769234439456, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -35049,7 +35049,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -35064,7 +35064,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -35078,9 +35078,9 @@ } }, { - "accuracy": 0.9984438416033395, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9980801622395551, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -35092,7 +35092,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -35104,7 +35104,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -35120,7 +35120,7 @@ ], "model.layers.18.self_attn": [ { - "accuracy": 0.9426621355508503, + "accuracy": 0.9524316493617861, "total_bits": 89665536, "q_proj": { "group_size": { @@ -35184,7 +35184,7 @@ } }, { - "accuracy": 0.9423443537793661, + "accuracy": 0.9544516817519539, "total_bits": 92221440, "q_proj": { "group_size": { @@ -35248,7 +35248,7 @@ } }, { - "accuracy": 0.9530246604822183, + "accuracy": 0.9606708723463511, "total_bits": 95758848, "q_proj": { "group_size": { @@ -35312,7 +35312,7 @@ } }, { - "accuracy": 0.9621052176348472, + "accuracy": 0.9691502977358667, "total_bits": 112272384, "q_proj": { "group_size": { @@ -35376,7 +35376,7 @@ } }, { - "accuracy": 0.9709830131185683, + "accuracy": 0.9761827687585825, "total_bits": 132913152, "q_proj": { "group_size": { @@ -35440,7 +35440,7 @@ } }, { - "accuracy": 0.9718570904316086, + "accuracy": 0.9764769796870256, "total_bits": 132980224, "q_proj": { "group_size": { @@ -35504,7 +35504,7 @@ } }, { - "accuracy": 0.9808978106532442, + "accuracy": 0.9857286564319542, "total_bits": 169613312, "q_proj": { "group_size": { @@ -35556,7 +35556,7 @@ } }, { - "accuracy": 0.9825035657635645, + "accuracy": 0.9864044238470102, "total_bits": 169745920, "q_proj": { "group_size": { @@ -35608,7 +35608,7 @@ } }, { - "accuracy": 0.9844047992833351, + "accuracy": 0.9875787227218481, "total_bits": 171195392, "q_proj": { "group_size": { @@ -35660,7 +35660,7 @@ } }, { - "accuracy": 0.9850903565652276, + "accuracy": 0.9881641256475919, "total_bits": 173563904, "q_proj": { "group_size": { @@ -35712,7 +35712,7 @@ } }, { - "accuracy": 0.9853769444223297, + "accuracy": 0.9880209960239498, "total_bits": 174923264, "q_proj": { "group_size": { @@ -35776,7 +35776,7 @@ } }, { - "accuracy": 0.986380965497933, + "accuracy": 0.9888365769170617, "total_bits": 175750144, "q_proj": { "group_size": { @@ -35840,7 +35840,7 @@ } }, { - "accuracy": 0.9881471818018901, + "accuracy": 0.9901526825698582, "total_bits": 179253248, "q_proj": { "group_size": { @@ -35901,7 +35901,7 @@ } }, { - "accuracy": 0.9892791555214085, + "accuracy": 0.9910030119415176, "total_bits": 181592064, "q_proj": { "group_size": { @@ -35962,7 +35962,7 @@ } }, { - "accuracy": 0.9933352529218322, + "accuracy": 0.9945431236471785, "total_bits": 220469248, "q_proj": { "group_size": { @@ -36023,7 +36023,7 @@ } }, { - "accuracy": 0.9945772679179514, + "accuracy": 0.9954977838298011, "total_bits": 223535104, "q_proj": { "group_size": { @@ -36084,7 +36084,7 @@ } }, { - "accuracy": 0.9950140060268735, + "accuracy": 0.9962333541265443, "total_bits": 253499392, "q_proj": { "group_size": { @@ -36136,7 +36136,7 @@ } }, { - "accuracy": 0.9972415298260259, + "accuracy": 0.9976457134440639, "total_bits": 265838592, "q_proj": { "group_size": { @@ -36188,7 +36188,7 @@ } }, { - "accuracy": 0.9985959064215422, + "accuracy": 0.9989886529817197, "total_bits": 337385472, "q_proj": { "group_size": { @@ -36240,11 +36240,11 @@ } } ], - "model.layers.18.block_sparse_moe": [ + "model.layers.18.mlp": [ { - "accuracy": 0.9095662926372728, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9063287828313678, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -36259,7 +36259,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -36274,7 +36274,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -36294,9 +36294,9 @@ } }, { - "accuracy": 0.912707301935083, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9090659596810216, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -36311,7 +36311,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -36326,7 +36326,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -36346,9 +36346,9 @@ } }, { - "accuracy": 0.9245913122829638, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.92380379238411, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -36363,7 +36363,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -36378,7 +36378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -36395,9 +36395,9 @@ } }, { - "accuracy": 0.9272654946697385, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9282308242430812, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -36412,7 +36412,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -36427,7 +36427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -36444,9 +36444,9 @@ } }, { - "accuracy": 0.9575127143608897, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9528292234202749, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -36461,7 +36461,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -36476,7 +36476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -36496,9 +36496,9 @@ } }, { - "accuracy": 0.960948856155339, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9565392246371821, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -36513,7 +36513,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -36528,7 +36528,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -36548,9 +36548,9 @@ } }, { - "accuracy": 0.965100116145454, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9627073330706671, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -36565,7 +36565,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -36580,7 +36580,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -36597,9 +36597,9 @@ } }, { - "accuracy": 0.9786706856010776, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9752998492239338, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -36611,7 +36611,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -36623,7 +36623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -36640,9 +36640,9 @@ } }, { - "accuracy": 0.9803212548753149, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9773440878152063, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -36654,7 +36654,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -36666,7 +36666,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -36683,9 +36683,9 @@ } }, { - "accuracy": 0.978994027662434, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9759262205150566, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -36700,7 +36700,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -36715,7 +36715,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -36735,9 +36735,9 @@ } }, { - "accuracy": 0.9816417630859896, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9788604075775335, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -36752,7 +36752,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -36767,7 +36767,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -36787,9 +36787,9 @@ } }, { - "accuracy": 0.9896233837904507, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.987716553340617, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -36804,7 +36804,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -36819,7 +36819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -36839,9 +36839,9 @@ } }, { - "accuracy": 0.9911354199591044, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9894503384681517, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -36856,7 +36856,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -36871,7 +36871,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -36891,9 +36891,9 @@ } }, { - "accuracy": 0.9944868792632693, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9932522047556153, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -36905,7 +36905,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -36917,7 +36917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -36934,9 +36934,9 @@ } }, { - "accuracy": 0.9948771291804549, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9937487349379808, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -36951,7 +36951,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -36966,7 +36966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -36983,9 +36983,9 @@ } }, { - "accuracy": 0.9957250996345752, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9950438506509128, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -37000,7 +37000,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -37015,7 +37015,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -37029,9 +37029,9 @@ } }, { - "accuracy": 0.9985609189055762, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9981912341164915, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -37043,7 +37043,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -37055,7 +37055,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -37071,7 +37071,7 @@ ], "model.layers.19.self_attn": [ { - "accuracy": 0.9370661077922896, + "accuracy": 0.9531734097552926, "total_bits": 89665536, "q_proj": { "group_size": { @@ -37135,7 +37135,7 @@ } }, { - "accuracy": 0.9386997762087145, + "accuracy": 0.9551613434757057, "total_bits": 92221440, "q_proj": { "group_size": { @@ -37199,7 +37199,7 @@ } }, { - "accuracy": 0.9530433086972487, + "accuracy": 0.9629494123356908, "total_bits": 95758848, "q_proj": { "group_size": { @@ -37263,7 +37263,7 @@ } }, { - "accuracy": 0.9605018681797542, + "accuracy": 0.9698111785477713, "total_bits": 112272384, "q_proj": { "group_size": { @@ -37327,7 +37327,7 @@ } }, { - "accuracy": 0.9676079006963655, + "accuracy": 0.9759254256557477, "total_bits": 132913152, "q_proj": { "group_size": { @@ -37391,7 +37391,7 @@ } }, { - "accuracy": 0.9704811785762247, + "accuracy": 0.9764532275792015, "total_bits": 132980224, "q_proj": { "group_size": { @@ -37455,7 +37455,7 @@ } }, { - "accuracy": 0.9798381700621623, + "accuracy": 0.9859687963402585, "total_bits": 169613312, "q_proj": { "group_size": { @@ -37507,7 +37507,7 @@ } }, { - "accuracy": 0.9818890725605582, + "accuracy": 0.9866338403601396, "total_bits": 169745920, "q_proj": { "group_size": { @@ -37559,7 +37559,7 @@ } }, { - "accuracy": 0.983230293973496, + "accuracy": 0.9877041297915735, "total_bits": 171195392, "q_proj": { "group_size": { @@ -37611,7 +37611,7 @@ } }, { - "accuracy": 0.9836836655771262, + "accuracy": 0.9881126121617854, "total_bits": 173563904, "q_proj": { "group_size": { @@ -37663,7 +37663,7 @@ } }, { - "accuracy": 0.9850478390917966, + "accuracy": 0.98816726839562, "total_bits": 174923264, "q_proj": { "group_size": { @@ -37727,7 +37727,7 @@ } }, { - "accuracy": 0.9858068748328247, + "accuracy": 0.9890394763972022, "total_bits": 175750144, "q_proj": { "group_size": { @@ -37791,7 +37791,7 @@ } }, { - "accuracy": 0.9880467953748608, + "accuracy": 0.99056251930367, "total_bits": 179253248, "q_proj": { "group_size": { @@ -37852,7 +37852,7 @@ } }, { - "accuracy": 0.9892469397746027, + "accuracy": 0.9915260557624462, "total_bits": 181592064, "q_proj": { "group_size": { @@ -37913,7 +37913,7 @@ } }, { - "accuracy": 0.9934651391139548, + "accuracy": 0.9949341131447765, "total_bits": 220469248, "q_proj": { "group_size": { @@ -37974,7 +37974,7 @@ } }, { - "accuracy": 0.9945724511053413, + "accuracy": 0.9957552230181662, "total_bits": 223535104, "q_proj": { "group_size": { @@ -38035,7 +38035,7 @@ } }, { - "accuracy": 0.9947791119890386, + "accuracy": 0.9962724672115751, "total_bits": 253499392, "q_proj": { "group_size": { @@ -38087,7 +38087,7 @@ } }, { - "accuracy": 0.9974138994921783, + "accuracy": 0.9978598439514539, "total_bits": 265838592, "q_proj": { "group_size": { @@ -38139,7 +38139,7 @@ } }, { - "accuracy": 0.9985662904371949, + "accuracy": 0.9989829504170062, "total_bits": 337385472, "q_proj": { "group_size": { @@ -38191,11 +38191,11 @@ } } ], - "model.layers.19.block_sparse_moe": [ + "model.layers.19.mlp": [ { - "accuracy": 0.9111451659547656, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9077638297488815, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -38210,7 +38210,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -38225,7 +38225,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -38245,9 +38245,9 @@ } }, { - "accuracy": 0.9142973307324083, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9104348332866242, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -38262,7 +38262,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -38277,7 +38277,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -38297,9 +38297,9 @@ } }, { - "accuracy": 0.9255013999186064, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9245737767533252, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -38314,7 +38314,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -38329,7 +38329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -38346,9 +38346,9 @@ } }, { - "accuracy": 0.9280337201137292, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9288125310681368, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -38363,7 +38363,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -38378,7 +38378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -38395,9 +38395,9 @@ } }, { - "accuracy": 0.957983608994829, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9534416763406051, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -38412,7 +38412,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -38427,7 +38427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -38447,9 +38447,9 @@ } }, { - "accuracy": 0.9614456116564964, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.957116788351222, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -38464,7 +38464,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -38479,7 +38479,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -38499,9 +38499,9 @@ } }, { - "accuracy": 0.9654145234901654, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9630837524799924, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -38516,7 +38516,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -38531,7 +38531,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -38548,9 +38548,9 @@ } }, { - "accuracy": 0.978855999382703, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9756977125806244, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -38562,7 +38562,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -38574,7 +38574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -38591,9 +38591,9 @@ } }, { - "accuracy": 0.9805044677402628, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9777051095703715, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -38605,7 +38605,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -38617,7 +38617,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -38634,9 +38634,9 @@ } }, { - "accuracy": 0.9791233171953967, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9762726569744317, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -38651,7 +38651,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -38666,7 +38666,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -38686,9 +38686,9 @@ } }, { - "accuracy": 0.9818246230286988, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9791760237789467, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -38703,7 +38703,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -38718,7 +38718,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -38738,9 +38738,9 @@ } }, { - "accuracy": 0.9896588750232599, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9879158250742445, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -38755,7 +38755,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -38770,7 +38770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -38790,9 +38790,9 @@ } }, { - "accuracy": 0.9911828828289321, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9896092312655559, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -38807,7 +38807,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -38822,7 +38822,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -38842,9 +38842,9 @@ } }, { - "accuracy": 0.994448157689093, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9933607235158745, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -38856,7 +38856,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -38868,7 +38868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -38885,9 +38885,9 @@ } }, { - "accuracy": 0.9948650564214117, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9938603864468046, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -38902,7 +38902,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -38917,7 +38917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -38934,9 +38934,9 @@ } }, { - "accuracy": 0.9956253841274271, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9950594631709943, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -38951,7 +38951,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -38966,7 +38966,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -38980,9 +38980,9 @@ } }, { - "accuracy": 0.9985413056381635, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9982136715705948, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -38994,7 +38994,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -39006,7 +39006,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -39022,7 +39022,7 @@ ], "model.layers.20.self_attn": [ { - "accuracy": 0.9457030035555363, + "accuracy": 0.954907306126858, "total_bits": 89665536, "q_proj": { "group_size": { @@ -39086,7 +39086,7 @@ } }, { - "accuracy": 0.9457965995136061, + "accuracy": 0.957181333221103, "total_bits": 92221440, "q_proj": { "group_size": { @@ -39150,7 +39150,7 @@ } }, { - "accuracy": 0.9582316289214712, + "accuracy": 0.9648269959970525, "total_bits": 95758848, "q_proj": { "group_size": { @@ -39214,7 +39214,7 @@ } }, { - "accuracy": 0.9663739424002797, + "accuracy": 0.9719980787485838, "total_bits": 112272384, "q_proj": { "group_size": { @@ -39278,7 +39278,7 @@ } }, { - "accuracy": 0.9724503584990376, + "accuracy": 0.9775758829261911, "total_bits": 132913152, "q_proj": { "group_size": { @@ -39342,7 +39342,7 @@ } }, { - "accuracy": 0.9729552895418907, + "accuracy": 0.9780622686406499, "total_bits": 132980224, "q_proj": { "group_size": { @@ -39406,7 +39406,7 @@ } }, { - "accuracy": 0.982453209249989, + "accuracy": 0.9860330916156894, "total_bits": 169613312, "q_proj": { "group_size": { @@ -39458,7 +39458,7 @@ } }, { - "accuracy": 0.9837940367624948, + "accuracy": 0.9872173377263703, "total_bits": 169745920, "q_proj": { "group_size": { @@ -39510,7 +39510,7 @@ } }, { - "accuracy": 0.983282304712032, + "accuracy": 0.9881648033318159, "total_bits": 171195392, "q_proj": { "group_size": { @@ -39562,7 +39562,7 @@ } }, { - "accuracy": 0.983277547271236, + "accuracy": 0.9886510241776705, "total_bits": 173563904, "q_proj": { "group_size": { @@ -39614,7 +39614,7 @@ } }, { - "accuracy": 0.9859663481382948, + "accuracy": 0.9888427957255197, "total_bits": 174923264, "q_proj": { "group_size": { @@ -39678,7 +39678,7 @@ } }, { - "accuracy": 0.9872883571507899, + "accuracy": 0.9891793454850191, "total_bits": 175750144, "q_proj": { "group_size": { @@ -39742,7 +39742,7 @@ } }, { - "accuracy": 0.9890850506918994, + "accuracy": 0.9911164162729523, "total_bits": 179253248, "q_proj": { "group_size": { @@ -39803,7 +39803,7 @@ } }, { - "accuracy": 0.9898241735553663, + "accuracy": 0.9918848455540443, "total_bits": 181592064, "q_proj": { "group_size": { @@ -39864,7 +39864,7 @@ } }, { - "accuracy": 0.9939548614385881, + "accuracy": 0.9951796877246938, "total_bits": 220469248, "q_proj": { "group_size": { @@ -39925,7 +39925,7 @@ } }, { - "accuracy": 0.9949813122787562, + "accuracy": 0.9957956023710338, "total_bits": 223535104, "q_proj": { "group_size": { @@ -39986,7 +39986,7 @@ } }, { - "accuracy": 0.9952861121365506, + "accuracy": 0.9965280096267203, "total_bits": 253499392, "q_proj": { "group_size": { @@ -40038,7 +40038,7 @@ } }, { - "accuracy": 0.9974552460112854, + "accuracy": 0.9978657021174991, "total_bits": 265838592, "q_proj": { "group_size": { @@ -40090,7 +40090,7 @@ } }, { - "accuracy": 0.9987537633247772, + "accuracy": 0.999070831292652, "total_bits": 337385472, "q_proj": { "group_size": { @@ -40142,11 +40142,11 @@ } } ], - "model.layers.20.block_sparse_moe": [ + "model.layers.20.mlp": [ { - "accuracy": 0.9119468076448691, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9071195949065058, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -40161,7 +40161,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -40176,7 +40176,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -40196,9 +40196,9 @@ } }, { - "accuracy": 0.9149409561957184, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9097570459309378, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -40213,7 +40213,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -40228,7 +40228,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -40248,9 +40248,9 @@ } }, { - "accuracy": 0.9257604556256219, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9237925363214392, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -40265,7 +40265,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -40280,7 +40280,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -40297,9 +40297,9 @@ } }, { - "accuracy": 0.9282039166673234, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.927994535353623, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -40314,7 +40314,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -40329,7 +40329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -40346,9 +40346,9 @@ } }, { - "accuracy": 0.9583988029901919, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9530657133773753, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -40363,7 +40363,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -40378,7 +40378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -40398,9 +40398,9 @@ } }, { - "accuracy": 0.9617315246478507, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9567254463112668, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -40415,7 +40415,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -40430,7 +40430,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -40450,9 +40450,9 @@ } }, { - "accuracy": 0.9655564200917357, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9626581309069144, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -40467,7 +40467,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -40482,7 +40482,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -40499,9 +40499,9 @@ } }, { - "accuracy": 0.9791019399601378, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9754905538437398, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -40513,7 +40513,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -40525,7 +40525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -40542,9 +40542,9 @@ } }, { - "accuracy": 0.9807611211742225, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9775416983389541, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -40556,7 +40556,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -40568,7 +40568,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -40585,9 +40585,9 @@ } }, { - "accuracy": 0.9794069032037729, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9760871630554137, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -40602,7 +40602,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -40617,7 +40617,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -40637,9 +40637,9 @@ } }, { - "accuracy": 0.9820101140066981, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9789933036326578, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -40654,7 +40654,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -40669,7 +40669,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -40689,9 +40689,9 @@ } }, { - "accuracy": 0.989843503988691, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9878273407781595, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -40706,7 +40706,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -40721,7 +40721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -40741,9 +40741,9 @@ } }, { - "accuracy": 0.9913209057903212, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9895302929768437, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -40758,7 +40758,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -40773,7 +40773,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -40793,9 +40793,9 @@ } }, { - "accuracy": 0.9946271568463233, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9933468170807158, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -40807,7 +40807,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -40819,7 +40819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -40836,9 +40836,9 @@ } }, { - "accuracy": 0.9949970869837623, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9938193710128728, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -40853,7 +40853,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -40868,7 +40868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -40885,9 +40885,9 @@ } }, { - "accuracy": 0.9957431105676254, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9950146749615669, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -40902,7 +40902,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -40917,7 +40917,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -40931,9 +40931,9 @@ } }, { - "accuracy": 0.9986027003768342, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9982245541422775, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -40945,7 +40945,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -40957,7 +40957,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -40973,7 +40973,7 @@ ], "model.layers.21.self_attn": [ { - "accuracy": 0.9420547632402495, + "accuracy": 0.9523296772845482, "total_bits": 89665536, "q_proj": { "group_size": { @@ -41037,7 +41037,7 @@ } }, { - "accuracy": 0.9446317094721293, + "accuracy": 0.954428034197343, "total_bits": 92221440, "q_proj": { "group_size": { @@ -41101,7 +41101,7 @@ } }, { - "accuracy": 0.9559027793768206, + "accuracy": 0.9619468497602564, "total_bits": 95758848, "q_proj": { "group_size": { @@ -41165,7 +41165,7 @@ } }, { - "accuracy": 0.9638713286503365, + "accuracy": 0.9692018048739747, "total_bits": 112272384, "q_proj": { "group_size": { @@ -41229,7 +41229,7 @@ } }, { - "accuracy": 0.9709425744061407, + "accuracy": 0.9759856691387924, "total_bits": 132913152, "q_proj": { "group_size": { @@ -41293,7 +41293,7 @@ } }, { - "accuracy": 0.9718014126349437, + "accuracy": 0.9766665207908342, "total_bits": 132980224, "q_proj": { "group_size": { @@ -41357,7 +41357,7 @@ } }, { - "accuracy": 0.9812249879499799, + "accuracy": 0.98568585470907, "total_bits": 169613312, "q_proj": { "group_size": { @@ -41409,7 +41409,7 @@ } }, { - "accuracy": 0.9827954999514317, + "accuracy": 0.9864689111709595, "total_bits": 169745920, "q_proj": { "group_size": { @@ -41461,7 +41461,7 @@ } }, { - "accuracy": 0.9842038799175307, + "accuracy": 0.9873518646743736, "total_bits": 171195392, "q_proj": { "group_size": { @@ -41513,7 +41513,7 @@ } }, { - "accuracy": 0.9848067334509993, + "accuracy": 0.9879808859458488, "total_bits": 173563904, "q_proj": { "group_size": { @@ -41565,7 +41565,7 @@ } }, { - "accuracy": 0.9859570207956591, + "accuracy": 0.9882089223859734, "total_bits": 174923264, "q_proj": { "group_size": { @@ -41629,7 +41629,7 @@ } }, { - "accuracy": 0.9869164829877647, + "accuracy": 0.9887366007866436, "total_bits": 175750144, "q_proj": { "group_size": { @@ -41693,7 +41693,7 @@ } }, { - "accuracy": 0.9887005343150935, + "accuracy": 0.9906133468949089, "total_bits": 179253248, "q_proj": { "group_size": { @@ -41754,7 +41754,7 @@ } }, { - "accuracy": 0.9897141926186649, + "accuracy": 0.9914003869910774, "total_bits": 181592064, "q_proj": { "group_size": { @@ -41815,7 +41815,7 @@ } }, { - "accuracy": 0.9936692170847795, + "accuracy": 0.9948957266261507, "total_bits": 220469248, "q_proj": { "group_size": { @@ -41876,7 +41876,7 @@ } }, { - "accuracy": 0.9948098210764951, + "accuracy": 0.9957440825252745, "total_bits": 223535104, "q_proj": { "group_size": { @@ -41937,7 +41937,7 @@ } }, { - "accuracy": 0.9950313313226951, + "accuracy": 0.9963222838521592, "total_bits": 253499392, "q_proj": { "group_size": { @@ -41989,7 +41989,7 @@ } }, { - "accuracy": 0.9973175655192646, + "accuracy": 0.9977386644428694, "total_bits": 265838592, "q_proj": { "group_size": { @@ -42041,7 +42041,7 @@ } }, { - "accuracy": 0.9986545597264347, + "accuracy": 0.9990125996532457, "total_bits": 337385472, "q_proj": { "group_size": { @@ -42093,11 +42093,11 @@ } } ], - "model.layers.21.block_sparse_moe": [ + "model.layers.21.mlp": [ { - "accuracy": 0.9138193422634351, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9077479964808414, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -42112,7 +42112,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -42127,7 +42127,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -42147,9 +42147,9 @@ } }, { - "accuracy": 0.9166999237709924, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9103414277104955, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -42164,7 +42164,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -42179,7 +42179,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -42199,9 +42199,9 @@ } }, { - "accuracy": 0.9270215869734162, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9238953621763932, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -42216,7 +42216,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -42231,7 +42231,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -42248,9 +42248,9 @@ } }, { - "accuracy": 0.9293562548333093, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9279653504490852, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -42265,7 +42265,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -42280,7 +42280,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -42297,9 +42297,9 @@ } }, { - "accuracy": 0.9591238708480408, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.953285093174169, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -42314,7 +42314,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -42329,7 +42329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -42349,9 +42349,9 @@ } }, { - "accuracy": 0.9625386849633957, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9569780226600797, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -42366,7 +42366,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -42381,7 +42381,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -42401,9 +42401,9 @@ } }, { - "accuracy": 0.9662052853719184, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9627228156712494, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -42418,7 +42418,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -42433,7 +42433,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -42450,9 +42450,9 @@ } }, { - "accuracy": 0.9793876067882306, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9755103276846441, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -42464,7 +42464,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -42476,7 +42476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -42493,9 +42493,9 @@ } }, { - "accuracy": 0.9810763406812361, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9775913820454949, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -42507,7 +42507,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -42519,7 +42519,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -42536,9 +42536,9 @@ } }, { - "accuracy": 0.9798199153554282, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9761809520913582, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -42553,7 +42553,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -42568,7 +42568,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -42588,9 +42588,9 @@ } }, { - "accuracy": 0.9823928314604258, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9791046030035144, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -42605,7 +42605,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -42620,7 +42620,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -42640,9 +42640,9 @@ } }, { - "accuracy": 0.9900380968067207, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9878605602946329, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -42657,7 +42657,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -42672,7 +42672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -42692,9 +42692,9 @@ } }, { - "accuracy": 0.9914966536391723, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9895728796867556, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -42709,7 +42709,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -42724,7 +42724,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -42744,9 +42744,9 @@ } }, { - "accuracy": 0.9946434770122563, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9933122179323906, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -42758,7 +42758,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -42770,7 +42770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -42787,9 +42787,9 @@ } }, { - "accuracy": 0.995071840349977, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9938407030299699, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -42804,7 +42804,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -42819,7 +42819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -42836,9 +42836,9 @@ } }, { - "accuracy": 0.9957792213015062, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9949922702201691, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -42853,7 +42853,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -42868,7 +42868,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -42882,9 +42882,9 @@ } }, { - "accuracy": 0.998579159807904, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9981890379965893, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -42896,7 +42896,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -42908,7 +42908,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -42924,7 +42924,7 @@ ], "model.layers.22.self_attn": [ { - "accuracy": 0.9471930198763546, + "accuracy": 0.9578456751217967, "total_bits": 89665536, "q_proj": { "group_size": { @@ -42988,7 +42988,7 @@ } }, { - "accuracy": 0.9498191834672501, + "accuracy": 0.9580500263132548, "total_bits": 92221440, "q_proj": { "group_size": { @@ -43052,7 +43052,7 @@ } }, { - "accuracy": 0.9585351386156521, + "accuracy": 0.9657132287362689, "total_bits": 95758848, "q_proj": { "group_size": { @@ -43116,7 +43116,7 @@ } }, { - "accuracy": 0.9663898856624177, + "accuracy": 0.9726243508293441, "total_bits": 112272384, "q_proj": { "group_size": { @@ -43180,7 +43180,7 @@ } }, { - "accuracy": 0.9733349061325977, + "accuracy": 0.97845135825245, "total_bits": 132913152, "q_proj": { "group_size": { @@ -43244,7 +43244,7 @@ } }, { - "accuracy": 0.9750405227471339, + "accuracy": 0.9787132562089124, "total_bits": 132980224, "q_proj": { "group_size": { @@ -43308,7 +43308,7 @@ } }, { - "accuracy": 0.9819130654397764, + "accuracy": 0.9868623651564121, "total_bits": 169613312, "q_proj": { "group_size": { @@ -43360,7 +43360,7 @@ } }, { - "accuracy": 0.9843339896515796, + "accuracy": 0.9873158822914487, "total_bits": 169745920, "q_proj": { "group_size": { @@ -43412,7 +43412,7 @@ } }, { - "accuracy": 0.9855142012728673, + "accuracy": 0.9881812647524241, "total_bits": 171195392, "q_proj": { "group_size": { @@ -43464,7 +43464,7 @@ } }, { - "accuracy": 0.9860695932844752, + "accuracy": 0.9886689126295479, "total_bits": 173563904, "q_proj": { "group_size": { @@ -43516,7 +43516,7 @@ } }, { - "accuracy": 0.9868055958987066, + "accuracy": 0.9891152360574588, "total_bits": 174923264, "q_proj": { "group_size": { @@ -43580,7 +43580,7 @@ } }, { - "accuracy": 0.9880141178636175, + "accuracy": 0.9898226397896284, "total_bits": 175750144, "q_proj": { "group_size": { @@ -43644,7 +43644,7 @@ } }, { - "accuracy": 0.989518814597671, + "accuracy": 0.9913634545610923, "total_bits": 179253248, "q_proj": { "group_size": { @@ -43705,7 +43705,7 @@ } }, { - "accuracy": 0.9904098668576855, + "accuracy": 0.992065459318263, "total_bits": 181592064, "q_proj": { "group_size": { @@ -43766,7 +43766,7 @@ } }, { - "accuracy": 0.9941569331200107, + "accuracy": 0.995324229282376, "total_bits": 220469248, "q_proj": { "group_size": { @@ -43827,7 +43827,7 @@ } }, { - "accuracy": 0.9952152201930355, + "accuracy": 0.9960566735458806, "total_bits": 223535104, "q_proj": { "group_size": { @@ -43888,7 +43888,7 @@ } }, { - "accuracy": 0.9953905812734248, + "accuracy": 0.9965572694414541, "total_bits": 253499392, "q_proj": { "group_size": { @@ -43940,7 +43940,7 @@ } }, { - "accuracy": 0.9975850072101151, + "accuracy": 0.9979240818892753, "total_bits": 265838592, "q_proj": { "group_size": { @@ -43992,7 +43992,7 @@ } }, { - "accuracy": 0.9987754085940603, + "accuracy": 0.99908907170481, "total_bits": 337385472, "q_proj": { "group_size": { @@ -44044,11 +44044,11 @@ } } ], - "model.layers.22.block_sparse_moe": [ + "model.layers.22.mlp": [ { - "accuracy": 0.9176656027373514, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9110634605351248, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -44063,7 +44063,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -44078,7 +44078,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -44098,9 +44098,9 @@ } }, { - "accuracy": 0.9204154336138776, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9135358394368699, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -44115,7 +44115,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -44130,7 +44130,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -44150,9 +44150,9 @@ } }, { - "accuracy": 0.9299716722024114, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9263079705599108, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -44167,7 +44167,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -44182,7 +44182,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -44199,9 +44199,9 @@ } }, { - "accuracy": 0.9321059900286951, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9301391608620945, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -44216,7 +44216,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -44231,7 +44231,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -44248,9 +44248,9 @@ } }, { - "accuracy": 0.961089680559541, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9550361302926352, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -44265,7 +44265,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -44280,7 +44280,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -44300,9 +44300,9 @@ } }, { - "accuracy": 0.9641676722584587, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9585004191061384, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -44317,7 +44317,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -44332,7 +44332,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -44352,9 +44352,9 @@ } }, { - "accuracy": 0.9675364571770555, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9639187357143352, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -44369,7 +44369,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -44384,7 +44384,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -44401,9 +44401,9 @@ } }, { - "accuracy": 0.9805391047542033, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9766080872596878, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -44415,7 +44415,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -44427,7 +44427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -44444,9 +44444,9 @@ } }, { - "accuracy": 0.9820854463369438, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9785486868043479, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -44458,7 +44458,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -44470,7 +44470,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -44487,9 +44487,9 @@ } }, { - "accuracy": 0.9807925557129478, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9771384122830472, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -44504,7 +44504,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -44519,7 +44519,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -44539,9 +44539,9 @@ } }, { - "accuracy": 0.9832148885256365, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9798805322498083, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -44556,7 +44556,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -44571,7 +44571,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -44591,9 +44591,9 @@ } }, { - "accuracy": 0.9905593977799934, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9883804876896504, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -44608,7 +44608,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -44623,7 +44623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -44643,9 +44643,9 @@ } }, { - "accuracy": 0.991917425349943, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9899783540075939, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -44660,7 +44660,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -44675,7 +44675,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -44695,9 +44695,9 @@ } }, { - "accuracy": 0.9950281573508523, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9936762779804045, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -44709,7 +44709,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -44721,7 +44721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -44738,9 +44738,9 @@ } }, { - "accuracy": 0.9953391299901628, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9940942086058816, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -44755,7 +44755,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -44770,7 +44770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -44787,9 +44787,9 @@ } }, { - "accuracy": 0.9959801378120717, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9951598907967931, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -44804,7 +44804,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -44819,7 +44819,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -44833,9 +44833,9 @@ } }, { - "accuracy": 0.9986904501841453, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9982952485963899, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -44847,7 +44847,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -44859,7 +44859,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -44875,7 +44875,7 @@ ], "model.layers.23.self_attn": [ { - "accuracy": 0.9524432279561695, + "accuracy": 0.9595728124442854, "total_bits": 89665536, "q_proj": { "group_size": { @@ -44939,7 +44939,7 @@ } }, { - "accuracy": 0.9530360379109257, + "accuracy": 0.9615252045424361, "total_bits": 92221440, "q_proj": { "group_size": { @@ -45003,7 +45003,7 @@ } }, { - "accuracy": 0.9641064432890791, + "accuracy": 0.9694810767511004, "total_bits": 95758848, "q_proj": { "group_size": { @@ -45067,7 +45067,7 @@ } }, { - "accuracy": 0.9699162048728842, + "accuracy": 0.9751400953452838, "total_bits": 112272384, "q_proj": { "group_size": { @@ -45131,7 +45131,7 @@ } }, { - "accuracy": 0.9746215462096428, + "accuracy": 0.9792133980969849, "total_bits": 132913152, "q_proj": { "group_size": { @@ -45195,7 +45195,7 @@ } }, { - "accuracy": 0.9762060763804536, + "accuracy": 0.9799468236436185, "total_bits": 132980224, "q_proj": { "group_size": { @@ -45259,7 +45259,7 @@ } }, { - "accuracy": 0.9833128412597274, + "accuracy": 0.9871044558423915, "total_bits": 169613312, "q_proj": { "group_size": { @@ -45311,7 +45311,7 @@ } }, { - "accuracy": 0.9853125596909147, + "accuracy": 0.9879715916161474, "total_bits": 169745920, "q_proj": { "group_size": { @@ -45363,7 +45363,7 @@ } }, { - "accuracy": 0.986122182227279, + "accuracy": 0.9890916360983331, "total_bits": 171195392, "q_proj": { "group_size": { @@ -45415,7 +45415,7 @@ } }, { - "accuracy": 0.9864380322396755, + "accuracy": 0.9894038999306136, "total_bits": 173563904, "q_proj": { "group_size": { @@ -45467,7 +45467,7 @@ } }, { - "accuracy": 0.987989904555051, + "accuracy": 0.9897039030972672, "total_bits": 174923264, "q_proj": { "group_size": { @@ -45531,7 +45531,7 @@ } }, { - "accuracy": 0.9885625831390682, + "accuracy": 0.9904116756086679, "total_bits": 175750144, "q_proj": { "group_size": { @@ -45595,7 +45595,7 @@ } }, { - "accuracy": 0.99064354779885, + "accuracy": 0.992263870485323, "total_bits": 179253248, "q_proj": { "group_size": { @@ -45656,7 +45656,7 @@ } }, { - "accuracy": 0.991567972704376, + "accuracy": 0.9927829971144858, "total_bits": 181592064, "q_proj": { "group_size": { @@ -45717,7 +45717,7 @@ } }, { - "accuracy": 0.9945859536283502, + "accuracy": 0.995707672322169, "total_bits": 220469248, "q_proj": { "group_size": { @@ -45778,7 +45778,7 @@ } }, { - "accuracy": 0.9956835696571752, + "accuracy": 0.9963485920973318, "total_bits": 223535104, "q_proj": { "group_size": { @@ -45839,7 +45839,7 @@ } }, { - "accuracy": 0.9955232734914476, + "accuracy": 0.9967020811316999, "total_bits": 253499392, "q_proj": { "group_size": { @@ -45891,7 +45891,7 @@ } }, { - "accuracy": 0.9978930688046507, + "accuracy": 0.9982005578196166, "total_bits": 265838592, "q_proj": { "group_size": { @@ -45943,7 +45943,7 @@ } }, { - "accuracy": 0.9988199585135781, + "accuracy": 0.9991306516856543, "total_bits": 337385472, "q_proj": { "group_size": { @@ -45995,11 +45995,11 @@ } } ], - "model.layers.23.block_sparse_moe": [ + "model.layers.23.mlp": [ { - "accuracy": 0.9204967839545325, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.913319766129318, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -46014,7 +46014,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -46029,7 +46029,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -46049,9 +46049,9 @@ } }, { - "accuracy": 0.9230595927097296, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9156908383102793, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -46066,7 +46066,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -46081,7 +46081,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -46101,9 +46101,9 @@ } }, { - "accuracy": 0.9317785262277252, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9276150712057164, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -46118,7 +46118,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -46133,7 +46133,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -46150,9 +46150,9 @@ } }, { - "accuracy": 0.9337434821615094, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9312179004283327, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -46167,7 +46167,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -46182,7 +46182,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -46199,9 +46199,9 @@ } }, { - "accuracy": 0.9621659109466955, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9560798008583093, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -46216,7 +46216,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -46231,7 +46231,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -46251,9 +46251,9 @@ } }, { - "accuracy": 0.9652149861580447, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.959484016816867, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -46268,7 +46268,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -46283,7 +46283,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -46303,9 +46303,9 @@ } }, { - "accuracy": 0.9683266715391686, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9645623034356456, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -46320,7 +46320,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -46335,7 +46335,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -46352,9 +46352,9 @@ } }, { - "accuracy": 0.9811087438444558, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9771620457207686, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -46366,7 +46366,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -46378,7 +46378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -46395,9 +46395,9 @@ } }, { - "accuracy": 0.9826179286465049, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9790527221599692, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -46409,7 +46409,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -46421,7 +46421,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -46438,9 +46438,9 @@ } }, { - "accuracy": 0.981307726255373, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.977660745235258, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -46455,7 +46455,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -46470,7 +46470,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -46490,9 +46490,9 @@ } }, { - "accuracy": 0.9836539208496872, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9803529323421811, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -46507,7 +46507,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -46522,7 +46522,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -46542,9 +46542,9 @@ } }, { - "accuracy": 0.9907920802943408, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9886469580968352, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -46559,7 +46559,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -46574,7 +46574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -46594,9 +46594,9 @@ } }, { - "accuracy": 0.9921098601955333, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9902143362468403, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -46611,7 +46611,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -46626,7 +46626,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -46646,9 +46646,9 @@ } }, { - "accuracy": 0.9951306316315344, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9938194972438443, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -46660,7 +46660,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -46672,7 +46672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -46689,9 +46689,9 @@ } }, { - "accuracy": 0.9954302021620893, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9942268141251254, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -46706,7 +46706,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -46721,7 +46721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -46738,9 +46738,9 @@ } }, { - "accuracy": 0.9960129390421667, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9952151365210548, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -46755,7 +46755,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -46770,7 +46770,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -46784,9 +46784,9 @@ } }, { - "accuracy": 0.9987152881099304, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9983235832463068, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -46798,7 +46798,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -46810,7 +46810,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -46826,7 +46826,7 @@ ], "model.layers.24.self_attn": [ { - "accuracy": 0.9499710435537916, + "accuracy": 0.958328434315167, "total_bits": 89665536, "q_proj": { "group_size": { @@ -46890,7 +46890,7 @@ } }, { - "accuracy": 0.950662459786001, + "accuracy": 0.959648142715818, "total_bits": 92221440, "q_proj": { "group_size": { @@ -46954,7 +46954,7 @@ } }, { - "accuracy": 0.9622133842816478, + "accuracy": 0.9673294343642498, "total_bits": 95758848, "q_proj": { "group_size": { @@ -47018,7 +47018,7 @@ } }, { - "accuracy": 0.968604315072298, + "accuracy": 0.9733665142404405, "total_bits": 112272384, "q_proj": { "group_size": { @@ -47082,7 +47082,7 @@ } }, { - "accuracy": 0.9742949208930919, + "accuracy": 0.9782990733847806, "total_bits": 132913152, "q_proj": { "group_size": { @@ -47146,7 +47146,7 @@ } }, { - "accuracy": 0.9749429447478369, + "accuracy": 0.9788336875897489, "total_bits": 132980224, "q_proj": { "group_size": { @@ -47210,7 +47210,7 @@ } }, { - "accuracy": 0.98349716043786, + "accuracy": 0.9866573128261065, "total_bits": 169613312, "q_proj": { "group_size": { @@ -47262,7 +47262,7 @@ } }, { - "accuracy": 0.9839948990725373, + "accuracy": 0.9868603820275319, "total_bits": 169745920, "q_proj": { "group_size": { @@ -47314,7 +47314,7 @@ } }, { - "accuracy": 0.9858845195880062, + "accuracy": 0.988208131766633, "total_bits": 171195392, "q_proj": { "group_size": { @@ -47366,7 +47366,7 @@ } }, { - "accuracy": 0.9862351651842657, + "accuracy": 0.9887175374223214, "total_bits": 173563904, "q_proj": { "group_size": { @@ -47418,7 +47418,7 @@ } }, { - "accuracy": 0.9871628187400731, + "accuracy": 0.988913990067024, "total_bits": 174923264, "q_proj": { "group_size": { @@ -47482,7 +47482,7 @@ } }, { - "accuracy": 0.9879845070505613, + "accuracy": 0.9895531169972137, "total_bits": 175750144, "q_proj": { "group_size": { @@ -47546,7 +47546,7 @@ } }, { - "accuracy": 0.9900748384381203, + "accuracy": 0.9915702124371341, "total_bits": 179253248, "q_proj": { "group_size": { @@ -47607,7 +47607,7 @@ } }, { - "accuracy": 0.9908129974562478, + "accuracy": 0.9923516503829313, "total_bits": 181592064, "q_proj": { "group_size": { @@ -47668,7 +47668,7 @@ } }, { - "accuracy": 0.9943481932492241, + "accuracy": 0.9954335956687206, "total_bits": 220469248, "q_proj": { "group_size": { @@ -47729,7 +47729,7 @@ } }, { - "accuracy": 0.9954483362012788, + "accuracy": 0.9961771102205507, "total_bits": 223535104, "q_proj": { "group_size": { @@ -47790,7 +47790,7 @@ } }, { - "accuracy": 0.9953681989205315, + "accuracy": 0.9964663893425543, "total_bits": 253499392, "q_proj": { "group_size": { @@ -47842,7 +47842,7 @@ } }, { - "accuracy": 0.9977228532385963, + "accuracy": 0.9980023772958176, "total_bits": 265838592, "q_proj": { "group_size": { @@ -47894,7 +47894,7 @@ } }, { - "accuracy": 0.9987915921466131, + "accuracy": 0.9990517964899098, "total_bits": 337385472, "q_proj": { "group_size": { @@ -47946,11 +47946,11 @@ } } ], - "model.layers.24.block_sparse_moe": [ + "model.layers.24.mlp": [ { - "accuracy": 0.9221052409786927, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9149401540818968, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -47965,7 +47965,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -47980,7 +47980,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -48000,9 +48000,9 @@ } }, { - "accuracy": 0.9246442421878639, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.917277310827845, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -48017,7 +48017,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -48032,7 +48032,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -48052,9 +48052,9 @@ } }, { - "accuracy": 0.9328722651851804, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9286139678013952, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -48069,7 +48069,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -48084,7 +48084,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -48101,9 +48101,9 @@ } }, { - "accuracy": 0.9347451446872008, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9320637866070396, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -48118,7 +48118,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -48133,7 +48133,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -48150,9 +48150,9 @@ } }, { - "accuracy": 0.962751935187139, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9568382388863125, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -48167,7 +48167,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -48182,7 +48182,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -48202,9 +48202,9 @@ } }, { - "accuracy": 0.9657756126436748, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9601893195588338, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -48219,7 +48219,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -48234,7 +48234,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -48254,9 +48254,9 @@ } }, { - "accuracy": 0.9687173461639568, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9650305011554768, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -48271,7 +48271,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -48286,7 +48286,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -48303,9 +48303,9 @@ } }, { - "accuracy": 0.9813323744425648, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9775330322843633, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -48317,7 +48317,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -48329,7 +48329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -48346,9 +48346,9 @@ } }, { - "accuracy": 0.9828240042062182, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9793986510975581, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -48360,7 +48360,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -48372,7 +48372,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -48389,9 +48389,9 @@ } }, { - "accuracy": 0.9815501413847271, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9780310926664817, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -48406,7 +48406,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -48421,7 +48421,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -48441,9 +48441,9 @@ } }, { - "accuracy": 0.983885766546193, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9806905463828068, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -48458,7 +48458,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -48473,7 +48473,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -48493,9 +48493,9 @@ } }, { - "accuracy": 0.9908952838496158, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9888370308877998, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -48510,7 +48510,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -48525,7 +48525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -48545,9 +48545,9 @@ } }, { - "accuracy": 0.992202242027576, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9903789020731653, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -48562,7 +48562,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -48577,7 +48577,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -48597,9 +48597,9 @@ } }, { - "accuracy": 0.9951759405728233, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9939056997996216, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -48611,7 +48611,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -48623,7 +48623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -48640,9 +48640,9 @@ } }, { - "accuracy": 0.9954795483009595, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9943226612736716, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -48657,7 +48657,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -48672,7 +48672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -48689,9 +48689,9 @@ } }, { - "accuracy": 0.9960276171638581, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9952651692774931, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -48706,7 +48706,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -48721,7 +48721,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -48735,9 +48735,9 @@ } }, { - "accuracy": 0.9987286387752791, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9983450298528432, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -48749,7 +48749,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -48761,7 +48761,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -48777,7 +48777,7 @@ ], "model.layers.25.self_attn": [ { - "accuracy": 0.9513595190487409, + "accuracy": 0.9566334423265959, "total_bits": 89665536, "q_proj": { "group_size": { @@ -48841,7 +48841,7 @@ } }, { - "accuracy": 0.9528722892466345, + "accuracy": 0.959297574860485, "total_bits": 92221440, "q_proj": { "group_size": { @@ -48905,7 +48905,7 @@ } }, { - "accuracy": 0.9632582637040239, + "accuracy": 0.9669618104633532, "total_bits": 95758848, "q_proj": { "group_size": { @@ -48969,7 +48969,7 @@ } }, { - "accuracy": 0.9693719534889648, + "accuracy": 0.9726681299507618, "total_bits": 112272384, "q_proj": { "group_size": { @@ -49033,7 +49033,7 @@ } }, { - "accuracy": 0.9753816164049663, + "accuracy": 0.978045946967445, "total_bits": 132913152, "q_proj": { "group_size": { @@ -49097,7 +49097,7 @@ } }, { - "accuracy": 0.9758248702671967, + "accuracy": 0.978905658384687, "total_bits": 132980224, "q_proj": { "group_size": { @@ -49161,7 +49161,7 @@ } }, { - "accuracy": 0.9832230314220253, + "accuracy": 0.9863742733570305, "total_bits": 169613312, "q_proj": { "group_size": { @@ -49213,7 +49213,7 @@ } }, { - "accuracy": 0.9844039228107584, + "accuracy": 0.9871814380350866, "total_bits": 169745920, "q_proj": { "group_size": { @@ -49265,7 +49265,7 @@ } }, { - "accuracy": 0.9860253931072197, + "accuracy": 0.9881242306314801, "total_bits": 171195392, "q_proj": { "group_size": { @@ -49317,7 +49317,7 @@ } }, { - "accuracy": 0.9864884662000757, + "accuracy": 0.9885882342871475, "total_bits": 173563904, "q_proj": { "group_size": { @@ -49369,7 +49369,7 @@ } }, { - "accuracy": 0.9873601074183458, + "accuracy": 0.9889406066534943, "total_bits": 174923264, "q_proj": { "group_size": { @@ -49433,7 +49433,7 @@ } }, { - "accuracy": 0.9883699642592355, + "accuracy": 0.9898438788460273, "total_bits": 175750144, "q_proj": { "group_size": { @@ -49497,7 +49497,7 @@ } }, { - "accuracy": 0.9902751693551085, + "accuracy": 0.9916399287206954, "total_bits": 179253248, "q_proj": { "group_size": { @@ -49558,7 +49558,7 @@ } }, { - "accuracy": 0.9910399240049484, + "accuracy": 0.9924041338411993, "total_bits": 181592064, "q_proj": { "group_size": { @@ -49619,7 +49619,7 @@ } }, { - "accuracy": 0.9944777478292388, + "accuracy": 0.9954681898050598, "total_bits": 220469248, "q_proj": { "group_size": { @@ -49680,7 +49680,7 @@ } }, { - "accuracy": 0.9956043592384575, + "accuracy": 0.9961850665215599, "total_bits": 223535104, "q_proj": { "group_size": { @@ -49741,7 +49741,7 @@ } }, { - "accuracy": 0.995558013588092, + "accuracy": 0.9965010672366541, "total_bits": 253499392, "q_proj": { "group_size": { @@ -49793,7 +49793,7 @@ } }, { - "accuracy": 0.997688677866551, + "accuracy": 0.9979520296114252, "total_bits": 265838592, "q_proj": { "group_size": { @@ -49845,7 +49845,7 @@ } }, { - "accuracy": 0.9988272385457285, + "accuracy": 0.9990427896937992, "total_bits": 337385472, "q_proj": { "group_size": { @@ -49897,11 +49897,11 @@ } } ], - "model.layers.25.block_sparse_moe": [ + "model.layers.25.mlp": [ { - "accuracy": 0.9221451184467265, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9146759376714104, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -49916,7 +49916,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -49931,7 +49931,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -49951,9 +49951,9 @@ } }, { - "accuracy": 0.9246156305858964, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9170219823718071, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -49968,7 +49968,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -49983,7 +49983,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -50003,9 +50003,9 @@ } }, { - "accuracy": 0.9324987399342813, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9280731132940242, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -50020,7 +50020,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -50035,7 +50035,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -50052,9 +50052,9 @@ } }, { - "accuracy": 0.9342960805485123, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9314445968913405, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -50069,7 +50069,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -50084,7 +50084,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -50101,9 +50101,9 @@ } }, { - "accuracy": 0.9627234475981248, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.956674035148401, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -50118,7 +50118,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -50133,7 +50133,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -50153,9 +50153,9 @@ } }, { - "accuracy": 0.9657524878061131, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9600346340356689, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -50170,7 +50170,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -50185,7 +50185,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -50205,9 +50205,9 @@ } }, { - "accuracy": 0.9685872759866088, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9647565111517906, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -50222,7 +50222,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -50237,7 +50237,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -50254,9 +50254,9 @@ } }, { - "accuracy": 0.9812466469838431, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9774213667566839, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -50268,7 +50268,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -50280,7 +50280,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -50297,9 +50297,9 @@ } }, { - "accuracy": 0.9827800785730544, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9792972804586354, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -50311,7 +50311,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -50323,7 +50323,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -50340,9 +50340,9 @@ } }, { - "accuracy": 0.9815366607845614, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9779517748638203, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -50357,7 +50357,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -50372,7 +50372,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -50392,9 +50392,9 @@ } }, { - "accuracy": 0.9838506254416547, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9806102370157054, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -50409,7 +50409,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -50424,7 +50424,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -50444,9 +50444,9 @@ } }, { - "accuracy": 0.9908647740161732, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.988792732063877, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -50461,7 +50461,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -50476,7 +50476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -50496,9 +50496,9 @@ } }, { - "accuracy": 0.9921978681691384, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9903401779746147, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -50513,7 +50513,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -50528,7 +50528,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -50548,9 +50548,9 @@ } }, { - "accuracy": 0.9951590524471708, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9938898375012765, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -50562,7 +50562,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -50574,7 +50574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -50591,9 +50591,9 @@ } }, { - "accuracy": 0.9954690486070161, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9943056339888197, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -50608,7 +50608,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -50623,7 +50623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -50640,9 +50640,9 @@ } }, { - "accuracy": 0.9959926817958292, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.995217764149665, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -50657,7 +50657,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -50672,7 +50672,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -50686,9 +50686,9 @@ } }, { - "accuracy": 0.9987237213216232, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9983432813375992, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -50700,7 +50700,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -50712,7 +50712,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -50728,7 +50728,7 @@ ], "model.layers.26.self_attn": [ { - "accuracy": 0.9486259669065475, + "accuracy": 0.9566455139337402, "total_bits": 89665536, "q_proj": { "group_size": { @@ -50792,7 +50792,7 @@ } }, { - "accuracy": 0.9516875302713168, + "accuracy": 0.9581242944849164, "total_bits": 92221440, "q_proj": { "group_size": { @@ -50856,7 +50856,7 @@ } }, { - "accuracy": 0.9609920637387979, + "accuracy": 0.966118764720465, "total_bits": 95758848, "q_proj": { "group_size": { @@ -50920,7 +50920,7 @@ } }, { - "accuracy": 0.9688636825272912, + "accuracy": 0.9730661178105756, "total_bits": 112272384, "q_proj": { "group_size": { @@ -50984,7 +50984,7 @@ } }, { - "accuracy": 0.9745490510800952, + "accuracy": 0.978104499364762, "total_bits": 132913152, "q_proj": { "group_size": { @@ -51048,7 +51048,7 @@ } }, { - "accuracy": 0.9752972233844431, + "accuracy": 0.9780851406976581, "total_bits": 132980224, "q_proj": { "group_size": { @@ -51112,7 +51112,7 @@ } }, { - "accuracy": 0.9835192375865421, + "accuracy": 0.9865694990087497, "total_bits": 169613312, "q_proj": { "group_size": { @@ -51164,7 +51164,7 @@ } }, { - "accuracy": 0.9848933926244315, + "accuracy": 0.9871383167215084, "total_bits": 169745920, "q_proj": { "group_size": { @@ -51216,7 +51216,7 @@ } }, { - "accuracy": 0.9858258776856881, + "accuracy": 0.988191818671399, "total_bits": 171195392, "q_proj": { "group_size": { @@ -51268,7 +51268,7 @@ } }, { - "accuracy": 0.9864517002807636, + "accuracy": 0.9886489273784193, "total_bits": 173563904, "q_proj": { "group_size": { @@ -51320,7 +51320,7 @@ } }, { - "accuracy": 0.9871907775339327, + "accuracy": 0.9886849663514448, "total_bits": 174923264, "q_proj": { "group_size": { @@ -51384,7 +51384,7 @@ } }, { - "accuracy": 0.9879473856601276, + "accuracy": 0.9894484154095775, "total_bits": 175750144, "q_proj": { "group_size": { @@ -51448,7 +51448,7 @@ } }, { - "accuracy": 0.9899085606203267, + "accuracy": 0.9913366238684639, "total_bits": 179253248, "q_proj": { "group_size": { @@ -51509,7 +51509,7 @@ } }, { - "accuracy": 0.9905517408379206, + "accuracy": 0.9918915066671999, "total_bits": 181592064, "q_proj": { "group_size": { @@ -51570,7 +51570,7 @@ } }, { - "accuracy": 0.9944266022771204, + "accuracy": 0.9952366673387587, "total_bits": 220469248, "q_proj": { "group_size": { @@ -51631,7 +51631,7 @@ } }, { - "accuracy": 0.9953044906356617, + "accuracy": 0.9959825560033909, "total_bits": 223535104, "q_proj": { "group_size": { @@ -51692,7 +51692,7 @@ } }, { - "accuracy": 0.9957428037458541, + "accuracy": 0.9964470160806453, "total_bits": 253499392, "q_proj": { "group_size": { @@ -51744,7 +51744,7 @@ } }, { - "accuracy": 0.9975238920213949, + "accuracy": 0.99787969605409, "total_bits": 265838592, "q_proj": { "group_size": { @@ -51796,7 +51796,7 @@ } }, { - "accuracy": 0.998822687552188, + "accuracy": 0.9990449535207039, "total_bits": 337385472, "q_proj": { "group_size": { @@ -51848,11 +51848,11 @@ } } ], - "model.layers.26.block_sparse_moe": [ + "model.layers.26.mlp": [ { - "accuracy": 0.9222756347766048, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9147281146755344, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -51867,7 +51867,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -51882,7 +51882,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -51902,9 +51902,9 @@ } }, { - "accuracy": 0.9248578573920225, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9171116361884695, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -51919,7 +51919,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -51934,7 +51934,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -51954,9 +51954,9 @@ } }, { - "accuracy": 0.9327644543036034, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9280182117302167, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -51971,7 +51971,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -51986,7 +51986,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -52003,9 +52003,9 @@ } }, { - "accuracy": 0.9345976147604615, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9313999792855037, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -52020,7 +52020,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -52035,7 +52035,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -52052,9 +52052,9 @@ } }, { - "accuracy": 0.962923229034794, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9566992338942855, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -52069,7 +52069,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -52084,7 +52084,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -52104,9 +52104,9 @@ } }, { - "accuracy": 0.9659108238196686, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9600689535666453, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -52121,7 +52121,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -52136,7 +52136,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -52156,9 +52156,9 @@ } }, { - "accuracy": 0.9687649021415334, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9647411490349393, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -52173,7 +52173,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -52188,7 +52188,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -52205,9 +52205,9 @@ } }, { - "accuracy": 0.98142835791958, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9773980775161794, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -52219,7 +52219,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -52231,7 +52231,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -52248,9 +52248,9 @@ } }, { - "accuracy": 0.9829370938241482, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9792801884836272, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -52262,7 +52262,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -52274,7 +52274,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -52291,9 +52291,9 @@ } }, { - "accuracy": 0.9817414911659924, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9779526128090525, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -52308,7 +52308,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -52323,7 +52323,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -52343,9 +52343,9 @@ } }, { - "accuracy": 0.9840268836307683, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9806135171828302, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -52360,7 +52360,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -52375,7 +52375,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -52395,9 +52395,9 @@ } }, { - "accuracy": 0.9910298103564664, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9887902426141265, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -52412,7 +52412,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -52427,7 +52427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -52447,9 +52447,9 @@ } }, { - "accuracy": 0.9923039041704645, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9903400298698168, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -52464,7 +52464,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -52479,7 +52479,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -52499,9 +52499,9 @@ } }, { - "accuracy": 0.9952229320463774, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.993870962545962, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -52513,7 +52513,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -52525,7 +52525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -52542,9 +52542,9 @@ } }, { - "accuracy": 0.9955559315061883, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9943068589108359, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -52559,7 +52559,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -52574,7 +52574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -52591,9 +52591,9 @@ } }, { - "accuracy": 0.9960958779372863, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9952171477611715, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -52608,7 +52608,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -52623,7 +52623,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -52637,9 +52637,9 @@ } }, { - "accuracy": 0.9987341178654644, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9983355305096331, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -52651,7 +52651,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -52663,7 +52663,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -52679,7 +52679,7 @@ ], "model.layers.27.self_attn": [ { - "accuracy": 0.9442517226071734, + "accuracy": 0.9529953975426524, "total_bits": 89665536, "q_proj": { "group_size": { @@ -52743,7 +52743,7 @@ } }, { - "accuracy": 0.9469829584030729, + "accuracy": 0.9539746520550627, "total_bits": 92221440, "q_proj": { "group_size": { @@ -52807,7 +52807,7 @@ } }, { - "accuracy": 0.9566318827044022, + "accuracy": 0.9636065997183323, "total_bits": 95758848, "q_proj": { "group_size": { @@ -52871,7 +52871,7 @@ } }, { - "accuracy": 0.964533438023768, + "accuracy": 0.970960430114677, "total_bits": 112272384, "q_proj": { "group_size": { @@ -52935,7 +52935,7 @@ } }, { - "accuracy": 0.9715799759877356, + "accuracy": 0.97689851942031, "total_bits": 132913152, "q_proj": { "group_size": { @@ -52999,7 +52999,7 @@ } }, { - "accuracy": 0.9729564628122669, + "accuracy": 0.9772981004299301, "total_bits": 132980224, "q_proj": { "group_size": { @@ -53063,7 +53063,7 @@ } }, { - "accuracy": 0.981406915540758, + "accuracy": 0.9850431690482717, "total_bits": 169613312, "q_proj": { "group_size": { @@ -53115,7 +53115,7 @@ } }, { - "accuracy": 0.9823053215973472, + "accuracy": 0.9859511236611166, "total_bits": 169745920, "q_proj": { "group_size": { @@ -53167,7 +53167,7 @@ } }, { - "accuracy": 0.9838362307238736, + "accuracy": 0.9869644216899025, "total_bits": 171195392, "q_proj": { "group_size": { @@ -53219,7 +53219,7 @@ } }, { - "accuracy": 0.9845881582188764, + "accuracy": 0.9875384545149771, "total_bits": 173563904, "q_proj": { "group_size": { @@ -53271,7 +53271,7 @@ } }, { - "accuracy": 0.9860564412451104, + "accuracy": 0.9882454060410198, "total_bits": 174923264, "q_proj": { "group_size": { @@ -53335,7 +53335,7 @@ } }, { - "accuracy": 0.9871776697568988, + "accuracy": 0.9887564918516498, "total_bits": 175750144, "q_proj": { "group_size": { @@ -53399,7 +53399,7 @@ } }, { - "accuracy": 0.9887215339725739, + "accuracy": 0.9905703542754054, "total_bits": 179253248, "q_proj": { "group_size": { @@ -53460,7 +53460,7 @@ } }, { - "accuracy": 0.9896244840010217, + "accuracy": 0.9913679245173147, "total_bits": 181592064, "q_proj": { "group_size": { @@ -53521,7 +53521,7 @@ } }, { - "accuracy": 0.9936470297704402, + "accuracy": 0.9948423995734438, "total_bits": 220469248, "q_proj": { "group_size": { @@ -53582,7 +53582,7 @@ } }, { - "accuracy": 0.9948371632857934, + "accuracy": 0.9956286184514236, "total_bits": 223535104, "q_proj": { "group_size": { @@ -53643,7 +53643,7 @@ } }, { - "accuracy": 0.9949522354566541, + "accuracy": 0.9960845137309087, "total_bits": 253499392, "q_proj": { "group_size": { @@ -53695,7 +53695,7 @@ } }, { - "accuracy": 0.9972281611132386, + "accuracy": 0.9976874942045757, "total_bits": 265838592, "q_proj": { "group_size": { @@ -53747,7 +53747,7 @@ } }, { - "accuracy": 0.9986772067832613, + "accuracy": 0.9989690801533135, "total_bits": 337385472, "q_proj": { "group_size": { @@ -53799,11 +53799,11 @@ } } ], - "model.layers.27.block_sparse_moe": [ + "model.layers.27.mlp": [ { - "accuracy": 0.9208881749134314, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9119134459056353, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -53818,7 +53818,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -53833,7 +53833,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -53853,9 +53853,9 @@ } }, { - "accuracy": 0.9235014735083831, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9143923938666519, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -53870,7 +53870,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -53885,7 +53885,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -53905,9 +53905,9 @@ } }, { - "accuracy": 0.9314361455801287, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9254979401042587, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -53922,7 +53922,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -53937,7 +53937,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -53954,9 +53954,9 @@ } }, { - "accuracy": 0.9333208040579369, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9290329853170797, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -53971,7 +53971,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -53986,7 +53986,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -54003,9 +54003,9 @@ } }, { - "accuracy": 0.9619845160724301, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9551643996842598, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -54020,7 +54020,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -54035,7 +54035,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -54055,9 +54055,9 @@ } }, { - "accuracy": 0.9650827256080351, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9586752536461541, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -54072,7 +54072,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -54087,7 +54087,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -54107,9 +54107,9 @@ } }, { - "accuracy": 0.9679535704811937, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9634748657086962, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -54124,7 +54124,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -54139,7 +54139,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -54156,9 +54156,9 @@ } }, { - "accuracy": 0.9807602280848905, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9764762125223091, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -54170,7 +54170,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -54182,7 +54182,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -54199,9 +54199,9 @@ } }, { - "accuracy": 0.9823299683630466, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9784798018242183, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -54213,7 +54213,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -54225,7 +54225,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -54242,9 +54242,9 @@ } }, { - "accuracy": 0.9811228912715849, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9771588371184311, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -54259,7 +54259,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -54274,7 +54274,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -54294,9 +54294,9 @@ } }, { - "accuracy": 0.9835154746021879, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9799348504625653, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -54311,7 +54311,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -54326,7 +54326,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -54346,9 +54346,9 @@ } }, { - "accuracy": 0.9906613345895159, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9883958927208656, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -54363,7 +54363,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -54378,7 +54378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -54398,9 +54398,9 @@ } }, { - "accuracy": 0.9919989972718453, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9900037541397309, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -54415,7 +54415,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -54430,7 +54430,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -54450,9 +54450,9 @@ } }, { - "accuracy": 0.9949931211181378, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9936266078760749, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -54464,7 +54464,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -54476,7 +54476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -54493,9 +54493,9 @@ } }, { - "accuracy": 0.9953539220518187, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9941124011468339, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -54510,7 +54510,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -54525,7 +54525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -54542,9 +54542,9 @@ } }, { - "accuracy": 0.995889257566121, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9950554836051244, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -54559,7 +54559,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -54574,7 +54574,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -54588,9 +54588,9 @@ } }, { - "accuracy": 0.9986937264850559, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9982740072116844, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -54602,7 +54602,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -54614,7 +54614,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -54630,7 +54630,7 @@ ], "model.layers.28.self_attn": [ { - "accuracy": 0.9372522989778143, + "accuracy": 0.9482620282000617, "total_bits": 89665536, "q_proj": { "group_size": { @@ -54694,7 +54694,7 @@ } }, { - "accuracy": 0.9399709017261079, + "accuracy": 0.9497155005994596, "total_bits": 92221440, "q_proj": { "group_size": { @@ -54758,7 +54758,7 @@ } }, { - "accuracy": 0.953845058518805, + "accuracy": 0.9606695809450588, "total_bits": 95758848, "q_proj": { "group_size": { @@ -54822,7 +54822,7 @@ } }, { - "accuracy": 0.9635215137938136, + "accuracy": 0.9691915019954506, "total_bits": 112272384, "q_proj": { "group_size": { @@ -54886,7 +54886,7 @@ } }, { - "accuracy": 0.9690023592036021, + "accuracy": 0.973943164179984, "total_bits": 132913152, "q_proj": { "group_size": { @@ -54950,7 +54950,7 @@ } }, { - "accuracy": 0.9697745632576315, + "accuracy": 0.9744764922284767, "total_bits": 132980224, "q_proj": { "group_size": { @@ -55014,7 +55014,7 @@ } }, { - "accuracy": 0.9815021029330397, + "accuracy": 0.9851749465849838, "total_bits": 169613312, "q_proj": { "group_size": { @@ -55066,7 +55066,7 @@ } }, { - "accuracy": 0.982008520317705, + "accuracy": 0.9854659755273085, "total_bits": 169745920, "q_proj": { "group_size": { @@ -55118,7 +55118,7 @@ } }, { - "accuracy": 0.9835018401867465, + "accuracy": 0.9865795140595812, "total_bits": 171195392, "q_proj": { "group_size": { @@ -55170,7 +55170,7 @@ } }, { - "accuracy": 0.9843551027343461, + "accuracy": 0.9870419336090747, "total_bits": 173563904, "q_proj": { "group_size": { @@ -55222,7 +55222,7 @@ } }, { - "accuracy": 0.9844869119850429, + "accuracy": 0.9869199116273146, "total_bits": 174923264, "q_proj": { "group_size": { @@ -55286,7 +55286,7 @@ } }, { - "accuracy": 0.9856589606619979, + "accuracy": 0.9878577027273806, "total_bits": 175750144, "q_proj": { "group_size": { @@ -55350,7 +55350,7 @@ } }, { - "accuracy": 0.9882959338199151, + "accuracy": 0.9900523995686519, "total_bits": 179253248, "q_proj": { "group_size": { @@ -55411,7 +55411,7 @@ } }, { - "accuracy": 0.9892216198832581, + "accuracy": 0.9908103465200647, "total_bits": 181592064, "q_proj": { "group_size": { @@ -55472,7 +55472,7 @@ } }, { - "accuracy": 0.9935238386357301, + "accuracy": 0.9945393592658404, "total_bits": 220469248, "q_proj": { "group_size": { @@ -55533,7 +55533,7 @@ } }, { - "accuracy": 0.9945825652562474, + "accuracy": 0.9953787808772177, "total_bits": 223535104, "q_proj": { "group_size": { @@ -55594,7 +55594,7 @@ } }, { - "accuracy": 0.9950757999046657, + "accuracy": 0.9960174795705825, "total_bits": 253499392, "q_proj": { "group_size": { @@ -55646,7 +55646,7 @@ } }, { - "accuracy": 0.9972921312345486, + "accuracy": 0.9976649379945899, "total_bits": 265838592, "q_proj": { "group_size": { @@ -55698,7 +55698,7 @@ } }, { - "accuracy": 0.998667761125896, + "accuracy": 0.9989359679166228, "total_bits": 337385472, "q_proj": { "group_size": { @@ -55750,11 +55750,11 @@ } } ], - "model.layers.28.block_sparse_moe": [ + "model.layers.28.mlp": [ { - "accuracy": 0.9171708159540829, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.907466558641509, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -55769,7 +55769,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -55784,7 +55784,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -55804,9 +55804,9 @@ } }, { - "accuracy": 0.9199309633358529, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9101024356327558, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -55821,7 +55821,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -55836,7 +55836,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -55856,9 +55856,9 @@ } }, { - "accuracy": 0.9283658834664446, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9217034925363565, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -55873,7 +55873,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -55888,7 +55888,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -55905,9 +55905,9 @@ } }, { - "accuracy": 0.9304677865615016, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9255249339498972, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -55922,7 +55922,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -55937,7 +55937,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -55954,9 +55954,9 @@ } }, { - "accuracy": 0.9604277128451749, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9529471339559868, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -55971,7 +55971,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -55986,7 +55986,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -56006,9 +56006,9 @@ } }, { - "accuracy": 0.9636236376276142, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9565911702811718, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -56023,7 +56023,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -56038,7 +56038,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -56058,9 +56058,9 @@ } }, { - "accuracy": 0.9667280017349281, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9616312292453489, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -56075,7 +56075,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -56090,7 +56090,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -56107,9 +56107,9 @@ } }, { - "accuracy": 0.9800177510631712, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9752057723112797, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -56121,7 +56121,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -56133,7 +56133,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -56150,9 +56150,9 @@ } }, { - "accuracy": 0.9816904697861326, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9773073797849449, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -56164,7 +56164,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -56176,7 +56176,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -56193,9 +56193,9 @@ } }, { - "accuracy": 0.9805088709843787, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9760507405490467, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -56210,7 +56210,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -56225,7 +56225,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -56245,9 +56245,9 @@ } }, { - "accuracy": 0.9829444338224436, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9789289100781867, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -56262,7 +56262,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -56277,7 +56277,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -56297,9 +56297,9 @@ } }, { - "accuracy": 0.9904319891510042, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9878254177931108, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -56314,7 +56314,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -56329,7 +56329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -56349,9 +56349,9 @@ } }, { - "accuracy": 0.9917925807500356, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9895025102764761, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -56366,7 +56366,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -56381,7 +56381,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -56401,9 +56401,9 @@ } }, { - "accuracy": 0.9948608006816357, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9932841452720919, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -56415,7 +56415,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -56427,7 +56427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -56444,9 +56444,9 @@ } }, { - "accuracy": 0.9952604927269644, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9938231445113687, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -56461,7 +56461,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -56476,7 +56476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -56493,9 +56493,9 @@ } }, { - "accuracy": 0.9958560369590199, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9948247685464785, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -56510,7 +56510,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -56525,7 +56525,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -56539,9 +56539,9 @@ } }, { - "accuracy": 0.9986312081332711, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9981694209303609, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -56553,7 +56553,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -56565,7 +56565,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -56581,7 +56581,7 @@ ], "model.layers.29.self_attn": [ { - "accuracy": 0.9399542720302155, + "accuracy": 0.9486677319203553, "total_bits": 89665536, "q_proj": { "group_size": { @@ -56645,7 +56645,7 @@ } }, { - "accuracy": 0.9415980632367887, + "accuracy": 0.950985316775347, "total_bits": 92221440, "q_proj": { "group_size": { @@ -56709,7 +56709,7 @@ } }, { - "accuracy": 0.9550248304087865, + "accuracy": 0.9608922221354748, "total_bits": 95758848, "q_proj": { "group_size": { @@ -56773,7 +56773,7 @@ } }, { - "accuracy": 0.9628188687523729, + "accuracy": 0.9681620175313008, "total_bits": 112272384, "q_proj": { "group_size": { @@ -56837,7 +56837,7 @@ } }, { - "accuracy": 0.9701426660543994, + "accuracy": 0.9745559038496331, "total_bits": 132913152, "q_proj": { "group_size": { @@ -56901,7 +56901,7 @@ } }, { - "accuracy": 0.9701379417000633, + "accuracy": 0.9748296869036398, "total_bits": 132980224, "q_proj": { "group_size": { @@ -56965,7 +56965,7 @@ } }, { - "accuracy": 0.9820624949704659, + "accuracy": 0.9851639794283792, "total_bits": 169613312, "q_proj": { "group_size": { @@ -57017,7 +57017,7 @@ } }, { - "accuracy": 0.982790415518378, + "accuracy": 0.9854088360933881, "total_bits": 169745920, "q_proj": { "group_size": { @@ -57069,7 +57069,7 @@ } }, { - "accuracy": 0.9843628367801246, + "accuracy": 0.9866753707786924, "total_bits": 171195392, "q_proj": { "group_size": { @@ -57121,7 +57121,7 @@ } }, { - "accuracy": 0.984925673735377, + "accuracy": 0.9871332803250927, "total_bits": 173563904, "q_proj": { "group_size": { @@ -57173,7 +57173,7 @@ } }, { - "accuracy": 0.9853824486857966, + "accuracy": 0.9869443939410543, "total_bits": 174923264, "q_proj": { "group_size": { @@ -57237,7 +57237,7 @@ } }, { - "accuracy": 0.9860384722093218, + "accuracy": 0.9879518838011121, "total_bits": 175750144, "q_proj": { "group_size": { @@ -57301,7 +57301,7 @@ } }, { - "accuracy": 0.9884231762078247, + "accuracy": 0.9901009005200314, "total_bits": 179253248, "q_proj": { "group_size": { @@ -57362,7 +57362,7 @@ } }, { - "accuracy": 0.9896586935133919, + "accuracy": 0.9909052236103698, "total_bits": 181592064, "q_proj": { "group_size": { @@ -57423,7 +57423,7 @@ } }, { - "accuracy": 0.9938314549722954, + "accuracy": 0.9946467591278059, "total_bits": 220469248, "q_proj": { "group_size": { @@ -57484,7 +57484,7 @@ } }, { - "accuracy": 0.9948071711451599, + "accuracy": 0.995431797284829, "total_bits": 223535104, "q_proj": { "group_size": { @@ -57545,7 +57545,7 @@ } }, { - "accuracy": 0.9953192318789661, + "accuracy": 0.9960963177298637, "total_bits": 253499392, "q_proj": { "group_size": { @@ -57597,7 +57597,7 @@ } }, { - "accuracy": 0.9972958471258416, + "accuracy": 0.9976569103371156, "total_bits": 265838592, "q_proj": { "group_size": { @@ -57649,7 +57649,7 @@ } }, { - "accuracy": 0.9987246007903626, + "accuracy": 0.9989515731852551, "total_bits": 337385472, "q_proj": { "group_size": { @@ -57701,11 +57701,11 @@ } } ], - "model.layers.29.block_sparse_moe": [ + "model.layers.29.mlp": [ { - "accuracy": 0.9114951401164657, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.9011524323570101, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -57720,7 +57720,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -57735,7 +57735,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -57755,9 +57755,9 @@ } }, { - "accuracy": 0.9145057397453409, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.9040032927142947, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -57772,7 +57772,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -57787,7 +57787,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -57807,9 +57807,9 @@ } }, { - "accuracy": 0.9236511982193119, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.916344798513149, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -57824,7 +57824,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -57839,7 +57839,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -57856,9 +57856,9 @@ } }, { - "accuracy": 0.9259848980919311, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9204414649621436, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -57873,7 +57873,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -57888,7 +57888,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -57905,9 +57905,9 @@ } }, { - "accuracy": 0.9569801417620558, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9491528554966575, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -57922,7 +57922,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -57937,7 +57937,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -57957,9 +57957,9 @@ } }, { - "accuracy": 0.9608506985792988, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9534615783118888, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -57974,7 +57974,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -57989,7 +57989,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -58009,9 +58009,9 @@ } }, { - "accuracy": 0.9642969829667556, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9589279125395574, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -58026,7 +58026,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -58041,7 +58041,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -58058,9 +58058,9 @@ } }, { - "accuracy": 0.9780479350470399, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9731179328733369, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -58072,7 +58072,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -58084,7 +58084,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -58101,9 +58101,9 @@ } }, { - "accuracy": 0.9799381740097153, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9754466209560633, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -58115,7 +58115,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -58127,7 +58127,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -58144,9 +58144,9 @@ } }, { - "accuracy": 0.978538320154736, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.973927722361527, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -58161,7 +58161,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -58176,7 +58176,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -58196,9 +58196,9 @@ } }, { - "accuracy": 0.9814430636010671, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9772543269944819, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -58213,7 +58213,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -58228,7 +58228,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -58248,9 +58248,9 @@ } }, { - "accuracy": 0.9893465798936392, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9866772730295595, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -58265,7 +58265,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -58280,7 +58280,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -58300,9 +58300,9 @@ } }, { - "accuracy": 0.9910064476885294, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9886375218874922, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -58317,7 +58317,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -58332,7 +58332,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -58352,9 +58352,9 @@ } }, { - "accuracy": 0.9941166113375833, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9925338110494378, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -58366,7 +58366,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -58378,7 +58378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -58395,9 +58395,9 @@ } }, { - "accuracy": 0.9947050358413866, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9932456227313531, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -58412,7 +58412,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -58427,7 +58427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -58444,9 +58444,9 @@ } }, { - "accuracy": 0.9953489905639895, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9943286007588827, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -58461,7 +58461,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -58476,7 +58476,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -58490,9 +58490,9 @@ } }, { - "accuracy": 0.9984351548329485, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9979788892363247, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -58504,7 +58504,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -58516,7 +58516,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -58532,7 +58532,7 @@ ], "model.layers.30.self_attn": [ { - "accuracy": 0.9307091906666756, + "accuracy": 0.9420245332937491, "total_bits": 89665536, "q_proj": { "group_size": { @@ -58596,7 +58596,7 @@ } }, { - "accuracy": 0.9331352622493317, + "accuracy": 0.9434712950728441, "total_bits": 92221440, "q_proj": { "group_size": { @@ -58660,7 +58660,7 @@ } }, { - "accuracy": 0.9472953494834273, + "accuracy": 0.953885415078778, "total_bits": 95758848, "q_proj": { "group_size": { @@ -58724,7 +58724,7 @@ } }, { - "accuracy": 0.9573088438299141, + "accuracy": 0.9624906360710922, "total_bits": 112272384, "q_proj": { "group_size": { @@ -58788,7 +58788,7 @@ } }, { - "accuracy": 0.9656897741124818, + "accuracy": 0.9706650646893602, "total_bits": 132913152, "q_proj": { "group_size": { @@ -58852,7 +58852,7 @@ } }, { - "accuracy": 0.9664703729121309, + "accuracy": 0.9713480981360925, "total_bits": 132980224, "q_proj": { "group_size": { @@ -58916,7 +58916,7 @@ } }, { - "accuracy": 0.9791824818147641, + "accuracy": 0.9829938919528535, "total_bits": 169613312, "q_proj": { "group_size": { @@ -58968,7 +58968,7 @@ } }, { - "accuracy": 0.9800174725977214, + "accuracy": 0.9833401454318511, "total_bits": 169745920, "q_proj": { "group_size": { @@ -59020,7 +59020,7 @@ } }, { - "accuracy": 0.9816427059765709, + "accuracy": 0.9844484847823256, "total_bits": 171195392, "q_proj": { "group_size": { @@ -59072,7 +59072,7 @@ } }, { - "accuracy": 0.9825277818170818, + "accuracy": 0.985214173205589, "total_bits": 173563904, "q_proj": { "group_size": { @@ -59124,7 +59124,7 @@ } }, { - "accuracy": 0.9831471301027035, + "accuracy": 0.9854797402788934, "total_bits": 174923264, "q_proj": { "group_size": { @@ -59188,7 +59188,7 @@ } }, { - "accuracy": 0.9837248535630735, + "accuracy": 0.9864956716467675, "total_bits": 175750144, "q_proj": { "group_size": { @@ -59252,7 +59252,7 @@ } }, { - "accuracy": 0.9870805295772458, + "accuracy": 0.988716923901321, "total_bits": 179253248, "q_proj": { "group_size": { @@ -59313,7 +59313,7 @@ } }, { - "accuracy": 0.9878849405795336, + "accuracy": 0.989641750770572, "total_bits": 181592064, "q_proj": { "group_size": { @@ -59374,7 +59374,7 @@ } }, { - "accuracy": 0.992843183566277, + "accuracy": 0.9939114387196145, "total_bits": 220469248, "q_proj": { "group_size": { @@ -59435,7 +59435,7 @@ } }, { - "accuracy": 0.9940619997091984, + "accuracy": 0.9948442267915724, "total_bits": 223535104, "q_proj": { "group_size": { @@ -59496,7 +59496,7 @@ } }, { - "accuracy": 0.9945325405876103, + "accuracy": 0.995559303825231, "total_bits": 253499392, "q_proj": { "group_size": { @@ -59548,7 +59548,7 @@ } }, { - "accuracy": 0.9969194934115206, + "accuracy": 0.9972948665779672, "total_bits": 265838592, "q_proj": { "group_size": { @@ -59600,7 +59600,7 @@ } }, { - "accuracy": 0.99853286201649, + "accuracy": 0.9987922162951068, "total_bits": 337385472, "q_proj": { "group_size": { @@ -59652,11 +59652,11 @@ } } ], - "model.layers.30.block_sparse_moe": [ + "model.layers.30.mlp": [ { - "accuracy": 0.8995548851395908, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.8889711016887113, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -59671,7 +59671,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -59686,7 +59686,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -59706,9 +59706,9 @@ } }, { - "accuracy": 0.9034415197215582, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.8925078075967336, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -59723,7 +59723,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -59738,7 +59738,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -59758,9 +59758,9 @@ } }, { - "accuracy": 0.9131332855475576, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.9052478004443019, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -59775,7 +59775,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -59790,7 +59790,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -59807,9 +59807,9 @@ } }, { - "accuracy": 0.9155313472606634, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.9094294533133507, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -59824,7 +59824,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -59839,7 +59839,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -59856,9 +59856,9 @@ } }, { - "accuracy": 0.9499536980139582, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9417006463596695, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -59873,7 +59873,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -59888,7 +59888,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -59908,9 +59908,9 @@ } }, { - "accuracy": 0.9558768392001328, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.9478225972699492, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -59925,7 +59925,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -59940,7 +59940,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -59960,9 +59960,9 @@ } }, { - "accuracy": 0.9595174865895196, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9536097561450381, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -59977,7 +59977,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -59992,7 +59992,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -60009,9 +60009,9 @@ } }, { - "accuracy": 0.975280621530194, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9698069707343453, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -60023,7 +60023,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -60035,7 +60035,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -60052,9 +60052,9 @@ } }, { - "accuracy": 0.9773026373433439, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9723663945731363, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -60066,7 +60066,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -60078,7 +60078,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -60095,9 +60095,9 @@ } }, { - "accuracy": 0.9749600906905375, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9698809748025317, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -60112,7 +60112,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -60127,7 +60127,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -60147,9 +60147,9 @@ } }, { - "accuracy": 0.9791758556016966, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9744535272842959, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -60164,7 +60164,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -60179,7 +60179,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -60199,9 +60199,9 @@ } }, { - "accuracy": 0.987613779982846, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9845811534967077, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -60216,7 +60216,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -60231,7 +60231,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -60251,9 +60251,9 @@ } }, { - "accuracy": 0.9899315809291837, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9872018207648867, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -60268,7 +60268,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -60283,7 +60283,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -60303,9 +60303,9 @@ } }, { - "accuracy": 0.9932172610914629, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9913914405838832, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -60317,7 +60317,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -60329,7 +60329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -60346,9 +60346,9 @@ } }, { - "accuracy": 0.9938334694965497, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9921281921775326, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -60363,7 +60363,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -60378,7 +60378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -60395,9 +60395,9 @@ } }, { - "accuracy": 0.9944921806454659, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9932570086399976, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -60412,7 +60412,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -60427,7 +60427,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -60441,9 +60441,9 @@ } }, { - "accuracy": 0.9981943596963232, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9976164725498835, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -60455,7 +60455,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -60467,7 +60467,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -60483,7 +60483,7 @@ ], "model.layers.31.self_attn": [ { - "accuracy": 0.9271599058257907, + "accuracy": 0.9392062692265761, "total_bits": 89665536, "q_proj": { "group_size": { @@ -60547,7 +60547,7 @@ } }, { - "accuracy": 0.9318617582321167, + "accuracy": 0.9426114706224517, "total_bits": 92221440, "q_proj": { "group_size": { @@ -60611,7 +60611,7 @@ } }, { - "accuracy": 0.9461133338903126, + "accuracy": 0.9518489953326552, "total_bits": 95758848, "q_proj": { "group_size": { @@ -60675,7 +60675,7 @@ } }, { - "accuracy": 0.9602390233623355, + "accuracy": 0.9655042531851091, "total_bits": 112272384, "q_proj": { "group_size": { @@ -60739,7 +60739,7 @@ } }, { - "accuracy": 0.9655896744837886, + "accuracy": 0.9696774345479513, "total_bits": 132913152, "q_proj": { "group_size": { @@ -60803,7 +60803,7 @@ } }, { - "accuracy": 0.9663463224117693, + "accuracy": 0.9703531589751181, "total_bits": 132980224, "q_proj": { "group_size": { @@ -60867,7 +60867,7 @@ } }, { - "accuracy": 0.9783107272692417, + "accuracy": 0.9816436835221554, "total_bits": 169613312, "q_proj": { "group_size": { @@ -60919,7 +60919,7 @@ } }, { - "accuracy": 0.9791976095814454, + "accuracy": 0.9821891076862812, "total_bits": 169745920, "q_proj": { "group_size": { @@ -60971,7 +60971,7 @@ } }, { - "accuracy": 0.980646914067237, + "accuracy": 0.9835320717998242, "total_bits": 171195392, "q_proj": { "group_size": { @@ -61023,7 +61023,7 @@ } }, { - "accuracy": 0.9814471597048012, + "accuracy": 0.9843084864612472, "total_bits": 173563904, "q_proj": { "group_size": { @@ -61075,7 +61075,7 @@ } }, { - "accuracy": 0.9825099036587697, + "accuracy": 0.9850082155807238, "total_bits": 174923264, "q_proj": { "group_size": { @@ -61139,7 +61139,7 @@ } }, { - "accuracy": 0.9835082461175165, + "accuracy": 0.9857731914441836, "total_bits": 175750144, "q_proj": { "group_size": { @@ -61203,7 +61203,7 @@ } }, { - "accuracy": 0.9860568437725306, + "accuracy": 0.9877125305662814, "total_bits": 179253248, "q_proj": { "group_size": { @@ -61264,7 +61264,7 @@ } }, { - "accuracy": 0.9872314710366098, + "accuracy": 0.9885860363658714, "total_bits": 181592064, "q_proj": { "group_size": { @@ -61325,7 +61325,7 @@ } }, { - "accuracy": 0.9924810869423183, + "accuracy": 0.9933305377336709, "total_bits": 220469248, "q_proj": { "group_size": { @@ -61386,7 +61386,7 @@ } }, { - "accuracy": 0.9935416606205859, + "accuracy": 0.9942430182433638, "total_bits": 223535104, "q_proj": { "group_size": { @@ -61447,7 +61447,7 @@ } }, { - "accuracy": 0.9943635753895107, + "accuracy": 0.9951480827661917, "total_bits": 253499392, "q_proj": { "group_size": { @@ -61499,7 +61499,7 @@ } }, { - "accuracy": 0.9966034654552411, + "accuracy": 0.996929424974185, "total_bits": 265838592, "q_proj": { "group_size": { @@ -61551,7 +61551,7 @@ } }, { - "accuracy": 0.9984699864749258, + "accuracy": 0.998694088597978, "total_bits": 337385472, "q_proj": { "group_size": { @@ -61603,11 +61603,11 @@ } } ], - "model.layers.31.block_sparse_moe": [ + "model.layers.31.mlp": [ { - "accuracy": 0.8714894950389862, - "total_bits": 1581846784, - "w1": { + "accuracy": 0.8689838055717318, + "total_bits": 395461696, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -61622,7 +61622,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -61637,7 +61637,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -61657,9 +61657,9 @@ } }, { - "accuracy": 0.8760565417377573, - "total_bits": 1636897024, - "w1": { + "accuracy": 0.8744133506950579, + "total_bits": 409224256, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -61674,7 +61674,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -61689,7 +61689,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "6": 32, "3": 64, @@ -61709,9 +61709,9 @@ } }, { - "accuracy": 0.886748741920057, - "total_bits": 1829089280, - "w1": { + "accuracy": 0.8858012404096753, + "total_bits": 457272320, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -61726,7 +61726,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -61741,7 +61741,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "3": 32 @@ -61758,9 +61758,9 @@ } }, { - "accuracy": 0.8896532644958872, - "total_bits": 2051911680, - "w1": { + "accuracy": 0.8897688961342761, + "total_bits": 512977920, + "gate_proj": { "group_size": { "3": 64, "2": 64 @@ -61775,7 +61775,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "3": 64, "2": 64 @@ -61790,7 +61790,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "5": 32, "4": 32 @@ -61807,9 +61807,9 @@ } }, { - "accuracy": 0.9353843307808826, - "total_bits": 2313589120, - "w1": { + "accuracy": 0.9366726726293564, + "total_bits": 578397280, + "gate_proj": { "group_size": { "4": 128, "3": 128 @@ -61824,7 +61824,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 128, "3": 128 @@ -61839,7 +61839,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128, @@ -61859,9 +61859,9 @@ } }, { - "accuracy": 0.9432532896140688, - "total_bits": 2371489792, - "w1": { + "accuracy": 0.943887605674957, + "total_bits": 592872448, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -61876,7 +61876,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -61891,7 +61891,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32, @@ -61911,9 +61911,9 @@ } }, { - "accuracy": 0.9471299073805934, - "total_bits": 2549817728, - "w1": { + "accuracy": 0.9489450508630589, + "total_bits": 637454432, + "gate_proj": { "group_size": { "4": 32, "3": 32 @@ -61928,7 +61928,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32, "3": 32 @@ -61943,7 +61943,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -61960,9 +61960,9 @@ } }, { - "accuracy": 0.9675060651804271, - "total_bits": 2914965888, - "w1": { + "accuracy": 0.9671246874097147, + "total_bits": 728741472, + "gate_proj": { "group_size": { "4": 128 }, @@ -61974,7 +61974,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -61986,7 +61986,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 128 @@ -62003,9 +62003,9 @@ } }, { - "accuracy": 0.9702967721478719, - "total_bits": 2957905920, - "w1": { + "accuracy": 0.9698382914066315, + "total_bits": 739476480, + "gate_proj": { "group_size": { "4": 32 }, @@ -62017,7 +62017,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "4": 32 }, @@ -62029,7 +62029,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "4": 32 @@ -62046,9 +62046,9 @@ } }, { - "accuracy": 0.9674451356067469, - "total_bits": 3006173568, - "w1": { + "accuracy": 0.9675754502807793, + "total_bits": 751543392, + "gate_proj": { "group_size": { "5": 128, "4": 128 @@ -62063,7 +62063,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 128, "4": 128 @@ -62078,7 +62078,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 128, @@ -62098,9 +62098,9 @@ } }, { - "accuracy": 0.9726809794083238, - "total_bits": 3064074240, - "w1": { + "accuracy": 0.9723473833383698, + "total_bits": 766018560, + "gate_proj": { "group_size": { "5": 32, "4": 32 @@ -62115,7 +62115,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "5": 32, "4": 32 @@ -62130,7 +62130,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "5": 32, @@ -62150,9 +62150,9 @@ } }, { - "accuracy": 0.9837010373900595, - "total_bits": 3698758016, - "w1": { + "accuracy": 0.9832064151077679, + "total_bits": 924689504, + "gate_proj": { "group_size": { "6": 128, "5": 128 @@ -62167,7 +62167,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128, "5": 128 @@ -62182,7 +62182,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128, @@ -62202,9 +62202,9 @@ } }, { - "accuracy": 0.9866214421528735, - "total_bits": 3756658688, - "w1": { + "accuracy": 0.9860335649236253, + "total_bits": 939164672, + "gate_proj": { "group_size": { "6": 32, "5": 32 @@ -62219,7 +62219,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 32, "5": 32 @@ -62234,7 +62234,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 32, @@ -62254,9 +62254,9 @@ } }, { - "accuracy": 0.9910344576316052, - "total_bits": 4278096256, - "w1": { + "accuracy": 0.9904524395242333, + "total_bits": 1069524064, + "gate_proj": { "group_size": { "6": 128 }, @@ -62268,7 +62268,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "6": 128 }, @@ -62280,7 +62280,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 32, "6": 128 @@ -62297,9 +62297,9 @@ } }, { - "accuracy": 0.9917836127508628, - "total_bits": 4441539584, - "w1": { + "accuracy": 0.9912659053347612, + "total_bits": 1110384896, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -62314,7 +62314,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -62329,7 +62329,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128, "6": 128 @@ -62346,9 +62346,9 @@ } }, { - "accuracy": 0.9925049688838619, - "total_bits": 4839998464, - "w1": { + "accuracy": 0.9923111496395186, + "total_bits": 1209999616, + "gate_proj": { "group_size": { "8": 128, "6": 128 @@ -62363,7 +62363,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128, "6": 128 @@ -62378,7 +62378,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 }, @@ -62392,9 +62392,9 @@ } }, { - "accuracy": 0.9974016365350077, - "total_bits": 5662082048, - "w1": { + "accuracy": 0.9970870429380355, + "total_bits": 1415520512, + "gate_proj": { "group_size": { "8": 128 }, @@ -62406,7 +62406,7 @@ ], "scale_bits": 4 }, - "w3": { + "up_proj": { "group_size": { "8": 128 }, @@ -62418,7 +62418,7 @@ ], "scale_bits": 4 }, - "w2": { + "down_proj": { "group_size": { "8": 128 },