clean
Browse files- results.json +0 -148
results.json
CHANGED
@@ -5785,43 +5785,6 @@
|
|
5785 |
"accuracy": 0.5723270440251572
|
5786 |
}
|
5787 |
},
|
5788 |
-
"data/ifeval_best_of_k/Qwen2.5-7B-RM-09192024.json": {
|
5789 |
-
"all": {
|
5790 |
-
"accuracy": 0.5726562500000001,
|
5791 |
-
"area_under_curve": 0.5536947485942088,
|
5792 |
-
"loss": 0.1320898115634918,
|
5793 |
-
"mean_max_score": 0.58177734375,
|
5794 |
-
"mean_end_score": 0.565234375
|
5795 |
-
},
|
5796 |
-
"gemma-2-9b-it": {
|
5797 |
-
"accuracy": 0.5796875,
|
5798 |
-
"area_under_curve": 0.5556161705870118,
|
5799 |
-
"loss": 0.11947629928588867,
|
5800 |
-
"mean_max_score": 0.62125,
|
5801 |
-
"mean_end_score": 0.58953125
|
5802 |
-
},
|
5803 |
-
"gpt-4o-mini-2024-07-18": {
|
5804 |
-
"accuracy": 0.61875,
|
5805 |
-
"area_under_curve": 0.5427620686093066,
|
5806 |
-
"loss": 0.1205466079711914,
|
5807 |
-
"mean_max_score": 0.631328125,
|
5808 |
-
"mean_end_score": 0.605390625
|
5809 |
-
},
|
5810 |
-
"Meta-Llama-3-8B-Instruct": {
|
5811 |
-
"accuracy": 0.58125,
|
5812 |
-
"area_under_curve": 0.5553819052297072,
|
5813 |
-
"loss": 0.14538890838623048,
|
5814 |
-
"mean_max_score": 0.60421875,
|
5815 |
-
"mean_end_score": 0.55078125
|
5816 |
-
},
|
5817 |
-
"claude-3-haiku-20240307": {
|
5818 |
-
"accuracy": 0.5109375,
|
5819 |
-
"area_under_curve": 0.5500825653737205,
|
5820 |
-
"loss": 0.15359460830688476,
|
5821 |
-
"mean_max_score": 0.54703125,
|
5822 |
-
"mean_end_score": 0.515625
|
5823 |
-
}
|
5824 |
-
},
|
5825 |
"data/ifeval_best_of_k/internlm2-1_8b-reward.json": {
|
5826 |
"all": {
|
5827 |
"accuracy": 0.5386718749999999,
|
@@ -6189,43 +6152,6 @@
|
|
6189 |
"mean_end_score": 0.606015625
|
6190 |
}
|
6191 |
},
|
6192 |
-
"data/ifeval_best_of_k/Qwen2.5-72B-RM-09242024.json": {
|
6193 |
-
"all": {
|
6194 |
-
"accuracy": 0.591796875,
|
6195 |
-
"area_under_curve": 0.5682828198209156,
|
6196 |
-
"loss": 0.11081918239593506,
|
6197 |
-
"mean_max_score": 0.61642578125,
|
6198 |
-
"mean_end_score": 0.61126953125
|
6199 |
-
},
|
6200 |
-
"gemma-2-9b-it": {
|
6201 |
-
"accuracy": 0.615625,
|
6202 |
-
"area_under_curve": 0.5724099728839697,
|
6203 |
-
"loss": 0.08845264434814454,
|
6204 |
-
"mean_max_score": 0.667265625,
|
6205 |
-
"mean_end_score": 0.6484375
|
6206 |
-
},
|
6207 |
-
"gpt-4o-mini-2024-07-18": {
|
6208 |
-
"accuracy": 0.6046875,
|
6209 |
-
"area_under_curve": 0.5579399685462639,
|
6210 |
-
"loss": 0.1147395133972168,
|
6211 |
-
"mean_max_score": 0.644296875,
|
6212 |
-
"mean_end_score": 0.6171875
|
6213 |
-
},
|
6214 |
-
"Meta-Llama-3-8B-Instruct": {
|
6215 |
-
"accuracy": 0.6140625,
|
6216 |
-
"area_under_curve": 0.576997247648311,
|
6217 |
-
"loss": 0.11122642517089844,
|
6218 |
-
"mean_max_score": 0.64453125,
|
6219 |
-
"mean_end_score": 0.636640625
|
6220 |
-
},
|
6221 |
-
"claude-3-haiku-20240307": {
|
6222 |
-
"accuracy": 0.5328125,
|
6223 |
-
"area_under_curve": 0.5592622087343524,
|
6224 |
-
"loss": 0.13447074890136718,
|
6225 |
-
"mean_max_score": 0.5696875,
|
6226 |
-
"mean_end_score": 0.543359375
|
6227 |
-
}
|
6228 |
-
},
|
6229 |
"data/ifeval_best_of_k/nemotron-4-340b-reward.json": {
|
6230 |
"all": {
|
6231 |
"accuracy": 0.6265624999999999,
|
@@ -6300,43 +6226,6 @@
|
|
6300 |
"mean_end_score": 0.578125
|
6301 |
}
|
6302 |
},
|
6303 |
-
"data/ifeval_best_of_k/Llama-3.1-8B-Instruct-RM-Test.json": {
|
6304 |
-
"all": {
|
6305 |
-
"accuracy": 0.5953124999999999,
|
6306 |
-
"area_under_curve": 0.5659010925349728,
|
6307 |
-
"loss": 0.11261327266693115,
|
6308 |
-
"mean_max_score": 0.61439453125,
|
6309 |
-
"mean_end_score": 0.60623046875
|
6310 |
-
},
|
6311 |
-
"gemma-2-9b-it": {
|
6312 |
-
"accuracy": 0.6,
|
6313 |
-
"area_under_curve": 0.5742011950437376,
|
6314 |
-
"loss": 0.08687259674072266,
|
6315 |
-
"mean_max_score": 0.67890625,
|
6316 |
-
"mean_end_score": 0.6640625
|
6317 |
-
},
|
6318 |
-
"gpt-4o-mini-2024-07-18": {
|
6319 |
-
"accuracy": 0.5984375,
|
6320 |
-
"area_under_curve": 0.5628933527191842,
|
6321 |
-
"loss": 0.10282745361328124,
|
6322 |
-
"mean_max_score": 0.655625,
|
6323 |
-
"mean_end_score": 0.644375
|
6324 |
-
},
|
6325 |
-
"Meta-Llama-3-8B-Instruct": {
|
6326 |
-
"accuracy": 0.603125,
|
6327 |
-
"area_under_curve": 0.5555893773327166,
|
6328 |
-
"loss": 0.12582313537597656,
|
6329 |
-
"mean_max_score": 0.618515625,
|
6330 |
-
"mean_end_score": 0.578125
|
6331 |
-
},
|
6332 |
-
"claude-3-haiku-20240307": {
|
6333 |
-
"accuracy": 0.5796874999999999,
|
6334 |
-
"area_under_curve": 0.5637145211028964,
|
6335 |
-
"loss": 0.13854501724243165,
|
6336 |
-
"mean_max_score": 0.564296875,
|
6337 |
-
"mean_end_score": 0.5390625
|
6338 |
-
}
|
6339 |
-
},
|
6340 |
"data/ifeval_best_of_k/Starling-RM-7B-alpha.json": {
|
6341 |
"all": {
|
6342 |
"accuracy": 0.5406249999999999,
|
@@ -6411,43 +6300,6 @@
|
|
6411 |
"mean_end_score": 0.484375
|
6412 |
}
|
6413 |
},
|
6414 |
-
"data/ifeval_best_of_k/Llama-3.1-70B-RM-09172024.json": {
|
6415 |
-
"all": {
|
6416 |
-
"accuracy": 0.630078125,
|
6417 |
-
"area_under_curve": 0.5902905300669057,
|
6418 |
-
"loss": 0.09440629482269287,
|
6419 |
-
"mean_max_score": 0.64310546875,
|
6420 |
-
"mean_end_score": 0.62984375
|
6421 |
-
},
|
6422 |
-
"gemma-2-9b-it": {
|
6423 |
-
"accuracy": 0.6375,
|
6424 |
-
"area_under_curve": 0.6064561485832756,
|
6425 |
-
"loss": 0.07111602783203125,
|
6426 |
-
"mean_max_score": 0.709375,
|
6427 |
-
"mean_end_score": 0.6953125
|
6428 |
-
},
|
6429 |
-
"gpt-4o-mini-2024-07-18": {
|
6430 |
-
"accuracy": 0.6359374999999999,
|
6431 |
-
"area_under_curve": 0.5804507982664724,
|
6432 |
-
"loss": 0.08310569763183594,
|
6433 |
-
"mean_max_score": 0.693203125,
|
6434 |
-
"mean_end_score": 0.6759375
|
6435 |
-
},
|
6436 |
-
"Meta-Llama-3-8B-Instruct": {
|
6437 |
-
"accuracy": 0.6468750000000001,
|
6438 |
-
"area_under_curve": 0.5893750619966321,
|
6439 |
-
"loss": 0.10088687896728515,
|
6440 |
-
"mean_max_score": 0.653359375,
|
6441 |
-
"mean_end_score": 0.6171875
|
6442 |
-
},
|
6443 |
-
"claude-3-haiku-20240307": {
|
6444 |
-
"accuracy": 0.6000000000000001,
|
6445 |
-
"area_under_curve": 0.585711467200442,
|
6446 |
-
"loss": 0.12550268173217774,
|
6447 |
-
"mean_max_score": 0.588984375,
|
6448 |
-
"mean_end_score": 0.53125
|
6449 |
-
}
|
6450 |
-
},
|
6451 |
"data/ifeval_best_of_k/Skywork-Reward-Gemma-2-27B.json": {
|
6452 |
"all": {
|
6453 |
"accuracy": 0.537890625,
|
|
|
5785 |
"accuracy": 0.5723270440251572
|
5786 |
}
|
5787 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5788 |
"data/ifeval_best_of_k/internlm2-1_8b-reward.json": {
|
5789 |
"all": {
|
5790 |
"accuracy": 0.5386718749999999,
|
|
|
6152 |
"mean_end_score": 0.606015625
|
6153 |
}
|
6154 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6155 |
"data/ifeval_best_of_k/nemotron-4-340b-reward.json": {
|
6156 |
"all": {
|
6157 |
"accuracy": 0.6265624999999999,
|
|
|
6226 |
"mean_end_score": 0.578125
|
6227 |
}
|
6228 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6229 |
"data/ifeval_best_of_k/Starling-RM-7B-alpha.json": {
|
6230 |
"all": {
|
6231 |
"accuracy": 0.5406249999999999,
|
|
|
6300 |
"mean_end_score": 0.484375
|
6301 |
}
|
6302 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6303 |
"data/ifeval_best_of_k/Skywork-Reward-Gemma-2-27B.json": {
|
6304 |
"all": {
|
6305 |
"accuracy": 0.537890625,
|