{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998751404669747, "eval_steps": 1000, "global_step": 4004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000249719066050693, "grad_norm": 0.34765625, "learning_rate": 1.2468827930174565e-08, "logits/chosen": -2.450503349304199, "logits/rejected": -2.672837734222412, "logps/chosen": -21.34674835205078, "logps/rejected": -42.586097717285156, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00249719066050693, "grad_norm": 0.2890625, "learning_rate": 1.2468827930174566e-07, "logits/chosen": -2.275761604309082, "logits/rejected": -2.479705333709717, "logps/chosen": -22.14301300048828, "logps/rejected": -63.31869888305664, "loss": 0.5, "rewards/accuracies": 0.5, "rewards/chosen": -0.00027842415147460997, "rewards/margins": -0.00017310140538029373, "rewards/rejected": -0.0001053227242664434, "step": 10 }, { "epoch": 0.00499438132101386, "grad_norm": 0.318359375, "learning_rate": 2.493765586034913e-07, "logits/chosen": -2.2202348709106445, "logits/rejected": -2.429389238357544, "logps/chosen": -21.814502716064453, "logps/rejected": -61.35728073120117, "loss": 0.5, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 8.430716843577102e-05, "rewards/margins": 0.00037039705784991384, "rewards/rejected": -0.00028608986758627, "step": 20 }, { "epoch": 0.007491571981520789, "grad_norm": 0.26171875, "learning_rate": 3.7406483790523695e-07, "logits/chosen": -2.14150333404541, "logits/rejected": -2.3708083629608154, "logps/chosen": -22.1105899810791, "logps/rejected": -52.95900344848633, "loss": 0.5001, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -9.514805424259976e-05, "rewards/margins": -5.593679452431388e-05, "rewards/rejected": -3.9211259718285874e-05, "step": 30 }, { "epoch": 0.00998876264202772, "grad_norm": 0.27734375, "learning_rate": 4.987531172069826e-07, "logits/chosen": -2.1455249786376953, "logits/rejected": -2.362419605255127, "logps/chosen": -22.628782272338867, "logps/rejected": -63.2244873046875, "loss": 0.4998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0004498485941439867, "rewards/margins": 0.0016190257156267762, "rewards/rejected": -0.0011691770050674677, "step": 40 }, { "epoch": 0.012485953302534648, "grad_norm": 0.212890625, "learning_rate": 6.234413965087283e-07, "logits/chosen": -2.2349250316619873, "logits/rejected": -2.495819568634033, "logps/chosen": -22.863269805908203, "logps/rejected": -59.4576416015625, "loss": 0.4998, "rewards/accuracies": 0.75, "rewards/chosen": 0.0006220145733095706, "rewards/margins": 0.0015806708252057433, "rewards/rejected": -0.0009586562518961728, "step": 50 }, { "epoch": 0.014983143963041578, "grad_norm": 0.328125, "learning_rate": 7.481296758104739e-07, "logits/chosen": -2.169523239135742, "logits/rejected": -2.3751749992370605, "logps/chosen": -22.777694702148438, "logps/rejected": -68.83964538574219, "loss": 0.4992, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0019476842135190964, "rewards/margins": 0.004409968852996826, "rewards/rejected": -0.002462285105139017, "step": 60 }, { "epoch": 0.017480334623548508, "grad_norm": 0.28515625, "learning_rate": 8.728179551122195e-07, "logits/chosen": -2.286738634109497, "logits/rejected": -2.4896113872528076, "logps/chosen": -21.078710556030273, "logps/rejected": -50.04187774658203, "loss": 0.4985, "rewards/accuracies": 1.0, "rewards/chosen": 0.004810997284948826, "rewards/margins": 0.007176141254603863, "rewards/rejected": -0.0023651437368243933, "step": 70 }, { "epoch": 0.01997752528405544, "grad_norm": 0.279296875, "learning_rate": 9.975062344139653e-07, "logits/chosen": -2.144176959991455, "logits/rejected": -2.352398633956909, "logps/chosen": -21.391971588134766, "logps/rejected": -56.86810302734375, "loss": 0.4969, "rewards/accuracies": 1.0, "rewards/chosen": 0.010686805471777916, "rewards/margins": 0.01409011147916317, "rewards/rejected": -0.003403306705877185, "step": 80 }, { "epoch": 0.02247471594456237, "grad_norm": 0.34765625, "learning_rate": 1.1221945137157108e-06, "logits/chosen": -2.2526628971099854, "logits/rejected": -2.430774211883545, "logps/chosen": -19.845823287963867, "logps/rejected": -51.37982177734375, "loss": 0.4949, "rewards/accuracies": 1.0, "rewards/chosen": 0.019541995599865913, "rewards/margins": 0.021860197186470032, "rewards/rejected": -0.0023182008881121874, "step": 90 }, { "epoch": 0.024971906605069295, "grad_norm": 0.353515625, "learning_rate": 1.2468827930174565e-06, "logits/chosen": -2.1313042640686035, "logits/rejected": -2.3720927238464355, "logps/chosen": -20.160160064697266, "logps/rejected": -66.42484283447266, "loss": 0.4924, "rewards/accuracies": 1.0, "rewards/chosen": 0.031359124928712845, "rewards/margins": 0.03116660751402378, "rewards/rejected": 0.00019251916091889143, "step": 100 }, { "epoch": 0.027469097265576226, "grad_norm": 0.33203125, "learning_rate": 1.3715710723192023e-06, "logits/chosen": -2.1676554679870605, "logits/rejected": -2.389533758163452, "logps/chosen": -17.833478927612305, "logps/rejected": -60.63257598876953, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 0.04782567545771599, "rewards/margins": 0.05032258480787277, "rewards/rejected": -0.002496910747140646, "step": 110 }, { "epoch": 0.029966287926083156, "grad_norm": 0.68359375, "learning_rate": 1.4962593516209478e-06, "logits/chosen": -2.1279516220092773, "logits/rejected": -2.343705177307129, "logps/chosen": -15.757919311523438, "logps/rejected": -51.14020919799805, "loss": 0.4836, "rewards/accuracies": 1.0, "rewards/chosen": 0.06653784960508347, "rewards/margins": 0.06694493442773819, "rewards/rejected": -0.0004070843569934368, "step": 120 }, { "epoch": 0.032463478586590086, "grad_norm": 0.337890625, "learning_rate": 1.6209476309226935e-06, "logits/chosen": -2.3082690238952637, "logits/rejected": -2.5344271659851074, "logps/chosen": -12.95374870300293, "logps/rejected": -53.89298629760742, "loss": 0.4766, "rewards/accuracies": 1.0, "rewards/chosen": 0.09254685044288635, "rewards/margins": 0.09660454094409943, "rewards/rejected": -0.0040576886385679245, "step": 130 }, { "epoch": 0.034960669247097016, "grad_norm": 0.291015625, "learning_rate": 1.745635910224439e-06, "logits/chosen": -2.1814446449279785, "logits/rejected": -2.40262508392334, "logps/chosen": -11.56260871887207, "logps/rejected": -71.49890899658203, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 0.11319079250097275, "rewards/margins": 0.11961270868778229, "rewards/rejected": -0.006421914789825678, "step": 140 }, { "epoch": 0.037457859907603946, "grad_norm": 0.28125, "learning_rate": 1.8703241895261848e-06, "logits/chosen": -2.2549407482147217, "logits/rejected": -2.4583637714385986, "logps/chosen": -8.707418441772461, "logps/rejected": -56.646148681640625, "loss": 0.4655, "rewards/accuracies": 1.0, "rewards/chosen": 0.13278979063034058, "rewards/margins": 0.14516989886760712, "rewards/rejected": -0.012380105443298817, "step": 150 }, { "epoch": 0.03995505056811088, "grad_norm": 0.26171875, "learning_rate": 1.9950124688279305e-06, "logits/chosen": -2.261176586151123, "logits/rejected": -2.454853057861328, "logps/chosen": -7.25634765625, "logps/rejected": -62.16912841796875, "loss": 0.4591, "rewards/accuracies": 1.0, "rewards/chosen": 0.14732162654399872, "rewards/margins": 0.1813906729221344, "rewards/rejected": -0.034069035202264786, "step": 160 }, { "epoch": 0.04245224122861781, "grad_norm": 0.228515625, "learning_rate": 2.119700748129676e-06, "logits/chosen": -2.413883686065674, "logits/rejected": -2.6421730518341064, "logps/chosen": -5.5545244216918945, "logps/rejected": -54.24146270751953, "loss": 0.4528, "rewards/accuracies": 1.0, "rewards/chosen": 0.16115576028823853, "rewards/margins": 0.21780212223529816, "rewards/rejected": -0.05664635822176933, "step": 170 }, { "epoch": 0.04494943188912474, "grad_norm": 0.310546875, "learning_rate": 2.2443890274314216e-06, "logits/chosen": -2.123264789581299, "logits/rejected": -2.3629353046417236, "logps/chosen": -5.675574779510498, "logps/rejected": -81.35579681396484, "loss": 0.448, "rewards/accuracies": 1.0, "rewards/chosen": 0.16601073741912842, "rewards/margins": 0.2519921362400055, "rewards/rejected": -0.08598136156797409, "step": 180 }, { "epoch": 0.04744662254963167, "grad_norm": 0.2734375, "learning_rate": 2.3690773067331675e-06, "logits/chosen": -2.162355899810791, "logits/rejected": -2.4037208557128906, "logps/chosen": -4.741239547729492, "logps/rejected": -69.67314147949219, "loss": 0.4382, "rewards/accuracies": 1.0, "rewards/chosen": 0.17391221225261688, "rewards/margins": 0.32386231422424316, "rewards/rejected": -0.14995010197162628, "step": 190 }, { "epoch": 0.04994381321013859, "grad_norm": 0.326171875, "learning_rate": 2.493765586034913e-06, "logits/chosen": -2.232464551925659, "logits/rejected": -2.461862087249756, "logps/chosen": -4.306845664978027, "logps/rejected": -70.49752807617188, "loss": 0.429, "rewards/accuracies": 1.0, "rewards/chosen": 0.18084710836410522, "rewards/margins": 0.39341551065444946, "rewards/rejected": -0.21256835758686066, "step": 200 }, { "epoch": 0.05244100387064552, "grad_norm": 0.28515625, "learning_rate": 2.6184538653366586e-06, "logits/chosen": -2.2186341285705566, "logits/rejected": -2.4293782711029053, "logps/chosen": -2.813771963119507, "logps/rejected": -77.77786254882812, "loss": 0.4186, "rewards/accuracies": 1.0, "rewards/chosen": 0.18635782599449158, "rewards/margins": 0.4745180010795593, "rewards/rejected": -0.28816017508506775, "step": 210 }, { "epoch": 0.05493819453115245, "grad_norm": 0.279296875, "learning_rate": 2.7431421446384045e-06, "logits/chosen": -2.2114510536193848, "logits/rejected": -2.423021078109741, "logps/chosen": -2.7164266109466553, "logps/rejected": -93.01399230957031, "loss": 0.4086, "rewards/accuracies": 1.0, "rewards/chosen": 0.19216802716255188, "rewards/margins": 0.5540723204612732, "rewards/rejected": -0.3619043231010437, "step": 220 }, { "epoch": 0.05743538519165938, "grad_norm": 0.30859375, "learning_rate": 2.86783042394015e-06, "logits/chosen": -2.2182869911193848, "logits/rejected": -2.4157519340515137, "logps/chosen": -2.1753125190734863, "logps/rejected": -96.47676086425781, "loss": 0.3976, "rewards/accuracies": 1.0, "rewards/chosen": 0.19363494217395782, "rewards/margins": 0.6491508483886719, "rewards/rejected": -0.45551595091819763, "step": 230 }, { "epoch": 0.05993257585216631, "grad_norm": 0.41796875, "learning_rate": 2.9925187032418956e-06, "logits/chosen": -2.303800344467163, "logits/rejected": -2.5223240852355957, "logps/chosen": -2.2545647621154785, "logps/rejected": -115.70625305175781, "loss": 0.3757, "rewards/accuracies": 1.0, "rewards/chosen": 0.19811145961284637, "rewards/margins": 0.8461275100708008, "rewards/rejected": -0.6480159759521484, "step": 240 }, { "epoch": 0.06242976651267324, "grad_norm": 0.408203125, "learning_rate": 3.117206982543641e-06, "logits/chosen": -2.2053685188293457, "logits/rejected": -2.415367841720581, "logps/chosen": -2.1990444660186768, "logps/rejected": -140.34054565429688, "loss": 0.3542, "rewards/accuracies": 1.0, "rewards/chosen": 0.19701093435287476, "rewards/margins": 1.0766099691390991, "rewards/rejected": -0.8795989751815796, "step": 250 }, { "epoch": 0.06492695717318017, "grad_norm": 0.45703125, "learning_rate": 3.241895261845387e-06, "logits/chosen": -2.224290132522583, "logits/rejected": -2.4391043186187744, "logps/chosen": -1.894426941871643, "logps/rejected": -191.0155029296875, "loss": 0.3217, "rewards/accuracies": 1.0, "rewards/chosen": 0.19716738164424896, "rewards/margins": 1.5208184719085693, "rewards/rejected": -1.3236511945724487, "step": 260 }, { "epoch": 0.0674241478336871, "grad_norm": 0.353515625, "learning_rate": 3.3665835411471326e-06, "logits/chosen": -2.1803958415985107, "logits/rejected": -2.3852007389068604, "logps/chosen": -2.2776474952697754, "logps/rejected": -256.2982177734375, "loss": 0.2905, "rewards/accuracies": 1.0, "rewards/chosen": 0.19975684583187103, "rewards/margins": 2.1906659603118896, "rewards/rejected": -1.990909218788147, "step": 270 }, { "epoch": 0.06992133849419403, "grad_norm": 0.1494140625, "learning_rate": 3.491271820448878e-06, "logits/chosen": -2.089259624481201, "logits/rejected": -2.2738101482391357, "logps/chosen": -3.7932281494140625, "logps/rejected": -315.3883361816406, "loss": 0.2858, "rewards/accuracies": 1.0, "rewards/chosen": 0.19718244671821594, "rewards/margins": 2.653756856918335, "rewards/rejected": -2.4565746784210205, "step": 280 }, { "epoch": 0.07241852915470096, "grad_norm": 0.138671875, "learning_rate": 3.615960099750624e-06, "logits/chosen": -2.136627674102783, "logits/rejected": -2.336648941040039, "logps/chosen": -2.27809476852417, "logps/rejected": -309.0271911621094, "loss": 0.281, "rewards/accuracies": 1.0, "rewards/chosen": 0.19811423122882843, "rewards/margins": 2.7192320823669434, "rewards/rejected": -2.521117687225342, "step": 290 }, { "epoch": 0.07491571981520789, "grad_norm": 0.1826171875, "learning_rate": 3.7406483790523696e-06, "logits/chosen": -2.1298162937164307, "logits/rejected": -2.3403031826019287, "logps/chosen": -2.7181735038757324, "logps/rejected": -379.2640075683594, "loss": 0.2649, "rewards/accuracies": 1.0, "rewards/chosen": 0.1967845857143402, "rewards/margins": 3.449932813644409, "rewards/rejected": -3.253148317337036, "step": 300 }, { "epoch": 0.07741291047571482, "grad_norm": 0.189453125, "learning_rate": 3.8653366583541155e-06, "logits/chosen": -2.0690829753875732, "logits/rejected": -2.240788459777832, "logps/chosen": -2.222135066986084, "logps/rejected": -404.05157470703125, "loss": 0.2741, "rewards/accuracies": 1.0, "rewards/chosen": 0.19702208042144775, "rewards/margins": 3.7276394367218018, "rewards/rejected": -3.5306174755096436, "step": 310 }, { "epoch": 0.07991010113622175, "grad_norm": 0.16796875, "learning_rate": 3.990024937655861e-06, "logits/chosen": -2.0671050548553467, "logits/rejected": -2.24275279045105, "logps/chosen": -2.2376856803894043, "logps/rejected": -507.495849609375, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": 0.1967649608850479, "rewards/margins": 4.618912696838379, "rewards/rejected": -4.422147750854492, "step": 320 }, { "epoch": 0.08240729179672868, "grad_norm": 0.1611328125, "learning_rate": 4.114713216957607e-06, "logits/chosen": -2.137000560760498, "logits/rejected": -2.287095546722412, "logps/chosen": -2.6727747917175293, "logps/rejected": -397.1515808105469, "loss": 0.2652, "rewards/accuracies": 1.0, "rewards/chosen": 0.1976507008075714, "rewards/margins": 3.678623914718628, "rewards/rejected": -3.480973482131958, "step": 330 }, { "epoch": 0.08490448245723561, "grad_norm": 0.21484375, "learning_rate": 4.239401496259352e-06, "logits/chosen": -2.0656325817108154, "logits/rejected": -2.2314834594726562, "logps/chosen": -2.123012065887451, "logps/rejected": -494.6885681152344, "loss": 0.2573, "rewards/accuracies": 1.0, "rewards/chosen": 0.19615662097930908, "rewards/margins": 4.528371810913086, "rewards/rejected": -4.332215309143066, "step": 340 }, { "epoch": 0.08740167311774254, "grad_norm": 0.1259765625, "learning_rate": 4.364089775561098e-06, "logits/chosen": -2.1637234687805176, "logits/rejected": -2.3083388805389404, "logps/chosen": -2.9447762966156006, "logps/rejected": -453.163330078125, "loss": 0.264, "rewards/accuracies": 1.0, "rewards/chosen": 0.18918542563915253, "rewards/margins": 4.223211288452148, "rewards/rejected": -4.034026145935059, "step": 350 }, { "epoch": 0.08989886377824947, "grad_norm": 0.376953125, "learning_rate": 4.488778054862843e-06, "logits/chosen": -2.1501951217651367, "logits/rejected": -2.341325521469116, "logps/chosen": -4.00003719329834, "logps/rejected": -510.6114196777344, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 0.18887588381767273, "rewards/margins": 4.7241339683532715, "rewards/rejected": -4.5352582931518555, "step": 360 }, { "epoch": 0.0923960544387564, "grad_norm": 0.228515625, "learning_rate": 4.6134663341645895e-06, "logits/chosen": -2.152017593383789, "logits/rejected": -2.326498508453369, "logps/chosen": -3.2789077758789062, "logps/rejected": -488.865966796875, "loss": 0.2472, "rewards/accuracies": 1.0, "rewards/chosen": 0.19166019558906555, "rewards/margins": 4.529562473297119, "rewards/rejected": -4.337902069091797, "step": 370 }, { "epoch": 0.09489324509926333, "grad_norm": 0.546875, "learning_rate": 4.738154613466335e-06, "logits/chosen": -2.0966598987579346, "logits/rejected": -2.3076987266540527, "logps/chosen": -3.7783362865448, "logps/rejected": -743.3594970703125, "loss": 0.2398, "rewards/accuracies": 1.0, "rewards/chosen": 0.18687334656715393, "rewards/margins": 6.980570316314697, "rewards/rejected": -6.793696403503418, "step": 380 }, { "epoch": 0.09739043575977026, "grad_norm": 0.1982421875, "learning_rate": 4.862842892768081e-06, "logits/chosen": -2.1418652534484863, "logits/rejected": -2.30336332321167, "logps/chosen": -2.9560298919677734, "logps/rejected": -607.9320068359375, "loss": 0.2388, "rewards/accuracies": 1.0, "rewards/chosen": 0.18996365368366241, "rewards/margins": 5.754693031311035, "rewards/rejected": -5.564728736877441, "step": 390 }, { "epoch": 0.09988762642027718, "grad_norm": 0.1630859375, "learning_rate": 4.987531172069826e-06, "logits/chosen": -2.0703442096710205, "logits/rejected": -2.2270889282226562, "logps/chosen": -2.578680992126465, "logps/rejected": -683.11083984375, "loss": 0.2415, "rewards/accuracies": 1.0, "rewards/chosen": 0.19460181891918182, "rewards/margins": 6.500932216644287, "rewards/rejected": -6.30633020401001, "step": 400 }, { "epoch": 0.10238481708078412, "grad_norm": 0.2021484375, "learning_rate": 4.999923022460671e-06, "logits/chosen": -2.0380523204803467, "logits/rejected": -2.2315127849578857, "logps/chosen": -4.086075782775879, "logps/rejected": -833.37255859375, "loss": 0.2328, "rewards/accuracies": 1.0, "rewards/chosen": 0.19286975264549255, "rewards/margins": 7.908270835876465, "rewards/rejected": -7.715400695800781, "step": 410 }, { "epoch": 0.10488200774129104, "grad_norm": 0.248046875, "learning_rate": 4.999656933348981e-06, "logits/chosen": -2.244335174560547, "logits/rejected": -2.4024062156677246, "logps/chosen": -2.923116445541382, "logps/rejected": -593.464599609375, "loss": 0.241, "rewards/accuracies": 1.0, "rewards/chosen": 0.19478723406791687, "rewards/margins": 5.600838661193848, "rewards/rejected": -5.4060516357421875, "step": 420 }, { "epoch": 0.10737919840179798, "grad_norm": 0.359375, "learning_rate": 4.99920080255011e-06, "logits/chosen": -2.077357769012451, "logits/rejected": -2.282799243927002, "logps/chosen": -2.9383771419525146, "logps/rejected": -852.4064331054688, "loss": 0.231, "rewards/accuracies": 1.0, "rewards/chosen": 0.19819210469722748, "rewards/margins": 8.028984069824219, "rewards/rejected": -7.830792427062988, "step": 430 }, { "epoch": 0.1098763890623049, "grad_norm": 0.171875, "learning_rate": 4.998554664742362e-06, "logits/chosen": -2.148183822631836, "logits/rejected": -2.3020401000976562, "logps/chosen": -1.9824367761611938, "logps/rejected": -745.6473999023438, "loss": 0.2322, "rewards/accuracies": 1.0, "rewards/chosen": 0.1946963667869568, "rewards/margins": 7.165565490722656, "rewards/rejected": -6.9708685874938965, "step": 440 }, { "epoch": 0.11237357972281184, "grad_norm": 0.13671875, "learning_rate": 4.997718569049726e-06, "logits/chosen": -2.094149351119995, "logits/rejected": -2.2727301120758057, "logps/chosen": -3.559483051300049, "logps/rejected": -817.2952270507812, "loss": 0.2319, "rewards/accuracies": 1.0, "rewards/chosen": 0.19730597734451294, "rewards/margins": 7.785311222076416, "rewards/rejected": -7.588005065917969, "step": 450 }, { "epoch": 0.11487077038331876, "grad_norm": 0.1171875, "learning_rate": 4.9966925790381404e-06, "logits/chosen": -2.1491434574127197, "logits/rejected": -2.301217555999756, "logps/chosen": -1.5461114645004272, "logps/rejected": -810.7796020507812, "loss": 0.2326, "rewards/accuracies": 1.0, "rewards/chosen": 0.19482001662254333, "rewards/margins": 7.743639945983887, "rewards/rejected": -7.548819541931152, "step": 460 }, { "epoch": 0.1173679610438257, "grad_norm": 0.1435546875, "learning_rate": 4.995476772710657e-06, "logits/chosen": -2.1041364669799805, "logits/rejected": -2.3101038932800293, "logps/chosen": -3.1227645874023438, "logps/rejected": -963.2913208007812, "loss": 0.2321, "rewards/accuracies": 1.0, "rewards/chosen": 0.1952921450138092, "rewards/margins": 9.208600044250488, "rewards/rejected": -9.013307571411133, "step": 470 }, { "epoch": 0.11986515170433262, "grad_norm": 0.08544921875, "learning_rate": 4.994071242501516e-06, "logits/chosen": -2.1944689750671387, "logits/rejected": -2.371983051300049, "logps/chosen": -2.822134494781494, "logps/rejected": -869.8029174804688, "loss": 0.2298, "rewards/accuracies": 1.0, "rewards/chosen": 0.19522327184677124, "rewards/margins": 8.3977632522583, "rewards/rejected": -8.20253849029541, "step": 480 }, { "epoch": 0.12236234236483956, "grad_norm": 0.0634765625, "learning_rate": 4.992476095269112e-06, "logits/chosen": -2.2050843238830566, "logits/rejected": -2.3897545337677, "logps/chosen": -1.4868861436843872, "logps/rejected": -922.6173095703125, "loss": 0.2305, "rewards/accuracies": 1.0, "rewards/chosen": 0.2005070000886917, "rewards/margins": 8.763871192932129, "rewards/rejected": -8.563364028930664, "step": 490 }, { "epoch": 0.12485953302534648, "grad_norm": 0.369140625, "learning_rate": 4.990691452287877e-06, "logits/chosen": -2.042813777923584, "logits/rejected": -2.213289976119995, "logps/chosen": -2.393306016921997, "logps/rejected": -886.4241943359375, "loss": 0.2303, "rewards/accuracies": 1.0, "rewards/chosen": 0.20110133290290833, "rewards/margins": 8.474591255187988, "rewards/rejected": -8.273489952087402, "step": 500 }, { "epoch": 0.1273567236858534, "grad_norm": 0.3984375, "learning_rate": 4.988717449239056e-06, "logits/chosen": -2.093723773956299, "logits/rejected": -2.2634453773498535, "logps/chosen": -1.9311176538467407, "logps/rejected": -851.02734375, "loss": 0.2347, "rewards/accuracies": 1.0, "rewards/chosen": 0.19648316502571106, "rewards/margins": 8.190296173095703, "rewards/rejected": -7.993813991546631, "step": 510 }, { "epoch": 0.12985391434636034, "grad_norm": 0.1630859375, "learning_rate": 4.98655423620039e-06, "logits/chosen": -2.1161797046661377, "logits/rejected": -2.3049392700195312, "logps/chosen": -1.9681230783462524, "logps/rejected": -963.2742919921875, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.20181334018707275, "rewards/margins": 9.243757247924805, "rewards/rejected": -9.041942596435547, "step": 520 }, { "epoch": 0.13235110500686728, "grad_norm": 0.07373046875, "learning_rate": 4.984201977634711e-06, "logits/chosen": -2.223388195037842, "logits/rejected": -2.4297728538513184, "logps/chosen": -2.4097044467926025, "logps/rejected": -1106.8994140625, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": 0.2010197937488556, "rewards/margins": 10.71354866027832, "rewards/rejected": -10.512530326843262, "step": 530 }, { "epoch": 0.1348482956673742, "grad_norm": 0.11279296875, "learning_rate": 4.9816608523774345e-06, "logits/chosen": -2.119506359100342, "logits/rejected": -2.305849552154541, "logps/chosen": -2.257546901702881, "logps/rejected": -930.5267333984375, "loss": 0.2306, "rewards/accuracies": 1.0, "rewards/chosen": 0.19777485728263855, "rewards/margins": 8.905478477478027, "rewards/rejected": -8.707704544067383, "step": 540 }, { "epoch": 0.13734548632788113, "grad_norm": 0.07470703125, "learning_rate": 4.978931053622964e-06, "logits/chosen": -2.1544103622436523, "logits/rejected": -2.354814052581787, "logps/chosen": -1.3565616607666016, "logps/rejected": -950.23681640625, "loss": 0.2307, "rewards/accuracies": 1.0, "rewards/chosen": 0.2015368640422821, "rewards/margins": 9.141637802124023, "rewards/rejected": -8.940099716186523, "step": 550 }, { "epoch": 0.13984267698838806, "grad_norm": 0.035400390625, "learning_rate": 4.9760127889100044e-06, "logits/chosen": -2.1699581146240234, "logits/rejected": -2.3422303199768066, "logps/chosen": -1.4560916423797607, "logps/rejected": -1047.3670654296875, "loss": 0.23, "rewards/accuracies": 1.0, "rewards/chosen": 0.19995173811912537, "rewards/margins": 10.138322830200195, "rewards/rejected": -9.938371658325195, "step": 560 }, { "epoch": 0.142339867648895, "grad_norm": 0.11328125, "learning_rate": 4.972906280105781e-06, "logits/chosen": -2.0392138957977295, "logits/rejected": -2.2401204109191895, "logps/chosen": -2.1844277381896973, "logps/rejected": -998.3021240234375, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 0.20644374191761017, "rewards/margins": 9.561029434204102, "rewards/rejected": -9.354585647583008, "step": 570 }, { "epoch": 0.1448370583094019, "grad_norm": 0.0625, "learning_rate": 4.969611763389175e-06, "logits/chosen": -2.2010245323181152, "logits/rejected": -2.3933498859405518, "logps/chosen": -2.1393237113952637, "logps/rejected": -925.5234375, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": 0.1972513645887375, "rewards/margins": 8.949918746948242, "rewards/rejected": -8.752666473388672, "step": 580 }, { "epoch": 0.14733424896990885, "grad_norm": 0.0751953125, "learning_rate": 4.966129489232762e-06, "logits/chosen": -2.1333353519439697, "logits/rejected": -2.3556675910949707, "logps/chosen": -2.2460904121398926, "logps/rejected": -1139.327392578125, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.20897097885608673, "rewards/margins": 10.926295280456543, "rewards/rejected": -10.717325210571289, "step": 590 }, { "epoch": 0.14983143963041579, "grad_norm": 0.12890625, "learning_rate": 4.962459722383775e-06, "logits/chosen": -2.095088243484497, "logits/rejected": -2.2931671142578125, "logps/chosen": -2.7135472297668457, "logps/rejected": -1181.6075439453125, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 0.20306305587291718, "rewards/margins": 11.351381301879883, "rewards/rejected": -11.148316383361816, "step": 600 }, { "epoch": 0.15232863029092272, "grad_norm": 0.0303955078125, "learning_rate": 4.958602741843975e-06, "logits/chosen": -2.0957350730895996, "logits/rejected": -2.3226089477539062, "logps/chosen": -2.8655078411102295, "logps/rejected": -1118.1968994140625, "loss": 0.2277, "rewards/accuracies": 1.0, "rewards/chosen": 0.19633761048316956, "rewards/margins": 10.744343757629395, "rewards/rejected": -10.548004150390625, "step": 610 }, { "epoch": 0.15482582095142963, "grad_norm": 0.107421875, "learning_rate": 4.954558840848437e-06, "logits/chosen": -2.211951494216919, "logits/rejected": -2.3932459354400635, "logps/chosen": -1.5332846641540527, "logps/rejected": -932.4984130859375, "loss": 0.2285, "rewards/accuracies": 1.0, "rewards/chosen": 0.20827274024486542, "rewards/margins": 9.049389839172363, "rewards/rejected": -8.841116905212402, "step": 620 }, { "epoch": 0.15732301161193657, "grad_norm": 0.052978515625, "learning_rate": 4.950328326843258e-06, "logits/chosen": -2.073488712310791, "logits/rejected": -2.2822651863098145, "logps/chosen": -0.9946017265319824, "logps/rejected": -1086.56689453125, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/chosen": 0.20773670077323914, "rewards/margins": 10.345720291137695, "rewards/rejected": -10.137983322143555, "step": 630 }, { "epoch": 0.1598202022724435, "grad_norm": 0.08203125, "learning_rate": 4.945911521462182e-06, "logits/chosen": -2.2225770950317383, "logits/rejected": -2.412863254547119, "logps/chosen": -1.7764488458633423, "logps/rejected": -1141.6427001953125, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 0.20446841418743134, "rewards/margins": 11.000048637390137, "rewards/rejected": -10.79557991027832, "step": 640 }, { "epoch": 0.16231739293295044, "grad_norm": 0.16796875, "learning_rate": 4.941308760502149e-06, "logits/chosen": -2.211944341659546, "logits/rejected": -2.371511697769165, "logps/chosen": -2.542166233062744, "logps/rejected": -972.3176879882812, "loss": 0.2319, "rewards/accuracies": 1.0, "rewards/chosen": 0.200990229845047, "rewards/margins": 9.126736640930176, "rewards/rejected": -8.925745964050293, "step": 650 }, { "epoch": 0.16481458359345735, "grad_norm": 0.134765625, "learning_rate": 4.936520393897762e-06, "logits/chosen": -2.1870148181915283, "logits/rejected": -2.4076366424560547, "logps/chosen": -2.055567979812622, "logps/rejected": -1019.7349853515625, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 0.21482977271080017, "rewards/margins": 9.75967788696289, "rewards/rejected": -9.54484748840332, "step": 660 }, { "epoch": 0.1673117742539643, "grad_norm": 0.042724609375, "learning_rate": 4.931546785694684e-06, "logits/chosen": -2.207019090652466, "logits/rejected": -2.411149740219116, "logps/chosen": -1.447061538696289, "logps/rejected": -1274.262451171875, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.2061387598514557, "rewards/margins": 12.430362701416016, "rewards/rejected": -12.224225044250488, "step": 670 }, { "epoch": 0.16980896491447123, "grad_norm": 0.031982421875, "learning_rate": 4.926388314021964e-06, "logits/chosen": -2.245506763458252, "logits/rejected": -2.439272403717041, "logps/chosen": -1.3953222036361694, "logps/rejected": -1066.398193359375, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.210698202252388, "rewards/margins": 10.378253936767578, "rewards/rejected": -10.167555809020996, "step": 680 }, { "epoch": 0.17230615557497814, "grad_norm": 0.049560546875, "learning_rate": 4.921045371063283e-06, "logits/chosen": -2.235975980758667, "logits/rejected": -2.42988920211792, "logps/chosen": -0.8631747961044312, "logps/rejected": -1208.173095703125, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.2110184133052826, "rewards/margins": 11.736184120178223, "rewards/rejected": -11.525165557861328, "step": 690 }, { "epoch": 0.17480334623548507, "grad_norm": 0.049072265625, "learning_rate": 4.915518363027142e-06, "logits/chosen": -2.29992938041687, "logits/rejected": -2.4797685146331787, "logps/chosen": -0.5947138667106628, "logps/rejected": -1052.22216796875, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.2057635486125946, "rewards/margins": 10.251365661621094, "rewards/rejected": -10.045602798461914, "step": 700 }, { "epoch": 0.177300536895992, "grad_norm": 0.0625, "learning_rate": 4.909807710115977e-06, "logits/chosen": -2.0681312084198, "logits/rejected": -2.245760202407837, "logps/chosen": -1.667133092880249, "logps/rejected": -1234.741943359375, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 0.19957289099693298, "rewards/margins": 12.013802528381348, "rewards/rejected": -11.814229011535645, "step": 710 }, { "epoch": 0.17979772755649895, "grad_norm": 0.064453125, "learning_rate": 4.903913846494211e-06, "logits/chosen": -2.0854830741882324, "logits/rejected": -2.318626880645752, "logps/chosen": -1.4859822988510132, "logps/rejected": -1401.390625, "loss": 0.2247, "rewards/accuracies": 1.0, "rewards/chosen": 0.21390756964683533, "rewards/margins": 13.512557983398438, "rewards/rejected": -13.298650741577148, "step": 720 }, { "epoch": 0.18229491821700586, "grad_norm": 0.049072265625, "learning_rate": 4.897837220255251e-06, "logits/chosen": -2.105733633041382, "logits/rejected": -2.273578643798828, "logps/chosen": -1.5127496719360352, "logps/rejected": -1189.6934814453125, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.21006134152412415, "rewards/margins": 11.488363265991211, "rewards/rejected": -11.278302192687988, "step": 730 }, { "epoch": 0.1847921088775128, "grad_norm": 0.06982421875, "learning_rate": 4.891578293387413e-06, "logits/chosen": -2.1760973930358887, "logits/rejected": -2.3570103645324707, "logps/chosen": -1.769789695739746, "logps/rejected": -1201.271240234375, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 0.20501787960529327, "rewards/margins": 11.696678161621094, "rewards/rejected": -11.491661071777344, "step": 740 }, { "epoch": 0.18728929953801973, "grad_norm": 0.031982421875, "learning_rate": 4.885137541738808e-06, "logits/chosen": -2.141007423400879, "logits/rejected": -2.313952922821045, "logps/chosen": -0.702928900718689, "logps/rejected": -1086.88330078125, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20619484782218933, "rewards/margins": 10.44408893585205, "rewards/rejected": -10.237894058227539, "step": 750 }, { "epoch": 0.18978649019852667, "grad_norm": 0.09033203125, "learning_rate": 4.878515454981153e-06, "logits/chosen": -2.0163445472717285, "logits/rejected": -2.219290256500244, "logps/chosen": -1.4322102069854736, "logps/rejected": -1299.561767578125, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.20874173939228058, "rewards/margins": 12.519464492797852, "rewards/rejected": -12.310722351074219, "step": 760 }, { "epoch": 0.19228368085903358, "grad_norm": 0.0654296875, "learning_rate": 4.8717125365725545e-06, "logits/chosen": -2.2308189868927, "logits/rejected": -2.3827383518218994, "logps/chosen": -1.321045160293579, "logps/rejected": -954.9481201171875, "loss": 0.2298, "rewards/accuracies": 1.0, "rewards/chosen": 0.2083440124988556, "rewards/margins": 9.28177547454834, "rewards/rejected": -9.073431015014648, "step": 770 }, { "epoch": 0.19478087151954052, "grad_norm": 0.05029296875, "learning_rate": 4.864729303719221e-06, "logits/chosen": -2.1831257343292236, "logits/rejected": -2.386863946914673, "logps/chosen": -1.462869644165039, "logps/rejected": -1309.128662109375, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21388690173625946, "rewards/margins": 12.65107250213623, "rewards/rejected": -12.437185287475586, "step": 780 }, { "epoch": 0.19727806218004745, "grad_norm": 0.1259765625, "learning_rate": 4.857566287336152e-06, "logits/chosen": -2.125136375427246, "logits/rejected": -2.3306586742401123, "logps/chosen": -1.5712594985961914, "logps/rejected": -1211.277587890625, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": 0.21211902797222137, "rewards/margins": 11.6867094039917, "rewards/rejected": -11.474590301513672, "step": 790 }, { "epoch": 0.19977525284055436, "grad_norm": 0.11376953125, "learning_rate": 4.850224032006765e-06, "logits/chosen": -2.226292610168457, "logits/rejected": -2.4260332584381104, "logps/chosen": -1.096842885017395, "logps/rejected": -1190.5208740234375, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.21329161524772644, "rewards/margins": 11.604973793029785, "rewards/rejected": -11.391681671142578, "step": 800 }, { "epoch": 0.2022724435010613, "grad_norm": 0.080078125, "learning_rate": 4.8427030959414984e-06, "logits/chosen": -2.0340332984924316, "logits/rejected": -2.239582061767578, "logps/chosen": -1.4298118352890015, "logps/rejected": -1246.587158203125, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 0.203706294298172, "rewards/margins": 12.136808395385742, "rewards/rejected": -11.933099746704102, "step": 810 }, { "epoch": 0.20476963416156824, "grad_norm": 0.0400390625, "learning_rate": 4.835004050935369e-06, "logits/chosen": -2.142270803451538, "logits/rejected": -2.3261685371398926, "logps/chosen": -2.205761432647705, "logps/rejected": -1209.187744140625, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 0.21324896812438965, "rewards/margins": 11.711974143981934, "rewards/rejected": -11.498725891113281, "step": 820 }, { "epoch": 0.20726682482207517, "grad_norm": 0.0257568359375, "learning_rate": 4.8271274823245e-06, "logits/chosen": -2.130068778991699, "logits/rejected": -2.303924083709717, "logps/chosen": -1.5450295209884644, "logps/rejected": -1218.6636962890625, "loss": 0.2285, "rewards/accuracies": 1.0, "rewards/chosen": 0.2086503505706787, "rewards/margins": 11.845584869384766, "rewards/rejected": -11.636935234069824, "step": 830 }, { "epoch": 0.20976401548258208, "grad_norm": 0.0791015625, "learning_rate": 4.8190739889416264e-06, "logits/chosen": -2.1227643489837646, "logits/rejected": -2.3156332969665527, "logps/chosen": -1.4759693145751953, "logps/rejected": -1314.2388916015625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21140392124652863, "rewards/margins": 12.794939994812012, "rewards/rejected": -12.583536148071289, "step": 840 }, { "epoch": 0.21226120614308902, "grad_norm": 0.0264892578125, "learning_rate": 4.810844183070553e-06, "logits/chosen": -2.2195773124694824, "logits/rejected": -2.416642665863037, "logps/chosen": -1.3944060802459717, "logps/rejected": -1100.637939453125, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": 0.20567412674427032, "rewards/margins": 10.635955810546875, "rewards/rejected": -10.430280685424805, "step": 850 }, { "epoch": 0.21475839680359596, "grad_norm": 0.05712890625, "learning_rate": 4.802438690399622e-06, "logits/chosen": -2.170403480529785, "logits/rejected": -2.3731253147125244, "logps/chosen": -0.7113627195358276, "logps/rejected": -1192.8896484375, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 0.20982804894447327, "rewards/margins": 11.532899856567383, "rewards/rejected": -11.32307243347168, "step": 860 }, { "epoch": 0.2172555874641029, "grad_norm": 0.06201171875, "learning_rate": 4.793858149974129e-06, "logits/chosen": -2.134357452392578, "logits/rejected": -2.3488316535949707, "logps/chosen": -1.1498069763183594, "logps/rejected": -1405.57177734375, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.2079639434814453, "rewards/margins": 13.713908195495605, "rewards/rejected": -13.505943298339844, "step": 870 }, { "epoch": 0.2197527781246098, "grad_norm": 0.03857421875, "learning_rate": 4.785103214147747e-06, "logits/chosen": -2.244509220123291, "logits/rejected": -2.446852445602417, "logps/chosen": -1.082582950592041, "logps/rejected": -1192.0093994140625, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.20774182677268982, "rewards/margins": 11.592524528503418, "rewards/rejected": -11.384782791137695, "step": 880 }, { "epoch": 0.22224996878511674, "grad_norm": 0.0118408203125, "learning_rate": 4.776174548532926e-06, "logits/chosen": -2.1576988697052, "logits/rejected": -2.3463644981384277, "logps/chosen": -1.1917221546173096, "logps/rejected": -1265.5885009765625, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.20792751014232635, "rewards/margins": 12.278467178344727, "rewards/rejected": -12.070539474487305, "step": 890 }, { "epoch": 0.22474715944562368, "grad_norm": 0.060546875, "learning_rate": 4.767072831950288e-06, "logits/chosen": -2.2008862495422363, "logits/rejected": -2.402891159057617, "logps/chosen": -1.2017600536346436, "logps/rejected": -1313.045654296875, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.2119072675704956, "rewards/margins": 12.807563781738281, "rewards/rejected": -12.59565544128418, "step": 900 }, { "epoch": 0.22724435010613062, "grad_norm": 0.091796875, "learning_rate": 4.7577987563770226e-06, "logits/chosen": -2.1067652702331543, "logits/rejected": -2.324591875076294, "logps/chosen": -2.000681161880493, "logps/rejected": -1264.68115234375, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.2102380096912384, "rewards/margins": 12.193601608276367, "rewards/rejected": -11.983363151550293, "step": 910 }, { "epoch": 0.22974154076663753, "grad_norm": 0.059814453125, "learning_rate": 4.748353026894273e-06, "logits/chosen": -2.1624951362609863, "logits/rejected": -2.3448517322540283, "logps/chosen": -1.4960781335830688, "logps/rejected": -1188.14990234375, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.2139265537261963, "rewards/margins": 11.510043144226074, "rewards/rejected": -11.29611587524414, "step": 920 }, { "epoch": 0.23223873142714446, "grad_norm": 0.080078125, "learning_rate": 4.738736361633532e-06, "logits/chosen": -2.25258207321167, "logits/rejected": -2.4271512031555176, "logps/chosen": -1.7973697185516357, "logps/rejected": -1126.24267578125, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.20842309296131134, "rewards/margins": 10.903474807739258, "rewards/rejected": -10.695051193237305, "step": 930 }, { "epoch": 0.2347359220876514, "grad_norm": 0.06103515625, "learning_rate": 4.728949491722046e-06, "logits/chosen": -2.274840831756592, "logits/rejected": -2.4521872997283936, "logps/chosen": -0.652289092540741, "logps/rejected": -1062.56494140625, "loss": 0.2295, "rewards/accuracies": 1.0, "rewards/chosen": 0.20718173682689667, "rewards/margins": 10.335628509521484, "rewards/rejected": -10.128446578979492, "step": 940 }, { "epoch": 0.2372331127481583, "grad_norm": 0.0751953125, "learning_rate": 4.718993161227231e-06, "logits/chosen": -2.172180414199829, "logits/rejected": -2.4125022888183594, "logps/chosen": -1.2400215864181519, "logps/rejected": -1376.037841796875, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.21632233262062073, "rewards/margins": 13.414273262023926, "rewards/rejected": -13.197952270507812, "step": 950 }, { "epoch": 0.23973030340866525, "grad_norm": 0.00982666015625, "learning_rate": 4.708868127100098e-06, "logits/chosen": -2.2069010734558105, "logits/rejected": -2.3836076259613037, "logps/chosen": -0.6828838586807251, "logps/rejected": -1159.0107421875, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.20559605956077576, "rewards/margins": 11.286005973815918, "rewards/rejected": -11.080410957336426, "step": 960 }, { "epoch": 0.24222749406917218, "grad_norm": 0.08740234375, "learning_rate": 4.6985751591177075e-06, "logits/chosen": -2.0572152137756348, "logits/rejected": -2.2502310276031494, "logps/chosen": -1.7850786447525024, "logps/rejected": -1321.8499755859375, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.212154358625412, "rewards/margins": 12.830732345581055, "rewards/rejected": -12.618578910827637, "step": 970 }, { "epoch": 0.24472468472967912, "grad_norm": 0.09716796875, "learning_rate": 4.688115039824648e-06, "logits/chosen": -2.1182241439819336, "logits/rejected": -2.292884349822998, "logps/chosen": -0.9138596653938293, "logps/rejected": -1220.1195068359375, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072029858827591, "rewards/margins": 11.845842361450195, "rewards/rejected": -11.638639450073242, "step": 980 }, { "epoch": 0.24722187539018603, "grad_norm": 0.1005859375, "learning_rate": 4.677488564473535e-06, "logits/chosen": -2.076742649078369, "logits/rejected": -2.280050754547119, "logps/chosen": -2.1341259479522705, "logps/rejected": -1361.389404296875, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 0.20143508911132812, "rewards/margins": 13.227249145507812, "rewards/rejected": -13.0258150100708, "step": 990 }, { "epoch": 0.24971906605069297, "grad_norm": 0.15625, "learning_rate": 4.666696540964556e-06, "logits/chosen": -2.205030918121338, "logits/rejected": -2.380605697631836, "logps/chosen": -1.0865452289581299, "logps/rejected": -1183.8802490234375, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21310412883758545, "rewards/margins": 11.559179306030273, "rewards/rejected": -11.346075057983398, "step": 1000 }, { "epoch": 0.24971906605069297, "eval_logits/chosen": -2.551421880722046, "eval_logits/rejected": -2.637223482131958, "eval_logps/chosen": -0.39880600571632385, "eval_logps/rejected": -585.1870727539062, "eval_loss": 0.22298085689544678, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.25514695048332214, "eval_rewards/margins": 5.658298015594482, "eval_rewards/rejected": -5.403151035308838, "eval_runtime": 0.6597, "eval_samples_per_second": 7.579, "eval_steps_per_second": 4.548, "step": 1000 }, { "epoch": 0.2522162567111999, "grad_norm": 0.0361328125, "learning_rate": 4.6557397897840454e-06, "logits/chosen": -2.226627826690674, "logits/rejected": -2.434197187423706, "logps/chosen": -1.4807536602020264, "logps/rejected": -1233.5753173828125, "loss": 0.2295, "rewards/accuracies": 1.0, "rewards/chosen": 0.21030649542808533, "rewards/margins": 11.924067497253418, "rewards/rejected": -11.713762283325195, "step": 1010 }, { "epoch": 0.2547134473717068, "grad_norm": 0.0311279296875, "learning_rate": 4.644619143942108e-06, "logits/chosen": -2.1962525844573975, "logits/rejected": -2.418130397796631, "logps/chosen": -1.2743520736694336, "logps/rejected": -1324.01123046875, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.2114056795835495, "rewards/margins": 12.735904693603516, "rewards/rejected": -12.524497985839844, "step": 1020 }, { "epoch": 0.2572106380322138, "grad_norm": 0.1162109375, "learning_rate": 4.633335448909284e-06, "logits/chosen": -2.0575506687164307, "logits/rejected": -2.2430522441864014, "logps/chosen": -1.6322782039642334, "logps/rejected": -1251.030029296875, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 0.21490998566150665, "rewards/margins": 12.10401725769043, "rewards/rejected": -11.889106750488281, "step": 1030 }, { "epoch": 0.2597078286927207, "grad_norm": 0.10400390625, "learning_rate": 4.621889562552272e-06, "logits/chosen": -2.1623690128326416, "logits/rejected": -2.387530565261841, "logps/chosen": -1.5265319347381592, "logps/rejected": -1406.755615234375, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.21330364048480988, "rewards/margins": 13.666200637817383, "rewards/rejected": -13.452896118164062, "step": 1040 }, { "epoch": 0.2622050193532276, "grad_norm": 0.134765625, "learning_rate": 4.610282355068707e-06, "logits/chosen": -2.265820264816284, "logits/rejected": -2.481659412384033, "logps/chosen": -1.5380371809005737, "logps/rejected": -1449.8046875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.2142389565706253, "rewards/margins": 14.062037467956543, "rewards/rejected": -13.847798347473145, "step": 1050 }, { "epoch": 0.26470221001373456, "grad_norm": 0.06787109375, "learning_rate": 4.598514708921006e-06, "logits/chosen": -2.249868869781494, "logits/rejected": -2.466034412384033, "logps/chosen": -0.7143852710723877, "logps/rejected": -1382.494140625, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.2079528272151947, "rewards/margins": 13.4636812210083, "rewards/rejected": -13.255727767944336, "step": 1060 }, { "epoch": 0.26719940067424147, "grad_norm": 0.01153564453125, "learning_rate": 4.5865875187692695e-06, "logits/chosen": -2.1900734901428223, "logits/rejected": -2.3761203289031982, "logps/chosen": -1.549536943435669, "logps/rejected": -1185.685791015625, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.20429477095603943, "rewards/margins": 11.534225463867188, "rewards/rejected": -11.32992935180664, "step": 1070 }, { "epoch": 0.2696965913347484, "grad_norm": 0.0830078125, "learning_rate": 4.57450169140327e-06, "logits/chosen": -2.0554583072662354, "logits/rejected": -2.273556709289551, "logps/chosen": -1.3945400714874268, "logps/rejected": -1522.8463134765625, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.2084466516971588, "rewards/margins": 14.89411449432373, "rewards/rejected": -14.685667037963867, "step": 1080 }, { "epoch": 0.27219378199525535, "grad_norm": 0.1416015625, "learning_rate": 4.562258145673507e-06, "logits/chosen": -2.20988392829895, "logits/rejected": -2.4358487129211426, "logps/chosen": -1.0550658702850342, "logps/rejected": -1489.2562255859375, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.20840421319007874, "rewards/margins": 14.553556442260742, "rewards/rejected": -14.34515380859375, "step": 1090 }, { "epoch": 0.27469097265576226, "grad_norm": 0.017578125, "learning_rate": 4.549857812421353e-06, "logits/chosen": -2.1285512447357178, "logits/rejected": -2.318908929824829, "logps/chosen": -0.753593921661377, "logps/rejected": -1319.107666015625, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.20573386549949646, "rewards/margins": 12.884170532226562, "rewards/rejected": -12.678436279296875, "step": 1100 }, { "epoch": 0.2771881633162692, "grad_norm": 0.050537109375, "learning_rate": 4.537301634408281e-06, "logits/chosen": -2.1442999839782715, "logits/rejected": -2.34287691116333, "logps/chosen": -0.9622041583061218, "logps/rejected": -1223.08837890625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21334879100322723, "rewards/margins": 11.921293258666992, "rewards/rejected": -11.707944869995117, "step": 1110 }, { "epoch": 0.27968535397677613, "grad_norm": 0.027099609375, "learning_rate": 4.52459056624419e-06, "logits/chosen": -2.198021173477173, "logits/rejected": -2.3665783405303955, "logps/chosen": -1.6707994937896729, "logps/rejected": -1209.2952880859375, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.20634672045707703, "rewards/margins": 11.70842170715332, "rewards/rejected": -11.502074241638184, "step": 1120 }, { "epoch": 0.28218254463728304, "grad_norm": 0.0458984375, "learning_rate": 4.51172557431483e-06, "logits/chosen": -2.0804460048675537, "logits/rejected": -2.27351713180542, "logps/chosen": -1.3884862661361694, "logps/rejected": -1267.9599609375, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20677968859672546, "rewards/margins": 12.219032287597656, "rewards/rejected": -12.012252807617188, "step": 1130 }, { "epoch": 0.28467973529779, "grad_norm": 0.0751953125, "learning_rate": 4.49870763670833e-06, "logits/chosen": -2.1440179347991943, "logits/rejected": -2.3646531105041504, "logps/chosen": -0.9940131306648254, "logps/rejected": -1360.1025390625, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.2132159024477005, "rewards/margins": 13.244120597839355, "rewards/rejected": -13.030904769897461, "step": 1140 }, { "epoch": 0.2871769259582969, "grad_norm": 0.060546875, "learning_rate": 4.4855377431408335e-06, "logits/chosen": -2.124523639678955, "logits/rejected": -2.308046817779541, "logps/chosen": -1.051758885383606, "logps/rejected": -1258.587158203125, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.21350452303886414, "rewards/margins": 12.086160659790039, "rewards/rejected": -11.872655868530273, "step": 1150 }, { "epoch": 0.2896741166188038, "grad_norm": 0.07275390625, "learning_rate": 4.472216894881261e-06, "logits/chosen": -2.12388277053833, "logits/rejected": -2.2992734909057617, "logps/chosen": -1.0673718452453613, "logps/rejected": -1227.642822265625, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21056973934173584, "rewards/margins": 11.97436809539795, "rewards/rejected": -11.763797760009766, "step": 1160 }, { "epoch": 0.2921713072793108, "grad_norm": 0.062255859375, "learning_rate": 4.4587461046751815e-06, "logits/chosen": -2.165827512741089, "logits/rejected": -2.366560697555542, "logps/chosen": -1.3018419742584229, "logps/rejected": -1152.0526123046875, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.2166510820388794, "rewards/margins": 11.213326454162598, "rewards/rejected": -10.996675491333008, "step": 1170 }, { "epoch": 0.2946684979398177, "grad_norm": 0.0218505859375, "learning_rate": 4.44512639666781e-06, "logits/chosen": -2.153282642364502, "logits/rejected": -2.3281288146972656, "logps/chosen": -0.8735140562057495, "logps/rejected": -1144.37744140625, "loss": 0.2288, "rewards/accuracies": 1.0, "rewards/chosen": 0.20929470658302307, "rewards/margins": 11.165544509887695, "rewards/rejected": -10.956250190734863, "step": 1180 }, { "epoch": 0.29716568860032466, "grad_norm": 0.2451171875, "learning_rate": 4.431358806326158e-06, "logits/chosen": -2.0921244621276855, "logits/rejected": -2.2888898849487305, "logps/chosen": -1.9632396697998047, "logps/rejected": -1334.217041015625, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 0.21045899391174316, "rewards/margins": 12.872146606445312, "rewards/rejected": -12.661687850952148, "step": 1190 }, { "epoch": 0.29966287926083157, "grad_norm": 0.08349609375, "learning_rate": 4.4174443803603e-06, "logits/chosen": -2.1807141304016113, "logits/rejected": -2.35149884223938, "logps/chosen": -1.1249208450317383, "logps/rejected": -1231.4007568359375, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 0.2088349312543869, "rewards/margins": 11.981757164001465, "rewards/rejected": -11.772923469543457, "step": 1200 }, { "epoch": 0.3021600699213385, "grad_norm": 0.054931640625, "learning_rate": 4.4033841766438e-06, "logits/chosen": -2.153378486633301, "logits/rejected": -2.333552598953247, "logps/chosen": -1.4812664985656738, "logps/rejected": -1186.764404296875, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.21333375573158264, "rewards/margins": 11.490147590637207, "rewards/rejected": -11.276814460754395, "step": 1210 }, { "epoch": 0.30465726058184545, "grad_norm": 0.0262451171875, "learning_rate": 4.389179264133281e-06, "logits/chosen": -2.232697010040283, "logits/rejected": -2.418818235397339, "logps/chosen": -0.8499106168746948, "logps/rejected": -1287.507568359375, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.20431029796600342, "rewards/margins": 12.579316139221191, "rewards/rejected": -12.375005722045898, "step": 1220 }, { "epoch": 0.30715445124235236, "grad_norm": 0.0478515625, "learning_rate": 4.374830722787159e-06, "logits/chosen": -2.2435195446014404, "logits/rejected": -2.4646503925323486, "logps/chosen": -0.5742496252059937, "logps/rejected": -1343.397216796875, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 0.2151050567626953, "rewards/margins": 13.130419731140137, "rewards/rejected": -12.915315628051758, "step": 1230 }, { "epoch": 0.30965164190285926, "grad_norm": 0.05615234375, "learning_rate": 4.360339643483533e-06, "logits/chosen": -2.2148001194000244, "logits/rejected": -2.421738862991333, "logps/chosen": -1.9802653789520264, "logps/rejected": -1262.169189453125, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.20881418883800507, "rewards/margins": 12.177266120910645, "rewards/rejected": -11.968450546264648, "step": 1240 }, { "epoch": 0.31214883256336623, "grad_norm": 0.01348876953125, "learning_rate": 4.345707127937253e-06, "logits/chosen": -2.1191718578338623, "logits/rejected": -2.344691753387451, "logps/chosen": -0.9136890172958374, "logps/rejected": -1512.323974609375, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.21416035294532776, "rewards/margins": 14.802743911743164, "rewards/rejected": -14.588582992553711, "step": 1250 }, { "epoch": 0.31464602322387314, "grad_norm": 0.0751953125, "learning_rate": 4.330934288616154e-06, "logits/chosen": -2.1469109058380127, "logits/rejected": -2.3361592292785645, "logps/chosen": -1.4744806289672852, "logps/rejected": -1288.8616943359375, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.2088310271501541, "rewards/margins": 12.5834379196167, "rewards/rejected": -12.374608039855957, "step": 1260 }, { "epoch": 0.31714321388438005, "grad_norm": 0.03369140625, "learning_rate": 4.316022248656485e-06, "logits/chosen": -2.0783493518829346, "logits/rejected": -2.3048255443573, "logps/chosen": -1.100656270980835, "logps/rejected": -1277.9552001953125, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.20990002155303955, "rewards/margins": 12.193166732788086, "rewards/rejected": -11.983266830444336, "step": 1270 }, { "epoch": 0.319640404544887, "grad_norm": 0.0400390625, "learning_rate": 4.3009721417775166e-06, "logits/chosen": -2.1016387939453125, "logits/rejected": -2.3064982891082764, "logps/chosen": -1.263979196548462, "logps/rejected": -1323.89599609375, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 0.21145665645599365, "rewards/margins": 12.815747261047363, "rewards/rejected": -12.604291915893555, "step": 1280 }, { "epoch": 0.3221375952053939, "grad_norm": 0.10986328125, "learning_rate": 4.285785112195346e-06, "logits/chosen": -2.188570976257324, "logits/rejected": -2.397493600845337, "logps/chosen": -2.353158473968506, "logps/rejected": -1393.356201171875, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.20123986899852753, "rewards/margins": 13.566085815429688, "rewards/rejected": -13.364847183227539, "step": 1290 }, { "epoch": 0.3246347858659009, "grad_norm": 0.04345703125, "learning_rate": 4.27046231453591e-06, "logits/chosen": -2.115800142288208, "logits/rejected": -2.314438819885254, "logps/chosen": -1.3714869022369385, "logps/rejected": -1331.2506103515625, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.20796707272529602, "rewards/margins": 12.886337280273438, "rewards/rejected": -12.678369522094727, "step": 1300 }, { "epoch": 0.3271319765264078, "grad_norm": 0.0322265625, "learning_rate": 4.255004913747196e-06, "logits/chosen": -2.1591382026672363, "logits/rejected": -2.3501150608062744, "logps/chosen": -0.8996777534484863, "logps/rejected": -1417.157470703125, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.2074543684720993, "rewards/margins": 13.838354110717773, "rewards/rejected": -13.630900382995605, "step": 1310 }, { "epoch": 0.3296291671869147, "grad_norm": 0.05126953125, "learning_rate": 4.2394140850106825e-06, "logits/chosen": -2.0840930938720703, "logits/rejected": -2.285808801651001, "logps/chosen": -0.9041382670402527, "logps/rejected": -1322.038818359375, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.2139444649219513, "rewards/margins": 12.818634033203125, "rewards/rejected": -12.604690551757812, "step": 1320 }, { "epoch": 0.33212635784742167, "grad_norm": 0.045166015625, "learning_rate": 4.223691013651986e-06, "logits/chosen": -2.141530990600586, "logits/rejected": -2.363454580307007, "logps/chosen": -2.294220209121704, "logps/rejected": -1329.7213134765625, "loss": 0.2239, "rewards/accuracies": 1.0, "rewards/chosen": 0.2177181988954544, "rewards/margins": 12.63646411895752, "rewards/rejected": -12.418745040893555, "step": 1330 }, { "epoch": 0.3346235485079286, "grad_norm": 0.039794921875, "learning_rate": 4.207836895050748e-06, "logits/chosen": -2.263815402984619, "logits/rejected": -2.524907350540161, "logps/chosen": -0.85591059923172, "logps/rejected": -1496.051513671875, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.21331222355365753, "rewards/margins": 14.541677474975586, "rewards/rejected": -14.32836627960205, "step": 1340 }, { "epoch": 0.3371207391684355, "grad_norm": 0.056640625, "learning_rate": 4.1918529345497525e-06, "logits/chosen": -2.1795644760131836, "logits/rejected": -2.345736026763916, "logps/chosen": -1.1188920736312866, "logps/rejected": -1032.299560546875, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 0.21017661690711975, "rewards/margins": 9.974283218383789, "rewards/rejected": -9.764104843139648, "step": 1350 }, { "epoch": 0.33961792982894246, "grad_norm": 0.061767578125, "learning_rate": 4.175740347363289e-06, "logits/chosen": -2.2571511268615723, "logits/rejected": -2.450302839279175, "logps/chosen": -2.4634203910827637, "logps/rejected": -1143.845703125, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 0.20782017707824707, "rewards/margins": 10.989904403686523, "rewards/rejected": -10.782083511352539, "step": 1360 }, { "epoch": 0.34211512048944936, "grad_norm": 0.021240234375, "learning_rate": 4.159500358484759e-06, "logits/chosen": -2.104897975921631, "logits/rejected": -2.321760654449463, "logps/chosen": -1.1564667224884033, "logps/rejected": -1532.8436279296875, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21173422038555145, "rewards/margins": 14.948999404907227, "rewards/rejected": -14.737266540527344, "step": 1370 }, { "epoch": 0.3446123111499563, "grad_norm": 0.0306396484375, "learning_rate": 4.143134202593549e-06, "logits/chosen": -2.1347815990448, "logits/rejected": -2.3222789764404297, "logps/chosen": -2.063771963119507, "logps/rejected": -1179.3240966796875, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.216390922665596, "rewards/margins": 11.309762001037598, "rewards/rejected": -11.093371391296387, "step": 1380 }, { "epoch": 0.34710950181046324, "grad_norm": 0.03955078125, "learning_rate": 4.126643123961158e-06, "logits/chosen": -2.216097354888916, "logits/rejected": -2.431462049484253, "logps/chosen": -1.3367359638214111, "logps/rejected": -1441.5928955078125, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.2119736224412918, "rewards/margins": 14.054840087890625, "rewards/rejected": -13.842867851257324, "step": 1390 }, { "epoch": 0.34960669247097015, "grad_norm": 0.07470703125, "learning_rate": 4.110028376356599e-06, "logits/chosen": -2.194693088531494, "logits/rejected": -2.394153118133545, "logps/chosen": -2.143383264541626, "logps/rejected": -1089.128173828125, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.2104116678237915, "rewards/margins": 10.493813514709473, "rewards/rejected": -10.283400535583496, "step": 1400 }, { "epoch": 0.3521038831314771, "grad_norm": 0.03369140625, "learning_rate": 4.093291222951079e-06, "logits/chosen": -2.1454501152038574, "logits/rejected": -2.360769033432007, "logps/chosen": -1.1339516639709473, "logps/rejected": -1363.47119140625, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.209524005651474, "rewards/margins": 13.2172269821167, "rewards/rejected": -13.007702827453613, "step": 1410 }, { "epoch": 0.354601073791984, "grad_norm": 0.057373046875, "learning_rate": 4.076432936221965e-06, "logits/chosen": -2.135999917984009, "logits/rejected": -2.3061912059783936, "logps/chosen": -0.5820466876029968, "logps/rejected": -1179.7847900390625, "loss": 0.2283, "rewards/accuracies": 1.0, "rewards/chosen": 0.2054443657398224, "rewards/margins": 11.520200729370117, "rewards/rejected": -11.314754486083984, "step": 1420 }, { "epoch": 0.35709826445249093, "grad_norm": 0.019775390625, "learning_rate": 4.059454797856039e-06, "logits/chosen": -2.172046184539795, "logits/rejected": -2.342928171157837, "logps/chosen": -0.7546096444129944, "logps/rejected": -1167.744873046875, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 0.20582588016986847, "rewards/margins": 11.390329360961914, "rewards/rejected": -11.184503555297852, "step": 1430 }, { "epoch": 0.3595954551129979, "grad_norm": 0.043212890625, "learning_rate": 4.042358098652057e-06, "logits/chosen": -2.244403123855591, "logits/rejected": -2.4426932334899902, "logps/chosen": -1.5733036994934082, "logps/rejected": -1163.822998046875, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.21082696318626404, "rewards/margins": 11.297124862670898, "rewards/rejected": -11.086297988891602, "step": 1440 }, { "epoch": 0.3620926457735048, "grad_norm": 0.046142578125, "learning_rate": 4.025144138422615e-06, "logits/chosen": -2.189898729324341, "logits/rejected": -2.393465757369995, "logps/chosen": -1.2910453081130981, "logps/rejected": -1412.8597412109375, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.21461701393127441, "rewards/margins": 13.799296379089355, "rewards/rejected": -13.584680557250977, "step": 1450 }, { "epoch": 0.3645898364340117, "grad_norm": 0.0169677734375, "learning_rate": 4.007814225895321e-06, "logits/chosen": -2.170092821121216, "logits/rejected": -2.3824923038482666, "logps/chosen": -0.8392337560653687, "logps/rejected": -1365.531005859375, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.20813941955566406, "rewards/margins": 13.32819652557373, "rewards/rejected": -13.120054244995117, "step": 1460 }, { "epoch": 0.3670870270945187, "grad_norm": 0.017333984375, "learning_rate": 3.990369678613303e-06, "logits/chosen": -2.0936970710754395, "logits/rejected": -2.3042235374450684, "logps/chosen": -1.4599825143814087, "logps/rejected": -1356.390869140625, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.21089884638786316, "rewards/margins": 12.903648376464844, "rewards/rejected": -12.6927490234375, "step": 1470 }, { "epoch": 0.3695842177550256, "grad_norm": 0.0240478515625, "learning_rate": 3.97281182283504e-06, "logits/chosen": -2.157559871673584, "logits/rejected": -2.371856927871704, "logps/chosen": -1.3865526914596558, "logps/rejected": -1416.440185546875, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20923642814159393, "rewards/margins": 13.767707824707031, "rewards/rejected": -13.558469772338867, "step": 1480 }, { "epoch": 0.3720814084155325, "grad_norm": 0.099609375, "learning_rate": 3.955141993433526e-06, "logits/chosen": -2.2016472816467285, "logits/rejected": -2.3889071941375732, "logps/chosen": -1.0489656925201416, "logps/rejected": -1286.4302978515625, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21162299811840057, "rewards/margins": 12.558609962463379, "rewards/rejected": -12.3469877243042, "step": 1490 }, { "epoch": 0.37457859907603946, "grad_norm": 0.053466796875, "learning_rate": 3.937361533794784e-06, "logits/chosen": -2.1290640830993652, "logits/rejected": -2.337486505508423, "logps/chosen": -1.496525526046753, "logps/rejected": -1124.3212890625, "loss": 0.2277, "rewards/accuracies": 1.0, "rewards/chosen": 0.21092331409454346, "rewards/margins": 10.851540565490723, "rewards/rejected": -10.640616416931152, "step": 1500 }, { "epoch": 0.3770757897365464, "grad_norm": 0.0233154296875, "learning_rate": 3.919471795715738e-06, "logits/chosen": -2.18410587310791, "logits/rejected": -2.3675644397735596, "logps/chosen": -0.84355628490448, "logps/rejected": -1166.61279296875, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.2062278687953949, "rewards/margins": 11.381316184997559, "rewards/rejected": -11.175088882446289, "step": 1510 }, { "epoch": 0.37957298039705334, "grad_norm": 0.0390625, "learning_rate": 3.901474139301433e-06, "logits/chosen": -2.0796847343444824, "logits/rejected": -2.264577627182007, "logps/chosen": -0.6843720078468323, "logps/rejected": -1241.1590576171875, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21053218841552734, "rewards/margins": 12.031414031982422, "rewards/rejected": -11.820880889892578, "step": 1520 }, { "epoch": 0.38207017105756025, "grad_norm": 0.0869140625, "learning_rate": 3.883369932861634e-06, "logits/chosen": -2.2165303230285645, "logits/rejected": -2.3859565258026123, "logps/chosen": -1.1263262033462524, "logps/rejected": -1200.8397216796875, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.20847392082214355, "rewards/margins": 11.745917320251465, "rewards/rejected": -11.537444114685059, "step": 1530 }, { "epoch": 0.38456736171806716, "grad_norm": 0.06298828125, "learning_rate": 3.865160552806796e-06, "logits/chosen": -2.262539863586426, "logits/rejected": -2.4538345336914062, "logps/chosen": -1.3924305438995361, "logps/rejected": -1240.5035400390625, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.20712998509407043, "rewards/margins": 12.123323440551758, "rewards/rejected": -11.916193008422852, "step": 1540 }, { "epoch": 0.3870645523785741, "grad_norm": 0.041015625, "learning_rate": 3.84684738354342e-06, "logits/chosen": -2.267106771469116, "logits/rejected": -2.4566650390625, "logps/chosen": -2.0142922401428223, "logps/rejected": -1211.2545166015625, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.2093096524477005, "rewards/margins": 11.762309074401855, "rewards/rejected": -11.552999496459961, "step": 1550 }, { "epoch": 0.38956174303908103, "grad_norm": 0.07958984375, "learning_rate": 3.828431817368798e-06, "logits/chosen": -2.141620397567749, "logits/rejected": -2.33925199508667, "logps/chosen": -1.531597375869751, "logps/rejected": -1257.968994140625, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.2043263465166092, "rewards/margins": 12.106410026550293, "rewards/rejected": -11.902084350585938, "step": 1560 }, { "epoch": 0.39205893369958794, "grad_norm": 0.07763671875, "learning_rate": 3.8099152543651684e-06, "logits/chosen": -2.3559296131134033, "logits/rejected": -2.583070993423462, "logps/chosen": -0.7891671061515808, "logps/rejected": -1441.2958984375, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 0.20671992003917694, "rewards/margins": 14.086430549621582, "rewards/rejected": -13.87971019744873, "step": 1570 }, { "epoch": 0.3945561243600949, "grad_norm": 0.03857421875, "learning_rate": 3.791299102293261e-06, "logits/chosen": -2.1035549640655518, "logits/rejected": -2.3072731494903564, "logps/chosen": -1.0839884281158447, "logps/rejected": -1459.4197998046875, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.21341785788536072, "rewards/margins": 14.197916984558105, "rewards/rejected": -13.98449993133545, "step": 1580 }, { "epoch": 0.3970533150206018, "grad_norm": 0.03369140625, "learning_rate": 3.7725847764852774e-06, "logits/chosen": -2.10914945602417, "logits/rejected": -2.3385162353515625, "logps/chosen": -1.6078799962997437, "logps/rejected": -1307.208740234375, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 0.2162178009748459, "rewards/margins": 12.54298210144043, "rewards/rejected": -12.326765060424805, "step": 1590 }, { "epoch": 0.3995505056811087, "grad_norm": 0.0267333984375, "learning_rate": 3.7537736997372833e-06, "logits/chosen": -2.1722114086151123, "logits/rejected": -2.3555681705474854, "logps/chosen": -1.133063793182373, "logps/rejected": -1113.764404296875, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.21135945618152618, "rewards/margins": 10.682828903198242, "rewards/rejected": -10.471468925476074, "step": 1600 }, { "epoch": 0.4020476963416157, "grad_norm": 0.020751953125, "learning_rate": 3.734867302201038e-06, "logits/chosen": -2.2481324672698975, "logits/rejected": -2.4178614616394043, "logps/chosen": -0.7748688459396362, "logps/rejected": -1153.1929931640625, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.2061961144208908, "rewards/margins": 11.231634140014648, "rewards/rejected": -11.02543830871582, "step": 1610 }, { "epoch": 0.4045448870021226, "grad_norm": 0.046630859375, "learning_rate": 3.7158670212752666e-06, "logits/chosen": -2.158440113067627, "logits/rejected": -2.3695878982543945, "logps/chosen": -0.685897946357727, "logps/rejected": -1294.4326171875, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.2151576578617096, "rewards/margins": 12.627668380737305, "rewards/rejected": -12.412511825561523, "step": 1620 }, { "epoch": 0.40704207766262956, "grad_norm": 0.015869140625, "learning_rate": 3.696774301496376e-06, "logits/chosen": -2.2252297401428223, "logits/rejected": -2.4217424392700195, "logps/chosen": -0.6748331785202026, "logps/rejected": -1261.10009765625, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 0.21283042430877686, "rewards/margins": 12.33554458618164, "rewards/rejected": -12.122715950012207, "step": 1630 }, { "epoch": 0.4095392683231365, "grad_norm": 0.0283203125, "learning_rate": 3.677590594428629e-06, "logits/chosen": -2.159726619720459, "logits/rejected": -2.3402228355407715, "logps/chosen": -0.9869475364685059, "logps/rejected": -1201.0703125, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 0.20935773849487305, "rewards/margins": 11.699995994567871, "rewards/rejected": -11.490636825561523, "step": 1640 }, { "epoch": 0.4120364589836434, "grad_norm": 0.07470703125, "learning_rate": 3.658317358553794e-06, "logits/chosen": -2.1311771869659424, "logits/rejected": -2.3283205032348633, "logps/chosen": -0.7873401045799255, "logps/rejected": -1318.947265625, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.20913653075695038, "rewards/margins": 12.813528060913086, "rewards/rejected": -12.604392051696777, "step": 1650 }, { "epoch": 0.41453364964415035, "grad_norm": 0.06494140625, "learning_rate": 3.638956059160252e-06, "logits/chosen": -2.180502414703369, "logits/rejected": -2.3862075805664062, "logps/chosen": -1.0054365396499634, "logps/rejected": -1342.7799072265625, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21474532783031464, "rewards/margins": 13.147130966186523, "rewards/rejected": -12.932388305664062, "step": 1660 }, { "epoch": 0.41703084030465726, "grad_norm": 0.03369140625, "learning_rate": 3.6195081682315972e-06, "logits/chosen": -2.2029502391815186, "logits/rejected": -2.3754451274871826, "logps/chosen": -1.1696422100067139, "logps/rejected": -1324.997802734375, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.20706875622272491, "rewards/margins": 12.972146987915039, "rewards/rejected": -12.765077590942383, "step": 1670 }, { "epoch": 0.41952803096516417, "grad_norm": 0.05126953125, "learning_rate": 3.5999751643347342e-06, "logits/chosen": -2.126647472381592, "logits/rejected": -2.32842755317688, "logps/chosen": -1.3129024505615234, "logps/rejected": -1431.062255859375, "loss": 0.2246, "rewards/accuracies": 1.0, "rewards/chosen": 0.2164861261844635, "rewards/margins": 13.894182205200195, "rewards/rejected": -13.677694320678711, "step": 1680 }, { "epoch": 0.42202522162567113, "grad_norm": 0.05126953125, "learning_rate": 3.5803585325074536e-06, "logits/chosen": -2.1573426723480225, "logits/rejected": -2.3461415767669678, "logps/chosen": -0.5849089622497559, "logps/rejected": -1369.0498046875, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.20605847239494324, "rewards/margins": 13.385258674621582, "rewards/rejected": -13.179201126098633, "step": 1690 }, { "epoch": 0.42452241228617804, "grad_norm": 0.041015625, "learning_rate": 3.5606597641455387e-06, "logits/chosen": -2.201714515686035, "logits/rejected": -2.3846235275268555, "logps/chosen": -1.2365072965621948, "logps/rejected": -1268.500732421875, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 0.20618323981761932, "rewards/margins": 12.352148056030273, "rewards/rejected": -12.145965576171875, "step": 1700 }, { "epoch": 0.427019602946685, "grad_norm": 0.6015625, "learning_rate": 3.540880356889376e-06, "logits/chosen": -2.204244375228882, "logits/rejected": -2.37742280960083, "logps/chosen": -1.9021589756011963, "logps/rejected": -1228.02685546875, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.1988053023815155, "rewards/margins": 11.881242752075195, "rewards/rejected": -11.682435035705566, "step": 1710 }, { "epoch": 0.4295167936071919, "grad_norm": 0.05712890625, "learning_rate": 3.5210218145100934e-06, "logits/chosen": -2.1249117851257324, "logits/rejected": -2.343653917312622, "logps/chosen": -0.9779669642448425, "logps/rejected": -1107.069580078125, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/chosen": 0.20971660315990448, "rewards/margins": 10.748934745788574, "rewards/rejected": -10.53921890258789, "step": 1720 }, { "epoch": 0.4320139842676988, "grad_norm": 0.04931640625, "learning_rate": 3.5010856467952335e-06, "logits/chosen": -2.135411262512207, "logits/rejected": -2.3283915519714355, "logps/chosen": -1.680784821510315, "logps/rejected": -1203.44873046875, "loss": 0.2277, "rewards/accuracies": 1.0, "rewards/chosen": 0.21119749546051025, "rewards/margins": 11.60279655456543, "rewards/rejected": -11.391599655151367, "step": 1730 }, { "epoch": 0.4345111749282058, "grad_norm": 0.061279296875, "learning_rate": 3.4810733694339687e-06, "logits/chosen": -2.227553367614746, "logits/rejected": -2.4453303813934326, "logps/chosen": -1.1945085525512695, "logps/rejected": -1365.62158203125, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21306195855140686, "rewards/margins": 13.283732414245605, "rewards/rejected": -13.070669174194336, "step": 1740 }, { "epoch": 0.4370083655887127, "grad_norm": 0.026123046875, "learning_rate": 3.4609865039018676e-06, "logits/chosen": -2.2143800258636475, "logits/rejected": -2.38647198677063, "logps/chosen": -0.3982168138027191, "logps/rejected": -1256.0924072265625, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.20522812008857727, "rewards/margins": 12.254903793334961, "rewards/rejected": -12.049676895141602, "step": 1750 }, { "epoch": 0.4395055562492196, "grad_norm": 0.017578125, "learning_rate": 3.4408265773452226e-06, "logits/chosen": -2.132845401763916, "logits/rejected": -2.32383394241333, "logps/chosen": -0.7928985953330994, "logps/rejected": -1260.4219970703125, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 0.21432673931121826, "rewards/margins": 12.288119316101074, "rewards/rejected": -12.07379150390625, "step": 1760 }, { "epoch": 0.4420027469097266, "grad_norm": 0.027587890625, "learning_rate": 3.420595122464942e-06, "logits/chosen": -2.2310843467712402, "logits/rejected": -2.43049693107605, "logps/chosen": -1.0165212154388428, "logps/rejected": -1248.940673828125, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.20808283984661102, "rewards/margins": 12.158212661743164, "rewards/rejected": -11.950130462646484, "step": 1770 }, { "epoch": 0.4444999375702335, "grad_norm": 0.05029296875, "learning_rate": 3.4002936774000284e-06, "logits/chosen": -2.129657030105591, "logits/rejected": -2.3626723289489746, "logps/chosen": -0.534063994884491, "logps/rejected": -1597.343017578125, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.21577997505664825, "rewards/margins": 15.636571884155273, "rewards/rejected": -15.420791625976562, "step": 1780 }, { "epoch": 0.4469971282307404, "grad_norm": 0.02587890625, "learning_rate": 3.3799237856106348e-06, "logits/chosen": -2.1293628215789795, "logits/rejected": -2.3366832733154297, "logps/chosen": -0.6109465956687927, "logps/rejected": -1318.239990234375, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.2080315351486206, "rewards/margins": 12.848733901977539, "rewards/rejected": -12.640703201293945, "step": 1790 }, { "epoch": 0.44949431889124736, "grad_norm": 0.060546875, "learning_rate": 3.35948699576072e-06, "logits/chosen": -2.0792922973632812, "logits/rejected": -2.285391330718994, "logps/chosen": -0.9549906849861145, "logps/rejected": -1534.51953125, "loss": 0.2244, "rewards/accuracies": 1.0, "rewards/chosen": 0.21579799056053162, "rewards/margins": 14.997169494628906, "rewards/rejected": -14.7813720703125, "step": 1800 }, { "epoch": 0.45199150955175427, "grad_norm": 0.09033203125, "learning_rate": 3.3389848616003085e-06, "logits/chosen": -2.169448137283325, "logits/rejected": -2.34112286567688, "logps/chosen": -1.1561418771743774, "logps/rejected": -1331.464111328125, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": 0.20999138057231903, "rewards/margins": 12.991872787475586, "rewards/rejected": -12.781880378723145, "step": 1810 }, { "epoch": 0.45448870021226123, "grad_norm": 0.024169921875, "learning_rate": 3.3184189418473674e-06, "logits/chosen": -2.0690829753875732, "logits/rejected": -2.2553791999816895, "logps/chosen": -0.737138032913208, "logps/rejected": -1278.2681884765625, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.20780067145824432, "rewards/margins": 12.481771469116211, "rewards/rejected": -12.273969650268555, "step": 1820 }, { "epoch": 0.45698589087276814, "grad_norm": 0.0810546875, "learning_rate": 3.2977908000692925e-06, "logits/chosen": -2.1408801078796387, "logits/rejected": -2.3243911266326904, "logps/chosen": -1.5268166065216064, "logps/rejected": -1405.22412109375, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 0.20781424641609192, "rewards/margins": 13.740381240844727, "rewards/rejected": -13.532565116882324, "step": 1830 }, { "epoch": 0.45948308153327505, "grad_norm": 0.04736328125, "learning_rate": 3.2771020045640435e-06, "logits/chosen": -2.286168336868286, "logits/rejected": -2.4684412479400635, "logps/chosen": -0.6708983182907104, "logps/rejected": -1134.7979736328125, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21890632808208466, "rewards/margins": 11.039240837097168, "rewards/rejected": -10.820335388183594, "step": 1840 }, { "epoch": 0.461980272193782, "grad_norm": 0.099609375, "learning_rate": 3.256354128240907e-06, "logits/chosen": -2.06745982170105, "logits/rejected": -2.248892307281494, "logps/chosen": -1.6344282627105713, "logps/rejected": -1263.974853515625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21318969130516052, "rewards/margins": 12.202125549316406, "rewards/rejected": -11.988935470581055, "step": 1850 }, { "epoch": 0.4644774628542889, "grad_norm": 0.0654296875, "learning_rate": 3.235548748500914e-06, "logits/chosen": -2.3071300983428955, "logits/rejected": -2.500091314315796, "logps/chosen": -1.0427045822143555, "logps/rejected": -1357.378662109375, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.20457443594932556, "rewards/margins": 13.286227226257324, "rewards/rejected": -13.081652641296387, "step": 1860 }, { "epoch": 0.46697465351479583, "grad_norm": 0.04248046875, "learning_rate": 3.214687447116913e-06, "logits/chosen": -2.10600209236145, "logits/rejected": -2.302873373031616, "logps/chosen": -0.6546305418014526, "logps/rejected": -1224.43359375, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.20871946215629578, "rewards/margins": 11.745490074157715, "rewards/rejected": -11.536770820617676, "step": 1870 }, { "epoch": 0.4694718441753028, "grad_norm": 0.01104736328125, "learning_rate": 3.193771810113313e-06, "logits/chosen": -2.1570992469787598, "logits/rejected": -2.384364604949951, "logps/chosen": -1.154052495956421, "logps/rejected": -1359.59619140625, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21566633880138397, "rewards/margins": 13.259126663208008, "rewards/rejected": -13.043458938598633, "step": 1880 }, { "epoch": 0.4719690348358097, "grad_norm": 0.0174560546875, "learning_rate": 3.1728034276455032e-06, "logits/chosen": -2.138918399810791, "logits/rejected": -2.335463047027588, "logps/chosen": -0.595456600189209, "logps/rejected": -1286.499267578125, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.21340537071228027, "rewards/margins": 12.50808048248291, "rewards/rejected": -12.294673919677734, "step": 1890 }, { "epoch": 0.4744662254963166, "grad_norm": 0.0191650390625, "learning_rate": 3.1517838938789597e-06, "logits/chosen": -2.1312789916992188, "logits/rejected": -2.3574845790863037, "logps/chosen": -1.0333608388900757, "logps/rejected": -1402.2928466796875, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.21823573112487793, "rewards/margins": 13.426950454711914, "rewards/rejected": -13.208715438842773, "step": 1900 }, { "epoch": 0.4769634161568236, "grad_norm": 0.021240234375, "learning_rate": 3.130714806868041e-06, "logits/chosen": -2.1018803119659424, "logits/rejected": -2.2899601459503174, "logps/chosen": -1.5672905445098877, "logps/rejected": -1282.0211181640625, "loss": 0.2247, "rewards/accuracies": 1.0, "rewards/chosen": 0.2161625623703003, "rewards/margins": 12.432838439941406, "rewards/rejected": -12.216676712036133, "step": 1910 }, { "epoch": 0.4794606068173305, "grad_norm": 0.03369140625, "learning_rate": 3.1095977684344976e-06, "logits/chosen": -2.1870434284210205, "logits/rejected": -2.3968632221221924, "logps/chosen": -0.9621860384941101, "logps/rejected": -1362.8802490234375, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.21506325900554657, "rewards/margins": 13.313095092773438, "rewards/rejected": -13.098034858703613, "step": 1920 }, { "epoch": 0.48195779747783746, "grad_norm": 0.052978515625, "learning_rate": 3.0884343840456874e-06, "logits/chosen": -2.2485427856445312, "logits/rejected": -2.4523234367370605, "logps/chosen": -0.8971269726753235, "logps/rejected": -1507.810791015625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.20877547562122345, "rewards/margins": 14.738133430480957, "rewards/rejected": -14.529356002807617, "step": 1930 }, { "epoch": 0.48445498813834437, "grad_norm": 0.07568359375, "learning_rate": 3.0672262626925174e-06, "logits/chosen": -2.148587942123413, "logits/rejected": -2.359325408935547, "logps/chosen": -2.250260353088379, "logps/rejected": -1421.3468017578125, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.22058424353599548, "rewards/margins": 13.836527824401855, "rewards/rejected": -13.615945816040039, "step": 1940 }, { "epoch": 0.4869521787988513, "grad_norm": 0.06640625, "learning_rate": 3.0459750167671147e-06, "logits/chosen": -2.1717689037323, "logits/rejected": -2.403097629547119, "logps/chosen": -1.1346304416656494, "logps/rejected": -1519.8033447265625, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21193809807300568, "rewards/margins": 14.729626655578613, "rewards/rejected": -14.517687797546387, "step": 1950 }, { "epoch": 0.48944936945935824, "grad_norm": 0.12890625, "learning_rate": 3.024682261940247e-06, "logits/chosen": -2.1400859355926514, "logits/rejected": -2.3196842670440674, "logps/chosen": -1.9256393909454346, "logps/rejected": -1212.4700927734375, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21558912098407745, "rewards/margins": 11.704329490661621, "rewards/rejected": -11.488740921020508, "step": 1960 }, { "epoch": 0.49194656011986515, "grad_norm": 0.053466796875, "learning_rate": 3.0033496170384803e-06, "logits/chosen": -2.2003872394561768, "logits/rejected": -2.384770154953003, "logps/chosen": -0.6797516345977783, "logps/rejected": -1223.4056396484375, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 0.20655830204486847, "rewards/margins": 11.95020866394043, "rewards/rejected": -11.743650436401367, "step": 1970 }, { "epoch": 0.49444375078037206, "grad_norm": 0.05126953125, "learning_rate": 2.9819787039211068e-06, "logits/chosen": -2.1409530639648438, "logits/rejected": -2.3441128730773926, "logps/chosen": -1.6590759754180908, "logps/rejected": -1320.5748291015625, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.21551513671875, "rewards/margins": 12.755599021911621, "rewards/rejected": -12.540084838867188, "step": 1980 }, { "epoch": 0.496940941440879, "grad_norm": 0.015869140625, "learning_rate": 2.960571147356845e-06, "logits/chosen": -2.2252392768859863, "logits/rejected": -2.4482040405273438, "logps/chosen": -0.6751580238342285, "logps/rejected": -1514.2879638671875, "loss": 0.2233, "rewards/accuracies": 1.0, "rewards/chosen": 0.22100117802619934, "rewards/margins": 14.839349746704102, "rewards/rejected": -14.618349075317383, "step": 1990 }, { "epoch": 0.49943813210138593, "grad_norm": 0.11376953125, "learning_rate": 2.9391285749003046e-06, "logits/chosen": -2.1313652992248535, "logits/rejected": -2.3276991844177246, "logps/chosen": -1.28163743019104, "logps/rejected": -1614.152099609375, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.21389129757881165, "rewards/margins": 15.772817611694336, "rewards/rejected": -15.558927536010742, "step": 2000 }, { "epoch": 0.49943813210138593, "eval_logits/chosen": -2.568960428237915, "eval_logits/rejected": -2.656001329421997, "eval_logps/chosen": -0.1526380479335785, "eval_logps/rejected": -643.470458984375, "eval_loss": 0.2215292751789093, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.25760865211486816, "eval_rewards/margins": 6.243593215942383, "eval_rewards/rejected": -5.985984802246094, "eval_runtime": 0.6593, "eval_samples_per_second": 7.584, "eval_steps_per_second": 4.551, "step": 2000 }, { "epoch": 0.5019353227618929, "grad_norm": 0.072265625, "learning_rate": 2.9176526167682543e-06, "logits/chosen": -2.0913753509521484, "logits/rejected": -2.273857593536377, "logps/chosen": -0.7355623841285706, "logps/rejected": -1363.037841796875, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.2061166763305664, "rewards/margins": 13.315282821655273, "rewards/rejected": -13.109164237976074, "step": 2010 }, { "epoch": 0.5044325134223998, "grad_norm": 0.0242919921875, "learning_rate": 2.8961449057156775e-06, "logits/chosen": -2.1776702404022217, "logits/rejected": -2.3788368701934814, "logps/chosen": -1.159735918045044, "logps/rejected": -1370.439697265625, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.21482279896736145, "rewards/margins": 13.327527046203613, "rewards/rejected": -13.112703323364258, "step": 2020 }, { "epoch": 0.5069297040829067, "grad_norm": 0.0654296875, "learning_rate": 2.874607076911642e-06, "logits/chosen": -2.1823270320892334, "logits/rejected": -2.400944471359253, "logps/chosen": -1.355530023574829, "logps/rejected": -1275.2886962890625, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.2216695249080658, "rewards/margins": 12.436738967895508, "rewards/rejected": -12.215067863464355, "step": 2030 }, { "epoch": 0.5094268947434136, "grad_norm": 0.06689453125, "learning_rate": 2.8530407678149806e-06, "logits/chosen": -2.1733579635620117, "logits/rejected": -2.3787028789520264, "logps/chosen": -2.122178554534912, "logps/rejected": -1217.6248779296875, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.21247439086437225, "rewards/margins": 11.738574028015137, "rewards/rejected": -11.526100158691406, "step": 2040 }, { "epoch": 0.5119240854039205, "grad_norm": 0.0164794921875, "learning_rate": 2.8314476180498003e-06, "logits/chosen": -2.010568618774414, "logits/rejected": -2.1947145462036133, "logps/chosen": -0.8790448904037476, "logps/rejected": -1320.770263671875, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.20843760669231415, "rewards/margins": 12.884744644165039, "rewards/rejected": -12.67630672454834, "step": 2050 }, { "epoch": 0.5144212760644276, "grad_norm": 0.037109375, "learning_rate": 2.8098292692808253e-06, "logits/chosen": -2.1951942443847656, "logits/rejected": -2.3474528789520264, "logps/chosen": -0.8600829839706421, "logps/rejected": -1061.1048583984375, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 0.20944657921791077, "rewards/margins": 10.36804485321045, "rewards/rejected": -10.158597946166992, "step": 2060 }, { "epoch": 0.5169184667249345, "grad_norm": 0.0224609375, "learning_rate": 2.7881873650885904e-06, "logits/chosen": -2.1963181495666504, "logits/rejected": -2.3679440021514893, "logps/chosen": -0.8357653617858887, "logps/rejected": -1268.226318359375, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.21145395934581757, "rewards/margins": 12.395639419555664, "rewards/rejected": -12.184186935424805, "step": 2070 }, { "epoch": 0.5194156573854414, "grad_norm": 0.03955078125, "learning_rate": 2.7665235508444772e-06, "logits/chosen": -2.131880044937134, "logits/rejected": -2.329930067062378, "logps/chosen": -0.8339768648147583, "logps/rejected": -1511.36962890625, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": 0.2037159651517868, "rewards/margins": 14.78296184539795, "rewards/rejected": -14.579244613647461, "step": 2080 }, { "epoch": 0.5219128480459483, "grad_norm": 0.1123046875, "learning_rate": 2.7448394735856275e-06, "logits/chosen": -2.0990092754364014, "logits/rejected": -2.317046642303467, "logps/chosen": -0.900246798992157, "logps/rejected": -1560.1123046875, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 0.20990662276744843, "rewards/margins": 15.233263969421387, "rewards/rejected": -15.023355484008789, "step": 2090 }, { "epoch": 0.5244100387064552, "grad_norm": 0.05810546875, "learning_rate": 2.723136781889722e-06, "logits/chosen": -2.221381664276123, "logits/rejected": -2.4073383808135986, "logps/chosen": -1.555213451385498, "logps/rejected": -1313.25439453125, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 0.21150963008403778, "rewards/margins": 12.850160598754883, "rewards/rejected": -12.638651847839355, "step": 2100 }, { "epoch": 0.5269072293669622, "grad_norm": 0.031494140625, "learning_rate": 2.7014171257496414e-06, "logits/chosen": -2.224299669265747, "logits/rejected": -2.4082083702087402, "logps/chosen": -1.5661276578903198, "logps/rejected": -1288.989013671875, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.2089545726776123, "rewards/margins": 12.412330627441406, "rewards/rejected": -12.203374862670898, "step": 2110 }, { "epoch": 0.5294044200274691, "grad_norm": 0.044189453125, "learning_rate": 2.6796821564480237e-06, "logits/chosen": -2.143993854522705, "logits/rejected": -2.3330225944519043, "logps/chosen": -1.3014509677886963, "logps/rejected": -1159.53271484375, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.21552510559558868, "rewards/margins": 11.207192420959473, "rewards/rejected": -10.991667747497559, "step": 2120 }, { "epoch": 0.531901610687976, "grad_norm": 0.032958984375, "learning_rate": 2.6579335264317253e-06, "logits/chosen": -2.2805047035217285, "logits/rejected": -2.4840075969696045, "logps/chosen": -0.6564453840255737, "logps/rejected": -1376.549560546875, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.20875303447246552, "rewards/margins": 13.3289794921875, "rewards/rejected": -13.120226860046387, "step": 2130 }, { "epoch": 0.5343988013484829, "grad_norm": 0.02587890625, "learning_rate": 2.6361728891861843e-06, "logits/chosen": -2.044534206390381, "logits/rejected": -2.263455629348755, "logps/chosen": -2.359926462173462, "logps/rejected": -1182.7542724609375, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21868690848350525, "rewards/margins": 11.231551170349121, "rewards/rejected": -11.01286506652832, "step": 2140 }, { "epoch": 0.5368959920089899, "grad_norm": 0.0235595703125, "learning_rate": 2.614401899109716e-06, "logits/chosen": -2.2184996604919434, "logits/rejected": -2.4115943908691406, "logps/chosen": -0.7188009023666382, "logps/rejected": -1362.302490234375, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.20849958062171936, "rewards/margins": 13.323092460632324, "rewards/rejected": -13.114593505859375, "step": 2150 }, { "epoch": 0.5393931826694968, "grad_norm": 0.023681640625, "learning_rate": 2.5926222113877282e-06, "logits/chosen": -2.2279531955718994, "logits/rejected": -2.4470245838165283, "logps/chosen": -0.8932285308837891, "logps/rejected": -1380.791748046875, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.20549210906028748, "rewards/margins": 13.206730842590332, "rewards/rejected": -13.001237869262695, "step": 2160 }, { "epoch": 0.5418903733300038, "grad_norm": 0.09619140625, "learning_rate": 2.570835481866889e-06, "logits/chosen": -2.122584819793701, "logits/rejected": -2.3029303550720215, "logps/chosen": -0.6316767334938049, "logps/rejected": -1331.388916015625, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.20623505115509033, "rewards/margins": 13.001462936401367, "rewards/rejected": -12.795228958129883, "step": 2170 }, { "epoch": 0.5443875639905107, "grad_norm": 0.04248046875, "learning_rate": 2.5490433669292337e-06, "logits/chosen": -2.044675350189209, "logits/rejected": -2.251300811767578, "logps/chosen": -0.7981548309326172, "logps/rejected": -1485.2850341796875, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.2164611518383026, "rewards/margins": 14.53178596496582, "rewards/rejected": -14.315322875976562, "step": 2180 }, { "epoch": 0.5468847546510176, "grad_norm": 0.038330078125, "learning_rate": 2.527247523366232e-06, "logits/chosen": -2.2029881477355957, "logits/rejected": -2.4012579917907715, "logps/chosen": -1.3100454807281494, "logps/rejected": -1426.16357421875, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.2108200490474701, "rewards/margins": 13.932962417602539, "rewards/rejected": -13.722142219543457, "step": 2190 }, { "epoch": 0.5493819453115245, "grad_norm": 0.02978515625, "learning_rate": 2.5054496082528336e-06, "logits/chosen": -2.263662576675415, "logits/rejected": -2.4767444133758545, "logps/chosen": -0.6738319993019104, "logps/rejected": -1380.506103515625, "loss": 0.2246, "rewards/accuracies": 1.0, "rewards/chosen": 0.21471872925758362, "rewards/margins": 13.528160095214844, "rewards/rejected": -13.313441276550293, "step": 2200 }, { "epoch": 0.5518791359720314, "grad_norm": 0.03955078125, "learning_rate": 2.483651278821481e-06, "logits/chosen": -2.2110023498535156, "logits/rejected": -2.4015591144561768, "logps/chosen": -1.228434443473816, "logps/rejected": -1266.2230224609375, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.20852570235729218, "rewards/margins": 12.340888977050781, "rewards/rejected": -12.13236141204834, "step": 2210 }, { "epoch": 0.5543763266325384, "grad_norm": 0.0238037109375, "learning_rate": 2.4618541923361166e-06, "logits/chosen": -2.3842873573303223, "logits/rejected": -2.558562994003296, "logps/chosen": -1.321533203125, "logps/rejected": -1156.223876953125, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.20611576735973358, "rewards/margins": 11.165016174316406, "rewards/rejected": -10.958898544311523, "step": 2220 }, { "epoch": 0.5568735172930454, "grad_norm": 0.06005859375, "learning_rate": 2.4400600059661836e-06, "logits/chosen": -2.069483757019043, "logits/rejected": -2.31620717048645, "logps/chosen": -1.093656301498413, "logps/rejected": -1508.9503173828125, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.2100195437669754, "rewards/margins": 14.735700607299805, "rewards/rejected": -14.52568244934082, "step": 2230 }, { "epoch": 0.5593707079535523, "grad_norm": 0.009765625, "learning_rate": 2.41827037666064e-06, "logits/chosen": -2.2314319610595703, "logits/rejected": -2.4116859436035156, "logps/chosen": -0.6631449460983276, "logps/rejected": -1216.8101806640625, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.2116110622882843, "rewards/margins": 11.8889799118042, "rewards/rejected": -11.677370071411133, "step": 2240 }, { "epoch": 0.5618678986140592, "grad_norm": 0.035888671875, "learning_rate": 2.396486961021983e-06, "logits/chosen": -2.156050443649292, "logits/rejected": -2.355743885040283, "logps/chosen": -0.5853773951530457, "logps/rejected": -1307.397705078125, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.21641604602336884, "rewards/margins": 12.776580810546875, "rewards/rejected": -12.560165405273438, "step": 2250 }, { "epoch": 0.5643650892745661, "grad_norm": 0.0247802734375, "learning_rate": 2.3747114151802993e-06, "logits/chosen": -2.2995388507843018, "logits/rejected": -2.4979677200317383, "logps/chosen": -1.0234979391098022, "logps/rejected": -1314.0380859375, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.2088872194290161, "rewards/margins": 12.864030838012695, "rewards/rejected": -12.655143737792969, "step": 2260 }, { "epoch": 0.566862279935073, "grad_norm": 0.04345703125, "learning_rate": 2.352945394667363e-06, "logits/chosen": -2.087890386581421, "logits/rejected": -2.308422803878784, "logps/chosen": -0.9035698771476746, "logps/rejected": -1510.1090087890625, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 0.2111314833164215, "rewards/margins": 14.675390243530273, "rewards/rejected": -14.464259147644043, "step": 2270 }, { "epoch": 0.56935947059558, "grad_norm": 0.126953125, "learning_rate": 2.3311905542907627e-06, "logits/chosen": -2.234039545059204, "logits/rejected": -2.428889751434326, "logps/chosen": -0.797686755657196, "logps/rejected": -1220.7269287109375, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.2120717316865921, "rewards/margins": 11.918030738830566, "rewards/rejected": -11.70595932006836, "step": 2280 }, { "epoch": 0.5718566612560869, "grad_norm": 0.06396484375, "learning_rate": 2.30944854800809e-06, "logits/chosen": -2.1873550415039062, "logits/rejected": -2.3636820316314697, "logps/chosen": -0.8641906976699829, "logps/rejected": -1375.240478515625, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.21045894920825958, "rewards/margins": 13.458274841308594, "rewards/rejected": -13.247815132141113, "step": 2290 }, { "epoch": 0.5743538519165938, "grad_norm": 0.0224609375, "learning_rate": 2.287721028801204e-06, "logits/chosen": -2.147500991821289, "logits/rejected": -2.3285794258117676, "logps/chosen": -1.5540382862091064, "logps/rejected": -1261.9169921875, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.2090359479188919, "rewards/margins": 12.258954048156738, "rewards/rejected": -12.049917221069336, "step": 2300 }, { "epoch": 0.5768510425771007, "grad_norm": 0.224609375, "learning_rate": 2.26600964855055e-06, "logits/chosen": -2.2112767696380615, "logits/rejected": -2.387683868408203, "logps/chosen": -1.0878078937530518, "logps/rejected": -1259.334716796875, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.2047530710697174, "rewards/margins": 12.326273918151855, "rewards/rejected": -12.121520042419434, "step": 2310 }, { "epoch": 0.5793482332376076, "grad_norm": 0.033203125, "learning_rate": 2.244316057909573e-06, "logits/chosen": -2.179072856903076, "logits/rejected": -2.3518600463867188, "logps/chosen": -0.5903832912445068, "logps/rejected": -1252.9005126953125, "loss": 0.2288, "rewards/accuracies": 1.0, "rewards/chosen": 0.20970389246940613, "rewards/margins": 12.249414443969727, "rewards/rejected": -12.039710998535156, "step": 2320 }, { "epoch": 0.5818454238981147, "grad_norm": 0.043212890625, "learning_rate": 2.2226419061792282e-06, "logits/chosen": -2.2571616172790527, "logits/rejected": -2.4548702239990234, "logps/chosen": -0.747587263584137, "logps/rejected": -1403.0311279296875, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.20638947188854218, "rewards/margins": 13.71589183807373, "rewards/rejected": -13.509503364562988, "step": 2330 }, { "epoch": 0.5843426145586216, "grad_norm": 0.0079345703125, "learning_rate": 2.200988841182589e-06, "logits/chosen": -2.1915557384490967, "logits/rejected": -2.3925371170043945, "logps/chosen": -0.653125524520874, "logps/rejected": -1481.6878662109375, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.20729561150074005, "rewards/margins": 14.506765365600586, "rewards/rejected": -14.299470901489258, "step": 2340 }, { "epoch": 0.5868398052191285, "grad_norm": 0.0286865234375, "learning_rate": 2.179358509139559e-06, "logits/chosen": -2.149214267730713, "logits/rejected": -2.344883680343628, "logps/chosen": -2.6051526069641113, "logps/rejected": -1142.56201171875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21485964953899384, "rewards/margins": 11.022435188293457, "rewards/rejected": -10.807573318481445, "step": 2350 }, { "epoch": 0.5893369958796354, "grad_norm": 0.041748046875, "learning_rate": 2.1577525545417254e-06, "logits/chosen": -2.1596992015838623, "logits/rejected": -2.3585286140441895, "logps/chosen": -0.6524207592010498, "logps/rejected": -1219.5198974609375, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.2145775854587555, "rewards/margins": 11.869647026062012, "rewards/rejected": -11.655069351196289, "step": 2360 }, { "epoch": 0.5918341865401423, "grad_norm": 0.203125, "learning_rate": 2.1361726200273293e-06, "logits/chosen": -2.247102737426758, "logits/rejected": -2.4553802013397217, "logps/chosen": -1.189576506614685, "logps/rejected": -1349.142578125, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21253642439842224, "rewards/margins": 13.139638900756836, "rewards/rejected": -12.927103042602539, "step": 2370 }, { "epoch": 0.5943313772006493, "grad_norm": 0.035888671875, "learning_rate": 2.1146203462563773e-06, "logits/chosen": -2.302658796310425, "logits/rejected": -2.4925646781921387, "logps/chosen": -0.5675852298736572, "logps/rejected": -1279.3642578125, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.20174381136894226, "rewards/margins": 12.505022048950195, "rewards/rejected": -12.303278923034668, "step": 2380 }, { "epoch": 0.5968285678611562, "grad_norm": 0.024169921875, "learning_rate": 2.0930973717859117e-06, "logits/chosen": -2.3194613456726074, "logits/rejected": -2.526947498321533, "logps/chosen": -0.6186977624893188, "logps/rejected": -1298.871826171875, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21502625942230225, "rewards/margins": 12.666671752929688, "rewards/rejected": -12.45164680480957, "step": 2390 }, { "epoch": 0.5993257585216631, "grad_norm": 0.0150146484375, "learning_rate": 2.0716053329454337e-06, "logits/chosen": -2.0586659908294678, "logits/rejected": -2.262817621231079, "logps/chosen": -1.2787067890167236, "logps/rejected": -1338.0716552734375, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.21538551151752472, "rewards/margins": 12.998420715332031, "rewards/rejected": -12.783034324645996, "step": 2400 }, { "epoch": 0.60182294918217, "grad_norm": 0.03369140625, "learning_rate": 2.0501458637124963e-06, "logits/chosen": -2.1946122646331787, "logits/rejected": -2.4308152198791504, "logps/chosen": -0.9974037408828735, "logps/rejected": -1574.956787109375, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21448758244514465, "rewards/margins": 15.443408012390137, "rewards/rejected": -15.228919982910156, "step": 2410 }, { "epoch": 0.604320139842677, "grad_norm": 0.021484375, "learning_rate": 2.0287205955884812e-06, "logits/chosen": -2.1859405040740967, "logits/rejected": -2.419334888458252, "logps/chosen": -1.4137351512908936, "logps/rejected": -1243.725830078125, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.21700558066368103, "rewards/margins": 11.884050369262695, "rewards/rejected": -11.667045593261719, "step": 2420 }, { "epoch": 0.6068173305031839, "grad_norm": 0.0595703125, "learning_rate": 2.0073311574745583e-06, "logits/chosen": -2.162872791290283, "logits/rejected": -2.378561019897461, "logps/chosen": -0.7903895378112793, "logps/rejected": -1427.887939453125, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 0.21638064086437225, "rewards/margins": 13.9492769241333, "rewards/rejected": -13.73289680480957, "step": 2430 }, { "epoch": 0.6093145211636909, "grad_norm": 0.037841796875, "learning_rate": 1.9859791755478453e-06, "logits/chosen": -2.1776349544525146, "logits/rejected": -2.3626227378845215, "logps/chosen": -1.0283732414245605, "logps/rejected": -1148.4774169921875, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21198368072509766, "rewards/margins": 11.212942123413086, "rewards/rejected": -11.000959396362305, "step": 2440 }, { "epoch": 0.6118117118241978, "grad_norm": 0.0311279296875, "learning_rate": 1.9646662731377737e-06, "logits/chosen": -2.130434989929199, "logits/rejected": -2.3274593353271484, "logps/chosen": -0.7933204770088196, "logps/rejected": -1231.31201171875, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.20909221470355988, "rewards/margins": 11.990662574768066, "rewards/rejected": -11.781569480895996, "step": 2450 }, { "epoch": 0.6143089024847047, "grad_norm": 0.06689453125, "learning_rate": 1.9433940706026743e-06, "logits/chosen": -2.162235736846924, "logits/rejected": -2.3636813163757324, "logps/chosen": -0.8596396446228027, "logps/rejected": -1512.643798828125, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": 0.21136781573295593, "rewards/margins": 14.79273796081543, "rewards/rejected": -14.581372261047363, "step": 2460 }, { "epoch": 0.6168060931452116, "grad_norm": 0.040283203125, "learning_rate": 1.9221641852065807e-06, "logits/chosen": -2.153958797454834, "logits/rejected": -2.322754383087158, "logps/chosen": -0.7868290543556213, "logps/rejected": -1277.087890625, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.21162231266498566, "rewards/margins": 12.481648445129395, "rewards/rejected": -12.270025253295898, "step": 2470 }, { "epoch": 0.6193032838057185, "grad_norm": 0.0302734375, "learning_rate": 1.9009782309962805e-06, "logits/chosen": -2.2541210651397705, "logits/rejected": -2.451572895050049, "logps/chosen": -0.9773980379104614, "logps/rejected": -1259.029296875, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 0.21588608622550964, "rewards/margins": 12.217048645019531, "rewards/rejected": -12.001164436340332, "step": 2480 }, { "epoch": 0.6218004744662256, "grad_norm": 0.042236328125, "learning_rate": 1.8798378186785979e-06, "logits/chosen": -2.208289623260498, "logits/rejected": -2.3975791931152344, "logps/chosen": -0.47841542959213257, "logps/rejected": -1317.6165771484375, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.21394380927085876, "rewards/margins": 12.884051322937012, "rewards/rejected": -12.670106887817383, "step": 2490 }, { "epoch": 0.6242976651267325, "grad_norm": 0.0172119140625, "learning_rate": 1.8587445554979404e-06, "logits/chosen": -2.054529905319214, "logits/rejected": -2.2491745948791504, "logps/chosen": -0.9916723370552063, "logps/rejected": -1467.903076171875, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21281161904335022, "rewards/margins": 14.350473403930664, "rewards/rejected": -14.137661933898926, "step": 2500 }, { "epoch": 0.6267948557872394, "grad_norm": 0.04052734375, "learning_rate": 1.8377000451141013e-06, "logits/chosen": -2.1033387184143066, "logits/rejected": -2.311828136444092, "logps/chosen": -1.013270616531372, "logps/rejected": -1430.32568359375, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.2092626839876175, "rewards/margins": 13.935432434082031, "rewards/rejected": -13.726168632507324, "step": 2510 }, { "epoch": 0.6292920464477463, "grad_norm": 0.033203125, "learning_rate": 1.8167058874803405e-06, "logits/chosen": -2.2198266983032227, "logits/rejected": -2.435263156890869, "logps/chosen": -1.5374799966812134, "logps/rejected": -1410.561279296875, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.21642649173736572, "rewards/margins": 13.645418167114258, "rewards/rejected": -13.428991317749023, "step": 2520 }, { "epoch": 0.6317892371082532, "grad_norm": 0.025390625, "learning_rate": 1.7957636787217451e-06, "logits/chosen": -2.1474337577819824, "logits/rejected": -2.3489108085632324, "logps/chosen": -0.525337278842926, "logps/rejected": -1465.3909912109375, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 0.21127943694591522, "rewards/margins": 14.354647636413574, "rewards/rejected": -14.1433687210083, "step": 2530 }, { "epoch": 0.6342864277687601, "grad_norm": 0.045166015625, "learning_rate": 1.7748750110138768e-06, "logits/chosen": -2.1010701656341553, "logits/rejected": -2.3061635494232178, "logps/chosen": -1.495689034461975, "logps/rejected": -1522.001953125, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 0.21690087020397186, "rewards/margins": 14.7809476852417, "rewards/rejected": -14.564045906066895, "step": 2540 }, { "epoch": 0.6367836184292671, "grad_norm": 0.0771484375, "learning_rate": 1.7540414724617282e-06, "logits/chosen": -2.0483648777008057, "logits/rejected": -2.2502453327178955, "logps/chosen": -1.7171008586883545, "logps/rejected": -1322.4296875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21715514361858368, "rewards/margins": 12.821990966796875, "rewards/rejected": -12.604835510253906, "step": 2550 }, { "epoch": 0.639280809089774, "grad_norm": 0.0155029296875, "learning_rate": 1.7332646469789827e-06, "logits/chosen": -2.2271251678466797, "logits/rejected": -2.4021248817443848, "logps/chosen": -0.7044438719749451, "logps/rejected": -1151.026611328125, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.21061739325523376, "rewards/margins": 11.253252983093262, "rewards/rejected": -11.042635917663574, "step": 2560 }, { "epoch": 0.6417779997502809, "grad_norm": 0.031494140625, "learning_rate": 1.7125461141675881e-06, "logits/chosen": -2.115159034729004, "logits/rejected": -2.321096181869507, "logps/chosen": -1.5179011821746826, "logps/rejected": -1341.9727783203125, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.21012921631336212, "rewards/margins": 13.043965339660645, "rewards/rejected": -12.833836555480957, "step": 2570 }, { "epoch": 0.6442751904107878, "grad_norm": 0.07177734375, "learning_rate": 1.6918874491976744e-06, "logits/chosen": -2.262359619140625, "logits/rejected": -2.4549667835235596, "logps/chosen": -1.1417173147201538, "logps/rejected": -1349.908203125, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21297034621238708, "rewards/margins": 13.129191398620605, "rewards/rejected": -12.916219711303711, "step": 2580 }, { "epoch": 0.6467723810712948, "grad_norm": 0.019287109375, "learning_rate": 1.6712902226877917e-06, "logits/chosen": -2.1325788497924805, "logits/rejected": -2.323542356491089, "logps/chosen": -1.002483606338501, "logps/rejected": -1407.6922607421875, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.21138958632946014, "rewards/margins": 13.752557754516602, "rewards/rejected": -13.541168212890625, "step": 2590 }, { "epoch": 0.6492695717318018, "grad_norm": 0.046875, "learning_rate": 1.6507560005854977e-06, "logits/chosen": -2.0466830730438232, "logits/rejected": -2.254211664199829, "logps/chosen": -1.2699908018112183, "logps/rejected": -1284.965576171875, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.22062024474143982, "rewards/margins": 12.382702827453613, "rewards/rejected": -12.16208267211914, "step": 2600 }, { "epoch": 0.6517667623923087, "grad_norm": 0.0283203125, "learning_rate": 1.6302863440483121e-06, "logits/chosen": -2.102281093597412, "logits/rejected": -2.344468832015991, "logps/chosen": -0.9672495722770691, "logps/rejected": -1371.63232421875, "loss": 0.2237, "rewards/accuracies": 1.0, "rewards/chosen": 0.2286236733198166, "rewards/margins": 13.317922592163086, "rewards/rejected": -13.089300155639648, "step": 2610 }, { "epoch": 0.6542639530528156, "grad_norm": 0.10302734375, "learning_rate": 1.6098828093250203e-06, "logits/chosen": -2.012927770614624, "logits/rejected": -2.23055100440979, "logps/chosen": -2.223574161529541, "logps/rejected": -1439.184326171875, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.2114640474319458, "rewards/margins": 13.80018424987793, "rewards/rejected": -13.588720321655273, "step": 2620 }, { "epoch": 0.6567611437133225, "grad_norm": 0.0201416015625, "learning_rate": 1.5895469476373545e-06, "logits/chosen": -2.0998306274414062, "logits/rejected": -2.284853935241699, "logps/chosen": -1.0365889072418213, "logps/rejected": -1287.3863525390625, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.21419724822044373, "rewards/margins": 12.467586517333984, "rewards/rejected": -12.253389358520508, "step": 2630 }, { "epoch": 0.6592583343738294, "grad_norm": 0.048583984375, "learning_rate": 1.5692803050620642e-06, "logits/chosen": -2.1266770362854004, "logits/rejected": -2.341489553451538, "logps/chosen": -1.9875209331512451, "logps/rejected": -1219.6407470703125, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.21611304581165314, "rewards/margins": 11.721773147583008, "rewards/rejected": -11.505661010742188, "step": 2640 }, { "epoch": 0.6617555250343363, "grad_norm": 0.045166015625, "learning_rate": 1.5490844224133717e-06, "logits/chosen": -2.178802251815796, "logits/rejected": -2.3850629329681396, "logps/chosen": -1.1978418827056885, "logps/rejected": -1456.7591552734375, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.2064187228679657, "rewards/margins": 14.242889404296875, "rewards/rejected": -14.036470413208008, "step": 2650 }, { "epoch": 0.6642527156948433, "grad_norm": 0.02978515625, "learning_rate": 1.528960835125822e-06, "logits/chosen": -2.3235323429107666, "logits/rejected": -2.508779525756836, "logps/chosen": -0.7140904664993286, "logps/rejected": -1262.5396728515625, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.2111320048570633, "rewards/margins": 12.3548002243042, "rewards/rejected": -12.143668174743652, "step": 2660 }, { "epoch": 0.6667499063553503, "grad_norm": 0.04541015625, "learning_rate": 1.5089110731375568e-06, "logits/chosen": -2.1535146236419678, "logits/rejected": -2.346010446548462, "logps/chosen": -1.2154910564422607, "logps/rejected": -1353.01416015625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21193823218345642, "rewards/margins": 13.174649238586426, "rewards/rejected": -12.962712287902832, "step": 2670 }, { "epoch": 0.6692470970158572, "grad_norm": 0.0224609375, "learning_rate": 1.4889366607739925e-06, "logits/chosen": -2.2847390174865723, "logits/rejected": -2.437983989715576, "logps/chosen": -0.47022026777267456, "logps/rejected": -1079.610595703125, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.20733828842639923, "rewards/margins": 10.533978462219238, "rewards/rejected": -10.326639175415039, "step": 2680 }, { "epoch": 0.6717442876763641, "grad_norm": 0.0517578125, "learning_rate": 1.4690391166319307e-06, "logits/chosen": -2.091798782348633, "logits/rejected": -2.286367177963257, "logps/chosen": -0.8848400115966797, "logps/rejected": -1370.623046875, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21229073405265808, "rewards/margins": 13.300837516784668, "rewards/rejected": -13.088546752929688, "step": 2690 }, { "epoch": 0.674241478336871, "grad_norm": 0.058837890625, "learning_rate": 1.4492199534641055e-06, "logits/chosen": -2.1903884410858154, "logits/rejected": -2.389869451522827, "logps/chosen": -0.7620021104812622, "logps/rejected": -1357.733642578125, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.2085207998752594, "rewards/margins": 13.300318717956543, "rewards/rejected": -13.091796875, "step": 2700 }, { "epoch": 0.676738668997378, "grad_norm": 0.050048828125, "learning_rate": 1.429480678064174e-06, "logits/chosen": -2.1907572746276855, "logits/rejected": -2.4412574768066406, "logps/chosen": -1.4903779029846191, "logps/rejected": -1532.8353271484375, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 0.22088858485221863, "rewards/margins": 14.948209762573242, "rewards/rejected": -14.727320671081543, "step": 2710 }, { "epoch": 0.6792358596578849, "grad_norm": 0.04833984375, "learning_rate": 1.4098227911521523e-06, "logits/chosen": -2.1927340030670166, "logits/rejected": -2.384458065032959, "logps/chosen": -1.0519030094146729, "logps/rejected": -1408.5384521484375, "loss": 0.2247, "rewards/accuracies": 1.0, "rewards/chosen": 0.21749384701251984, "rewards/margins": 13.769442558288574, "rewards/rejected": -13.551948547363281, "step": 2720 }, { "epoch": 0.6817330503183918, "grad_norm": 0.0206298828125, "learning_rate": 1.3902477872603295e-06, "logits/chosen": -2.292635440826416, "logits/rejected": -2.4606173038482666, "logps/chosen": -1.3724099397659302, "logps/rejected": -1059.629638671875, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 0.2094014585018158, "rewards/margins": 10.156023025512695, "rewards/rejected": -9.946621894836426, "step": 2730 }, { "epoch": 0.6842302409788987, "grad_norm": 0.06494140625, "learning_rate": 1.370757154619638e-06, "logits/chosen": -2.2135720252990723, "logits/rejected": -2.4035539627075195, "logps/chosen": -0.8492560386657715, "logps/rejected": -1440.1517333984375, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21952596306800842, "rewards/margins": 13.991949081420898, "rewards/rejected": -13.772422790527344, "step": 2740 }, { "epoch": 0.6867274316394056, "grad_norm": 0.0546875, "learning_rate": 1.3513523750465049e-06, "logits/chosen": -2.2055509090423584, "logits/rejected": -2.3952600955963135, "logps/chosen": -0.848610520362854, "logps/rejected": -1253.37841796875, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.2106543481349945, "rewards/margins": 12.153672218322754, "rewards/rejected": -11.94301700592041, "step": 2750 }, { "epoch": 0.6892246222999125, "grad_norm": 0.0078125, "learning_rate": 1.332034923830199e-06, "logits/chosen": -2.1199612617492676, "logits/rejected": -2.3331620693206787, "logps/chosen": -0.572918176651001, "logps/rejected": -1314.5574951171875, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.21192285418510437, "rewards/margins": 12.847297668457031, "rewards/rejected": -12.635372161865234, "step": 2760 }, { "epoch": 0.6917218129604196, "grad_norm": 0.05322265625, "learning_rate": 1.31280626962067e-06, "logits/chosen": -2.242522716522217, "logits/rejected": -2.4255213737487793, "logps/chosen": -0.6031197905540466, "logps/rejected": -1176.1724853515625, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.21441105008125305, "rewards/margins": 11.398508071899414, "rewards/rejected": -11.18409538269043, "step": 2770 }, { "epoch": 0.6942190036209265, "grad_norm": 0.03759765625, "learning_rate": 1.2936678743168813e-06, "logits/chosen": -2.1787726879119873, "logits/rejected": -2.379664659500122, "logps/chosen": -0.6903184652328491, "logps/rejected": -1316.2584228515625, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.2146139144897461, "rewards/margins": 12.865110397338867, "rewards/rejected": -12.650495529174805, "step": 2780 }, { "epoch": 0.6967161942814334, "grad_norm": 0.033935546875, "learning_rate": 1.2746211929556777e-06, "logits/chosen": -2.1566481590270996, "logits/rejected": -2.4140141010284424, "logps/chosen": -0.8048852682113647, "logps/rejected": -1725.7884521484375, "loss": 0.2244, "rewards/accuracies": 1.0, "rewards/chosen": 0.21056988835334778, "rewards/margins": 16.911666870117188, "rewards/rejected": -16.70109748840332, "step": 2790 }, { "epoch": 0.6992133849419403, "grad_norm": 0.1259765625, "learning_rate": 1.2556676736011558e-06, "logits/chosen": -2.1705546379089355, "logits/rejected": -2.36136531829834, "logps/chosen": -1.7305570840835571, "logps/rejected": -1468.9334716796875, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.21282191574573517, "rewards/margins": 14.295863151550293, "rewards/rejected": -14.083041191101074, "step": 2800 }, { "epoch": 0.7017105756024472, "grad_norm": 0.0264892578125, "learning_rate": 1.2368087572345772e-06, "logits/chosen": -2.2008700370788574, "logits/rejected": -2.3622145652770996, "logps/chosen": -0.9749493598937988, "logps/rejected": -1153.006103515625, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": 0.20674769580364227, "rewards/margins": 11.259106636047363, "rewards/rejected": -11.052358627319336, "step": 2810 }, { "epoch": 0.7042077662629542, "grad_norm": 0.07275390625, "learning_rate": 1.2180458776448067e-06, "logits/chosen": -2.183065891265869, "logits/rejected": -2.4031364917755127, "logps/chosen": -1.3278162479400635, "logps/rejected": -1352.5931396484375, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 0.21398906409740448, "rewards/margins": 13.097040176391602, "rewards/rejected": -12.883050918579102, "step": 2820 }, { "epoch": 0.7067049569234611, "grad_norm": 0.04150390625, "learning_rate": 1.1993804613193158e-06, "logits/chosen": -2.166015625, "logits/rejected": -2.376171827316284, "logps/chosen": -0.8504392504692078, "logps/rejected": -1218.2906494140625, "loss": 0.2242, "rewards/accuracies": 1.0, "rewards/chosen": 0.2224057912826538, "rewards/margins": 11.764514923095703, "rewards/rejected": -11.542108535766602, "step": 2830 }, { "epoch": 0.709202147583968, "grad_norm": 0.0791015625, "learning_rate": 1.1808139273357232e-06, "logits/chosen": -2.1249091625213623, "logits/rejected": -2.324924945831299, "logps/chosen": -1.2602803707122803, "logps/rejected": -1440.6927490234375, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.21478672325611115, "rewards/margins": 13.95526123046875, "rewards/rejected": -13.740473747253418, "step": 2840 }, { "epoch": 0.711699338244475, "grad_norm": 0.0186767578125, "learning_rate": 1.1623476872539108e-06, "logits/chosen": -2.1342732906341553, "logits/rejected": -2.3520121574401855, "logps/chosen": -1.0939338207244873, "logps/rejected": -1569.9661865234375, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.209732323884964, "rewards/margins": 15.359413146972656, "rewards/rejected": -15.149681091308594, "step": 2850 }, { "epoch": 0.7141965289049819, "grad_norm": 0.0206298828125, "learning_rate": 1.1439831450087032e-06, "logits/chosen": -2.1833555698394775, "logits/rejected": -2.408240795135498, "logps/chosen": -1.4031983613967896, "logps/rejected": -1495.2554931640625, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.20904704928398132, "rewards/margins": 14.592279434204102, "rewards/rejected": -14.383232116699219, "step": 2860 }, { "epoch": 0.7166937195654888, "grad_norm": 0.048828125, "learning_rate": 1.1257216968031357e-06, "logits/chosen": -2.1499791145324707, "logits/rejected": -2.3467013835906982, "logps/chosen": -0.6740778684616089, "logps/rejected": -1315.198486328125, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.2084966003894806, "rewards/margins": 12.877813339233398, "rewards/rejected": -12.669316291809082, "step": 2870 }, { "epoch": 0.7191909102259958, "grad_norm": 0.099609375, "learning_rate": 1.1075647310022974e-06, "logits/chosen": -2.293015956878662, "logits/rejected": -2.477437973022461, "logps/chosen": -0.6577932238578796, "logps/rejected": -1144.9639892578125, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21200187504291534, "rewards/margins": 11.185698509216309, "rewards/rejected": -10.973695755004883, "step": 2880 }, { "epoch": 0.7216881008865027, "grad_norm": 0.0341796875, "learning_rate": 1.0895136280277863e-06, "logits/chosen": -2.1305599212646484, "logits/rejected": -2.3395919799804688, "logps/chosen": -0.9710084795951843, "logps/rejected": -1521.902099609375, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.21521084010601044, "rewards/margins": 14.770858764648438, "rewards/rejected": -14.555648803710938, "step": 2890 }, { "epoch": 0.7241852915470096, "grad_norm": 0.02685546875, "learning_rate": 1.0715697602527542e-06, "logits/chosen": -1.9920504093170166, "logits/rejected": -2.2198596000671387, "logps/chosen": -0.49225324392318726, "logps/rejected": -1440.2822265625, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21176087856292725, "rewards/margins": 13.888765335083008, "rewards/rejected": -13.67700481414795, "step": 2900 }, { "epoch": 0.7266824822075165, "grad_norm": 0.05322265625, "learning_rate": 1.0537344918975708e-06, "logits/chosen": -2.1923391819000244, "logits/rejected": -2.3587305545806885, "logps/chosen": -2.3005270957946777, "logps/rejected": -1118.677490234375, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.22067633271217346, "rewards/margins": 10.68152141571045, "rewards/rejected": -10.460844993591309, "step": 2910 }, { "epoch": 0.7291796728680234, "grad_norm": 0.0277099609375, "learning_rate": 1.036009178926107e-06, "logits/chosen": -2.162017822265625, "logits/rejected": -2.350229263305664, "logps/chosen": -0.4403456151485443, "logps/rejected": -1365.908203125, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.21478745341300964, "rewards/margins": 13.359176635742188, "rewards/rejected": -13.144391059875488, "step": 2920 }, { "epoch": 0.7316768635285305, "grad_norm": 0.2041015625, "learning_rate": 1.0183951689426438e-06, "logits/chosen": -2.0874218940734863, "logits/rejected": -2.286980152130127, "logps/chosen": -1.1334517002105713, "logps/rejected": -1574.8843994140625, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": 0.20781561732292175, "rewards/margins": 15.404953002929688, "rewards/rejected": -15.197137832641602, "step": 2930 }, { "epoch": 0.7341740541890374, "grad_norm": 0.033203125, "learning_rate": 1.0008938010894156e-06, "logits/chosen": -2.05769419670105, "logits/rejected": -2.291485548019409, "logps/chosen": -0.6213763356208801, "logps/rejected": -1545.57421875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21286337077617645, "rewards/margins": 15.127001762390137, "rewards/rejected": -14.914140701293945, "step": 2940 }, { "epoch": 0.7366712448495443, "grad_norm": 0.06005859375, "learning_rate": 9.83506405944804e-07, "logits/chosen": -2.0132200717926025, "logits/rejected": -2.2228617668151855, "logps/chosen": -1.0132176876068115, "logps/rejected": -1225.736572265625, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 0.21595139801502228, "rewards/margins": 11.760801315307617, "rewards/rejected": -11.544851303100586, "step": 2950 }, { "epoch": 0.7391684355100512, "grad_norm": 0.02099609375, "learning_rate": 9.662343054221743e-07, "logits/chosen": -2.038722515106201, "logits/rejected": -2.254706621170044, "logps/chosen": -1.0080900192260742, "logps/rejected": -1486.7254638671875, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 0.21779987215995789, "rewards/margins": 14.319659233093262, "rewards/rejected": -14.101860046386719, "step": 2960 }, { "epoch": 0.7416656261705581, "grad_norm": 0.033447265625, "learning_rate": 9.490788126693754e-07, "logits/chosen": -2.05572247505188, "logits/rejected": -2.270496129989624, "logps/chosen": -1.580960988998413, "logps/rejected": -1349.623779296875, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21096165478229523, "rewards/margins": 13.037325859069824, "rewards/rejected": -12.826364517211914, "step": 2970 }, { "epoch": 0.744162816831065, "grad_norm": 0.034423828125, "learning_rate": 9.32041231968904e-07, "logits/chosen": -2.135493040084839, "logits/rejected": -2.3431620597839355, "logps/chosen": -0.692672848701477, "logps/rejected": -1422.2606201171875, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21278159320354462, "rewards/margins": 13.839566230773926, "rewards/rejected": -13.626785278320312, "step": 2980 }, { "epoch": 0.746660007491572, "grad_norm": 0.038330078125, "learning_rate": 9.151228586387464e-07, "logits/chosen": -2.1877083778381348, "logits/rejected": -2.3766164779663086, "logps/chosen": -0.7439475655555725, "logps/rejected": -1241.116943359375, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 0.2107519656419754, "rewards/margins": 12.074844360351562, "rewards/rejected": -11.864092826843262, "step": 2990 }, { "epoch": 0.7491571981520789, "grad_norm": 0.051025390625, "learning_rate": 8.983249789338941e-07, "logits/chosen": -2.150568723678589, "logits/rejected": -2.329155445098877, "logps/chosen": -0.8139511346817017, "logps/rejected": -1264.535888671875, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.20758719742298126, "rewards/margins": 12.365687370300293, "rewards/rejected": -12.158101081848145, "step": 3000 }, { "epoch": 0.7491571981520789, "eval_logits/chosen": -2.5715415477752686, "eval_logits/rejected": -2.65895676612854, "eval_logps/chosen": -0.12666501104831696, "eval_logps/rejected": -650.5204467773438, "eval_loss": 0.22132699191570282, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.25786837935447693, "eval_rewards/margins": 6.314352512359619, "eval_rewards/rejected": -6.056484222412109, "eval_runtime": 0.6559, "eval_samples_per_second": 7.623, "eval_steps_per_second": 4.574, "step": 3000 }, { "epoch": 0.7516543888125858, "grad_norm": 0.0194091796875, "learning_rate": 8.816488699485593e-07, "logits/chosen": -2.176842212677002, "logits/rejected": -2.3571324348449707, "logps/chosen": -0.4218795895576477, "logps/rejected": -1318.904052734375, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.21097974479198456, "rewards/margins": 12.884817123413086, "rewards/rejected": -12.67383861541748, "step": 3010 }, { "epoch": 0.7541515794730927, "grad_norm": 0.041015625, "learning_rate": 8.650957995190784e-07, "logits/chosen": -2.1513025760650635, "logits/rejected": -2.3777430057525635, "logps/chosen": -1.3863859176635742, "logps/rejected": -1556.2415771484375, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21383562684059143, "rewards/margins": 15.212430000305176, "rewards/rejected": -14.9985933303833, "step": 3020 }, { "epoch": 0.7566487701335997, "grad_norm": 0.029541015625, "learning_rate": 8.486670261275193e-07, "logits/chosen": -2.252506732940674, "logits/rejected": -2.452782392501831, "logps/chosen": -0.9220790863037109, "logps/rejected": -1333.130615234375, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.20681767165660858, "rewards/margins": 13.058314323425293, "rewards/rejected": -12.851496696472168, "step": 3030 }, { "epoch": 0.7591459607941067, "grad_norm": 0.0244140625, "learning_rate": 8.32363798806011e-07, "logits/chosen": -2.2259833812713623, "logits/rejected": -2.4163031578063965, "logps/chosen": -0.6000443696975708, "logps/rejected": -1395.551513671875, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.21475176513195038, "rewards/margins": 13.636807441711426, "rewards/rejected": -13.42205810546875, "step": 3040 }, { "epoch": 0.7616431514546136, "grad_norm": 0.02490234375, "learning_rate": 8.161873570417742e-07, "logits/chosen": -2.1769793033599854, "logits/rejected": -2.3828330039978027, "logps/chosen": -0.49710139632225037, "logps/rejected": -1448.161865234375, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 0.2149210423231125, "rewards/margins": 14.16187858581543, "rewards/rejected": -13.946958541870117, "step": 3050 }, { "epoch": 0.7641403421151205, "grad_norm": 0.04931640625, "learning_rate": 8.001389306828897e-07, "logits/chosen": -2.094914674758911, "logits/rejected": -2.325759172439575, "logps/chosen": -1.7604669332504272, "logps/rejected": -1601.197998046875, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.2165949046611786, "rewards/margins": 15.435786247253418, "rewards/rejected": -15.219189643859863, "step": 3060 }, { "epoch": 0.7666375327756274, "grad_norm": 0.056884765625, "learning_rate": 7.842197398447993e-07, "logits/chosen": -2.119885206222534, "logits/rejected": -2.3199260234832764, "logps/chosen": -1.7511274814605713, "logps/rejected": -1411.588623046875, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.2108723670244217, "rewards/margins": 13.716115951538086, "rewards/rejected": -13.505243301391602, "step": 3070 }, { "epoch": 0.7691347234361343, "grad_norm": 0.050048828125, "learning_rate": 7.684309948175414e-07, "logits/chosen": -2.0922672748565674, "logits/rejected": -2.2642672061920166, "logps/chosen": -0.6221259832382202, "logps/rejected": -1387.583251953125, "loss": 0.2247, "rewards/accuracies": 1.0, "rewards/chosen": 0.2143571376800537, "rewards/margins": 13.555415153503418, "rewards/rejected": -13.341056823730469, "step": 3080 }, { "epoch": 0.7716319140966412, "grad_norm": 0.0257568359375, "learning_rate": 7.527738959737371e-07, "logits/chosen": -2.1526269912719727, "logits/rejected": -2.3576555252075195, "logps/chosen": -1.0096280574798584, "logps/rejected": -1377.4000244140625, "loss": 0.2246, "rewards/accuracies": 1.0, "rewards/chosen": 0.2141624391078949, "rewards/margins": 13.408686637878418, "rewards/rejected": -13.194523811340332, "step": 3090 }, { "epoch": 0.7741291047571482, "grad_norm": 0.054443359375, "learning_rate": 7.372496336773269e-07, "logits/chosen": -2.1142802238464355, "logits/rejected": -2.297616958618164, "logps/chosen": -0.8569754362106323, "logps/rejected": -1148.1827392578125, "loss": 0.2277, "rewards/accuracies": 1.0, "rewards/chosen": 0.21184638142585754, "rewards/margins": 11.140499114990234, "rewards/rejected": -10.9286527633667, "step": 3100 }, { "epoch": 0.7766262954176552, "grad_norm": 0.0341796875, "learning_rate": 7.218593881930744e-07, "logits/chosen": -2.2074034214019775, "logits/rejected": -2.391183853149414, "logps/chosen": -0.8030778765678406, "logps/rejected": -1229.139404296875, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.2128894329071045, "rewards/margins": 12.000519752502441, "rewards/rejected": -11.787630081176758, "step": 3110 }, { "epoch": 0.7791234860781621, "grad_norm": 0.040283203125, "learning_rate": 7.066043295968342e-07, "logits/chosen": -2.1711983680725098, "logits/rejected": -2.370105266571045, "logps/chosen": -1.8718315362930298, "logps/rejected": -1323.237060546875, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.2125745713710785, "rewards/margins": 12.693571090698242, "rewards/rejected": -12.480997085571289, "step": 3120 }, { "epoch": 0.781620676738669, "grad_norm": 0.048828125, "learning_rate": 6.914856176865891e-07, "logits/chosen": -2.255979537963867, "logits/rejected": -2.4583592414855957, "logps/chosen": -1.3440260887145996, "logps/rejected": -1252.6439208984375, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.205407053232193, "rewards/margins": 12.165246963500977, "rewards/rejected": -11.959839820861816, "step": 3130 }, { "epoch": 0.7841178673991759, "grad_norm": 0.05322265625, "learning_rate": 6.765044018942804e-07, "logits/chosen": -2.2532248497009277, "logits/rejected": -2.4564757347106934, "logps/chosen": -0.6921563744544983, "logps/rejected": -1213.1630859375, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.20914144814014435, "rewards/margins": 11.834319114685059, "rewards/rejected": -11.625177383422852, "step": 3140 }, { "epoch": 0.7866150580596829, "grad_norm": 0.034423828125, "learning_rate": 6.616618211984169e-07, "logits/chosen": -2.1614041328430176, "logits/rejected": -2.3516342639923096, "logps/chosen": -0.3677124083042145, "logps/rejected": -1380.3824462890625, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.2100847065448761, "rewards/margins": 13.517425537109375, "rewards/rejected": -13.307340621948242, "step": 3150 }, { "epoch": 0.7891122487201898, "grad_norm": 0.046630859375, "learning_rate": 6.469590040374799e-07, "logits/chosen": -2.108102560043335, "logits/rejected": -2.3011136054992676, "logps/chosen": -0.5627329349517822, "logps/rejected": -1450.8382568359375, "loss": 0.2246, "rewards/accuracies": 1.0, "rewards/chosen": 0.21635802090168, "rewards/margins": 14.105690002441406, "rewards/rejected": -13.88933277130127, "step": 3160 }, { "epoch": 0.7916094393806967, "grad_norm": 0.038818359375, "learning_rate": 6.32397068224136e-07, "logits/chosen": -2.2220847606658936, "logits/rejected": -2.4407782554626465, "logps/chosen": -0.8599641919136047, "logps/rejected": -1339.21826171875, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.21565623581409454, "rewards/margins": 13.00416088104248, "rewards/rejected": -12.788503646850586, "step": 3170 }, { "epoch": 0.7941066300412036, "grad_norm": 0.41015625, "learning_rate": 6.17977120860249e-07, "logits/chosen": -2.208421230316162, "logits/rejected": -2.4098763465881348, "logps/chosen": -1.8245439529418945, "logps/rejected": -1350.473876953125, "loss": 0.2257, "rewards/accuracies": 1.0, "rewards/chosen": 0.21073952317237854, "rewards/margins": 13.214599609375, "rewards/rejected": -13.003860473632812, "step": 3180 }, { "epoch": 0.7966038207017105, "grad_norm": 0.03857421875, "learning_rate": 6.037002582527121e-07, "logits/chosen": -2.1419100761413574, "logits/rejected": -2.3298497200012207, "logps/chosen": -0.7109914422035217, "logps/rejected": -1332.651123046875, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21053273975849152, "rewards/margins": 12.884849548339844, "rewards/rejected": -12.67431640625, "step": 3190 }, { "epoch": 0.7991010113622175, "grad_norm": 0.0263671875, "learning_rate": 5.895675658300981e-07, "logits/chosen": -2.310133934020996, "logits/rejected": -2.4916586875915527, "logps/chosen": -0.809489905834198, "logps/rejected": -1163.7008056640625, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 0.21247372031211853, "rewards/margins": 11.380758285522461, "rewards/rejected": -11.168285369873047, "step": 3200 }, { "epoch": 0.8015982020227245, "grad_norm": 0.049072265625, "learning_rate": 5.755801180601381e-07, "logits/chosen": -2.2009828090667725, "logits/rejected": -2.4239349365234375, "logps/chosen": -0.8167581558227539, "logps/rejected": -1351.5517578125, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.21433226764202118, "rewards/margins": 13.204305648803711, "rewards/rejected": -12.989973068237305, "step": 3210 }, { "epoch": 0.8040953926832314, "grad_norm": 0.0390625, "learning_rate": 5.617389783680307e-07, "logits/chosen": -2.080115795135498, "logits/rejected": -2.3226914405822754, "logps/chosen": -0.7443311810493469, "logps/rejected": -1530.034423828125, "loss": 0.224, "rewards/accuracies": 1.0, "rewards/chosen": 0.21728749573230743, "rewards/margins": 14.894986152648926, "rewards/rejected": -14.677696228027344, "step": 3220 }, { "epoch": 0.8065925833437383, "grad_norm": 0.232421875, "learning_rate": 5.48045199055596e-07, "logits/chosen": -2.1640877723693848, "logits/rejected": -2.36962628364563, "logps/chosen": -1.1329087018966675, "logps/rejected": -1273.104736328125, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.20660333335399628, "rewards/margins": 12.42498779296875, "rewards/rejected": -12.218384742736816, "step": 3230 }, { "epoch": 0.8090897740042452, "grad_norm": 0.043212890625, "learning_rate": 5.344998212212704e-07, "logits/chosen": -2.091491937637329, "logits/rejected": -2.327758312225342, "logps/chosen": -1.4844013452529907, "logps/rejected": -1502.2427978515625, "loss": 0.224, "rewards/accuracies": 1.0, "rewards/chosen": 0.2210853397846222, "rewards/margins": 14.54228687286377, "rewards/rejected": -14.321202278137207, "step": 3240 }, { "epoch": 0.8115869646647521, "grad_norm": 0.03564453125, "learning_rate": 5.211038746809551e-07, "logits/chosen": -2.192322015762329, "logits/rejected": -2.3808138370513916, "logps/chosen": -0.5706063508987427, "logps/rejected": -1285.364013671875, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20710210502147675, "rewards/margins": 12.564165115356445, "rewards/rejected": -12.357062339782715, "step": 3250 }, { "epoch": 0.8140841553252591, "grad_norm": 0.07275390625, "learning_rate": 5.078583778897216e-07, "logits/chosen": -2.1883485317230225, "logits/rejected": -2.3633830547332764, "logps/chosen": -1.4209082126617432, "logps/rejected": -1214.212890625, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.21422162652015686, "rewards/margins": 11.81783390045166, "rewards/rejected": -11.603612899780273, "step": 3260 }, { "epoch": 0.816581345985766, "grad_norm": 0.044677734375, "learning_rate": 4.94764337864384e-07, "logits/chosen": -2.2724106311798096, "logits/rejected": -2.4622254371643066, "logps/chosen": -0.9480986595153809, "logps/rejected": -1307.112548828125, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.2102733850479126, "rewards/margins": 12.694864273071289, "rewards/rejected": -12.484590530395508, "step": 3270 }, { "epoch": 0.819078536646273, "grad_norm": 0.0203857421875, "learning_rate": 4.818227501069328e-07, "logits/chosen": -2.2342686653137207, "logits/rejected": -2.4815187454223633, "logps/chosen": -1.358782410621643, "logps/rejected": -1722.28125, "loss": 0.2233, "rewards/accuracies": 1.0, "rewards/chosen": 0.21825866401195526, "rewards/margins": 16.87509536743164, "rewards/rejected": -16.656835556030273, "step": 3280 }, { "epoch": 0.8215757273067799, "grad_norm": 0.031982421875, "learning_rate": 4.690345985288572e-07, "logits/chosen": -2.1274971961975098, "logits/rejected": -2.328562021255493, "logps/chosen": -1.1761976480484009, "logps/rejected": -1403.730224609375, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.213038831949234, "rewards/margins": 13.633008003234863, "rewards/rejected": -13.41996955871582, "step": 3290 }, { "epoch": 0.8240729179672868, "grad_norm": 0.01953125, "learning_rate": 4.5640085537633633e-07, "logits/chosen": -2.1780600547790527, "logits/rejected": -2.418370008468628, "logps/chosen": -1.041684865951538, "logps/rejected": -1449.9898681640625, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 0.21910138428211212, "rewards/margins": 14.122426986694336, "rewards/rejected": -13.903326034545898, "step": 3300 }, { "epoch": 0.8265701086277938, "grad_norm": 0.04052734375, "learning_rate": 4.439224811563211e-07, "logits/chosen": -2.0584537982940674, "logits/rejected": -2.258396625518799, "logps/chosen": -0.6486183404922485, "logps/rejected": -1476.123291015625, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21381358802318573, "rewards/margins": 14.360862731933594, "rewards/rejected": -14.147050857543945, "step": 3310 }, { "epoch": 0.8290672992883007, "grad_norm": 0.04345703125, "learning_rate": 4.316004245635158e-07, "logits/chosen": -2.147899866104126, "logits/rejected": -2.3480546474456787, "logps/chosen": -1.0383152961730957, "logps/rejected": -1587.1812744140625, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 0.2098073661327362, "rewards/margins": 15.521720886230469, "rewards/rejected": -15.311912536621094, "step": 3320 }, { "epoch": 0.8315644899488076, "grad_norm": 0.01336669921875, "learning_rate": 4.194356224082455e-07, "logits/chosen": -2.0754525661468506, "logits/rejected": -2.304088592529297, "logps/chosen": -0.6566920876502991, "logps/rejected": -1547.8634033203125, "loss": 0.227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20915034413337708, "rewards/margins": 15.037984848022461, "rewards/rejected": -14.828834533691406, "step": 3330 }, { "epoch": 0.8340616806093145, "grad_norm": 0.0283203125, "learning_rate": 4.074289995452338e-07, "logits/chosen": -2.141746997833252, "logits/rejected": -2.3306097984313965, "logps/chosen": -0.9173293113708496, "logps/rejected": -1333.7867431640625, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.2176503688097, "rewards/margins": 13.049924850463867, "rewards/rejected": -12.832275390625, "step": 3340 }, { "epoch": 0.8365588712698214, "grad_norm": 0.01904296875, "learning_rate": 3.9558146880329246e-07, "logits/chosen": -2.1531293392181396, "logits/rejected": -2.3555006980895996, "logps/chosen": -1.041372537612915, "logps/rejected": -1363.6673583984375, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.21647608280181885, "rewards/margins": 13.151887893676758, "rewards/rejected": -12.935412406921387, "step": 3350 }, { "epoch": 0.8390560619303283, "grad_norm": 0.0869140625, "learning_rate": 3.838939309159187e-07, "logits/chosen": -2.150744915008545, "logits/rejected": -2.3291797637939453, "logps/chosen": -0.6859675645828247, "logps/rejected": -1347.5732421875, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.21168136596679688, "rewards/margins": 13.16771411895752, "rewards/rejected": -12.956031799316406, "step": 3360 }, { "epoch": 0.8415532525908354, "grad_norm": 0.0213623046875, "learning_rate": 3.723672744528162e-07, "logits/chosen": -2.225355863571167, "logits/rejected": -2.434971570968628, "logps/chosen": -0.7719963788986206, "logps/rejected": -1404.882568359375, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.2120278775691986, "rewards/margins": 13.696516036987305, "rewards/rejected": -13.484487533569336, "step": 3370 }, { "epoch": 0.8440504432513423, "grad_norm": 0.0220947265625, "learning_rate": 3.6100237575233647e-07, "logits/chosen": -2.2835781574249268, "logits/rejected": -2.459686279296875, "logps/chosen": -0.8155478239059448, "logps/rejected": -1183.509033203125, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21422457695007324, "rewards/margins": 11.59019660949707, "rewards/rejected": -11.375970840454102, "step": 3380 }, { "epoch": 0.8465476339118492, "grad_norm": 0.047607421875, "learning_rate": 3.4980009885486054e-07, "logits/chosen": -2.2139523029327393, "logits/rejected": -2.3762905597686768, "logps/chosen": -0.49865931272506714, "logps/rejected": -1125.1737060546875, "loss": 0.2277, "rewards/accuracies": 1.0, "rewards/chosen": 0.20752505958080292, "rewards/margins": 10.996343612670898, "rewards/rejected": -10.788819313049316, "step": 3390 }, { "epoch": 0.8490448245723561, "grad_norm": 0.01556396484375, "learning_rate": 3.3876129543710197e-07, "logits/chosen": -2.184354305267334, "logits/rejected": -2.3724493980407715, "logps/chosen": -0.690311074256897, "logps/rejected": -1528.7587890625, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 0.21172885596752167, "rewards/margins": 14.938085556030273, "rewards/rejected": -14.726354598999023, "step": 3400 }, { "epoch": 0.851542015232863, "grad_norm": 0.01495361328125, "learning_rate": 3.2788680474735687e-07, "logits/chosen": -2.1705164909362793, "logits/rejected": -2.373166561126709, "logps/chosen": -0.5612165927886963, "logps/rejected": -1317.1187744140625, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.20889082551002502, "rewards/margins": 12.858512878417969, "rewards/rejected": -12.649621963500977, "step": 3410 }, { "epoch": 0.85403920589337, "grad_norm": 0.00872802734375, "learning_rate": 3.1717745354170214e-07, "logits/chosen": -2.071406841278076, "logits/rejected": -2.2939293384552, "logps/chosen": -0.8238442540168762, "logps/rejected": -1532.26513671875, "loss": 0.2244, "rewards/accuracies": 1.0, "rewards/chosen": 0.21248868107795715, "rewards/margins": 15.00029468536377, "rewards/rejected": -14.787805557250977, "step": 3420 }, { "epoch": 0.8565363965538769, "grad_norm": 0.021484375, "learning_rate": 3.0663405602113727e-07, "logits/chosen": -2.24153208732605, "logits/rejected": -2.467984676361084, "logps/chosen": -0.9753687977790833, "logps/rejected": -1389.8427734375, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.20957298576831818, "rewards/margins": 13.579081535339355, "rewards/rejected": -13.369508743286133, "step": 3430 }, { "epoch": 0.8590335872143838, "grad_norm": 0.047607421875, "learning_rate": 2.9625741376968107e-07, "logits/chosen": -2.060586452484131, "logits/rejected": -2.3030850887298584, "logps/chosen": -2.972282886505127, "logps/rejected": -1365.322265625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21505007147789001, "rewards/margins": 12.994850158691406, "rewards/rejected": -12.77979850769043, "step": 3440 }, { "epoch": 0.8615307778748907, "grad_norm": 0.02001953125, "learning_rate": 2.8604831569343324e-07, "logits/chosen": -2.2799830436706543, "logits/rejected": -2.4574227333068848, "logps/chosen": -0.9521903991699219, "logps/rejected": -1208.1971435546875, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.21667388081550598, "rewards/margins": 11.710010528564453, "rewards/rejected": -11.493337631225586, "step": 3450 }, { "epoch": 0.8640279685353977, "grad_norm": 0.028564453125, "learning_rate": 2.760075379605942e-07, "logits/chosen": -2.1184418201446533, "logits/rejected": -2.292738199234009, "logps/chosen": -0.882199764251709, "logps/rejected": -1400.0753173828125, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.20649878680706024, "rewards/margins": 13.685522079467773, "rewards/rejected": -13.479023933410645, "step": 3460 }, { "epoch": 0.8665251591959046, "grad_norm": 0.166015625, "learning_rate": 2.661358439424552e-07, "logits/chosen": -2.1794090270996094, "logits/rejected": -2.3647027015686035, "logps/chosen": -0.8141934275627136, "logps/rejected": -1179.304931640625, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 0.21081428229808807, "rewards/margins": 11.469918251037598, "rewards/rejected": -11.25910472869873, "step": 3470 }, { "epoch": 0.8690223498564116, "grad_norm": 0.0262451171875, "learning_rate": 2.564339841553615e-07, "logits/chosen": -2.1696417331695557, "logits/rejected": -2.341275453567505, "logps/chosen": -0.6168124675750732, "logps/rejected": -1255.4180908203125, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 0.20439627766609192, "rewards/margins": 12.246126174926758, "rewards/rejected": -12.041730880737305, "step": 3480 }, { "epoch": 0.8715195405169185, "grad_norm": 0.212890625, "learning_rate": 2.469026962036539e-07, "logits/chosen": -2.155325412750244, "logits/rejected": -2.346266984939575, "logps/chosen": -1.7188537120819092, "logps/rejected": -1198.50634765625, "loss": 0.2282, "rewards/accuracies": 1.0, "rewards/chosen": 0.21203342080116272, "rewards/margins": 11.423583984375, "rewards/rejected": -11.211549758911133, "step": 3490 }, { "epoch": 0.8740167311774254, "grad_norm": 0.033447265625, "learning_rate": 2.3754270472358786e-07, "logits/chosen": -2.1500706672668457, "logits/rejected": -2.346287965774536, "logps/chosen": -1.2322837114334106, "logps/rejected": -1203.8701171875, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.2134041041135788, "rewards/margins": 11.585506439208984, "rewards/rejected": -11.372100830078125, "step": 3500 }, { "epoch": 0.8765139218379323, "grad_norm": 0.0419921875, "learning_rate": 2.283547213282458e-07, "logits/chosen": -2.26165509223938, "logits/rejected": -2.4591403007507324, "logps/chosen": -1.2189921140670776, "logps/rejected": -1291.148193359375, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21497786045074463, "rewards/margins": 12.45503044128418, "rewards/rejected": -12.24005126953125, "step": 3510 }, { "epoch": 0.8790111124984392, "grad_norm": 0.0400390625, "learning_rate": 2.1933944455343166e-07, "logits/chosen": -1.9996531009674072, "logits/rejected": -2.232881784439087, "logps/chosen": -1.0685181617736816, "logps/rejected": -1328.8775634765625, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.21257129311561584, "rewards/margins": 12.904253959655762, "rewards/rejected": -12.691683769226074, "step": 3520 }, { "epoch": 0.8815083031589462, "grad_norm": 0.01324462890625, "learning_rate": 2.104975598045647e-07, "logits/chosen": -2.1279807090759277, "logits/rejected": -2.3155367374420166, "logps/chosen": -0.7418814897537231, "logps/rejected": -1234.801025390625, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.21231353282928467, "rewards/margins": 12.064626693725586, "rewards/rejected": -11.852312088012695, "step": 3530 }, { "epoch": 0.8840054938194531, "grad_norm": 0.04248046875, "learning_rate": 2.018297393045701e-07, "logits/chosen": -2.169581651687622, "logits/rejected": -2.334414005279541, "logps/chosen": -1.1093792915344238, "logps/rejected": -1281.266357421875, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.209160715341568, "rewards/margins": 12.48926067352295, "rewards/rejected": -12.280099868774414, "step": 3540 }, { "epoch": 0.8865026844799601, "grad_norm": 0.0888671875, "learning_rate": 1.9333664204277236e-07, "logits/chosen": -2.0912182331085205, "logits/rejected": -2.292468309402466, "logps/chosen": -0.8641373515129089, "logps/rejected": -1473.767333984375, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.21468326449394226, "rewards/margins": 14.221132278442383, "rewards/rejected": -14.006446838378906, "step": 3550 }, { "epoch": 0.888999875140467, "grad_norm": 0.017578125, "learning_rate": 1.8501891372479124e-07, "logits/chosen": -2.155086040496826, "logits/rejected": -2.3607256412506104, "logps/chosen": -1.0332701206207275, "logps/rejected": -1407.046875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21279795467853546, "rewards/margins": 13.675623893737793, "rewards/rejected": -13.4628267288208, "step": 3560 }, { "epoch": 0.8914970658009739, "grad_norm": 0.10498046875, "learning_rate": 1.7687718672345533e-07, "logits/chosen": -2.1115050315856934, "logits/rejected": -2.295365810394287, "logps/chosen": -1.0766206979751587, "logps/rejected": -1537.2154541015625, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.21170708537101746, "rewards/margins": 15.0249662399292, "rewards/rejected": -14.813258171081543, "step": 3570 }, { "epoch": 0.8939942564614808, "grad_norm": 0.03515625, "learning_rate": 1.689120800307212e-07, "logits/chosen": -2.010655403137207, "logits/rejected": -2.2343146800994873, "logps/chosen": -0.6959076523780823, "logps/rejected": -1583.5645751953125, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 0.2187151461839676, "rewards/margins": 15.251518249511719, "rewards/rejected": -15.032801628112793, "step": 3580 }, { "epoch": 0.8964914471219878, "grad_norm": 0.041259765625, "learning_rate": 1.6112419921061357e-07, "logits/chosen": -2.149298906326294, "logits/rejected": -2.3335325717926025, "logps/chosen": -1.0091092586517334, "logps/rejected": -1295.9100341796875, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.2126895934343338, "rewards/margins": 12.669659614562988, "rewards/rejected": -12.456971168518066, "step": 3590 }, { "epoch": 0.8989886377824947, "grad_norm": 0.05615234375, "learning_rate": 1.5351413635318807e-07, "logits/chosen": -2.2476723194122314, "logits/rejected": -2.4481379985809326, "logps/chosen": -1.025138258934021, "logps/rejected": -1300.3070068359375, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21075649559497833, "rewards/margins": 12.60840892791748, "rewards/rejected": -12.397652626037598, "step": 3600 }, { "epoch": 0.9014858284430016, "grad_norm": 0.0279541015625, "learning_rate": 1.460824700295138e-07, "logits/chosen": -2.246796131134033, "logits/rejected": -2.438882350921631, "logps/chosen": -1.5276464223861694, "logps/rejected": -1376.2548828125, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.21834063529968262, "rewards/margins": 13.44012451171875, "rewards/rejected": -13.221783638000488, "step": 3610 }, { "epoch": 0.9039830191035085, "grad_norm": 0.0157470703125, "learning_rate": 1.3882976524768694e-07, "logits/chosen": -2.2246479988098145, "logits/rejected": -2.397996425628662, "logps/chosen": -1.2670552730560303, "logps/rejected": -1179.010986328125, "loss": 0.2277, "rewards/accuracies": 1.0, "rewards/chosen": 0.21077945828437805, "rewards/margins": 11.500974655151367, "rewards/rejected": -11.290196418762207, "step": 3620 }, { "epoch": 0.9064802097640154, "grad_norm": 0.044677734375, "learning_rate": 1.3175657340987664e-07, "logits/chosen": -2.1487388610839844, "logits/rejected": -2.334177255630493, "logps/chosen": -0.5317996740341187, "logps/rejected": -1380.195556640625, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.2111264169216156, "rewards/margins": 13.50303840637207, "rewards/rejected": -13.291911125183105, "step": 3630 }, { "epoch": 0.9089774004245225, "grad_norm": 0.052978515625, "learning_rate": 1.2486343227040122e-07, "logits/chosen": -2.2575807571411133, "logits/rejected": -2.471717357635498, "logps/chosen": -1.4988012313842773, "logps/rejected": -1318.032470703125, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.22440342605113983, "rewards/margins": 12.804231643676758, "rewards/rejected": -12.579828262329102, "step": 3640 }, { "epoch": 0.9114745910850294, "grad_norm": 0.0654296875, "learning_rate": 1.181508658948452e-07, "logits/chosen": -2.189079999923706, "logits/rejected": -2.372708559036255, "logps/chosen": -0.8293665051460266, "logps/rejected": -1286.6043701171875, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.21110188961029053, "rewards/margins": 12.531554222106934, "rewards/rejected": -12.320451736450195, "step": 3650 }, { "epoch": 0.9139717817455363, "grad_norm": 0.018798828125, "learning_rate": 1.1161938462021627e-07, "logits/chosen": -2.082040309906006, "logits/rejected": -2.2717125415802, "logps/chosen": -1.0598349571228027, "logps/rejected": -1245.3990478515625, "loss": 0.228, "rewards/accuracies": 1.0, "rewards/chosen": 0.21748106181621552, "rewards/margins": 12.090206146240234, "rewards/rejected": -11.872724533081055, "step": 3660 }, { "epoch": 0.9164689724060432, "grad_norm": 0.06982421875, "learning_rate": 1.0526948501614536e-07, "logits/chosen": -2.103464126586914, "logits/rejected": -2.3152968883514404, "logps/chosen": -1.075402021408081, "logps/rejected": -1461.2308349609375, "loss": 0.2268, "rewards/accuracies": 1.0, "rewards/chosen": 0.21564142405986786, "rewards/margins": 14.216550827026367, "rewards/rejected": -14.000910758972168, "step": 3670 }, { "epoch": 0.9189661630665501, "grad_norm": 0.06982421875, "learning_rate": 9.910164984713477e-08, "logits/chosen": -2.1121301651000977, "logits/rejected": -2.327693223953247, "logps/chosen": -1.3442718982696533, "logps/rejected": -1471.4466552734375, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21234098076820374, "rewards/margins": 14.340484619140625, "rewards/rejected": -14.128143310546875, "step": 3680 }, { "epoch": 0.921463353727057, "grad_norm": 0.06005859375, "learning_rate": 9.311634803585323e-08, "logits/chosen": -2.1561217308044434, "logits/rejected": -2.3662197589874268, "logps/chosen": -0.8007721900939941, "logps/rejected": -1469.8878173828125, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 0.2085859775543213, "rewards/margins": 14.383298873901367, "rewards/rejected": -14.174713134765625, "step": 3690 }, { "epoch": 0.923960544387564, "grad_norm": 0.068359375, "learning_rate": 8.7314034627487e-08, "logits/chosen": -2.203339099884033, "logits/rejected": -2.40441632270813, "logps/chosen": -0.5535265207290649, "logps/rejected": -1442.5123291015625, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.20969831943511963, "rewards/margins": 14.11566162109375, "rewards/rejected": -13.905962944030762, "step": 3700 }, { "epoch": 0.9264577350480709, "grad_norm": 0.04833984375, "learning_rate": 8.16951507551439e-08, "logits/chosen": -2.2100465297698975, "logits/rejected": -2.394742250442505, "logps/chosen": -1.0725539922714233, "logps/rejected": -1294.114990234375, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 0.21020770072937012, "rewards/margins": 12.512723922729492, "rewards/rejected": -12.302515029907227, "step": 3710 }, { "epoch": 0.9289549257085778, "grad_norm": 0.01544189453125, "learning_rate": 7.626012360631291e-08, "logits/chosen": -2.2372231483459473, "logits/rejected": -2.4310178756713867, "logps/chosen": -1.1079142093658447, "logps/rejected": -1298.0045166015625, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 0.21124926209449768, "rewards/margins": 12.63310718536377, "rewards/rejected": -12.421857833862305, "step": 3720 }, { "epoch": 0.9314521163690848, "grad_norm": 0.03466796875, "learning_rate": 7.100936639038936e-08, "logits/chosen": -2.0344414710998535, "logits/rejected": -2.2667644023895264, "logps/chosen": -1.0317838191986084, "logps/rejected": -1655.9033203125, "loss": 0.224, "rewards/accuracies": 1.0, "rewards/chosen": 0.2157924920320511, "rewards/margins": 16.17062759399414, "rewards/rejected": -15.954833984375, "step": 3730 }, { "epoch": 0.9339493070295917, "grad_norm": 0.00531005859375, "learning_rate": 6.594327830725916e-08, "logits/chosen": -2.162308931350708, "logits/rejected": -2.371338367462158, "logps/chosen": -0.7821828722953796, "logps/rejected": -1442.607177734375, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.21128495037555695, "rewards/margins": 14.142976760864258, "rewards/rejected": -13.931692123413086, "step": 3740 }, { "epoch": 0.9364464976900987, "grad_norm": 0.043701171875, "learning_rate": 6.106224451694592e-08, "logits/chosen": -2.1930558681488037, "logits/rejected": -2.386634111404419, "logps/chosen": -0.6907114386558533, "logps/rejected": -1420.8463134765625, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.21480241417884827, "rewards/margins": 13.8462495803833, "rewards/rejected": -13.631448745727539, "step": 3750 }, { "epoch": 0.9389436883506056, "grad_norm": 0.04736328125, "learning_rate": 5.636663611033266e-08, "logits/chosen": -2.058790683746338, "logits/rejected": -2.274880886077881, "logps/chosen": -0.41397613286972046, "logps/rejected": -1431.2852783203125, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 0.20900818705558777, "rewards/margins": 14.00958251953125, "rewards/rejected": -13.800572395324707, "step": 3760 }, { "epoch": 0.9414408790111125, "grad_norm": 0.019287109375, "learning_rate": 5.185681008094579e-08, "logits/chosen": -2.251438617706299, "logits/rejected": -2.4458415508270264, "logps/chosen": -1.0221302509307861, "logps/rejected": -1385.7362060546875, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.21662044525146484, "rewards/margins": 13.469772338867188, "rewards/rejected": -13.253152847290039, "step": 3770 }, { "epoch": 0.9439380696716194, "grad_norm": 0.03857421875, "learning_rate": 4.753310929781513e-08, "logits/chosen": -2.206300973892212, "logits/rejected": -2.3716189861297607, "logps/chosen": -0.6498397588729858, "logps/rejected": -1291.197021484375, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.21270425617694855, "rewards/margins": 12.625164985656738, "rewards/rejected": -12.412460327148438, "step": 3780 }, { "epoch": 0.9464352603321263, "grad_norm": 0.033447265625, "learning_rate": 4.3395862479405914e-08, "logits/chosen": -2.1362087726593018, "logits/rejected": -2.332123041152954, "logps/chosen": -1.0763086080551147, "logps/rejected": -1387.7713623046875, "loss": 0.2244, "rewards/accuracies": 1.0, "rewards/chosen": 0.2146589756011963, "rewards/margins": 13.39326286315918, "rewards/rejected": -13.178604125976562, "step": 3790 }, { "epoch": 0.9489324509926332, "grad_norm": 0.0155029296875, "learning_rate": 3.9445384168628474e-08, "logits/chosen": -2.291581869125366, "logits/rejected": -2.500275135040283, "logps/chosen": -1.0031490325927734, "logps/rejected": -1276.675537109375, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.20909054577350616, "rewards/margins": 12.3539457321167, "rewards/rejected": -12.144854545593262, "step": 3800 }, { "epoch": 0.9514296416531403, "grad_norm": 0.0284423828125, "learning_rate": 3.5681974708923484e-08, "logits/chosen": -2.1034350395202637, "logits/rejected": -2.2940239906311035, "logps/chosen": -0.8783596158027649, "logps/rejected": -1220.660888671875, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.21768565475940704, "rewards/margins": 11.806703567504883, "rewards/rejected": -11.589017868041992, "step": 3810 }, { "epoch": 0.9539268323136472, "grad_norm": 0.047119140625, "learning_rate": 3.210592022142717e-08, "logits/chosen": -2.1330649852752686, "logits/rejected": -2.2985074520111084, "logps/chosen": -0.7123221158981323, "logps/rejected": -1336.329833984375, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 0.20608916878700256, "rewards/margins": 12.982263565063477, "rewards/rejected": -12.776172637939453, "step": 3820 }, { "epoch": 0.9564240229741541, "grad_norm": 0.021484375, "learning_rate": 2.8717492583220095e-08, "logits/chosen": -2.225675106048584, "logits/rejected": -2.428712844848633, "logps/chosen": -0.8774779438972473, "logps/rejected": -1398.2039794921875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.20993375778198242, "rewards/margins": 13.676666259765625, "rewards/rejected": -13.4667329788208, "step": 3830 }, { "epoch": 0.958921213634661, "grad_norm": 0.03271484375, "learning_rate": 2.551694940665539e-08, "logits/chosen": -2.163163423538208, "logits/rejected": -2.351386070251465, "logps/chosen": -0.9975617527961731, "logps/rejected": -1255.6383056640625, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.21191437542438507, "rewards/margins": 12.254827499389648, "rewards/rejected": -12.042913436889648, "step": 3840 }, { "epoch": 0.9614184042951679, "grad_norm": 0.055908203125, "learning_rate": 2.2504534019774092e-08, "logits/chosen": -2.3171262741088867, "logits/rejected": -2.492202043533325, "logps/chosen": -0.872540295124054, "logps/rejected": -1181.1051025390625, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 0.21220049262046814, "rewards/margins": 11.461995124816895, "rewards/rejected": -11.24979305267334, "step": 3850 }, { "epoch": 0.9639155949556749, "grad_norm": 0.032958984375, "learning_rate": 1.9680475447805826e-08, "logits/chosen": -2.1993744373321533, "logits/rejected": -2.380159378051758, "logps/chosen": -0.721504807472229, "logps/rejected": -1297.561767578125, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 0.20789849758148193, "rewards/margins": 12.683464050292969, "rewards/rejected": -12.475565910339355, "step": 3860 }, { "epoch": 0.9664127856161818, "grad_norm": 0.0201416015625, "learning_rate": 1.70449883957563e-08, "logits/chosen": -2.232905626296997, "logits/rejected": -2.4287447929382324, "logps/chosen": -2.1762092113494873, "logps/rejected": -1314.664794921875, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 0.21345773339271545, "rewards/margins": 12.700533866882324, "rewards/rejected": -12.487077713012695, "step": 3870 }, { "epoch": 0.9689099762766887, "grad_norm": 0.072265625, "learning_rate": 1.4598273232083182e-08, "logits/chosen": -2.198019027709961, "logits/rejected": -2.3671329021453857, "logps/chosen": -0.9621660113334656, "logps/rejected": -1280.2039794921875, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": 0.20697855949401855, "rewards/margins": 12.497517585754395, "rewards/rejected": -12.29054069519043, "step": 3880 }, { "epoch": 0.9714071669371956, "grad_norm": 0.0478515625, "learning_rate": 1.2340515973464917e-08, "logits/chosen": -2.1526544094085693, "logits/rejected": -2.3664348125457764, "logps/chosen": -1.546007752418518, "logps/rejected": -1401.54638671875, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 0.20804882049560547, "rewards/margins": 13.68072509765625, "rewards/rejected": -13.472674369812012, "step": 3890 }, { "epoch": 0.9739043575977026, "grad_norm": 0.016357421875, "learning_rate": 1.0271888270655118e-08, "logits/chosen": -2.043034076690674, "logits/rejected": -2.229666233062744, "logps/chosen": -0.9901046752929688, "logps/rejected": -1281.4815673828125, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 0.2106485813856125, "rewards/margins": 12.358712196350098, "rewards/rejected": -12.148063659667969, "step": 3900 }, { "epoch": 0.9764015482582095, "grad_norm": 0.07666015625, "learning_rate": 8.392547395435769e-09, "logits/chosen": -2.374267101287842, "logits/rejected": -2.551339626312256, "logps/chosen": -1.2009716033935547, "logps/rejected": -1176.1605224609375, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 0.20642951130867004, "rewards/margins": 11.431352615356445, "rewards/rejected": -11.224924087524414, "step": 3910 }, { "epoch": 0.9788987389187165, "grad_norm": 0.0732421875, "learning_rate": 6.702636228657911e-09, "logits/chosen": -2.262585163116455, "logits/rejected": -2.4511070251464844, "logps/chosen": -0.7528651356697083, "logps/rejected": -1265.910400390625, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.21303972601890564, "rewards/margins": 12.352733612060547, "rewards/rejected": -12.139693260192871, "step": 3920 }, { "epoch": 0.9813959295792234, "grad_norm": 0.11572265625, "learning_rate": 5.2022832493800465e-09, "logits/chosen": -2.3309874534606934, "logits/rejected": -2.5094618797302246, "logps/chosen": -0.8482611775398254, "logps/rejected": -1145.723876953125, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.2170940339565277, "rewards/margins": 11.18010139465332, "rewards/rejected": -10.963006973266602, "step": 3930 }, { "epoch": 0.9838931202397303, "grad_norm": 0.0174560546875, "learning_rate": 3.891602525100124e-09, "logits/chosen": -2.202822208404541, "logits/rejected": -2.4167404174804688, "logps/chosen": -0.8022462725639343, "logps/rejected": -1359.097412109375, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 0.21217508614063263, "rewards/margins": 13.16607666015625, "rewards/rejected": -12.953901290893555, "step": 3940 }, { "epoch": 0.9863903109002372, "grad_norm": 0.0947265625, "learning_rate": 2.7706937030827495e-09, "logits/chosen": -2.245856285095215, "logits/rejected": -2.436892032623291, "logps/chosen": -1.236242651939392, "logps/rejected": -1134.9066162109375, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": 0.20883974432945251, "rewards/margins": 10.997222900390625, "rewards/rejected": -10.788381576538086, "step": 3950 }, { "epoch": 0.9888875015607441, "grad_norm": 0.07568359375, "learning_rate": 1.839642002783859e-09, "logits/chosen": -2.1721549034118652, "logits/rejected": -2.3608601093292236, "logps/chosen": -0.9914839863777161, "logps/rejected": -1147.5926513671875, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 0.21253648400306702, "rewards/margins": 11.109753608703613, "rewards/rejected": -10.897214889526367, "step": 3960 }, { "epoch": 0.9913846922212511, "grad_norm": 0.0308837890625, "learning_rate": 1.0985182093714574e-09, "logits/chosen": -2.2215476036071777, "logits/rejected": -2.3835880756378174, "logps/chosen": -0.42377692461013794, "logps/rejected": -1237.712646484375, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.20775683224201202, "rewards/margins": 12.072611808776855, "rewards/rejected": -11.86485481262207, "step": 3970 }, { "epoch": 0.993881882881758, "grad_norm": 0.1875, "learning_rate": 5.473786683440896e-10, "logits/chosen": -2.119377613067627, "logits/rejected": -2.3185195922851562, "logps/chosen": -1.0564239025115967, "logps/rejected": -1471.339111328125, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 0.21374066174030304, "rewards/margins": 14.393136978149414, "rewards/rejected": -14.17939567565918, "step": 3980 }, { "epoch": 0.996379073542265, "grad_norm": 0.0311279296875, "learning_rate": 1.862652812467669e-10, "logits/chosen": -2.1754400730133057, "logits/rejected": -2.3970232009887695, "logps/chosen": -1.259765863418579, "logps/rejected": -1448.65576171875, "loss": 0.2242, "rewards/accuracies": 1.0, "rewards/chosen": 0.2176629602909088, "rewards/margins": 13.863238334655762, "rewards/rejected": -13.645576477050781, "step": 3990 }, { "epoch": 0.9988762642027719, "grad_norm": 0.035400390625, "learning_rate": 1.5205502486292932e-11, "logits/chosen": -2.143209934234619, "logits/rejected": -2.34411883354187, "logps/chosen": -0.6734473705291748, "logps/rejected": -1441.0018310546875, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.20581206679344177, "rewards/margins": 14.11164379119873, "rewards/rejected": -13.905832290649414, "step": 4000 }, { "epoch": 0.9988762642027719, "eval_logits/chosen": -2.571059465408325, "eval_logits/rejected": -2.6589972972869873, "eval_logps/chosen": -0.11967950314283371, "eval_logps/rejected": -652.1184692382812, "eval_loss": 0.22132086753845215, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.2579382359981537, "eval_rewards/margins": 6.330402374267578, "eval_rewards/rejected": -6.072464466094971, "eval_runtime": 0.656, "eval_samples_per_second": 7.622, "eval_steps_per_second": 4.573, "step": 4000 }, { "epoch": 0.9998751404669747, "step": 4004, "total_flos": 0.0, "train_loss": 0.2426841035559699, "train_runtime": 8271.4989, "train_samples_per_second": 1.936, "train_steps_per_second": 0.484 } ], "logging_steps": 10, "max_steps": 4004, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }