diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6109 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998751404669747, + "eval_steps": 1000, + "global_step": 4004, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000249719066050693, + "grad_norm": 0.34765625, + "learning_rate": 1.2468827930174565e-08, + "logits/chosen": -2.450503349304199, + "logits/rejected": -2.672837734222412, + "logps/chosen": -21.34674835205078, + "logps/rejected": -42.586097717285156, + "loss": 0.5, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.00249719066050693, + "grad_norm": 0.2890625, + "learning_rate": 1.2468827930174566e-07, + "logits/chosen": -2.275761604309082, + "logits/rejected": -2.479705333709717, + "logps/chosen": -22.14301300048828, + "logps/rejected": -63.31869888305664, + "loss": 0.5, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00027842415147460997, + "rewards/margins": -0.00017310140538029373, + "rewards/rejected": -0.0001053227242664434, + "step": 10 + }, + { + "epoch": 0.00499438132101386, + "grad_norm": 0.318359375, + "learning_rate": 2.493765586034913e-07, + "logits/chosen": -2.2202348709106445, + "logits/rejected": -2.429389238357544, + "logps/chosen": -21.814502716064453, + "logps/rejected": -61.35728073120117, + "loss": 0.5, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 8.430716843577102e-05, + "rewards/margins": 0.00037039705784991384, + "rewards/rejected": -0.00028608986758627, + "step": 20 + }, + { + "epoch": 0.007491571981520789, + "grad_norm": 0.26171875, + "learning_rate": 3.7406483790523695e-07, + "logits/chosen": -2.14150333404541, + "logits/rejected": -2.3708083629608154, + "logps/chosen": -22.1105899810791, + "logps/rejected": -52.95900344848633, + "loss": 0.5001, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -9.514805424259976e-05, + "rewards/margins": -5.593679452431388e-05, + "rewards/rejected": -3.9211259718285874e-05, + "step": 30 + }, + { + "epoch": 0.00998876264202772, + "grad_norm": 0.27734375, + "learning_rate": 4.987531172069826e-07, + "logits/chosen": -2.1455249786376953, + "logits/rejected": -2.362419605255127, + "logps/chosen": -22.628782272338867, + "logps/rejected": -63.2244873046875, + "loss": 0.4998, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0004498485941439867, + "rewards/margins": 0.0016190257156267762, + "rewards/rejected": -0.0011691770050674677, + "step": 40 + }, + { + "epoch": 0.012485953302534648, + "grad_norm": 0.212890625, + "learning_rate": 6.234413965087283e-07, + "logits/chosen": -2.2349250316619873, + "logits/rejected": -2.495819568634033, + "logps/chosen": -22.863269805908203, + "logps/rejected": -59.4576416015625, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0006220145733095706, + "rewards/margins": 0.0015806708252057433, + "rewards/rejected": -0.0009586562518961728, + "step": 50 + }, + { + "epoch": 0.014983143963041578, + "grad_norm": 0.328125, + "learning_rate": 7.481296758104739e-07, + "logits/chosen": -2.169523239135742, + "logits/rejected": -2.3751749992370605, + "logps/chosen": -22.777694702148438, + "logps/rejected": -68.83964538574219, + "loss": 0.4992, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0019476842135190964, + "rewards/margins": 0.004409968852996826, + "rewards/rejected": -0.002462285105139017, + "step": 60 + }, + { + "epoch": 0.017480334623548508, + "grad_norm": 0.28515625, + "learning_rate": 8.728179551122195e-07, + "logits/chosen": -2.286738634109497, + "logits/rejected": -2.4896113872528076, + "logps/chosen": -21.078710556030273, + "logps/rejected": -50.04187774658203, + "loss": 0.4985, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004810997284948826, + "rewards/margins": 0.007176141254603863, + "rewards/rejected": -0.0023651437368243933, + "step": 70 + }, + { + "epoch": 0.01997752528405544, + "grad_norm": 0.279296875, + "learning_rate": 9.975062344139653e-07, + "logits/chosen": -2.144176959991455, + "logits/rejected": -2.352398633956909, + "logps/chosen": -21.391971588134766, + "logps/rejected": -56.86810302734375, + "loss": 0.4969, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010686805471777916, + "rewards/margins": 0.01409011147916317, + "rewards/rejected": -0.003403306705877185, + "step": 80 + }, + { + "epoch": 0.02247471594456237, + "grad_norm": 0.34765625, + "learning_rate": 1.1221945137157108e-06, + "logits/chosen": -2.2526628971099854, + "logits/rejected": -2.430774211883545, + "logps/chosen": -19.845823287963867, + "logps/rejected": -51.37982177734375, + "loss": 0.4949, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019541995599865913, + "rewards/margins": 0.021860197186470032, + "rewards/rejected": -0.0023182008881121874, + "step": 90 + }, + { + "epoch": 0.024971906605069295, + "grad_norm": 0.353515625, + "learning_rate": 1.2468827930174565e-06, + "logits/chosen": -2.1313042640686035, + "logits/rejected": -2.3720927238464355, + "logps/chosen": -20.160160064697266, + "logps/rejected": -66.42484283447266, + "loss": 0.4924, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.031359124928712845, + "rewards/margins": 0.03116660751402378, + "rewards/rejected": 0.00019251916091889143, + "step": 100 + }, + { + "epoch": 0.027469097265576226, + "grad_norm": 0.33203125, + "learning_rate": 1.3715710723192023e-06, + "logits/chosen": -2.1676554679870605, + "logits/rejected": -2.389533758163452, + "logps/chosen": -17.833478927612305, + "logps/rejected": -60.63257598876953, + "loss": 0.4879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04782567545771599, + "rewards/margins": 0.05032258480787277, + "rewards/rejected": -0.002496910747140646, + "step": 110 + }, + { + "epoch": 0.029966287926083156, + "grad_norm": 0.68359375, + "learning_rate": 1.4962593516209478e-06, + "logits/chosen": -2.1279516220092773, + "logits/rejected": -2.343705177307129, + "logps/chosen": -15.757919311523438, + "logps/rejected": -51.14020919799805, + "loss": 0.4836, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06653784960508347, + "rewards/margins": 0.06694493442773819, + "rewards/rejected": -0.0004070843569934368, + "step": 120 + }, + { + "epoch": 0.032463478586590086, + "grad_norm": 0.337890625, + "learning_rate": 1.6209476309226935e-06, + "logits/chosen": -2.3082690238952637, + "logits/rejected": -2.5344271659851074, + "logps/chosen": -12.95374870300293, + "logps/rejected": -53.89298629760742, + "loss": 0.4766, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09254685044288635, + "rewards/margins": 0.09660454094409943, + "rewards/rejected": -0.0040576886385679245, + "step": 130 + }, + { + "epoch": 0.034960669247097016, + "grad_norm": 0.291015625, + "learning_rate": 1.745635910224439e-06, + "logits/chosen": -2.1814446449279785, + "logits/rejected": -2.40262508392334, + "logps/chosen": -11.56260871887207, + "logps/rejected": -71.49890899658203, + "loss": 0.4714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11319079250097275, + "rewards/margins": 0.11961270868778229, + "rewards/rejected": -0.006421914789825678, + "step": 140 + }, + { + "epoch": 0.037457859907603946, + "grad_norm": 0.28125, + "learning_rate": 1.8703241895261848e-06, + "logits/chosen": -2.2549407482147217, + "logits/rejected": -2.4583637714385986, + "logps/chosen": -8.707418441772461, + "logps/rejected": -56.646148681640625, + "loss": 0.4655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13278979063034058, + "rewards/margins": 0.14516989886760712, + "rewards/rejected": -0.012380105443298817, + "step": 150 + }, + { + "epoch": 0.03995505056811088, + "grad_norm": 0.26171875, + "learning_rate": 1.9950124688279305e-06, + "logits/chosen": -2.261176586151123, + "logits/rejected": -2.454853057861328, + "logps/chosen": -7.25634765625, + "logps/rejected": -62.16912841796875, + "loss": 0.4591, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14732162654399872, + "rewards/margins": 0.1813906729221344, + "rewards/rejected": -0.034069035202264786, + "step": 160 + }, + { + "epoch": 0.04245224122861781, + "grad_norm": 0.228515625, + "learning_rate": 2.119700748129676e-06, + "logits/chosen": -2.413883686065674, + "logits/rejected": -2.6421730518341064, + "logps/chosen": -5.5545244216918945, + "logps/rejected": -54.24146270751953, + "loss": 0.4528, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16115576028823853, + "rewards/margins": 0.21780212223529816, + "rewards/rejected": -0.05664635822176933, + "step": 170 + }, + { + "epoch": 0.04494943188912474, + "grad_norm": 0.310546875, + "learning_rate": 2.2443890274314216e-06, + "logits/chosen": -2.123264789581299, + "logits/rejected": -2.3629353046417236, + "logps/chosen": -5.675574779510498, + "logps/rejected": -81.35579681396484, + "loss": 0.448, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16601073741912842, + "rewards/margins": 0.2519921362400055, + "rewards/rejected": -0.08598136156797409, + "step": 180 + }, + { + "epoch": 0.04744662254963167, + "grad_norm": 0.2734375, + "learning_rate": 2.3690773067331675e-06, + "logits/chosen": -2.162355899810791, + "logits/rejected": -2.4037208557128906, + "logps/chosen": -4.741239547729492, + "logps/rejected": -69.67314147949219, + "loss": 0.4382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17391221225261688, + "rewards/margins": 0.32386231422424316, + "rewards/rejected": -0.14995010197162628, + "step": 190 + }, + { + "epoch": 0.04994381321013859, + "grad_norm": 0.326171875, + "learning_rate": 2.493765586034913e-06, + "logits/chosen": -2.232464551925659, + "logits/rejected": -2.461862087249756, + "logps/chosen": -4.306845664978027, + "logps/rejected": -70.49752807617188, + "loss": 0.429, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18084710836410522, + "rewards/margins": 0.39341551065444946, + "rewards/rejected": -0.21256835758686066, + "step": 200 + }, + { + "epoch": 0.05244100387064552, + "grad_norm": 0.28515625, + "learning_rate": 2.6184538653366586e-06, + "logits/chosen": -2.2186341285705566, + "logits/rejected": -2.4293782711029053, + "logps/chosen": -2.813771963119507, + "logps/rejected": -77.77786254882812, + "loss": 0.4186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18635782599449158, + "rewards/margins": 0.4745180010795593, + "rewards/rejected": -0.28816017508506775, + "step": 210 + }, + { + "epoch": 0.05493819453115245, + "grad_norm": 0.279296875, + "learning_rate": 2.7431421446384045e-06, + "logits/chosen": -2.2114510536193848, + "logits/rejected": -2.423021078109741, + "logps/chosen": -2.7164266109466553, + "logps/rejected": -93.01399230957031, + "loss": 0.4086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19216802716255188, + "rewards/margins": 0.5540723204612732, + "rewards/rejected": -0.3619043231010437, + "step": 220 + }, + { + "epoch": 0.05743538519165938, + "grad_norm": 0.30859375, + "learning_rate": 2.86783042394015e-06, + "logits/chosen": -2.2182869911193848, + "logits/rejected": -2.4157519340515137, + "logps/chosen": -2.1753125190734863, + "logps/rejected": -96.47676086425781, + "loss": 0.3976, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19363494217395782, + "rewards/margins": 0.6491508483886719, + "rewards/rejected": -0.45551595091819763, + "step": 230 + }, + { + "epoch": 0.05993257585216631, + "grad_norm": 0.41796875, + "learning_rate": 2.9925187032418956e-06, + "logits/chosen": -2.303800344467163, + "logits/rejected": -2.5223240852355957, + "logps/chosen": -2.2545647621154785, + "logps/rejected": -115.70625305175781, + "loss": 0.3757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19811145961284637, + "rewards/margins": 0.8461275100708008, + "rewards/rejected": -0.6480159759521484, + "step": 240 + }, + { + "epoch": 0.06242976651267324, + "grad_norm": 0.408203125, + "learning_rate": 3.117206982543641e-06, + "logits/chosen": -2.2053685188293457, + "logits/rejected": -2.415367841720581, + "logps/chosen": -2.1990444660186768, + "logps/rejected": -140.34054565429688, + "loss": 0.3542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19701093435287476, + "rewards/margins": 1.0766099691390991, + "rewards/rejected": -0.8795989751815796, + "step": 250 + }, + { + "epoch": 0.06492695717318017, + "grad_norm": 0.45703125, + "learning_rate": 3.241895261845387e-06, + "logits/chosen": -2.224290132522583, + "logits/rejected": -2.4391043186187744, + "logps/chosen": -1.894426941871643, + "logps/rejected": -191.0155029296875, + "loss": 0.3217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19716738164424896, + "rewards/margins": 1.5208184719085693, + "rewards/rejected": -1.3236511945724487, + "step": 260 + }, + { + "epoch": 0.0674241478336871, + "grad_norm": 0.353515625, + "learning_rate": 3.3665835411471326e-06, + "logits/chosen": -2.1803958415985107, + "logits/rejected": -2.3852007389068604, + "logps/chosen": -2.2776474952697754, + "logps/rejected": -256.2982177734375, + "loss": 0.2905, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19975684583187103, + "rewards/margins": 2.1906659603118896, + "rewards/rejected": -1.990909218788147, + "step": 270 + }, + { + "epoch": 0.06992133849419403, + "grad_norm": 0.1494140625, + "learning_rate": 3.491271820448878e-06, + "logits/chosen": -2.089259624481201, + "logits/rejected": -2.2738101482391357, + "logps/chosen": -3.7932281494140625, + "logps/rejected": -315.3883361816406, + "loss": 0.2858, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19718244671821594, + "rewards/margins": 2.653756856918335, + "rewards/rejected": -2.4565746784210205, + "step": 280 + }, + { + "epoch": 0.07241852915470096, + "grad_norm": 0.138671875, + "learning_rate": 3.615960099750624e-06, + "logits/chosen": -2.136627674102783, + "logits/rejected": -2.336648941040039, + "logps/chosen": -2.27809476852417, + "logps/rejected": -309.0271911621094, + "loss": 0.281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19811423122882843, + "rewards/margins": 2.7192320823669434, + "rewards/rejected": -2.521117687225342, + "step": 290 + }, + { + "epoch": 0.07491571981520789, + "grad_norm": 0.1826171875, + "learning_rate": 3.7406483790523696e-06, + "logits/chosen": -2.1298162937164307, + "logits/rejected": -2.3403031826019287, + "logps/chosen": -2.7181735038757324, + "logps/rejected": -379.2640075683594, + "loss": 0.2649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1967845857143402, + "rewards/margins": 3.449932813644409, + "rewards/rejected": -3.253148317337036, + "step": 300 + }, + { + "epoch": 0.07741291047571482, + "grad_norm": 0.189453125, + "learning_rate": 3.8653366583541155e-06, + "logits/chosen": -2.0690829753875732, + "logits/rejected": -2.240788459777832, + "logps/chosen": -2.222135066986084, + "logps/rejected": -404.05157470703125, + "loss": 0.2741, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19702208042144775, + "rewards/margins": 3.7276394367218018, + "rewards/rejected": -3.5306174755096436, + "step": 310 + }, + { + "epoch": 0.07991010113622175, + "grad_norm": 0.16796875, + "learning_rate": 3.990024937655861e-06, + "logits/chosen": -2.0671050548553467, + "logits/rejected": -2.24275279045105, + "logps/chosen": -2.2376856803894043, + "logps/rejected": -507.495849609375, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1967649608850479, + "rewards/margins": 4.618912696838379, + "rewards/rejected": -4.422147750854492, + "step": 320 + }, + { + "epoch": 0.08240729179672868, + "grad_norm": 0.1611328125, + "learning_rate": 4.114713216957607e-06, + "logits/chosen": -2.137000560760498, + "logits/rejected": -2.287095546722412, + "logps/chosen": -2.6727747917175293, + "logps/rejected": -397.1515808105469, + "loss": 0.2652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1976507008075714, + "rewards/margins": 3.678623914718628, + "rewards/rejected": -3.480973482131958, + "step": 330 + }, + { + "epoch": 0.08490448245723561, + "grad_norm": 0.21484375, + "learning_rate": 4.239401496259352e-06, + "logits/chosen": -2.0656325817108154, + "logits/rejected": -2.2314834594726562, + "logps/chosen": -2.123012065887451, + "logps/rejected": -494.6885681152344, + "loss": 0.2573, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19615662097930908, + "rewards/margins": 4.528371810913086, + "rewards/rejected": -4.332215309143066, + "step": 340 + }, + { + "epoch": 0.08740167311774254, + "grad_norm": 0.1259765625, + "learning_rate": 4.364089775561098e-06, + "logits/chosen": -2.1637234687805176, + "logits/rejected": -2.3083388805389404, + "logps/chosen": -2.9447762966156006, + "logps/rejected": -453.163330078125, + "loss": 0.264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18918542563915253, + "rewards/margins": 4.223211288452148, + "rewards/rejected": -4.034026145935059, + "step": 350 + }, + { + "epoch": 0.08989886377824947, + "grad_norm": 0.376953125, + "learning_rate": 4.488778054862843e-06, + "logits/chosen": -2.1501951217651367, + "logits/rejected": -2.341325521469116, + "logps/chosen": -4.00003719329834, + "logps/rejected": -510.6114196777344, + "loss": 0.2492, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18887588381767273, + "rewards/margins": 4.7241339683532715, + "rewards/rejected": -4.5352582931518555, + "step": 360 + }, + { + "epoch": 0.0923960544387564, + "grad_norm": 0.228515625, + "learning_rate": 4.6134663341645895e-06, + "logits/chosen": -2.152017593383789, + "logits/rejected": -2.326498508453369, + "logps/chosen": -3.2789077758789062, + "logps/rejected": -488.865966796875, + "loss": 0.2472, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19166019558906555, + "rewards/margins": 4.529562473297119, + "rewards/rejected": -4.337902069091797, + "step": 370 + }, + { + "epoch": 0.09489324509926333, + "grad_norm": 0.546875, + "learning_rate": 4.738154613466335e-06, + "logits/chosen": -2.0966598987579346, + "logits/rejected": -2.3076987266540527, + "logps/chosen": -3.7783362865448, + "logps/rejected": -743.3594970703125, + "loss": 0.2398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18687334656715393, + "rewards/margins": 6.980570316314697, + "rewards/rejected": -6.793696403503418, + "step": 380 + }, + { + "epoch": 0.09739043575977026, + "grad_norm": 0.1982421875, + "learning_rate": 4.862842892768081e-06, + "logits/chosen": -2.1418652534484863, + "logits/rejected": -2.30336332321167, + "logps/chosen": -2.9560298919677734, + "logps/rejected": -607.9320068359375, + "loss": 0.2388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18996365368366241, + "rewards/margins": 5.754693031311035, + "rewards/rejected": -5.564728736877441, + "step": 390 + }, + { + "epoch": 0.09988762642027718, + "grad_norm": 0.1630859375, + "learning_rate": 4.987531172069826e-06, + "logits/chosen": -2.0703442096710205, + "logits/rejected": -2.2270889282226562, + "logps/chosen": -2.578680992126465, + "logps/rejected": -683.11083984375, + "loss": 0.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19460181891918182, + "rewards/margins": 6.500932216644287, + "rewards/rejected": -6.30633020401001, + "step": 400 + }, + { + "epoch": 0.10238481708078412, + "grad_norm": 0.2021484375, + "learning_rate": 4.999923022460671e-06, + "logits/chosen": -2.0380523204803467, + "logits/rejected": -2.2315127849578857, + "logps/chosen": -4.086075782775879, + "logps/rejected": -833.37255859375, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19286975264549255, + "rewards/margins": 7.908270835876465, + "rewards/rejected": -7.715400695800781, + "step": 410 + }, + { + "epoch": 0.10488200774129104, + "grad_norm": 0.248046875, + "learning_rate": 4.999656933348981e-06, + "logits/chosen": -2.244335174560547, + "logits/rejected": -2.4024062156677246, + "logps/chosen": -2.923116445541382, + "logps/rejected": -593.464599609375, + "loss": 0.241, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19478723406791687, + "rewards/margins": 5.600838661193848, + "rewards/rejected": -5.4060516357421875, + "step": 420 + }, + { + "epoch": 0.10737919840179798, + "grad_norm": 0.359375, + "learning_rate": 4.99920080255011e-06, + "logits/chosen": -2.077357769012451, + "logits/rejected": -2.282799243927002, + "logps/chosen": -2.9383771419525146, + "logps/rejected": -852.4064331054688, + "loss": 0.231, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19819210469722748, + "rewards/margins": 8.028984069824219, + "rewards/rejected": -7.830792427062988, + "step": 430 + }, + { + "epoch": 0.1098763890623049, + "grad_norm": 0.171875, + "learning_rate": 4.998554664742362e-06, + "logits/chosen": -2.148183822631836, + "logits/rejected": -2.3020401000976562, + "logps/chosen": -1.9824367761611938, + "logps/rejected": -745.6473999023438, + "loss": 0.2322, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1946963667869568, + "rewards/margins": 7.165565490722656, + "rewards/rejected": -6.9708685874938965, + "step": 440 + }, + { + "epoch": 0.11237357972281184, + "grad_norm": 0.13671875, + "learning_rate": 4.997718569049726e-06, + "logits/chosen": -2.094149351119995, + "logits/rejected": -2.2727301120758057, + "logps/chosen": -3.559483051300049, + "logps/rejected": -817.2952270507812, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19730597734451294, + "rewards/margins": 7.785311222076416, + "rewards/rejected": -7.588005065917969, + "step": 450 + }, + { + "epoch": 0.11487077038331876, + "grad_norm": 0.1171875, + "learning_rate": 4.9966925790381404e-06, + "logits/chosen": -2.1491434574127197, + "logits/rejected": -2.301217555999756, + "logps/chosen": -1.5461114645004272, + "logps/rejected": -810.7796020507812, + "loss": 0.2326, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19482001662254333, + "rewards/margins": 7.743639945983887, + "rewards/rejected": -7.548819541931152, + "step": 460 + }, + { + "epoch": 0.1173679610438257, + "grad_norm": 0.1435546875, + "learning_rate": 4.995476772710657e-06, + "logits/chosen": -2.1041364669799805, + "logits/rejected": -2.3101038932800293, + "logps/chosen": -3.1227645874023438, + "logps/rejected": -963.2913208007812, + "loss": 0.2321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1952921450138092, + "rewards/margins": 9.208600044250488, + "rewards/rejected": -9.013307571411133, + "step": 470 + }, + { + "epoch": 0.11986515170433262, + "grad_norm": 0.08544921875, + "learning_rate": 4.994071242501516e-06, + "logits/chosen": -2.1944689750671387, + "logits/rejected": -2.371983051300049, + "logps/chosen": -2.822134494781494, + "logps/rejected": -869.8029174804688, + "loss": 0.2298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19522327184677124, + "rewards/margins": 8.3977632522583, + "rewards/rejected": -8.20253849029541, + "step": 480 + }, + { + "epoch": 0.12236234236483956, + "grad_norm": 0.0634765625, + "learning_rate": 4.992476095269112e-06, + "logits/chosen": -2.2050843238830566, + "logits/rejected": -2.3897545337677, + "logps/chosen": -1.4868861436843872, + "logps/rejected": -922.6173095703125, + "loss": 0.2305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2005070000886917, + "rewards/margins": 8.763871192932129, + "rewards/rejected": -8.563364028930664, + "step": 490 + }, + { + "epoch": 0.12485953302534648, + "grad_norm": 0.369140625, + "learning_rate": 4.990691452287877e-06, + "logits/chosen": -2.042813777923584, + "logits/rejected": -2.213289976119995, + "logps/chosen": -2.393306016921997, + "logps/rejected": -886.4241943359375, + "loss": 0.2303, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20110133290290833, + "rewards/margins": 8.474591255187988, + "rewards/rejected": -8.273489952087402, + "step": 500 + }, + { + "epoch": 0.1273567236858534, + "grad_norm": 0.3984375, + "learning_rate": 4.988717449239056e-06, + "logits/chosen": -2.093723773956299, + "logits/rejected": -2.2634453773498535, + "logps/chosen": -1.9311176538467407, + "logps/rejected": -851.02734375, + "loss": 0.2347, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19648316502571106, + "rewards/margins": 8.190296173095703, + "rewards/rejected": -7.993813991546631, + "step": 510 + }, + { + "epoch": 0.12985391434636034, + "grad_norm": 0.1630859375, + "learning_rate": 4.98655423620039e-06, + "logits/chosen": -2.1161797046661377, + "logits/rejected": -2.3049392700195312, + "logps/chosen": -1.9681230783462524, + "logps/rejected": -963.2742919921875, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20181334018707275, + "rewards/margins": 9.243757247924805, + "rewards/rejected": -9.041942596435547, + "step": 520 + }, + { + "epoch": 0.13235110500686728, + "grad_norm": 0.07373046875, + "learning_rate": 4.984201977634711e-06, + "logits/chosen": -2.223388195037842, + "logits/rejected": -2.4297728538513184, + "logps/chosen": -2.4097044467926025, + "logps/rejected": -1106.8994140625, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2010197937488556, + "rewards/margins": 10.71354866027832, + "rewards/rejected": -10.512530326843262, + "step": 530 + }, + { + "epoch": 0.1348482956673742, + "grad_norm": 0.11279296875, + "learning_rate": 4.9816608523774345e-06, + "logits/chosen": -2.119506359100342, + "logits/rejected": -2.305849552154541, + "logps/chosen": -2.257546901702881, + "logps/rejected": -930.5267333984375, + "loss": 0.2306, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19777485728263855, + "rewards/margins": 8.905478477478027, + "rewards/rejected": -8.707704544067383, + "step": 540 + }, + { + "epoch": 0.13734548632788113, + "grad_norm": 0.07470703125, + "learning_rate": 4.978931053622964e-06, + "logits/chosen": -2.1544103622436523, + "logits/rejected": -2.354814052581787, + "logps/chosen": -1.3565616607666016, + "logps/rejected": -950.23681640625, + "loss": 0.2307, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2015368640422821, + "rewards/margins": 9.141637802124023, + "rewards/rejected": -8.940099716186523, + "step": 550 + }, + { + "epoch": 0.13984267698838806, + "grad_norm": 0.035400390625, + "learning_rate": 4.9760127889100044e-06, + "logits/chosen": -2.1699581146240234, + "logits/rejected": -2.3422303199768066, + "logps/chosen": -1.4560916423797607, + "logps/rejected": -1047.3670654296875, + "loss": 0.23, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19995173811912537, + "rewards/margins": 10.138322830200195, + "rewards/rejected": -9.938371658325195, + "step": 560 + }, + { + "epoch": 0.142339867648895, + "grad_norm": 0.11328125, + "learning_rate": 4.972906280105781e-06, + "logits/chosen": -2.0392138957977295, + "logits/rejected": -2.2401204109191895, + "logps/chosen": -2.1844277381896973, + "logps/rejected": -998.3021240234375, + "loss": 0.2281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20644374191761017, + "rewards/margins": 9.561029434204102, + "rewards/rejected": -9.354585647583008, + "step": 570 + }, + { + "epoch": 0.1448370583094019, + "grad_norm": 0.0625, + "learning_rate": 4.969611763389175e-06, + "logits/chosen": -2.2010245323181152, + "logits/rejected": -2.3933498859405518, + "logps/chosen": -2.1393237113952637, + "logps/rejected": -925.5234375, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1972513645887375, + "rewards/margins": 8.949918746948242, + "rewards/rejected": -8.752666473388672, + "step": 580 + }, + { + "epoch": 0.14733424896990885, + "grad_norm": 0.0751953125, + "learning_rate": 4.966129489232762e-06, + "logits/chosen": -2.1333353519439697, + "logits/rejected": -2.3556675910949707, + "logps/chosen": -2.2460904121398926, + "logps/rejected": -1139.327392578125, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20897097885608673, + "rewards/margins": 10.926295280456543, + "rewards/rejected": -10.717325210571289, + "step": 590 + }, + { + "epoch": 0.14983143963041579, + "grad_norm": 0.12890625, + "learning_rate": 4.962459722383775e-06, + "logits/chosen": -2.095088243484497, + "logits/rejected": -2.2931671142578125, + "logps/chosen": -2.7135472297668457, + "logps/rejected": -1181.6075439453125, + "loss": 0.2292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20306305587291718, + "rewards/margins": 11.351381301879883, + "rewards/rejected": -11.148316383361816, + "step": 600 + }, + { + "epoch": 0.15232863029092272, + "grad_norm": 0.0303955078125, + "learning_rate": 4.958602741843975e-06, + "logits/chosen": -2.0957350730895996, + "logits/rejected": -2.3226089477539062, + "logps/chosen": -2.8655078411102295, + "logps/rejected": -1118.1968994140625, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19633761048316956, + "rewards/margins": 10.744343757629395, + "rewards/rejected": -10.548004150390625, + "step": 610 + }, + { + "epoch": 0.15482582095142963, + "grad_norm": 0.107421875, + "learning_rate": 4.954558840848437e-06, + "logits/chosen": -2.211951494216919, + "logits/rejected": -2.3932459354400635, + "logps/chosen": -1.5332846641540527, + "logps/rejected": -932.4984130859375, + "loss": 0.2285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20827274024486542, + "rewards/margins": 9.049389839172363, + "rewards/rejected": -8.841116905212402, + "step": 620 + }, + { + "epoch": 0.15732301161193657, + "grad_norm": 0.052978515625, + "learning_rate": 4.950328326843258e-06, + "logits/chosen": -2.073488712310791, + "logits/rejected": -2.2822651863098145, + "logps/chosen": -0.9946017265319824, + "logps/rejected": -1086.56689453125, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20773670077323914, + "rewards/margins": 10.345720291137695, + "rewards/rejected": -10.137983322143555, + "step": 630 + }, + { + "epoch": 0.1598202022724435, + "grad_norm": 0.08203125, + "learning_rate": 4.945911521462182e-06, + "logits/chosen": -2.2225770950317383, + "logits/rejected": -2.412863254547119, + "logps/chosen": -1.7764488458633423, + "logps/rejected": -1141.6427001953125, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20446841418743134, + "rewards/margins": 11.000048637390137, + "rewards/rejected": -10.79557991027832, + "step": 640 + }, + { + "epoch": 0.16231739293295044, + "grad_norm": 0.16796875, + "learning_rate": 4.941308760502149e-06, + "logits/chosen": -2.211944341659546, + "logits/rejected": -2.371511697769165, + "logps/chosen": -2.542166233062744, + "logps/rejected": -972.3176879882812, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.200990229845047, + "rewards/margins": 9.126736640930176, + "rewards/rejected": -8.925745964050293, + "step": 650 + }, + { + "epoch": 0.16481458359345735, + "grad_norm": 0.134765625, + "learning_rate": 4.936520393897762e-06, + "logits/chosen": -2.1870148181915283, + "logits/rejected": -2.4076366424560547, + "logps/chosen": -2.055567979812622, + "logps/rejected": -1019.7349853515625, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21482977271080017, + "rewards/margins": 9.75967788696289, + "rewards/rejected": -9.54484748840332, + "step": 660 + }, + { + "epoch": 0.1673117742539643, + "grad_norm": 0.042724609375, + "learning_rate": 4.931546785694684e-06, + "logits/chosen": -2.207019090652466, + "logits/rejected": -2.411149740219116, + "logps/chosen": -1.447061538696289, + "logps/rejected": -1274.262451171875, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2061387598514557, + "rewards/margins": 12.430362701416016, + "rewards/rejected": -12.224225044250488, + "step": 670 + }, + { + "epoch": 0.16980896491447123, + "grad_norm": 0.031982421875, + "learning_rate": 4.926388314021964e-06, + "logits/chosen": -2.245506763458252, + "logits/rejected": -2.439272403717041, + "logps/chosen": -1.3953222036361694, + "logps/rejected": -1066.398193359375, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.210698202252388, + "rewards/margins": 10.378253936767578, + "rewards/rejected": -10.167555809020996, + "step": 680 + }, + { + "epoch": 0.17230615557497814, + "grad_norm": 0.049560546875, + "learning_rate": 4.921045371063283e-06, + "logits/chosen": -2.235975980758667, + "logits/rejected": -2.42988920211792, + "logps/chosen": -0.8631747961044312, + "logps/rejected": -1208.173095703125, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2110184133052826, + "rewards/margins": 11.736184120178223, + "rewards/rejected": -11.525165557861328, + "step": 690 + }, + { + "epoch": 0.17480334623548507, + "grad_norm": 0.049072265625, + "learning_rate": 4.915518363027142e-06, + "logits/chosen": -2.29992938041687, + "logits/rejected": -2.4797685146331787, + "logps/chosen": -0.5947138667106628, + "logps/rejected": -1052.22216796875, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2057635486125946, + "rewards/margins": 10.251365661621094, + "rewards/rejected": -10.045602798461914, + "step": 700 + }, + { + "epoch": 0.177300536895992, + "grad_norm": 0.0625, + "learning_rate": 4.909807710115977e-06, + "logits/chosen": -2.0681312084198, + "logits/rejected": -2.245760202407837, + "logps/chosen": -1.667133092880249, + "logps/rejected": -1234.741943359375, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19957289099693298, + "rewards/margins": 12.013802528381348, + "rewards/rejected": -11.814229011535645, + "step": 710 + }, + { + "epoch": 0.17979772755649895, + "grad_norm": 0.064453125, + "learning_rate": 4.903913846494211e-06, + "logits/chosen": -2.0854830741882324, + "logits/rejected": -2.318626880645752, + "logps/chosen": -1.4859822988510132, + "logps/rejected": -1401.390625, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21390756964683533, + "rewards/margins": 13.512557983398438, + "rewards/rejected": -13.298650741577148, + "step": 720 + }, + { + "epoch": 0.18229491821700586, + "grad_norm": 0.049072265625, + "learning_rate": 4.897837220255251e-06, + "logits/chosen": -2.105733633041382, + "logits/rejected": -2.273578643798828, + "logps/chosen": -1.5127496719360352, + "logps/rejected": -1189.6934814453125, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21006134152412415, + "rewards/margins": 11.488363265991211, + "rewards/rejected": -11.278302192687988, + "step": 730 + }, + { + "epoch": 0.1847921088775128, + "grad_norm": 0.06982421875, + "learning_rate": 4.891578293387413e-06, + "logits/chosen": -2.1760973930358887, + "logits/rejected": -2.3570103645324707, + "logps/chosen": -1.769789695739746, + "logps/rejected": -1201.271240234375, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20501787960529327, + "rewards/margins": 11.696678161621094, + "rewards/rejected": -11.491661071777344, + "step": 740 + }, + { + "epoch": 0.18728929953801973, + "grad_norm": 0.031982421875, + "learning_rate": 4.885137541738808e-06, + "logits/chosen": -2.141007423400879, + "logits/rejected": -2.313952922821045, + "logps/chosen": -0.702928900718689, + "logps/rejected": -1086.88330078125, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20619484782218933, + "rewards/margins": 10.44408893585205, + "rewards/rejected": -10.237894058227539, + "step": 750 + }, + { + "epoch": 0.18978649019852667, + "grad_norm": 0.09033203125, + "learning_rate": 4.878515454981153e-06, + "logits/chosen": -2.0163445472717285, + "logits/rejected": -2.219290256500244, + "logps/chosen": -1.4322102069854736, + "logps/rejected": -1299.561767578125, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20874173939228058, + "rewards/margins": 12.519464492797852, + "rewards/rejected": -12.310722351074219, + "step": 760 + }, + { + "epoch": 0.19228368085903358, + "grad_norm": 0.0654296875, + "learning_rate": 4.8717125365725545e-06, + "logits/chosen": -2.2308189868927, + "logits/rejected": -2.3827383518218994, + "logps/chosen": -1.321045160293579, + "logps/rejected": -954.9481201171875, + "loss": 0.2298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2083440124988556, + "rewards/margins": 9.28177547454834, + "rewards/rejected": -9.073431015014648, + "step": 770 + }, + { + "epoch": 0.19478087151954052, + "grad_norm": 0.05029296875, + "learning_rate": 4.864729303719221e-06, + "logits/chosen": -2.1831257343292236, + "logits/rejected": -2.386863946914673, + "logps/chosen": -1.462869644165039, + "logps/rejected": -1309.128662109375, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21388690173625946, + "rewards/margins": 12.65107250213623, + "rewards/rejected": -12.437185287475586, + "step": 780 + }, + { + "epoch": 0.19727806218004745, + "grad_norm": 0.1259765625, + "learning_rate": 4.857566287336152e-06, + "logits/chosen": -2.125136375427246, + "logits/rejected": -2.3306586742401123, + "logps/chosen": -1.5712594985961914, + "logps/rejected": -1211.277587890625, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21211902797222137, + "rewards/margins": 11.6867094039917, + "rewards/rejected": -11.474590301513672, + "step": 790 + }, + { + "epoch": 0.19977525284055436, + "grad_norm": 0.11376953125, + "learning_rate": 4.850224032006765e-06, + "logits/chosen": -2.226292610168457, + "logits/rejected": -2.4260332584381104, + "logps/chosen": -1.096842885017395, + "logps/rejected": -1190.5208740234375, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21329161524772644, + "rewards/margins": 11.604973793029785, + "rewards/rejected": -11.391681671142578, + "step": 800 + }, + { + "epoch": 0.2022724435010613, + "grad_norm": 0.080078125, + "learning_rate": 4.8427030959414984e-06, + "logits/chosen": -2.0340332984924316, + "logits/rejected": -2.239582061767578, + "logps/chosen": -1.4298118352890015, + "logps/rejected": -1246.587158203125, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.203706294298172, + "rewards/margins": 12.136808395385742, + "rewards/rejected": -11.933099746704102, + "step": 810 + }, + { + "epoch": 0.20476963416156824, + "grad_norm": 0.0400390625, + "learning_rate": 4.835004050935369e-06, + "logits/chosen": -2.142270803451538, + "logits/rejected": -2.3261685371398926, + "logps/chosen": -2.205761432647705, + "logps/rejected": -1209.187744140625, + "loss": 0.2294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21324896812438965, + "rewards/margins": 11.711974143981934, + "rewards/rejected": -11.498725891113281, + "step": 820 + }, + { + "epoch": 0.20726682482207517, + "grad_norm": 0.0257568359375, + "learning_rate": 4.8271274823245e-06, + "logits/chosen": -2.130068778991699, + "logits/rejected": -2.303924083709717, + "logps/chosen": -1.5450295209884644, + "logps/rejected": -1218.6636962890625, + "loss": 0.2285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2086503505706787, + "rewards/margins": 11.845584869384766, + "rewards/rejected": -11.636935234069824, + "step": 830 + }, + { + "epoch": 0.20976401548258208, + "grad_norm": 0.0791015625, + "learning_rate": 4.8190739889416264e-06, + "logits/chosen": -2.1227643489837646, + "logits/rejected": -2.3156332969665527, + "logps/chosen": -1.4759693145751953, + "logps/rejected": -1314.2388916015625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21140392124652863, + "rewards/margins": 12.794939994812012, + "rewards/rejected": -12.583536148071289, + "step": 840 + }, + { + "epoch": 0.21226120614308902, + "grad_norm": 0.0264892578125, + "learning_rate": 4.810844183070553e-06, + "logits/chosen": -2.2195773124694824, + "logits/rejected": -2.416642665863037, + "logps/chosen": -1.3944060802459717, + "logps/rejected": -1100.637939453125, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20567412674427032, + "rewards/margins": 10.635955810546875, + "rewards/rejected": -10.430280685424805, + "step": 850 + }, + { + "epoch": 0.21475839680359596, + "grad_norm": 0.05712890625, + "learning_rate": 4.802438690399622e-06, + "logits/chosen": -2.170403480529785, + "logits/rejected": -2.3731253147125244, + "logps/chosen": -0.7113627195358276, + "logps/rejected": -1192.8896484375, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20982804894447327, + "rewards/margins": 11.532899856567383, + "rewards/rejected": -11.32307243347168, + "step": 860 + }, + { + "epoch": 0.2172555874641029, + "grad_norm": 0.06201171875, + "learning_rate": 4.793858149974129e-06, + "logits/chosen": -2.134357452392578, + "logits/rejected": -2.3488316535949707, + "logps/chosen": -1.1498069763183594, + "logps/rejected": -1405.57177734375, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2079639434814453, + "rewards/margins": 13.713908195495605, + "rewards/rejected": -13.505943298339844, + "step": 870 + }, + { + "epoch": 0.2197527781246098, + "grad_norm": 0.03857421875, + "learning_rate": 4.785103214147747e-06, + "logits/chosen": -2.244509220123291, + "logits/rejected": -2.446852445602417, + "logps/chosen": -1.082582950592041, + "logps/rejected": -1192.0093994140625, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20774182677268982, + "rewards/margins": 11.592524528503418, + "rewards/rejected": -11.384782791137695, + "step": 880 + }, + { + "epoch": 0.22224996878511674, + "grad_norm": 0.0118408203125, + "learning_rate": 4.776174548532926e-06, + "logits/chosen": -2.1576988697052, + "logits/rejected": -2.3463644981384277, + "logps/chosen": -1.1917221546173096, + "logps/rejected": -1265.5885009765625, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20792751014232635, + "rewards/margins": 12.278467178344727, + "rewards/rejected": -12.070539474487305, + "step": 890 + }, + { + "epoch": 0.22474715944562368, + "grad_norm": 0.060546875, + "learning_rate": 4.767072831950288e-06, + "logits/chosen": -2.2008862495422363, + "logits/rejected": -2.402891159057617, + "logps/chosen": -1.2017600536346436, + "logps/rejected": -1313.045654296875, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2119072675704956, + "rewards/margins": 12.807563781738281, + "rewards/rejected": -12.59565544128418, + "step": 900 + }, + { + "epoch": 0.22724435010613062, + "grad_norm": 0.091796875, + "learning_rate": 4.7577987563770226e-06, + "logits/chosen": -2.1067652702331543, + "logits/rejected": -2.324591875076294, + "logps/chosen": -2.000681161880493, + "logps/rejected": -1264.68115234375, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2102380096912384, + "rewards/margins": 12.193601608276367, + "rewards/rejected": -11.983363151550293, + "step": 910 + }, + { + "epoch": 0.22974154076663753, + "grad_norm": 0.059814453125, + "learning_rate": 4.748353026894273e-06, + "logits/chosen": -2.1624951362609863, + "logits/rejected": -2.3448517322540283, + "logps/chosen": -1.4960781335830688, + "logps/rejected": -1188.14990234375, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2139265537261963, + "rewards/margins": 11.510043144226074, + "rewards/rejected": -11.29611587524414, + "step": 920 + }, + { + "epoch": 0.23223873142714446, + "grad_norm": 0.080078125, + "learning_rate": 4.738736361633532e-06, + "logits/chosen": -2.25258207321167, + "logits/rejected": -2.4271512031555176, + "logps/chosen": -1.7973697185516357, + "logps/rejected": -1126.24267578125, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20842309296131134, + "rewards/margins": 10.903474807739258, + "rewards/rejected": -10.695051193237305, + "step": 930 + }, + { + "epoch": 0.2347359220876514, + "grad_norm": 0.06103515625, + "learning_rate": 4.728949491722046e-06, + "logits/chosen": -2.274840831756592, + "logits/rejected": -2.4521872997283936, + "logps/chosen": -0.652289092540741, + "logps/rejected": -1062.56494140625, + "loss": 0.2295, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20718173682689667, + "rewards/margins": 10.335628509521484, + "rewards/rejected": -10.128446578979492, + "step": 940 + }, + { + "epoch": 0.2372331127481583, + "grad_norm": 0.0751953125, + "learning_rate": 4.718993161227231e-06, + "logits/chosen": -2.172180414199829, + "logits/rejected": -2.4125022888183594, + "logps/chosen": -1.2400215864181519, + "logps/rejected": -1376.037841796875, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21632233262062073, + "rewards/margins": 13.414273262023926, + "rewards/rejected": -13.197952270507812, + "step": 950 + }, + { + "epoch": 0.23973030340866525, + "grad_norm": 0.00982666015625, + "learning_rate": 4.708868127100098e-06, + "logits/chosen": -2.2069010734558105, + "logits/rejected": -2.3836076259613037, + "logps/chosen": -0.6828838586807251, + "logps/rejected": -1159.0107421875, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20559605956077576, + "rewards/margins": 11.286005973815918, + "rewards/rejected": -11.080410957336426, + "step": 960 + }, + { + "epoch": 0.24222749406917218, + "grad_norm": 0.08740234375, + "learning_rate": 4.6985751591177075e-06, + "logits/chosen": -2.0572152137756348, + "logits/rejected": -2.2502310276031494, + "logps/chosen": -1.7850786447525024, + "logps/rejected": -1321.8499755859375, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.212154358625412, + "rewards/margins": 12.830732345581055, + "rewards/rejected": -12.618578910827637, + "step": 970 + }, + { + "epoch": 0.24472468472967912, + "grad_norm": 0.09716796875, + "learning_rate": 4.688115039824648e-06, + "logits/chosen": -2.1182241439819336, + "logits/rejected": -2.292884349822998, + "logps/chosen": -0.9138596653938293, + "logps/rejected": -1220.1195068359375, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2072029858827591, + "rewards/margins": 11.845842361450195, + "rewards/rejected": -11.638639450073242, + "step": 980 + }, + { + "epoch": 0.24722187539018603, + "grad_norm": 0.1005859375, + "learning_rate": 4.677488564473535e-06, + "logits/chosen": -2.076742649078369, + "logits/rejected": -2.280050754547119, + "logps/chosen": -2.1341259479522705, + "logps/rejected": -1361.389404296875, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20143508911132812, + "rewards/margins": 13.227249145507812, + "rewards/rejected": -13.0258150100708, + "step": 990 + }, + { + "epoch": 0.24971906605069297, + "grad_norm": 0.15625, + "learning_rate": 4.666696540964556e-06, + "logits/chosen": -2.205030918121338, + "logits/rejected": -2.380605697631836, + "logps/chosen": -1.0865452289581299, + "logps/rejected": -1183.8802490234375, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21310412883758545, + "rewards/margins": 11.559179306030273, + "rewards/rejected": -11.346075057983398, + "step": 1000 + }, + { + "epoch": 0.24971906605069297, + "eval_logits/chosen": -2.551421880722046, + "eval_logits/rejected": -2.637223482131958, + "eval_logps/chosen": -0.39880600571632385, + "eval_logps/rejected": -585.1870727539062, + "eval_loss": 0.22298085689544678, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 0.25514695048332214, + "eval_rewards/margins": 5.658298015594482, + "eval_rewards/rejected": -5.403151035308838, + "eval_runtime": 0.6597, + "eval_samples_per_second": 7.579, + "eval_steps_per_second": 4.548, + "step": 1000 + }, + { + "epoch": 0.2522162567111999, + "grad_norm": 0.0361328125, + "learning_rate": 4.6557397897840454e-06, + "logits/chosen": -2.226627826690674, + "logits/rejected": -2.434197187423706, + "logps/chosen": -1.4807536602020264, + "logps/rejected": -1233.5753173828125, + "loss": 0.2295, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21030649542808533, + "rewards/margins": 11.924067497253418, + "rewards/rejected": -11.713762283325195, + "step": 1010 + }, + { + "epoch": 0.2547134473717068, + "grad_norm": 0.0311279296875, + "learning_rate": 4.644619143942108e-06, + "logits/chosen": -2.1962525844573975, + "logits/rejected": -2.418130397796631, + "logps/chosen": -1.2743520736694336, + "logps/rejected": -1324.01123046875, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2114056795835495, + "rewards/margins": 12.735904693603516, + "rewards/rejected": -12.524497985839844, + "step": 1020 + }, + { + "epoch": 0.2572106380322138, + "grad_norm": 0.1162109375, + "learning_rate": 4.633335448909284e-06, + "logits/chosen": -2.0575506687164307, + "logits/rejected": -2.2430522441864014, + "logps/chosen": -1.6322782039642334, + "logps/rejected": -1251.030029296875, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21490998566150665, + "rewards/margins": 12.10401725769043, + "rewards/rejected": -11.889106750488281, + "step": 1030 + }, + { + "epoch": 0.2597078286927207, + "grad_norm": 0.10400390625, + "learning_rate": 4.621889562552272e-06, + "logits/chosen": -2.1623690128326416, + "logits/rejected": -2.387530565261841, + "logps/chosen": -1.5265319347381592, + "logps/rejected": -1406.755615234375, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21330364048480988, + "rewards/margins": 13.666200637817383, + "rewards/rejected": -13.452896118164062, + "step": 1040 + }, + { + "epoch": 0.2622050193532276, + "grad_norm": 0.134765625, + "learning_rate": 4.610282355068707e-06, + "logits/chosen": -2.265820264816284, + "logits/rejected": -2.481659412384033, + "logps/chosen": -1.5380371809005737, + "logps/rejected": -1449.8046875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2142389565706253, + "rewards/margins": 14.062037467956543, + "rewards/rejected": -13.847798347473145, + "step": 1050 + }, + { + "epoch": 0.26470221001373456, + "grad_norm": 0.06787109375, + "learning_rate": 4.598514708921006e-06, + "logits/chosen": -2.249868869781494, + "logits/rejected": -2.466034412384033, + "logps/chosen": -0.7143852710723877, + "logps/rejected": -1382.494140625, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2079528272151947, + "rewards/margins": 13.4636812210083, + "rewards/rejected": -13.255727767944336, + "step": 1060 + }, + { + "epoch": 0.26719940067424147, + "grad_norm": 0.01153564453125, + "learning_rate": 4.5865875187692695e-06, + "logits/chosen": -2.1900734901428223, + "logits/rejected": -2.3761203289031982, + "logps/chosen": -1.549536943435669, + "logps/rejected": -1185.685791015625, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20429477095603943, + "rewards/margins": 11.534225463867188, + "rewards/rejected": -11.32992935180664, + "step": 1070 + }, + { + "epoch": 0.2696965913347484, + "grad_norm": 0.0830078125, + "learning_rate": 4.57450169140327e-06, + "logits/chosen": -2.0554583072662354, + "logits/rejected": -2.273556709289551, + "logps/chosen": -1.3945400714874268, + "logps/rejected": -1522.8463134765625, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2084466516971588, + "rewards/margins": 14.89411449432373, + "rewards/rejected": -14.685667037963867, + "step": 1080 + }, + { + "epoch": 0.27219378199525535, + "grad_norm": 0.1416015625, + "learning_rate": 4.562258145673507e-06, + "logits/chosen": -2.20988392829895, + "logits/rejected": -2.4358487129211426, + "logps/chosen": -1.0550658702850342, + "logps/rejected": -1489.2562255859375, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20840421319007874, + "rewards/margins": 14.553556442260742, + "rewards/rejected": -14.34515380859375, + "step": 1090 + }, + { + "epoch": 0.27469097265576226, + "grad_norm": 0.017578125, + "learning_rate": 4.549857812421353e-06, + "logits/chosen": -2.1285512447357178, + "logits/rejected": -2.318908929824829, + "logps/chosen": -0.753593921661377, + "logps/rejected": -1319.107666015625, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20573386549949646, + "rewards/margins": 12.884170532226562, + "rewards/rejected": -12.678436279296875, + "step": 1100 + }, + { + "epoch": 0.2771881633162692, + "grad_norm": 0.050537109375, + "learning_rate": 4.537301634408281e-06, + "logits/chosen": -2.1442999839782715, + "logits/rejected": -2.34287691116333, + "logps/chosen": -0.9622041583061218, + "logps/rejected": -1223.08837890625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21334879100322723, + "rewards/margins": 11.921293258666992, + "rewards/rejected": -11.707944869995117, + "step": 1110 + }, + { + "epoch": 0.27968535397677613, + "grad_norm": 0.027099609375, + "learning_rate": 4.52459056624419e-06, + "logits/chosen": -2.198021173477173, + "logits/rejected": -2.3665783405303955, + "logps/chosen": -1.6707994937896729, + "logps/rejected": -1209.2952880859375, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20634672045707703, + "rewards/margins": 11.70842170715332, + "rewards/rejected": -11.502074241638184, + "step": 1120 + }, + { + "epoch": 0.28218254463728304, + "grad_norm": 0.0458984375, + "learning_rate": 4.51172557431483e-06, + "logits/chosen": -2.0804460048675537, + "logits/rejected": -2.27351713180542, + "logps/chosen": -1.3884862661361694, + "logps/rejected": -1267.9599609375, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20677968859672546, + "rewards/margins": 12.219032287597656, + "rewards/rejected": -12.012252807617188, + "step": 1130 + }, + { + "epoch": 0.28467973529779, + "grad_norm": 0.0751953125, + "learning_rate": 4.49870763670833e-06, + "logits/chosen": -2.1440179347991943, + "logits/rejected": -2.3646531105041504, + "logps/chosen": -0.9940131306648254, + "logps/rejected": -1360.1025390625, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2132159024477005, + "rewards/margins": 13.244120597839355, + "rewards/rejected": -13.030904769897461, + "step": 1140 + }, + { + "epoch": 0.2871769259582969, + "grad_norm": 0.060546875, + "learning_rate": 4.4855377431408335e-06, + "logits/chosen": -2.124523639678955, + "logits/rejected": -2.308046817779541, + "logps/chosen": -1.051758885383606, + "logps/rejected": -1258.587158203125, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21350452303886414, + "rewards/margins": 12.086160659790039, + "rewards/rejected": -11.872655868530273, + "step": 1150 + }, + { + "epoch": 0.2896741166188038, + "grad_norm": 0.07275390625, + "learning_rate": 4.472216894881261e-06, + "logits/chosen": -2.12388277053833, + "logits/rejected": -2.2992734909057617, + "logps/chosen": -1.0673718452453613, + "logps/rejected": -1227.642822265625, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21056973934173584, + "rewards/margins": 11.97436809539795, + "rewards/rejected": -11.763797760009766, + "step": 1160 + }, + { + "epoch": 0.2921713072793108, + "grad_norm": 0.062255859375, + "learning_rate": 4.4587461046751815e-06, + "logits/chosen": -2.165827512741089, + "logits/rejected": -2.366560697555542, + "logps/chosen": -1.3018419742584229, + "logps/rejected": -1152.0526123046875, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2166510820388794, + "rewards/margins": 11.213326454162598, + "rewards/rejected": -10.996675491333008, + "step": 1170 + }, + { + "epoch": 0.2946684979398177, + "grad_norm": 0.0218505859375, + "learning_rate": 4.44512639666781e-06, + "logits/chosen": -2.153282642364502, + "logits/rejected": -2.3281288146972656, + "logps/chosen": -0.8735140562057495, + "logps/rejected": -1144.37744140625, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20929470658302307, + "rewards/margins": 11.165544509887695, + "rewards/rejected": -10.956250190734863, + "step": 1180 + }, + { + "epoch": 0.29716568860032466, + "grad_norm": 0.2451171875, + "learning_rate": 4.431358806326158e-06, + "logits/chosen": -2.0921244621276855, + "logits/rejected": -2.2888898849487305, + "logps/chosen": -1.9632396697998047, + "logps/rejected": -1334.217041015625, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21045899391174316, + "rewards/margins": 12.872146606445312, + "rewards/rejected": -12.661687850952148, + "step": 1190 + }, + { + "epoch": 0.29966287926083157, + "grad_norm": 0.08349609375, + "learning_rate": 4.4174443803603e-06, + "logits/chosen": -2.1807141304016113, + "logits/rejected": -2.35149884223938, + "logps/chosen": -1.1249208450317383, + "logps/rejected": -1231.4007568359375, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2088349312543869, + "rewards/margins": 11.981757164001465, + "rewards/rejected": -11.772923469543457, + "step": 1200 + }, + { + "epoch": 0.3021600699213385, + "grad_norm": 0.054931640625, + "learning_rate": 4.4033841766438e-06, + "logits/chosen": -2.153378486633301, + "logits/rejected": -2.333552598953247, + "logps/chosen": -1.4812664985656738, + "logps/rejected": -1186.764404296875, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21333375573158264, + "rewards/margins": 11.490147590637207, + "rewards/rejected": -11.276814460754395, + "step": 1210 + }, + { + "epoch": 0.30465726058184545, + "grad_norm": 0.0262451171875, + "learning_rate": 4.389179264133281e-06, + "logits/chosen": -2.232697010040283, + "logits/rejected": -2.418818235397339, + "logps/chosen": -0.8499106168746948, + "logps/rejected": -1287.507568359375, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20431029796600342, + "rewards/margins": 12.579316139221191, + "rewards/rejected": -12.375005722045898, + "step": 1220 + }, + { + "epoch": 0.30715445124235236, + "grad_norm": 0.0478515625, + "learning_rate": 4.374830722787159e-06, + "logits/chosen": -2.2435195446014404, + "logits/rejected": -2.4646503925323486, + "logps/chosen": -0.5742496252059937, + "logps/rejected": -1343.397216796875, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2151050567626953, + "rewards/margins": 13.130419731140137, + "rewards/rejected": -12.915315628051758, + "step": 1230 + }, + { + "epoch": 0.30965164190285926, + "grad_norm": 0.05615234375, + "learning_rate": 4.360339643483533e-06, + "logits/chosen": -2.2148001194000244, + "logits/rejected": -2.421738862991333, + "logps/chosen": -1.9802653789520264, + "logps/rejected": -1262.169189453125, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20881418883800507, + "rewards/margins": 12.177266120910645, + "rewards/rejected": -11.968450546264648, + "step": 1240 + }, + { + "epoch": 0.31214883256336623, + "grad_norm": 0.01348876953125, + "learning_rate": 4.345707127937253e-06, + "logits/chosen": -2.1191718578338623, + "logits/rejected": -2.344691753387451, + "logps/chosen": -0.9136890172958374, + "logps/rejected": -1512.323974609375, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21416035294532776, + "rewards/margins": 14.802743911743164, + "rewards/rejected": -14.588582992553711, + "step": 1250 + }, + { + "epoch": 0.31464602322387314, + "grad_norm": 0.0751953125, + "learning_rate": 4.330934288616154e-06, + "logits/chosen": -2.1469109058380127, + "logits/rejected": -2.3361592292785645, + "logps/chosen": -1.4744806289672852, + "logps/rejected": -1288.8616943359375, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2088310271501541, + "rewards/margins": 12.5834379196167, + "rewards/rejected": -12.374608039855957, + "step": 1260 + }, + { + "epoch": 0.31714321388438005, + "grad_norm": 0.03369140625, + "learning_rate": 4.316022248656485e-06, + "logits/chosen": -2.0783493518829346, + "logits/rejected": -2.3048255443573, + "logps/chosen": -1.100656270980835, + "logps/rejected": -1277.9552001953125, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20990002155303955, + "rewards/margins": 12.193166732788086, + "rewards/rejected": -11.983266830444336, + "step": 1270 + }, + { + "epoch": 0.319640404544887, + "grad_norm": 0.0400390625, + "learning_rate": 4.3009721417775166e-06, + "logits/chosen": -2.1016387939453125, + "logits/rejected": -2.3064982891082764, + "logps/chosen": -1.263979196548462, + "logps/rejected": -1323.89599609375, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21145665645599365, + "rewards/margins": 12.815747261047363, + "rewards/rejected": -12.604291915893555, + "step": 1280 + }, + { + "epoch": 0.3221375952053939, + "grad_norm": 0.10986328125, + "learning_rate": 4.285785112195346e-06, + "logits/chosen": -2.188570976257324, + "logits/rejected": -2.397493600845337, + "logps/chosen": -2.353158473968506, + "logps/rejected": -1393.356201171875, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20123986899852753, + "rewards/margins": 13.566085815429688, + "rewards/rejected": -13.364847183227539, + "step": 1290 + }, + { + "epoch": 0.3246347858659009, + "grad_norm": 0.04345703125, + "learning_rate": 4.27046231453591e-06, + "logits/chosen": -2.115800142288208, + "logits/rejected": -2.314438819885254, + "logps/chosen": -1.3714869022369385, + "logps/rejected": -1331.2506103515625, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20796707272529602, + "rewards/margins": 12.886337280273438, + "rewards/rejected": -12.678369522094727, + "step": 1300 + }, + { + "epoch": 0.3271319765264078, + "grad_norm": 0.0322265625, + "learning_rate": 4.255004913747196e-06, + "logits/chosen": -2.1591382026672363, + "logits/rejected": -2.3501150608062744, + "logps/chosen": -0.8996777534484863, + "logps/rejected": -1417.157470703125, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2074543684720993, + "rewards/margins": 13.838354110717773, + "rewards/rejected": -13.630900382995605, + "step": 1310 + }, + { + "epoch": 0.3296291671869147, + "grad_norm": 0.05126953125, + "learning_rate": 4.2394140850106825e-06, + "logits/chosen": -2.0840930938720703, + "logits/rejected": -2.285808801651001, + "logps/chosen": -0.9041382670402527, + "logps/rejected": -1322.038818359375, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2139444649219513, + "rewards/margins": 12.818634033203125, + "rewards/rejected": -12.604690551757812, + "step": 1320 + }, + { + "epoch": 0.33212635784742167, + "grad_norm": 0.045166015625, + "learning_rate": 4.223691013651986e-06, + "logits/chosen": -2.141530990600586, + "logits/rejected": -2.363454580307007, + "logps/chosen": -2.294220209121704, + "logps/rejected": -1329.7213134765625, + "loss": 0.2239, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2177181988954544, + "rewards/margins": 12.63646411895752, + "rewards/rejected": -12.418745040893555, + "step": 1330 + }, + { + "epoch": 0.3346235485079286, + "grad_norm": 0.039794921875, + "learning_rate": 4.207836895050748e-06, + "logits/chosen": -2.263815402984619, + "logits/rejected": -2.524907350540161, + "logps/chosen": -0.85591059923172, + "logps/rejected": -1496.051513671875, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21331222355365753, + "rewards/margins": 14.541677474975586, + "rewards/rejected": -14.32836627960205, + "step": 1340 + }, + { + "epoch": 0.3371207391684355, + "grad_norm": 0.056640625, + "learning_rate": 4.1918529345497525e-06, + "logits/chosen": -2.1795644760131836, + "logits/rejected": -2.345736026763916, + "logps/chosen": -1.1188920736312866, + "logps/rejected": -1032.299560546875, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21017661690711975, + "rewards/margins": 9.974283218383789, + "rewards/rejected": -9.764104843139648, + "step": 1350 + }, + { + "epoch": 0.33961792982894246, + "grad_norm": 0.061767578125, + "learning_rate": 4.175740347363289e-06, + "logits/chosen": -2.2571511268615723, + "logits/rejected": -2.450302839279175, + "logps/chosen": -2.4634203910827637, + "logps/rejected": -1143.845703125, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20782017707824707, + "rewards/margins": 10.989904403686523, + "rewards/rejected": -10.782083511352539, + "step": 1360 + }, + { + "epoch": 0.34211512048944936, + "grad_norm": 0.021240234375, + "learning_rate": 4.159500358484759e-06, + "logits/chosen": -2.104897975921631, + "logits/rejected": -2.321760654449463, + "logps/chosen": -1.1564667224884033, + "logps/rejected": -1532.8436279296875, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21173422038555145, + "rewards/margins": 14.948999404907227, + "rewards/rejected": -14.737266540527344, + "step": 1370 + }, + { + "epoch": 0.3446123111499563, + "grad_norm": 0.0306396484375, + "learning_rate": 4.143134202593549e-06, + "logits/chosen": -2.1347815990448, + "logits/rejected": -2.3222789764404297, + "logps/chosen": -2.063771963119507, + "logps/rejected": -1179.3240966796875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.216390922665596, + "rewards/margins": 11.309762001037598, + "rewards/rejected": -11.093371391296387, + "step": 1380 + }, + { + "epoch": 0.34710950181046324, + "grad_norm": 0.03955078125, + "learning_rate": 4.126643123961158e-06, + "logits/chosen": -2.216097354888916, + "logits/rejected": -2.431462049484253, + "logps/chosen": -1.3367359638214111, + "logps/rejected": -1441.5928955078125, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2119736224412918, + "rewards/margins": 14.054840087890625, + "rewards/rejected": -13.842867851257324, + "step": 1390 + }, + { + "epoch": 0.34960669247097015, + "grad_norm": 0.07470703125, + "learning_rate": 4.110028376356599e-06, + "logits/chosen": -2.194693088531494, + "logits/rejected": -2.394153118133545, + "logps/chosen": -2.143383264541626, + "logps/rejected": -1089.128173828125, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2104116678237915, + "rewards/margins": 10.493813514709473, + "rewards/rejected": -10.283400535583496, + "step": 1400 + }, + { + "epoch": 0.3521038831314771, + "grad_norm": 0.03369140625, + "learning_rate": 4.093291222951079e-06, + "logits/chosen": -2.1454501152038574, + "logits/rejected": -2.360769033432007, + "logps/chosen": -1.1339516639709473, + "logps/rejected": -1363.47119140625, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.209524005651474, + "rewards/margins": 13.2172269821167, + "rewards/rejected": -13.007702827453613, + "step": 1410 + }, + { + "epoch": 0.354601073791984, + "grad_norm": 0.057373046875, + "learning_rate": 4.076432936221965e-06, + "logits/chosen": -2.135999917984009, + "logits/rejected": -2.3061912059783936, + "logps/chosen": -0.5820466876029968, + "logps/rejected": -1179.7847900390625, + "loss": 0.2283, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2054443657398224, + "rewards/margins": 11.520200729370117, + "rewards/rejected": -11.314754486083984, + "step": 1420 + }, + { + "epoch": 0.35709826445249093, + "grad_norm": 0.019775390625, + "learning_rate": 4.059454797856039e-06, + "logits/chosen": -2.172046184539795, + "logits/rejected": -2.342928171157837, + "logps/chosen": -0.7546096444129944, + "logps/rejected": -1167.744873046875, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20582588016986847, + "rewards/margins": 11.390329360961914, + "rewards/rejected": -11.184503555297852, + "step": 1430 + }, + { + "epoch": 0.3595954551129979, + "grad_norm": 0.043212890625, + "learning_rate": 4.042358098652057e-06, + "logits/chosen": -2.244403123855591, + "logits/rejected": -2.4426932334899902, + "logps/chosen": -1.5733036994934082, + "logps/rejected": -1163.822998046875, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21082696318626404, + "rewards/margins": 11.297124862670898, + "rewards/rejected": -11.086297988891602, + "step": 1440 + }, + { + "epoch": 0.3620926457735048, + "grad_norm": 0.046142578125, + "learning_rate": 4.025144138422615e-06, + "logits/chosen": -2.189898729324341, + "logits/rejected": -2.393465757369995, + "logps/chosen": -1.2910453081130981, + "logps/rejected": -1412.8597412109375, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21461701393127441, + "rewards/margins": 13.799296379089355, + "rewards/rejected": -13.584680557250977, + "step": 1450 + }, + { + "epoch": 0.3645898364340117, + "grad_norm": 0.0169677734375, + "learning_rate": 4.007814225895321e-06, + "logits/chosen": -2.170092821121216, + "logits/rejected": -2.3824923038482666, + "logps/chosen": -0.8392337560653687, + "logps/rejected": -1365.531005859375, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20813941955566406, + "rewards/margins": 13.32819652557373, + "rewards/rejected": -13.120054244995117, + "step": 1460 + }, + { + "epoch": 0.3670870270945187, + "grad_norm": 0.017333984375, + "learning_rate": 3.990369678613303e-06, + "logits/chosen": -2.0936970710754395, + "logits/rejected": -2.3042235374450684, + "logps/chosen": -1.4599825143814087, + "logps/rejected": -1356.390869140625, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21089884638786316, + "rewards/margins": 12.903648376464844, + "rewards/rejected": -12.6927490234375, + "step": 1470 + }, + { + "epoch": 0.3695842177550256, + "grad_norm": 0.0240478515625, + "learning_rate": 3.97281182283504e-06, + "logits/chosen": -2.157559871673584, + "logits/rejected": -2.371856927871704, + "logps/chosen": -1.3865526914596558, + "logps/rejected": -1416.440185546875, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20923642814159393, + "rewards/margins": 13.767707824707031, + "rewards/rejected": -13.558469772338867, + "step": 1480 + }, + { + "epoch": 0.3720814084155325, + "grad_norm": 0.099609375, + "learning_rate": 3.955141993433526e-06, + "logits/chosen": -2.2016472816467285, + "logits/rejected": -2.3889071941375732, + "logps/chosen": -1.0489656925201416, + "logps/rejected": -1286.4302978515625, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21162299811840057, + "rewards/margins": 12.558609962463379, + "rewards/rejected": -12.3469877243042, + "step": 1490 + }, + { + "epoch": 0.37457859907603946, + "grad_norm": 0.053466796875, + "learning_rate": 3.937361533794784e-06, + "logits/chosen": -2.1290640830993652, + "logits/rejected": -2.337486505508423, + "logps/chosen": -1.496525526046753, + "logps/rejected": -1124.3212890625, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21092331409454346, + "rewards/margins": 10.851540565490723, + "rewards/rejected": -10.640616416931152, + "step": 1500 + }, + { + "epoch": 0.3770757897365464, + "grad_norm": 0.0233154296875, + "learning_rate": 3.919471795715738e-06, + "logits/chosen": -2.18410587310791, + "logits/rejected": -2.3675644397735596, + "logps/chosen": -0.84355628490448, + "logps/rejected": -1166.61279296875, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2062278687953949, + "rewards/margins": 11.381316184997559, + "rewards/rejected": -11.175088882446289, + "step": 1510 + }, + { + "epoch": 0.37957298039705334, + "grad_norm": 0.0390625, + "learning_rate": 3.901474139301433e-06, + "logits/chosen": -2.0796847343444824, + "logits/rejected": -2.264577627182007, + "logps/chosen": -0.6843720078468323, + "logps/rejected": -1241.1590576171875, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21053218841552734, + "rewards/margins": 12.031414031982422, + "rewards/rejected": -11.820880889892578, + "step": 1520 + }, + { + "epoch": 0.38207017105756025, + "grad_norm": 0.0869140625, + "learning_rate": 3.883369932861634e-06, + "logits/chosen": -2.2165303230285645, + "logits/rejected": -2.3859565258026123, + "logps/chosen": -1.1263262033462524, + "logps/rejected": -1200.8397216796875, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20847392082214355, + "rewards/margins": 11.745917320251465, + "rewards/rejected": -11.537444114685059, + "step": 1530 + }, + { + "epoch": 0.38456736171806716, + "grad_norm": 0.06298828125, + "learning_rate": 3.865160552806796e-06, + "logits/chosen": -2.262539863586426, + "logits/rejected": -2.4538345336914062, + "logps/chosen": -1.3924305438995361, + "logps/rejected": -1240.5035400390625, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20712998509407043, + "rewards/margins": 12.123323440551758, + "rewards/rejected": -11.916193008422852, + "step": 1540 + }, + { + "epoch": 0.3870645523785741, + "grad_norm": 0.041015625, + "learning_rate": 3.84684738354342e-06, + "logits/chosen": -2.267106771469116, + "logits/rejected": -2.4566650390625, + "logps/chosen": -2.0142922401428223, + "logps/rejected": -1211.2545166015625, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2093096524477005, + "rewards/margins": 11.762309074401855, + "rewards/rejected": -11.552999496459961, + "step": 1550 + }, + { + "epoch": 0.38956174303908103, + "grad_norm": 0.07958984375, + "learning_rate": 3.828431817368798e-06, + "logits/chosen": -2.141620397567749, + "logits/rejected": -2.33925199508667, + "logps/chosen": -1.531597375869751, + "logps/rejected": -1257.968994140625, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2043263465166092, + "rewards/margins": 12.106410026550293, + "rewards/rejected": -11.902084350585938, + "step": 1560 + }, + { + "epoch": 0.39205893369958794, + "grad_norm": 0.07763671875, + "learning_rate": 3.8099152543651684e-06, + "logits/chosen": -2.3559296131134033, + "logits/rejected": -2.583070993423462, + "logps/chosen": -0.7891671061515808, + "logps/rejected": -1441.2958984375, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20671992003917694, + "rewards/margins": 14.086430549621582, + "rewards/rejected": -13.87971019744873, + "step": 1570 + }, + { + "epoch": 0.3945561243600949, + "grad_norm": 0.03857421875, + "learning_rate": 3.791299102293261e-06, + "logits/chosen": -2.1035549640655518, + "logits/rejected": -2.3072731494903564, + "logps/chosen": -1.0839884281158447, + "logps/rejected": -1459.4197998046875, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21341785788536072, + "rewards/margins": 14.197916984558105, + "rewards/rejected": -13.98449993133545, + "step": 1580 + }, + { + "epoch": 0.3970533150206018, + "grad_norm": 0.03369140625, + "learning_rate": 3.7725847764852774e-06, + "logits/chosen": -2.10914945602417, + "logits/rejected": -2.3385162353515625, + "logps/chosen": -1.6078799962997437, + "logps/rejected": -1307.208740234375, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2162178009748459, + "rewards/margins": 12.54298210144043, + "rewards/rejected": -12.326765060424805, + "step": 1590 + }, + { + "epoch": 0.3995505056811087, + "grad_norm": 0.0267333984375, + "learning_rate": 3.7537736997372833e-06, + "logits/chosen": -2.1722114086151123, + "logits/rejected": -2.3555681705474854, + "logps/chosen": -1.133063793182373, + "logps/rejected": -1113.764404296875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21135945618152618, + "rewards/margins": 10.682828903198242, + "rewards/rejected": -10.471468925476074, + "step": 1600 + }, + { + "epoch": 0.4020476963416157, + "grad_norm": 0.020751953125, + "learning_rate": 3.734867302201038e-06, + "logits/chosen": -2.2481324672698975, + "logits/rejected": -2.4178614616394043, + "logps/chosen": -0.7748688459396362, + "logps/rejected": -1153.1929931640625, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2061961144208908, + "rewards/margins": 11.231634140014648, + "rewards/rejected": -11.02543830871582, + "step": 1610 + }, + { + "epoch": 0.4045448870021226, + "grad_norm": 0.046630859375, + "learning_rate": 3.7158670212752666e-06, + "logits/chosen": -2.158440113067627, + "logits/rejected": -2.3695878982543945, + "logps/chosen": -0.685897946357727, + "logps/rejected": -1294.4326171875, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2151576578617096, + "rewards/margins": 12.627668380737305, + "rewards/rejected": -12.412511825561523, + "step": 1620 + }, + { + "epoch": 0.40704207766262956, + "grad_norm": 0.015869140625, + "learning_rate": 3.696774301496376e-06, + "logits/chosen": -2.2252297401428223, + "logits/rejected": -2.4217424392700195, + "logps/chosen": -0.6748331785202026, + "logps/rejected": -1261.10009765625, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21283042430877686, + "rewards/margins": 12.33554458618164, + "rewards/rejected": -12.122715950012207, + "step": 1630 + }, + { + "epoch": 0.4095392683231365, + "grad_norm": 0.0283203125, + "learning_rate": 3.677590594428629e-06, + "logits/chosen": -2.159726619720459, + "logits/rejected": -2.3402228355407715, + "logps/chosen": -0.9869475364685059, + "logps/rejected": -1201.0703125, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20935773849487305, + "rewards/margins": 11.699995994567871, + "rewards/rejected": -11.490636825561523, + "step": 1640 + }, + { + "epoch": 0.4120364589836434, + "grad_norm": 0.07470703125, + "learning_rate": 3.658317358553794e-06, + "logits/chosen": -2.1311771869659424, + "logits/rejected": -2.3283205032348633, + "logps/chosen": -0.7873401045799255, + "logps/rejected": -1318.947265625, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20913653075695038, + "rewards/margins": 12.813528060913086, + "rewards/rejected": -12.604392051696777, + "step": 1650 + }, + { + "epoch": 0.41453364964415035, + "grad_norm": 0.06494140625, + "learning_rate": 3.638956059160252e-06, + "logits/chosen": -2.180502414703369, + "logits/rejected": -2.3862075805664062, + "logps/chosen": -1.0054365396499634, + "logps/rejected": -1342.7799072265625, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21474532783031464, + "rewards/margins": 13.147130966186523, + "rewards/rejected": -12.932388305664062, + "step": 1660 + }, + { + "epoch": 0.41703084030465726, + "grad_norm": 0.03369140625, + "learning_rate": 3.6195081682315972e-06, + "logits/chosen": -2.2029502391815186, + "logits/rejected": -2.3754451274871826, + "logps/chosen": -1.1696422100067139, + "logps/rejected": -1324.997802734375, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20706875622272491, + "rewards/margins": 12.972146987915039, + "rewards/rejected": -12.765077590942383, + "step": 1670 + }, + { + "epoch": 0.41952803096516417, + "grad_norm": 0.05126953125, + "learning_rate": 3.5999751643347342e-06, + "logits/chosen": -2.126647472381592, + "logits/rejected": -2.32842755317688, + "logps/chosen": -1.3129024505615234, + "logps/rejected": -1431.062255859375, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2164861261844635, + "rewards/margins": 13.894182205200195, + "rewards/rejected": -13.677694320678711, + "step": 1680 + }, + { + "epoch": 0.42202522162567113, + "grad_norm": 0.05126953125, + "learning_rate": 3.5803585325074536e-06, + "logits/chosen": -2.1573426723480225, + "logits/rejected": -2.3461415767669678, + "logps/chosen": -0.5849089622497559, + "logps/rejected": -1369.0498046875, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20605847239494324, + "rewards/margins": 13.385258674621582, + "rewards/rejected": -13.179201126098633, + "step": 1690 + }, + { + "epoch": 0.42452241228617804, + "grad_norm": 0.041015625, + "learning_rate": 3.5606597641455387e-06, + "logits/chosen": -2.201714515686035, + "logits/rejected": -2.3846235275268555, + "logps/chosen": -1.2365072965621948, + "logps/rejected": -1268.500732421875, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20618323981761932, + "rewards/margins": 12.352148056030273, + "rewards/rejected": -12.145965576171875, + "step": 1700 + }, + { + "epoch": 0.427019602946685, + "grad_norm": 0.6015625, + "learning_rate": 3.540880356889376e-06, + "logits/chosen": -2.204244375228882, + "logits/rejected": -2.37742280960083, + "logps/chosen": -1.9021589756011963, + "logps/rejected": -1228.02685546875, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1988053023815155, + "rewards/margins": 11.881242752075195, + "rewards/rejected": -11.682435035705566, + "step": 1710 + }, + { + "epoch": 0.4295167936071919, + "grad_norm": 0.05712890625, + "learning_rate": 3.5210218145100934e-06, + "logits/chosen": -2.1249117851257324, + "logits/rejected": -2.343653917312622, + "logps/chosen": -0.9779669642448425, + "logps/rejected": -1107.069580078125, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20971660315990448, + "rewards/margins": 10.748934745788574, + "rewards/rejected": -10.53921890258789, + "step": 1720 + }, + { + "epoch": 0.4320139842676988, + "grad_norm": 0.04931640625, + "learning_rate": 3.5010856467952335e-06, + "logits/chosen": -2.135411262512207, + "logits/rejected": -2.3283915519714355, + "logps/chosen": -1.680784821510315, + "logps/rejected": -1203.44873046875, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21119749546051025, + "rewards/margins": 11.60279655456543, + "rewards/rejected": -11.391599655151367, + "step": 1730 + }, + { + "epoch": 0.4345111749282058, + "grad_norm": 0.061279296875, + "learning_rate": 3.4810733694339687e-06, + "logits/chosen": -2.227553367614746, + "logits/rejected": -2.4453303813934326, + "logps/chosen": -1.1945085525512695, + "logps/rejected": -1365.62158203125, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21306195855140686, + "rewards/margins": 13.283732414245605, + "rewards/rejected": -13.070669174194336, + "step": 1740 + }, + { + "epoch": 0.4370083655887127, + "grad_norm": 0.026123046875, + "learning_rate": 3.4609865039018676e-06, + "logits/chosen": -2.2143800258636475, + "logits/rejected": -2.38647198677063, + "logps/chosen": -0.3982168138027191, + "logps/rejected": -1256.0924072265625, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20522812008857727, + "rewards/margins": 12.254903793334961, + "rewards/rejected": -12.049676895141602, + "step": 1750 + }, + { + "epoch": 0.4395055562492196, + "grad_norm": 0.017578125, + "learning_rate": 3.4408265773452226e-06, + "logits/chosen": -2.132845401763916, + "logits/rejected": -2.32383394241333, + "logps/chosen": -0.7928985953330994, + "logps/rejected": -1260.4219970703125, + "loss": 0.2281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21432673931121826, + "rewards/margins": 12.288119316101074, + "rewards/rejected": -12.07379150390625, + "step": 1760 + }, + { + "epoch": 0.4420027469097266, + "grad_norm": 0.027587890625, + "learning_rate": 3.420595122464942e-06, + "logits/chosen": -2.2310843467712402, + "logits/rejected": -2.43049693107605, + "logps/chosen": -1.0165212154388428, + "logps/rejected": -1248.940673828125, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20808283984661102, + "rewards/margins": 12.158212661743164, + "rewards/rejected": -11.950130462646484, + "step": 1770 + }, + { + "epoch": 0.4444999375702335, + "grad_norm": 0.05029296875, + "learning_rate": 3.4002936774000284e-06, + "logits/chosen": -2.129657030105591, + "logits/rejected": -2.3626723289489746, + "logps/chosen": -0.534063994884491, + "logps/rejected": -1597.343017578125, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21577997505664825, + "rewards/margins": 15.636571884155273, + "rewards/rejected": -15.420791625976562, + "step": 1780 + }, + { + "epoch": 0.4469971282307404, + "grad_norm": 0.02587890625, + "learning_rate": 3.3799237856106348e-06, + "logits/chosen": -2.1293628215789795, + "logits/rejected": -2.3366832733154297, + "logps/chosen": -0.6109465956687927, + "logps/rejected": -1318.239990234375, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2080315351486206, + "rewards/margins": 12.848733901977539, + "rewards/rejected": -12.640703201293945, + "step": 1790 + }, + { + "epoch": 0.44949431889124736, + "grad_norm": 0.060546875, + "learning_rate": 3.35948699576072e-06, + "logits/chosen": -2.0792922973632812, + "logits/rejected": -2.285391330718994, + "logps/chosen": -0.9549906849861145, + "logps/rejected": -1534.51953125, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21579799056053162, + "rewards/margins": 14.997169494628906, + "rewards/rejected": -14.7813720703125, + "step": 1800 + }, + { + "epoch": 0.45199150955175427, + "grad_norm": 0.09033203125, + "learning_rate": 3.3389848616003085e-06, + "logits/chosen": -2.169448137283325, + "logits/rejected": -2.34112286567688, + "logps/chosen": -1.1561418771743774, + "logps/rejected": -1331.464111328125, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20999138057231903, + "rewards/margins": 12.991872787475586, + "rewards/rejected": -12.781880378723145, + "step": 1810 + }, + { + "epoch": 0.45448870021226123, + "grad_norm": 0.024169921875, + "learning_rate": 3.3184189418473674e-06, + "logits/chosen": -2.0690829753875732, + "logits/rejected": -2.2553791999816895, + "logps/chosen": -0.737138032913208, + "logps/rejected": -1278.2681884765625, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20780067145824432, + "rewards/margins": 12.481771469116211, + "rewards/rejected": -12.273969650268555, + "step": 1820 + }, + { + "epoch": 0.45698589087276814, + "grad_norm": 0.0810546875, + "learning_rate": 3.2977908000692925e-06, + "logits/chosen": -2.1408801078796387, + "logits/rejected": -2.3243911266326904, + "logps/chosen": -1.5268166065216064, + "logps/rejected": -1405.22412109375, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20781424641609192, + "rewards/margins": 13.740381240844727, + "rewards/rejected": -13.532565116882324, + "step": 1830 + }, + { + "epoch": 0.45948308153327505, + "grad_norm": 0.04736328125, + "learning_rate": 3.2771020045640435e-06, + "logits/chosen": -2.286168336868286, + "logits/rejected": -2.4684412479400635, + "logps/chosen": -0.6708983182907104, + "logps/rejected": -1134.7979736328125, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21890632808208466, + "rewards/margins": 11.039240837097168, + "rewards/rejected": -10.820335388183594, + "step": 1840 + }, + { + "epoch": 0.461980272193782, + "grad_norm": 0.099609375, + "learning_rate": 3.256354128240907e-06, + "logits/chosen": -2.06745982170105, + "logits/rejected": -2.248892307281494, + "logps/chosen": -1.6344282627105713, + "logps/rejected": -1263.974853515625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21318969130516052, + "rewards/margins": 12.202125549316406, + "rewards/rejected": -11.988935470581055, + "step": 1850 + }, + { + "epoch": 0.4644774628542889, + "grad_norm": 0.0654296875, + "learning_rate": 3.235548748500914e-06, + "logits/chosen": -2.3071300983428955, + "logits/rejected": -2.500091314315796, + "logps/chosen": -1.0427045822143555, + "logps/rejected": -1357.378662109375, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20457443594932556, + "rewards/margins": 13.286227226257324, + "rewards/rejected": -13.081652641296387, + "step": 1860 + }, + { + "epoch": 0.46697465351479583, + "grad_norm": 0.04248046875, + "learning_rate": 3.214687447116913e-06, + "logits/chosen": -2.10600209236145, + "logits/rejected": -2.302873373031616, + "logps/chosen": -0.6546305418014526, + "logps/rejected": -1224.43359375, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20871946215629578, + "rewards/margins": 11.745490074157715, + "rewards/rejected": -11.536770820617676, + "step": 1870 + }, + { + "epoch": 0.4694718441753028, + "grad_norm": 0.01104736328125, + "learning_rate": 3.193771810113313e-06, + "logits/chosen": -2.1570992469787598, + "logits/rejected": -2.384364604949951, + "logps/chosen": -1.154052495956421, + "logps/rejected": -1359.59619140625, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21566633880138397, + "rewards/margins": 13.259126663208008, + "rewards/rejected": -13.043458938598633, + "step": 1880 + }, + { + "epoch": 0.4719690348358097, + "grad_norm": 0.0174560546875, + "learning_rate": 3.1728034276455032e-06, + "logits/chosen": -2.138918399810791, + "logits/rejected": -2.335463047027588, + "logps/chosen": -0.595456600189209, + "logps/rejected": -1286.499267578125, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21340537071228027, + "rewards/margins": 12.50808048248291, + "rewards/rejected": -12.294673919677734, + "step": 1890 + }, + { + "epoch": 0.4744662254963166, + "grad_norm": 0.0191650390625, + "learning_rate": 3.1517838938789597e-06, + "logits/chosen": -2.1312789916992188, + "logits/rejected": -2.3574845790863037, + "logps/chosen": -1.0333608388900757, + "logps/rejected": -1402.2928466796875, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21823573112487793, + "rewards/margins": 13.426950454711914, + "rewards/rejected": -13.208715438842773, + "step": 1900 + }, + { + "epoch": 0.4769634161568236, + "grad_norm": 0.021240234375, + "learning_rate": 3.130714806868041e-06, + "logits/chosen": -2.1018803119659424, + "logits/rejected": -2.2899601459503174, + "logps/chosen": -1.5672905445098877, + "logps/rejected": -1282.0211181640625, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2161625623703003, + "rewards/margins": 12.432838439941406, + "rewards/rejected": -12.216676712036133, + "step": 1910 + }, + { + "epoch": 0.4794606068173305, + "grad_norm": 0.03369140625, + "learning_rate": 3.1095977684344976e-06, + "logits/chosen": -2.1870434284210205, + "logits/rejected": -2.3968632221221924, + "logps/chosen": -0.9621860384941101, + "logps/rejected": -1362.8802490234375, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21506325900554657, + "rewards/margins": 13.313095092773438, + "rewards/rejected": -13.098034858703613, + "step": 1920 + }, + { + "epoch": 0.48195779747783746, + "grad_norm": 0.052978515625, + "learning_rate": 3.0884343840456874e-06, + "logits/chosen": -2.2485427856445312, + "logits/rejected": -2.4523234367370605, + "logps/chosen": -0.8971269726753235, + "logps/rejected": -1507.810791015625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20877547562122345, + "rewards/margins": 14.738133430480957, + "rewards/rejected": -14.529356002807617, + "step": 1930 + }, + { + "epoch": 0.48445498813834437, + "grad_norm": 0.07568359375, + "learning_rate": 3.0672262626925174e-06, + "logits/chosen": -2.148587942123413, + "logits/rejected": -2.359325408935547, + "logps/chosen": -2.250260353088379, + "logps/rejected": -1421.3468017578125, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22058424353599548, + "rewards/margins": 13.836527824401855, + "rewards/rejected": -13.615945816040039, + "step": 1940 + }, + { + "epoch": 0.4869521787988513, + "grad_norm": 0.06640625, + "learning_rate": 3.0459750167671147e-06, + "logits/chosen": -2.1717689037323, + "logits/rejected": -2.403097629547119, + "logps/chosen": -1.1346304416656494, + "logps/rejected": -1519.8033447265625, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21193809807300568, + "rewards/margins": 14.729626655578613, + "rewards/rejected": -14.517687797546387, + "step": 1950 + }, + { + "epoch": 0.48944936945935824, + "grad_norm": 0.12890625, + "learning_rate": 3.024682261940247e-06, + "logits/chosen": -2.1400859355926514, + "logits/rejected": -2.3196842670440674, + "logps/chosen": -1.9256393909454346, + "logps/rejected": -1212.4700927734375, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21558912098407745, + "rewards/margins": 11.704329490661621, + "rewards/rejected": -11.488740921020508, + "step": 1960 + }, + { + "epoch": 0.49194656011986515, + "grad_norm": 0.053466796875, + "learning_rate": 3.0033496170384803e-06, + "logits/chosen": -2.2003872394561768, + "logits/rejected": -2.384770154953003, + "logps/chosen": -0.6797516345977783, + "logps/rejected": -1223.4056396484375, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20655830204486847, + "rewards/margins": 11.95020866394043, + "rewards/rejected": -11.743650436401367, + "step": 1970 + }, + { + "epoch": 0.49444375078037206, + "grad_norm": 0.05126953125, + "learning_rate": 2.9819787039211068e-06, + "logits/chosen": -2.1409530639648438, + "logits/rejected": -2.3441128730773926, + "logps/chosen": -1.6590759754180908, + "logps/rejected": -1320.5748291015625, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21551513671875, + "rewards/margins": 12.755599021911621, + "rewards/rejected": -12.540084838867188, + "step": 1980 + }, + { + "epoch": 0.496940941440879, + "grad_norm": 0.015869140625, + "learning_rate": 2.960571147356845e-06, + "logits/chosen": -2.2252392768859863, + "logits/rejected": -2.4482040405273438, + "logps/chosen": -0.6751580238342285, + "logps/rejected": -1514.2879638671875, + "loss": 0.2233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22100117802619934, + "rewards/margins": 14.839349746704102, + "rewards/rejected": -14.618349075317383, + "step": 1990 + }, + { + "epoch": 0.49943813210138593, + "grad_norm": 0.11376953125, + "learning_rate": 2.9391285749003046e-06, + "logits/chosen": -2.1313652992248535, + "logits/rejected": -2.3276991844177246, + "logps/chosen": -1.28163743019104, + "logps/rejected": -1614.152099609375, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21389129757881165, + "rewards/margins": 15.772817611694336, + "rewards/rejected": -15.558927536010742, + "step": 2000 + }, + { + "epoch": 0.49943813210138593, + "eval_logits/chosen": -2.568960428237915, + "eval_logits/rejected": -2.656001329421997, + "eval_logps/chosen": -0.1526380479335785, + "eval_logps/rejected": -643.470458984375, + "eval_loss": 0.2215292751789093, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 0.25760865211486816, + "eval_rewards/margins": 6.243593215942383, + "eval_rewards/rejected": -5.985984802246094, + "eval_runtime": 0.6593, + "eval_samples_per_second": 7.584, + "eval_steps_per_second": 4.551, + "step": 2000 + }, + { + "epoch": 0.5019353227618929, + "grad_norm": 0.072265625, + "learning_rate": 2.9176526167682543e-06, + "logits/chosen": -2.0913753509521484, + "logits/rejected": -2.273857593536377, + "logps/chosen": -0.7355623841285706, + "logps/rejected": -1363.037841796875, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2061166763305664, + "rewards/margins": 13.315282821655273, + "rewards/rejected": -13.109164237976074, + "step": 2010 + }, + { + "epoch": 0.5044325134223998, + "grad_norm": 0.0242919921875, + "learning_rate": 2.8961449057156775e-06, + "logits/chosen": -2.1776702404022217, + "logits/rejected": -2.3788368701934814, + "logps/chosen": -1.159735918045044, + "logps/rejected": -1370.439697265625, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21482279896736145, + "rewards/margins": 13.327527046203613, + "rewards/rejected": -13.112703323364258, + "step": 2020 + }, + { + "epoch": 0.5069297040829067, + "grad_norm": 0.0654296875, + "learning_rate": 2.874607076911642e-06, + "logits/chosen": -2.1823270320892334, + "logits/rejected": -2.400944471359253, + "logps/chosen": -1.355530023574829, + "logps/rejected": -1275.2886962890625, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2216695249080658, + "rewards/margins": 12.436738967895508, + "rewards/rejected": -12.215067863464355, + "step": 2030 + }, + { + "epoch": 0.5094268947434136, + "grad_norm": 0.06689453125, + "learning_rate": 2.8530407678149806e-06, + "logits/chosen": -2.1733579635620117, + "logits/rejected": -2.3787028789520264, + "logps/chosen": -2.122178554534912, + "logps/rejected": -1217.6248779296875, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21247439086437225, + "rewards/margins": 11.738574028015137, + "rewards/rejected": -11.526100158691406, + "step": 2040 + }, + { + "epoch": 0.5119240854039205, + "grad_norm": 0.0164794921875, + "learning_rate": 2.8314476180498003e-06, + "logits/chosen": -2.010568618774414, + "logits/rejected": -2.1947145462036133, + "logps/chosen": -0.8790448904037476, + "logps/rejected": -1320.770263671875, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20843760669231415, + "rewards/margins": 12.884744644165039, + "rewards/rejected": -12.67630672454834, + "step": 2050 + }, + { + "epoch": 0.5144212760644276, + "grad_norm": 0.037109375, + "learning_rate": 2.8098292692808253e-06, + "logits/chosen": -2.1951942443847656, + "logits/rejected": -2.3474528789520264, + "logps/chosen": -0.8600829839706421, + "logps/rejected": -1061.1048583984375, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20944657921791077, + "rewards/margins": 10.36804485321045, + "rewards/rejected": -10.158597946166992, + "step": 2060 + }, + { + "epoch": 0.5169184667249345, + "grad_norm": 0.0224609375, + "learning_rate": 2.7881873650885904e-06, + "logits/chosen": -2.1963181495666504, + "logits/rejected": -2.3679440021514893, + "logps/chosen": -0.8357653617858887, + "logps/rejected": -1268.226318359375, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21145395934581757, + "rewards/margins": 12.395639419555664, + "rewards/rejected": -12.184186935424805, + "step": 2070 + }, + { + "epoch": 0.5194156573854414, + "grad_norm": 0.03955078125, + "learning_rate": 2.7665235508444772e-06, + "logits/chosen": -2.131880044937134, + "logits/rejected": -2.329930067062378, + "logps/chosen": -0.8339768648147583, + "logps/rejected": -1511.36962890625, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2037159651517868, + "rewards/margins": 14.78296184539795, + "rewards/rejected": -14.579244613647461, + "step": 2080 + }, + { + "epoch": 0.5219128480459483, + "grad_norm": 0.1123046875, + "learning_rate": 2.7448394735856275e-06, + "logits/chosen": -2.0990092754364014, + "logits/rejected": -2.317046642303467, + "logps/chosen": -0.900246798992157, + "logps/rejected": -1560.1123046875, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20990662276744843, + "rewards/margins": 15.233263969421387, + "rewards/rejected": -15.023355484008789, + "step": 2090 + }, + { + "epoch": 0.5244100387064552, + "grad_norm": 0.05810546875, + "learning_rate": 2.723136781889722e-06, + "logits/chosen": -2.221381664276123, + "logits/rejected": -2.4073383808135986, + "logps/chosen": -1.555213451385498, + "logps/rejected": -1313.25439453125, + "loss": 0.2281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21150963008403778, + "rewards/margins": 12.850160598754883, + "rewards/rejected": -12.638651847839355, + "step": 2100 + }, + { + "epoch": 0.5269072293669622, + "grad_norm": 0.031494140625, + "learning_rate": 2.7014171257496414e-06, + "logits/chosen": -2.224299669265747, + "logits/rejected": -2.4082083702087402, + "logps/chosen": -1.5661276578903198, + "logps/rejected": -1288.989013671875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2089545726776123, + "rewards/margins": 12.412330627441406, + "rewards/rejected": -12.203374862670898, + "step": 2110 + }, + { + "epoch": 0.5294044200274691, + "grad_norm": 0.044189453125, + "learning_rate": 2.6796821564480237e-06, + "logits/chosen": -2.143993854522705, + "logits/rejected": -2.3330225944519043, + "logps/chosen": -1.3014509677886963, + "logps/rejected": -1159.53271484375, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21552510559558868, + "rewards/margins": 11.207192420959473, + "rewards/rejected": -10.991667747497559, + "step": 2120 + }, + { + "epoch": 0.531901610687976, + "grad_norm": 0.032958984375, + "learning_rate": 2.6579335264317253e-06, + "logits/chosen": -2.2805047035217285, + "logits/rejected": -2.4840075969696045, + "logps/chosen": -0.6564453840255737, + "logps/rejected": -1376.549560546875, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20875303447246552, + "rewards/margins": 13.3289794921875, + "rewards/rejected": -13.120226860046387, + "step": 2130 + }, + { + "epoch": 0.5343988013484829, + "grad_norm": 0.02587890625, + "learning_rate": 2.6361728891861843e-06, + "logits/chosen": -2.044534206390381, + "logits/rejected": -2.263455629348755, + "logps/chosen": -2.359926462173462, + "logps/rejected": -1182.7542724609375, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21868690848350525, + "rewards/margins": 11.231551170349121, + "rewards/rejected": -11.01286506652832, + "step": 2140 + }, + { + "epoch": 0.5368959920089899, + "grad_norm": 0.0235595703125, + "learning_rate": 2.614401899109716e-06, + "logits/chosen": -2.2184996604919434, + "logits/rejected": -2.4115943908691406, + "logps/chosen": -0.7188009023666382, + "logps/rejected": -1362.302490234375, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20849958062171936, + "rewards/margins": 13.323092460632324, + "rewards/rejected": -13.114593505859375, + "step": 2150 + }, + { + "epoch": 0.5393931826694968, + "grad_norm": 0.023681640625, + "learning_rate": 2.5926222113877282e-06, + "logits/chosen": -2.2279531955718994, + "logits/rejected": -2.4470245838165283, + "logps/chosen": -0.8932285308837891, + "logps/rejected": -1380.791748046875, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20549210906028748, + "rewards/margins": 13.206730842590332, + "rewards/rejected": -13.001237869262695, + "step": 2160 + }, + { + "epoch": 0.5418903733300038, + "grad_norm": 0.09619140625, + "learning_rate": 2.570835481866889e-06, + "logits/chosen": -2.122584819793701, + "logits/rejected": -2.3029303550720215, + "logps/chosen": -0.6316767334938049, + "logps/rejected": -1331.388916015625, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20623505115509033, + "rewards/margins": 13.001462936401367, + "rewards/rejected": -12.795228958129883, + "step": 2170 + }, + { + "epoch": 0.5443875639905107, + "grad_norm": 0.04248046875, + "learning_rate": 2.5490433669292337e-06, + "logits/chosen": -2.044675350189209, + "logits/rejected": -2.251300811767578, + "logps/chosen": -0.7981548309326172, + "logps/rejected": -1485.2850341796875, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2164611518383026, + "rewards/margins": 14.53178596496582, + "rewards/rejected": -14.315322875976562, + "step": 2180 + }, + { + "epoch": 0.5468847546510176, + "grad_norm": 0.038330078125, + "learning_rate": 2.527247523366232e-06, + "logits/chosen": -2.2029881477355957, + "logits/rejected": -2.4012579917907715, + "logps/chosen": -1.3100454807281494, + "logps/rejected": -1426.16357421875, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2108200490474701, + "rewards/margins": 13.932962417602539, + "rewards/rejected": -13.722142219543457, + "step": 2190 + }, + { + "epoch": 0.5493819453115245, + "grad_norm": 0.02978515625, + "learning_rate": 2.5054496082528336e-06, + "logits/chosen": -2.263662576675415, + "logits/rejected": -2.4767444133758545, + "logps/chosen": -0.6738319993019104, + "logps/rejected": -1380.506103515625, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21471872925758362, + "rewards/margins": 13.528160095214844, + "rewards/rejected": -13.313441276550293, + "step": 2200 + }, + { + "epoch": 0.5518791359720314, + "grad_norm": 0.03955078125, + "learning_rate": 2.483651278821481e-06, + "logits/chosen": -2.2110023498535156, + "logits/rejected": -2.4015591144561768, + "logps/chosen": -1.228434443473816, + "logps/rejected": -1266.2230224609375, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20852570235729218, + "rewards/margins": 12.340888977050781, + "rewards/rejected": -12.13236141204834, + "step": 2210 + }, + { + "epoch": 0.5543763266325384, + "grad_norm": 0.0238037109375, + "learning_rate": 2.4618541923361166e-06, + "logits/chosen": -2.3842873573303223, + "logits/rejected": -2.558562994003296, + "logps/chosen": -1.321533203125, + "logps/rejected": -1156.223876953125, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20611576735973358, + "rewards/margins": 11.165016174316406, + "rewards/rejected": -10.958898544311523, + "step": 2220 + }, + { + "epoch": 0.5568735172930454, + "grad_norm": 0.06005859375, + "learning_rate": 2.4400600059661836e-06, + "logits/chosen": -2.069483757019043, + "logits/rejected": -2.31620717048645, + "logps/chosen": -1.093656301498413, + "logps/rejected": -1508.9503173828125, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2100195437669754, + "rewards/margins": 14.735700607299805, + "rewards/rejected": -14.52568244934082, + "step": 2230 + }, + { + "epoch": 0.5593707079535523, + "grad_norm": 0.009765625, + "learning_rate": 2.41827037666064e-06, + "logits/chosen": -2.2314319610595703, + "logits/rejected": -2.4116859436035156, + "logps/chosen": -0.6631449460983276, + "logps/rejected": -1216.8101806640625, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2116110622882843, + "rewards/margins": 11.8889799118042, + "rewards/rejected": -11.677370071411133, + "step": 2240 + }, + { + "epoch": 0.5618678986140592, + "grad_norm": 0.035888671875, + "learning_rate": 2.396486961021983e-06, + "logits/chosen": -2.156050443649292, + "logits/rejected": -2.355743885040283, + "logps/chosen": -0.5853773951530457, + "logps/rejected": -1307.397705078125, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21641604602336884, + "rewards/margins": 12.776580810546875, + "rewards/rejected": -12.560165405273438, + "step": 2250 + }, + { + "epoch": 0.5643650892745661, + "grad_norm": 0.0247802734375, + "learning_rate": 2.3747114151802993e-06, + "logits/chosen": -2.2995388507843018, + "logits/rejected": -2.4979677200317383, + "logps/chosen": -1.0234979391098022, + "logps/rejected": -1314.0380859375, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2088872194290161, + "rewards/margins": 12.864030838012695, + "rewards/rejected": -12.655143737792969, + "step": 2260 + }, + { + "epoch": 0.566862279935073, + "grad_norm": 0.04345703125, + "learning_rate": 2.352945394667363e-06, + "logits/chosen": -2.087890386581421, + "logits/rejected": -2.308422803878784, + "logps/chosen": -0.9035698771476746, + "logps/rejected": -1510.1090087890625, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2111314833164215, + "rewards/margins": 14.675390243530273, + "rewards/rejected": -14.464259147644043, + "step": 2270 + }, + { + "epoch": 0.56935947059558, + "grad_norm": 0.126953125, + "learning_rate": 2.3311905542907627e-06, + "logits/chosen": -2.234039545059204, + "logits/rejected": -2.428889751434326, + "logps/chosen": -0.797686755657196, + "logps/rejected": -1220.7269287109375, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2120717316865921, + "rewards/margins": 11.918030738830566, + "rewards/rejected": -11.70595932006836, + "step": 2280 + }, + { + "epoch": 0.5718566612560869, + "grad_norm": 0.06396484375, + "learning_rate": 2.30944854800809e-06, + "logits/chosen": -2.1873550415039062, + "logits/rejected": -2.3636820316314697, + "logps/chosen": -0.8641906976699829, + "logps/rejected": -1375.240478515625, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21045894920825958, + "rewards/margins": 13.458274841308594, + "rewards/rejected": -13.247815132141113, + "step": 2290 + }, + { + "epoch": 0.5743538519165938, + "grad_norm": 0.0224609375, + "learning_rate": 2.287721028801204e-06, + "logits/chosen": -2.147500991821289, + "logits/rejected": -2.3285794258117676, + "logps/chosen": -1.5540382862091064, + "logps/rejected": -1261.9169921875, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2090359479188919, + "rewards/margins": 12.258954048156738, + "rewards/rejected": -12.049917221069336, + "step": 2300 + }, + { + "epoch": 0.5768510425771007, + "grad_norm": 0.224609375, + "learning_rate": 2.26600964855055e-06, + "logits/chosen": -2.2112767696380615, + "logits/rejected": -2.387683868408203, + "logps/chosen": -1.0878078937530518, + "logps/rejected": -1259.334716796875, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2047530710697174, + "rewards/margins": 12.326273918151855, + "rewards/rejected": -12.121520042419434, + "step": 2310 + }, + { + "epoch": 0.5793482332376076, + "grad_norm": 0.033203125, + "learning_rate": 2.244316057909573e-06, + "logits/chosen": -2.179072856903076, + "logits/rejected": -2.3518600463867188, + "logps/chosen": -0.5903832912445068, + "logps/rejected": -1252.9005126953125, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20970389246940613, + "rewards/margins": 12.249414443969727, + "rewards/rejected": -12.039710998535156, + "step": 2320 + }, + { + "epoch": 0.5818454238981147, + "grad_norm": 0.043212890625, + "learning_rate": 2.2226419061792282e-06, + "logits/chosen": -2.2571616172790527, + "logits/rejected": -2.4548702239990234, + "logps/chosen": -0.747587263584137, + "logps/rejected": -1403.0311279296875, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20638947188854218, + "rewards/margins": 13.71589183807373, + "rewards/rejected": -13.509503364562988, + "step": 2330 + }, + { + "epoch": 0.5843426145586216, + "grad_norm": 0.0079345703125, + "learning_rate": 2.200988841182589e-06, + "logits/chosen": -2.1915557384490967, + "logits/rejected": -2.3925371170043945, + "logps/chosen": -0.653125524520874, + "logps/rejected": -1481.6878662109375, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20729561150074005, + "rewards/margins": 14.506765365600586, + "rewards/rejected": -14.299470901489258, + "step": 2340 + }, + { + "epoch": 0.5868398052191285, + "grad_norm": 0.0286865234375, + "learning_rate": 2.179358509139559e-06, + "logits/chosen": -2.149214267730713, + "logits/rejected": -2.344883680343628, + "logps/chosen": -2.6051526069641113, + "logps/rejected": -1142.56201171875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21485964953899384, + "rewards/margins": 11.022435188293457, + "rewards/rejected": -10.807573318481445, + "step": 2350 + }, + { + "epoch": 0.5893369958796354, + "grad_norm": 0.041748046875, + "learning_rate": 2.1577525545417254e-06, + "logits/chosen": -2.1596992015838623, + "logits/rejected": -2.3585286140441895, + "logps/chosen": -0.6524207592010498, + "logps/rejected": -1219.5198974609375, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2145775854587555, + "rewards/margins": 11.869647026062012, + "rewards/rejected": -11.655069351196289, + "step": 2360 + }, + { + "epoch": 0.5918341865401423, + "grad_norm": 0.203125, + "learning_rate": 2.1361726200273293e-06, + "logits/chosen": -2.247102737426758, + "logits/rejected": -2.4553802013397217, + "logps/chosen": -1.189576506614685, + "logps/rejected": -1349.142578125, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21253642439842224, + "rewards/margins": 13.139638900756836, + "rewards/rejected": -12.927103042602539, + "step": 2370 + }, + { + "epoch": 0.5943313772006493, + "grad_norm": 0.035888671875, + "learning_rate": 2.1146203462563773e-06, + "logits/chosen": -2.302658796310425, + "logits/rejected": -2.4925646781921387, + "logps/chosen": -0.5675852298736572, + "logps/rejected": -1279.3642578125, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20174381136894226, + "rewards/margins": 12.505022048950195, + "rewards/rejected": -12.303278923034668, + "step": 2380 + }, + { + "epoch": 0.5968285678611562, + "grad_norm": 0.024169921875, + "learning_rate": 2.0930973717859117e-06, + "logits/chosen": -2.3194613456726074, + "logits/rejected": -2.526947498321533, + "logps/chosen": -0.6186977624893188, + "logps/rejected": -1298.871826171875, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21502625942230225, + "rewards/margins": 12.666671752929688, + "rewards/rejected": -12.45164680480957, + "step": 2390 + }, + { + "epoch": 0.5993257585216631, + "grad_norm": 0.0150146484375, + "learning_rate": 2.0716053329454337e-06, + "logits/chosen": -2.0586659908294678, + "logits/rejected": -2.262817621231079, + "logps/chosen": -1.2787067890167236, + "logps/rejected": -1338.0716552734375, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21538551151752472, + "rewards/margins": 12.998420715332031, + "rewards/rejected": -12.783034324645996, + "step": 2400 + }, + { + "epoch": 0.60182294918217, + "grad_norm": 0.03369140625, + "learning_rate": 2.0501458637124963e-06, + "logits/chosen": -2.1946122646331787, + "logits/rejected": -2.4308152198791504, + "logps/chosen": -0.9974037408828735, + "logps/rejected": -1574.956787109375, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21448758244514465, + "rewards/margins": 15.443408012390137, + "rewards/rejected": -15.228919982910156, + "step": 2410 + }, + { + "epoch": 0.604320139842677, + "grad_norm": 0.021484375, + "learning_rate": 2.0287205955884812e-06, + "logits/chosen": -2.1859405040740967, + "logits/rejected": -2.419334888458252, + "logps/chosen": -1.4137351512908936, + "logps/rejected": -1243.725830078125, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21700558066368103, + "rewards/margins": 11.884050369262695, + "rewards/rejected": -11.667045593261719, + "step": 2420 + }, + { + "epoch": 0.6068173305031839, + "grad_norm": 0.0595703125, + "learning_rate": 2.0073311574745583e-06, + "logits/chosen": -2.162872791290283, + "logits/rejected": -2.378561019897461, + "logps/chosen": -0.7903895378112793, + "logps/rejected": -1427.887939453125, + "loss": 0.2243, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21638064086437225, + "rewards/margins": 13.9492769241333, + "rewards/rejected": -13.73289680480957, + "step": 2430 + }, + { + "epoch": 0.6093145211636909, + "grad_norm": 0.037841796875, + "learning_rate": 1.9859791755478453e-06, + "logits/chosen": -2.1776349544525146, + "logits/rejected": -2.3626227378845215, + "logps/chosen": -1.0283732414245605, + "logps/rejected": -1148.4774169921875, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21198368072509766, + "rewards/margins": 11.212942123413086, + "rewards/rejected": -11.000959396362305, + "step": 2440 + }, + { + "epoch": 0.6118117118241978, + "grad_norm": 0.0311279296875, + "learning_rate": 1.9646662731377737e-06, + "logits/chosen": -2.130434989929199, + "logits/rejected": -2.3274593353271484, + "logps/chosen": -0.7933204770088196, + "logps/rejected": -1231.31201171875, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20909221470355988, + "rewards/margins": 11.990662574768066, + "rewards/rejected": -11.781569480895996, + "step": 2450 + }, + { + "epoch": 0.6143089024847047, + "grad_norm": 0.06689453125, + "learning_rate": 1.9433940706026743e-06, + "logits/chosen": -2.162235736846924, + "logits/rejected": -2.3636813163757324, + "logps/chosen": -0.8596396446228027, + "logps/rejected": -1512.643798828125, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21136781573295593, + "rewards/margins": 14.79273796081543, + "rewards/rejected": -14.581372261047363, + "step": 2460 + }, + { + "epoch": 0.6168060931452116, + "grad_norm": 0.040283203125, + "learning_rate": 1.9221641852065807e-06, + "logits/chosen": -2.153958797454834, + "logits/rejected": -2.322754383087158, + "logps/chosen": -0.7868290543556213, + "logps/rejected": -1277.087890625, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21162231266498566, + "rewards/margins": 12.481648445129395, + "rewards/rejected": -12.270025253295898, + "step": 2470 + }, + { + "epoch": 0.6193032838057185, + "grad_norm": 0.0302734375, + "learning_rate": 1.9009782309962805e-06, + "logits/chosen": -2.2541210651397705, + "logits/rejected": -2.451572895050049, + "logps/chosen": -0.9773980379104614, + "logps/rejected": -1259.029296875, + "loss": 0.2243, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21588608622550964, + "rewards/margins": 12.217048645019531, + "rewards/rejected": -12.001164436340332, + "step": 2480 + }, + { + "epoch": 0.6218004744662256, + "grad_norm": 0.042236328125, + "learning_rate": 1.8798378186785979e-06, + "logits/chosen": -2.208289623260498, + "logits/rejected": -2.3975791931152344, + "logps/chosen": -0.47841542959213257, + "logps/rejected": -1317.6165771484375, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21394380927085876, + "rewards/margins": 12.884051322937012, + "rewards/rejected": -12.670106887817383, + "step": 2490 + }, + { + "epoch": 0.6242976651267325, + "grad_norm": 0.0172119140625, + "learning_rate": 1.8587445554979404e-06, + "logits/chosen": -2.054529905319214, + "logits/rejected": -2.2491745948791504, + "logps/chosen": -0.9916723370552063, + "logps/rejected": -1467.903076171875, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21281161904335022, + "rewards/margins": 14.350473403930664, + "rewards/rejected": -14.137661933898926, + "step": 2500 + }, + { + "epoch": 0.6267948557872394, + "grad_norm": 0.04052734375, + "learning_rate": 1.8377000451141013e-06, + "logits/chosen": -2.1033387184143066, + "logits/rejected": -2.311828136444092, + "logps/chosen": -1.013270616531372, + "logps/rejected": -1430.32568359375, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2092626839876175, + "rewards/margins": 13.935432434082031, + "rewards/rejected": -13.726168632507324, + "step": 2510 + }, + { + "epoch": 0.6292920464477463, + "grad_norm": 0.033203125, + "learning_rate": 1.8167058874803405e-06, + "logits/chosen": -2.2198266983032227, + "logits/rejected": -2.435263156890869, + "logps/chosen": -1.5374799966812134, + "logps/rejected": -1410.561279296875, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21642649173736572, + "rewards/margins": 13.645418167114258, + "rewards/rejected": -13.428991317749023, + "step": 2520 + }, + { + "epoch": 0.6317892371082532, + "grad_norm": 0.025390625, + "learning_rate": 1.7957636787217451e-06, + "logits/chosen": -2.1474337577819824, + "logits/rejected": -2.3489108085632324, + "logps/chosen": -0.525337278842926, + "logps/rejected": -1465.3909912109375, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21127943694591522, + "rewards/margins": 14.354647636413574, + "rewards/rejected": -14.1433687210083, + "step": 2530 + }, + { + "epoch": 0.6342864277687601, + "grad_norm": 0.045166015625, + "learning_rate": 1.7748750110138768e-06, + "logits/chosen": -2.1010701656341553, + "logits/rejected": -2.3061635494232178, + "logps/chosen": -1.495689034461975, + "logps/rejected": -1522.001953125, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21690087020397186, + "rewards/margins": 14.7809476852417, + "rewards/rejected": -14.564045906066895, + "step": 2540 + }, + { + "epoch": 0.6367836184292671, + "grad_norm": 0.0771484375, + "learning_rate": 1.7540414724617282e-06, + "logits/chosen": -2.0483648777008057, + "logits/rejected": -2.2502453327178955, + "logps/chosen": -1.7171008586883545, + "logps/rejected": -1322.4296875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21715514361858368, + "rewards/margins": 12.821990966796875, + "rewards/rejected": -12.604835510253906, + "step": 2550 + }, + { + "epoch": 0.639280809089774, + "grad_norm": 0.0155029296875, + "learning_rate": 1.7332646469789827e-06, + "logits/chosen": -2.2271251678466797, + "logits/rejected": -2.4021248817443848, + "logps/chosen": -0.7044438719749451, + "logps/rejected": -1151.026611328125, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21061739325523376, + "rewards/margins": 11.253252983093262, + "rewards/rejected": -11.042635917663574, + "step": 2560 + }, + { + "epoch": 0.6417779997502809, + "grad_norm": 0.031494140625, + "learning_rate": 1.7125461141675881e-06, + "logits/chosen": -2.115159034729004, + "logits/rejected": -2.321096181869507, + "logps/chosen": -1.5179011821746826, + "logps/rejected": -1341.9727783203125, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21012921631336212, + "rewards/margins": 13.043965339660645, + "rewards/rejected": -12.833836555480957, + "step": 2570 + }, + { + "epoch": 0.6442751904107878, + "grad_norm": 0.07177734375, + "learning_rate": 1.6918874491976744e-06, + "logits/chosen": -2.262359619140625, + "logits/rejected": -2.4549667835235596, + "logps/chosen": -1.1417173147201538, + "logps/rejected": -1349.908203125, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21297034621238708, + "rewards/margins": 13.129191398620605, + "rewards/rejected": -12.916219711303711, + "step": 2580 + }, + { + "epoch": 0.6467723810712948, + "grad_norm": 0.019287109375, + "learning_rate": 1.6712902226877917e-06, + "logits/chosen": -2.1325788497924805, + "logits/rejected": -2.323542356491089, + "logps/chosen": -1.002483606338501, + "logps/rejected": -1407.6922607421875, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21138958632946014, + "rewards/margins": 13.752557754516602, + "rewards/rejected": -13.541168212890625, + "step": 2590 + }, + { + "epoch": 0.6492695717318018, + "grad_norm": 0.046875, + "learning_rate": 1.6507560005854977e-06, + "logits/chosen": -2.0466830730438232, + "logits/rejected": -2.254211664199829, + "logps/chosen": -1.2699908018112183, + "logps/rejected": -1284.965576171875, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22062024474143982, + "rewards/margins": 12.382702827453613, + "rewards/rejected": -12.16208267211914, + "step": 2600 + }, + { + "epoch": 0.6517667623923087, + "grad_norm": 0.0283203125, + "learning_rate": 1.6302863440483121e-06, + "logits/chosen": -2.102281093597412, + "logits/rejected": -2.344468832015991, + "logps/chosen": -0.9672495722770691, + "logps/rejected": -1371.63232421875, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2286236733198166, + "rewards/margins": 13.317922592163086, + "rewards/rejected": -13.089300155639648, + "step": 2610 + }, + { + "epoch": 0.6542639530528156, + "grad_norm": 0.10302734375, + "learning_rate": 1.6098828093250203e-06, + "logits/chosen": -2.012927770614624, + "logits/rejected": -2.23055100440979, + "logps/chosen": -2.223574161529541, + "logps/rejected": -1439.184326171875, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2114640474319458, + "rewards/margins": 13.80018424987793, + "rewards/rejected": -13.588720321655273, + "step": 2620 + }, + { + "epoch": 0.6567611437133225, + "grad_norm": 0.0201416015625, + "learning_rate": 1.5895469476373545e-06, + "logits/chosen": -2.0998306274414062, + "logits/rejected": -2.284853935241699, + "logps/chosen": -1.0365889072418213, + "logps/rejected": -1287.3863525390625, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21419724822044373, + "rewards/margins": 12.467586517333984, + "rewards/rejected": -12.253389358520508, + "step": 2630 + }, + { + "epoch": 0.6592583343738294, + "grad_norm": 0.048583984375, + "learning_rate": 1.5692803050620642e-06, + "logits/chosen": -2.1266770362854004, + "logits/rejected": -2.341489553451538, + "logps/chosen": -1.9875209331512451, + "logps/rejected": -1219.6407470703125, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21611304581165314, + "rewards/margins": 11.721773147583008, + "rewards/rejected": -11.505661010742188, + "step": 2640 + }, + { + "epoch": 0.6617555250343363, + "grad_norm": 0.045166015625, + "learning_rate": 1.5490844224133717e-06, + "logits/chosen": -2.178802251815796, + "logits/rejected": -2.3850629329681396, + "logps/chosen": -1.1978418827056885, + "logps/rejected": -1456.7591552734375, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2064187228679657, + "rewards/margins": 14.242889404296875, + "rewards/rejected": -14.036470413208008, + "step": 2650 + }, + { + "epoch": 0.6642527156948433, + "grad_norm": 0.02978515625, + "learning_rate": 1.528960835125822e-06, + "logits/chosen": -2.3235323429107666, + "logits/rejected": -2.508779525756836, + "logps/chosen": -0.7140904664993286, + "logps/rejected": -1262.5396728515625, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2111320048570633, + "rewards/margins": 12.3548002243042, + "rewards/rejected": -12.143668174743652, + "step": 2660 + }, + { + "epoch": 0.6667499063553503, + "grad_norm": 0.04541015625, + "learning_rate": 1.5089110731375568e-06, + "logits/chosen": -2.1535146236419678, + "logits/rejected": -2.346010446548462, + "logps/chosen": -1.2154910564422607, + "logps/rejected": -1353.01416015625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21193823218345642, + "rewards/margins": 13.174649238586426, + "rewards/rejected": -12.962712287902832, + "step": 2670 + }, + { + "epoch": 0.6692470970158572, + "grad_norm": 0.0224609375, + "learning_rate": 1.4889366607739925e-06, + "logits/chosen": -2.2847390174865723, + "logits/rejected": -2.437983989715576, + "logps/chosen": -0.47022026777267456, + "logps/rejected": -1079.610595703125, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20733828842639923, + "rewards/margins": 10.533978462219238, + "rewards/rejected": -10.326639175415039, + "step": 2680 + }, + { + "epoch": 0.6717442876763641, + "grad_norm": 0.0517578125, + "learning_rate": 1.4690391166319307e-06, + "logits/chosen": -2.091798782348633, + "logits/rejected": -2.286367177963257, + "logps/chosen": -0.8848400115966797, + "logps/rejected": -1370.623046875, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21229073405265808, + "rewards/margins": 13.300837516784668, + "rewards/rejected": -13.088546752929688, + "step": 2690 + }, + { + "epoch": 0.674241478336871, + "grad_norm": 0.058837890625, + "learning_rate": 1.4492199534641055e-06, + "logits/chosen": -2.1903884410858154, + "logits/rejected": -2.389869451522827, + "logps/chosen": -0.7620021104812622, + "logps/rejected": -1357.733642578125, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2085207998752594, + "rewards/margins": 13.300318717956543, + "rewards/rejected": -13.091796875, + "step": 2700 + }, + { + "epoch": 0.676738668997378, + "grad_norm": 0.050048828125, + "learning_rate": 1.429480678064174e-06, + "logits/chosen": -2.1907572746276855, + "logits/rejected": -2.4412574768066406, + "logps/chosen": -1.4903779029846191, + "logps/rejected": -1532.8353271484375, + "loss": 0.2235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22088858485221863, + "rewards/margins": 14.948209762573242, + "rewards/rejected": -14.727320671081543, + "step": 2710 + }, + { + "epoch": 0.6792358596578849, + "grad_norm": 0.04833984375, + "learning_rate": 1.4098227911521523e-06, + "logits/chosen": -2.1927340030670166, + "logits/rejected": -2.384458065032959, + "logps/chosen": -1.0519030094146729, + "logps/rejected": -1408.5384521484375, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21749384701251984, + "rewards/margins": 13.769442558288574, + "rewards/rejected": -13.551948547363281, + "step": 2720 + }, + { + "epoch": 0.6817330503183918, + "grad_norm": 0.0206298828125, + "learning_rate": 1.3902477872603295e-06, + "logits/chosen": -2.292635440826416, + "logits/rejected": -2.4606173038482666, + "logps/chosen": -1.3724099397659302, + "logps/rejected": -1059.629638671875, + "loss": 0.2292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2094014585018158, + "rewards/margins": 10.156023025512695, + "rewards/rejected": -9.946621894836426, + "step": 2730 + }, + { + "epoch": 0.6842302409788987, + "grad_norm": 0.06494140625, + "learning_rate": 1.370757154619638e-06, + "logits/chosen": -2.2135720252990723, + "logits/rejected": -2.4035539627075195, + "logps/chosen": -0.8492560386657715, + "logps/rejected": -1440.1517333984375, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21952596306800842, + "rewards/margins": 13.991949081420898, + "rewards/rejected": -13.772422790527344, + "step": 2740 + }, + { + "epoch": 0.6867274316394056, + "grad_norm": 0.0546875, + "learning_rate": 1.3513523750465049e-06, + "logits/chosen": -2.2055509090423584, + "logits/rejected": -2.3952600955963135, + "logps/chosen": -0.848610520362854, + "logps/rejected": -1253.37841796875, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2106543481349945, + "rewards/margins": 12.153672218322754, + "rewards/rejected": -11.94301700592041, + "step": 2750 + }, + { + "epoch": 0.6892246222999125, + "grad_norm": 0.0078125, + "learning_rate": 1.332034923830199e-06, + "logits/chosen": -2.1199612617492676, + "logits/rejected": -2.3331620693206787, + "logps/chosen": -0.572918176651001, + "logps/rejected": -1314.5574951171875, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21192285418510437, + "rewards/margins": 12.847297668457031, + "rewards/rejected": -12.635372161865234, + "step": 2760 + }, + { + "epoch": 0.6917218129604196, + "grad_norm": 0.05322265625, + "learning_rate": 1.31280626962067e-06, + "logits/chosen": -2.242522716522217, + "logits/rejected": -2.4255213737487793, + "logps/chosen": -0.6031197905540466, + "logps/rejected": -1176.1724853515625, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21441105008125305, + "rewards/margins": 11.398508071899414, + "rewards/rejected": -11.18409538269043, + "step": 2770 + }, + { + "epoch": 0.6942190036209265, + "grad_norm": 0.03759765625, + "learning_rate": 1.2936678743168813e-06, + "logits/chosen": -2.1787726879119873, + "logits/rejected": -2.379664659500122, + "logps/chosen": -0.6903184652328491, + "logps/rejected": -1316.2584228515625, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2146139144897461, + "rewards/margins": 12.865110397338867, + "rewards/rejected": -12.650495529174805, + "step": 2780 + }, + { + "epoch": 0.6967161942814334, + "grad_norm": 0.033935546875, + "learning_rate": 1.2746211929556777e-06, + "logits/chosen": -2.1566481590270996, + "logits/rejected": -2.4140141010284424, + "logps/chosen": -0.8048852682113647, + "logps/rejected": -1725.7884521484375, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21056988835334778, + "rewards/margins": 16.911666870117188, + "rewards/rejected": -16.70109748840332, + "step": 2790 + }, + { + "epoch": 0.6992133849419403, + "grad_norm": 0.1259765625, + "learning_rate": 1.2556676736011558e-06, + "logits/chosen": -2.1705546379089355, + "logits/rejected": -2.36136531829834, + "logps/chosen": -1.7305570840835571, + "logps/rejected": -1468.9334716796875, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21282191574573517, + "rewards/margins": 14.295863151550293, + "rewards/rejected": -14.083041191101074, + "step": 2800 + }, + { + "epoch": 0.7017105756024472, + "grad_norm": 0.0264892578125, + "learning_rate": 1.2368087572345772e-06, + "logits/chosen": -2.2008700370788574, + "logits/rejected": -2.3622145652770996, + "logps/chosen": -0.9749493598937988, + "logps/rejected": -1153.006103515625, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20674769580364227, + "rewards/margins": 11.259106636047363, + "rewards/rejected": -11.052358627319336, + "step": 2810 + }, + { + "epoch": 0.7042077662629542, + "grad_norm": 0.07275390625, + "learning_rate": 1.2180458776448067e-06, + "logits/chosen": -2.183065891265869, + "logits/rejected": -2.4031364917755127, + "logps/chosen": -1.3278162479400635, + "logps/rejected": -1352.5931396484375, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21398906409740448, + "rewards/margins": 13.097040176391602, + "rewards/rejected": -12.883050918579102, + "step": 2820 + }, + { + "epoch": 0.7067049569234611, + "grad_norm": 0.04150390625, + "learning_rate": 1.1993804613193158e-06, + "logits/chosen": -2.166015625, + "logits/rejected": -2.376171827316284, + "logps/chosen": -0.8504392504692078, + "logps/rejected": -1218.2906494140625, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2224057912826538, + "rewards/margins": 11.764514923095703, + "rewards/rejected": -11.542108535766602, + "step": 2830 + }, + { + "epoch": 0.709202147583968, + "grad_norm": 0.0791015625, + "learning_rate": 1.1808139273357232e-06, + "logits/chosen": -2.1249091625213623, + "logits/rejected": -2.324924945831299, + "logps/chosen": -1.2602803707122803, + "logps/rejected": -1440.6927490234375, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21478672325611115, + "rewards/margins": 13.95526123046875, + "rewards/rejected": -13.740473747253418, + "step": 2840 + }, + { + "epoch": 0.711699338244475, + "grad_norm": 0.0186767578125, + "learning_rate": 1.1623476872539108e-06, + "logits/chosen": -2.1342732906341553, + "logits/rejected": -2.3520121574401855, + "logps/chosen": -1.0939338207244873, + "logps/rejected": -1569.9661865234375, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.209732323884964, + "rewards/margins": 15.359413146972656, + "rewards/rejected": -15.149681091308594, + "step": 2850 + }, + { + "epoch": 0.7141965289049819, + "grad_norm": 0.0206298828125, + "learning_rate": 1.1439831450087032e-06, + "logits/chosen": -2.1833555698394775, + "logits/rejected": -2.408240795135498, + "logps/chosen": -1.4031983613967896, + "logps/rejected": -1495.2554931640625, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20904704928398132, + "rewards/margins": 14.592279434204102, + "rewards/rejected": -14.383232116699219, + "step": 2860 + }, + { + "epoch": 0.7166937195654888, + "grad_norm": 0.048828125, + "learning_rate": 1.1257216968031357e-06, + "logits/chosen": -2.1499791145324707, + "logits/rejected": -2.3467013835906982, + "logps/chosen": -0.6740778684616089, + "logps/rejected": -1315.198486328125, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2084966003894806, + "rewards/margins": 12.877813339233398, + "rewards/rejected": -12.669316291809082, + "step": 2870 + }, + { + "epoch": 0.7191909102259958, + "grad_norm": 0.099609375, + "learning_rate": 1.1075647310022974e-06, + "logits/chosen": -2.293015956878662, + "logits/rejected": -2.477437973022461, + "logps/chosen": -0.6577932238578796, + "logps/rejected": -1144.9639892578125, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21200187504291534, + "rewards/margins": 11.185698509216309, + "rewards/rejected": -10.973695755004883, + "step": 2880 + }, + { + "epoch": 0.7216881008865027, + "grad_norm": 0.0341796875, + "learning_rate": 1.0895136280277863e-06, + "logits/chosen": -2.1305599212646484, + "logits/rejected": -2.3395919799804688, + "logps/chosen": -0.9710084795951843, + "logps/rejected": -1521.902099609375, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21521084010601044, + "rewards/margins": 14.770858764648438, + "rewards/rejected": -14.555648803710938, + "step": 2890 + }, + { + "epoch": 0.7241852915470096, + "grad_norm": 0.02685546875, + "learning_rate": 1.0715697602527542e-06, + "logits/chosen": -1.9920504093170166, + "logits/rejected": -2.2198596000671387, + "logps/chosen": -0.49225324392318726, + "logps/rejected": -1440.2822265625, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21176087856292725, + "rewards/margins": 13.888765335083008, + "rewards/rejected": -13.67700481414795, + "step": 2900 + }, + { + "epoch": 0.7266824822075165, + "grad_norm": 0.05322265625, + "learning_rate": 1.0537344918975708e-06, + "logits/chosen": -2.1923391819000244, + "logits/rejected": -2.3587305545806885, + "logps/chosen": -2.3005270957946777, + "logps/rejected": -1118.677490234375, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22067633271217346, + "rewards/margins": 10.68152141571045, + "rewards/rejected": -10.460844993591309, + "step": 2910 + }, + { + "epoch": 0.7291796728680234, + "grad_norm": 0.0277099609375, + "learning_rate": 1.036009178926107e-06, + "logits/chosen": -2.162017822265625, + "logits/rejected": -2.350229263305664, + "logps/chosen": -0.4403456151485443, + "logps/rejected": -1365.908203125, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21478745341300964, + "rewards/margins": 13.359176635742188, + "rewards/rejected": -13.144391059875488, + "step": 2920 + }, + { + "epoch": 0.7316768635285305, + "grad_norm": 0.2041015625, + "learning_rate": 1.0183951689426438e-06, + "logits/chosen": -2.0874218940734863, + "logits/rejected": -2.286980152130127, + "logps/chosen": -1.1334517002105713, + "logps/rejected": -1574.8843994140625, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20781561732292175, + "rewards/margins": 15.404953002929688, + "rewards/rejected": -15.197137832641602, + "step": 2930 + }, + { + "epoch": 0.7341740541890374, + "grad_norm": 0.033203125, + "learning_rate": 1.0008938010894156e-06, + "logits/chosen": -2.05769419670105, + "logits/rejected": -2.291485548019409, + "logps/chosen": -0.6213763356208801, + "logps/rejected": -1545.57421875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21286337077617645, + "rewards/margins": 15.127001762390137, + "rewards/rejected": -14.914140701293945, + "step": 2940 + }, + { + "epoch": 0.7366712448495443, + "grad_norm": 0.06005859375, + "learning_rate": 9.83506405944804e-07, + "logits/chosen": -2.0132200717926025, + "logits/rejected": -2.2228617668151855, + "logps/chosen": -1.0132176876068115, + "logps/rejected": -1225.736572265625, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21595139801502228, + "rewards/margins": 11.760801315307617, + "rewards/rejected": -11.544851303100586, + "step": 2950 + }, + { + "epoch": 0.7391684355100512, + "grad_norm": 0.02099609375, + "learning_rate": 9.662343054221743e-07, + "logits/chosen": -2.038722515106201, + "logits/rejected": -2.254706621170044, + "logps/chosen": -1.0080900192260742, + "logps/rejected": -1486.7254638671875, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21779987215995789, + "rewards/margins": 14.319659233093262, + "rewards/rejected": -14.101860046386719, + "step": 2960 + }, + { + "epoch": 0.7416656261705581, + "grad_norm": 0.033447265625, + "learning_rate": 9.490788126693754e-07, + "logits/chosen": -2.05572247505188, + "logits/rejected": -2.270496129989624, + "logps/chosen": -1.580960988998413, + "logps/rejected": -1349.623779296875, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21096165478229523, + "rewards/margins": 13.037325859069824, + "rewards/rejected": -12.826364517211914, + "step": 2970 + }, + { + "epoch": 0.744162816831065, + "grad_norm": 0.034423828125, + "learning_rate": 9.32041231968904e-07, + "logits/chosen": -2.135493040084839, + "logits/rejected": -2.3431620597839355, + "logps/chosen": -0.692672848701477, + "logps/rejected": -1422.2606201171875, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21278159320354462, + "rewards/margins": 13.839566230773926, + "rewards/rejected": -13.626785278320312, + "step": 2980 + }, + { + "epoch": 0.746660007491572, + "grad_norm": 0.038330078125, + "learning_rate": 9.151228586387464e-07, + "logits/chosen": -2.1877083778381348, + "logits/rejected": -2.3766164779663086, + "logps/chosen": -0.7439475655555725, + "logps/rejected": -1241.116943359375, + "loss": 0.2292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2107519656419754, + "rewards/margins": 12.074844360351562, + "rewards/rejected": -11.864092826843262, + "step": 2990 + }, + { + "epoch": 0.7491571981520789, + "grad_norm": 0.051025390625, + "learning_rate": 8.983249789338941e-07, + "logits/chosen": -2.150568723678589, + "logits/rejected": -2.329155445098877, + "logps/chosen": -0.8139511346817017, + "logps/rejected": -1264.535888671875, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20758719742298126, + "rewards/margins": 12.365687370300293, + "rewards/rejected": -12.158101081848145, + "step": 3000 + }, + { + "epoch": 0.7491571981520789, + "eval_logits/chosen": -2.5715415477752686, + "eval_logits/rejected": -2.65895676612854, + "eval_logps/chosen": -0.12666501104831696, + "eval_logps/rejected": -650.5204467773438, + "eval_loss": 0.22132699191570282, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 0.25786837935447693, + "eval_rewards/margins": 6.314352512359619, + "eval_rewards/rejected": -6.056484222412109, + "eval_runtime": 0.6559, + "eval_samples_per_second": 7.623, + "eval_steps_per_second": 4.574, + "step": 3000 + }, + { + "epoch": 0.7516543888125858, + "grad_norm": 0.0194091796875, + "learning_rate": 8.816488699485593e-07, + "logits/chosen": -2.176842212677002, + "logits/rejected": -2.3571324348449707, + "logps/chosen": -0.4218795895576477, + "logps/rejected": -1318.904052734375, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21097974479198456, + "rewards/margins": 12.884817123413086, + "rewards/rejected": -12.67383861541748, + "step": 3010 + }, + { + "epoch": 0.7541515794730927, + "grad_norm": 0.041015625, + "learning_rate": 8.650957995190784e-07, + "logits/chosen": -2.1513025760650635, + "logits/rejected": -2.3777430057525635, + "logps/chosen": -1.3863859176635742, + "logps/rejected": -1556.2415771484375, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21383562684059143, + "rewards/margins": 15.212430000305176, + "rewards/rejected": -14.9985933303833, + "step": 3020 + }, + { + "epoch": 0.7566487701335997, + "grad_norm": 0.029541015625, + "learning_rate": 8.486670261275193e-07, + "logits/chosen": -2.252506732940674, + "logits/rejected": -2.452782392501831, + "logps/chosen": -0.9220790863037109, + "logps/rejected": -1333.130615234375, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20681767165660858, + "rewards/margins": 13.058314323425293, + "rewards/rejected": -12.851496696472168, + "step": 3030 + }, + { + "epoch": 0.7591459607941067, + "grad_norm": 0.0244140625, + "learning_rate": 8.32363798806011e-07, + "logits/chosen": -2.2259833812713623, + "logits/rejected": -2.4163031578063965, + "logps/chosen": -0.6000443696975708, + "logps/rejected": -1395.551513671875, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21475176513195038, + "rewards/margins": 13.636807441711426, + "rewards/rejected": -13.42205810546875, + "step": 3040 + }, + { + "epoch": 0.7616431514546136, + "grad_norm": 0.02490234375, + "learning_rate": 8.161873570417742e-07, + "logits/chosen": -2.1769793033599854, + "logits/rejected": -2.3828330039978027, + "logps/chosen": -0.49710139632225037, + "logps/rejected": -1448.161865234375, + "loss": 0.2243, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2149210423231125, + "rewards/margins": 14.16187858581543, + "rewards/rejected": -13.946958541870117, + "step": 3050 + }, + { + "epoch": 0.7641403421151205, + "grad_norm": 0.04931640625, + "learning_rate": 8.001389306828897e-07, + "logits/chosen": -2.094914674758911, + "logits/rejected": -2.325759172439575, + "logps/chosen": -1.7604669332504272, + "logps/rejected": -1601.197998046875, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2165949046611786, + "rewards/margins": 15.435786247253418, + "rewards/rejected": -15.219189643859863, + "step": 3060 + }, + { + "epoch": 0.7666375327756274, + "grad_norm": 0.056884765625, + "learning_rate": 7.842197398447993e-07, + "logits/chosen": -2.119885206222534, + "logits/rejected": -2.3199260234832764, + "logps/chosen": -1.7511274814605713, + "logps/rejected": -1411.588623046875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2108723670244217, + "rewards/margins": 13.716115951538086, + "rewards/rejected": -13.505243301391602, + "step": 3070 + }, + { + "epoch": 0.7691347234361343, + "grad_norm": 0.050048828125, + "learning_rate": 7.684309948175414e-07, + "logits/chosen": -2.0922672748565674, + "logits/rejected": -2.2642672061920166, + "logps/chosen": -0.6221259832382202, + "logps/rejected": -1387.583251953125, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2143571376800537, + "rewards/margins": 13.555415153503418, + "rewards/rejected": -13.341056823730469, + "step": 3080 + }, + { + "epoch": 0.7716319140966412, + "grad_norm": 0.0257568359375, + "learning_rate": 7.527738959737371e-07, + "logits/chosen": -2.1526269912719727, + "logits/rejected": -2.3576555252075195, + "logps/chosen": -1.0096280574798584, + "logps/rejected": -1377.4000244140625, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2141624391078949, + "rewards/margins": 13.408686637878418, + "rewards/rejected": -13.194523811340332, + "step": 3090 + }, + { + "epoch": 0.7741291047571482, + "grad_norm": 0.054443359375, + "learning_rate": 7.372496336773269e-07, + "logits/chosen": -2.1142802238464355, + "logits/rejected": -2.297616958618164, + "logps/chosen": -0.8569754362106323, + "logps/rejected": -1148.1827392578125, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21184638142585754, + "rewards/margins": 11.140499114990234, + "rewards/rejected": -10.9286527633667, + "step": 3100 + }, + { + "epoch": 0.7766262954176552, + "grad_norm": 0.0341796875, + "learning_rate": 7.218593881930744e-07, + "logits/chosen": -2.2074034214019775, + "logits/rejected": -2.391183853149414, + "logps/chosen": -0.8030778765678406, + "logps/rejected": -1229.139404296875, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2128894329071045, + "rewards/margins": 12.000519752502441, + "rewards/rejected": -11.787630081176758, + "step": 3110 + }, + { + "epoch": 0.7791234860781621, + "grad_norm": 0.040283203125, + "learning_rate": 7.066043295968342e-07, + "logits/chosen": -2.1711983680725098, + "logits/rejected": -2.370105266571045, + "logps/chosen": -1.8718315362930298, + "logps/rejected": -1323.237060546875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2125745713710785, + "rewards/margins": 12.693571090698242, + "rewards/rejected": -12.480997085571289, + "step": 3120 + }, + { + "epoch": 0.781620676738669, + "grad_norm": 0.048828125, + "learning_rate": 6.914856176865891e-07, + "logits/chosen": -2.255979537963867, + "logits/rejected": -2.4583592414855957, + "logps/chosen": -1.3440260887145996, + "logps/rejected": -1252.6439208984375, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.205407053232193, + "rewards/margins": 12.165246963500977, + "rewards/rejected": -11.959839820861816, + "step": 3130 + }, + { + "epoch": 0.7841178673991759, + "grad_norm": 0.05322265625, + "learning_rate": 6.765044018942804e-07, + "logits/chosen": -2.2532248497009277, + "logits/rejected": -2.4564757347106934, + "logps/chosen": -0.6921563744544983, + "logps/rejected": -1213.1630859375, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20914144814014435, + "rewards/margins": 11.834319114685059, + "rewards/rejected": -11.625177383422852, + "step": 3140 + }, + { + "epoch": 0.7866150580596829, + "grad_norm": 0.034423828125, + "learning_rate": 6.616618211984169e-07, + "logits/chosen": -2.1614041328430176, + "logits/rejected": -2.3516342639923096, + "logps/chosen": -0.3677124083042145, + "logps/rejected": -1380.3824462890625, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2100847065448761, + "rewards/margins": 13.517425537109375, + "rewards/rejected": -13.307340621948242, + "step": 3150 + }, + { + "epoch": 0.7891122487201898, + "grad_norm": 0.046630859375, + "learning_rate": 6.469590040374799e-07, + "logits/chosen": -2.108102560043335, + "logits/rejected": -2.3011136054992676, + "logps/chosen": -0.5627329349517822, + "logps/rejected": -1450.8382568359375, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21635802090168, + "rewards/margins": 14.105690002441406, + "rewards/rejected": -13.88933277130127, + "step": 3160 + }, + { + "epoch": 0.7916094393806967, + "grad_norm": 0.038818359375, + "learning_rate": 6.32397068224136e-07, + "logits/chosen": -2.2220847606658936, + "logits/rejected": -2.4407782554626465, + "logps/chosen": -0.8599641919136047, + "logps/rejected": -1339.21826171875, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21565623581409454, + "rewards/margins": 13.00416088104248, + "rewards/rejected": -12.788503646850586, + "step": 3170 + }, + { + "epoch": 0.7941066300412036, + "grad_norm": 0.41015625, + "learning_rate": 6.17977120860249e-07, + "logits/chosen": -2.208421230316162, + "logits/rejected": -2.4098763465881348, + "logps/chosen": -1.8245439529418945, + "logps/rejected": -1350.473876953125, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21073952317237854, + "rewards/margins": 13.214599609375, + "rewards/rejected": -13.003860473632812, + "step": 3180 + }, + { + "epoch": 0.7966038207017105, + "grad_norm": 0.03857421875, + "learning_rate": 6.037002582527121e-07, + "logits/chosen": -2.1419100761413574, + "logits/rejected": -2.3298497200012207, + "logps/chosen": -0.7109914422035217, + "logps/rejected": -1332.651123046875, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21053273975849152, + "rewards/margins": 12.884849548339844, + "rewards/rejected": -12.67431640625, + "step": 3190 + }, + { + "epoch": 0.7991010113622175, + "grad_norm": 0.0263671875, + "learning_rate": 5.895675658300981e-07, + "logits/chosen": -2.310133934020996, + "logits/rejected": -2.4916586875915527, + "logps/chosen": -0.809489905834198, + "logps/rejected": -1163.7008056640625, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21247372031211853, + "rewards/margins": 11.380758285522461, + "rewards/rejected": -11.168285369873047, + "step": 3200 + }, + { + "epoch": 0.8015982020227245, + "grad_norm": 0.049072265625, + "learning_rate": 5.755801180601381e-07, + "logits/chosen": -2.2009828090667725, + "logits/rejected": -2.4239349365234375, + "logps/chosen": -0.8167581558227539, + "logps/rejected": -1351.5517578125, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21433226764202118, + "rewards/margins": 13.204305648803711, + "rewards/rejected": -12.989973068237305, + "step": 3210 + }, + { + "epoch": 0.8040953926832314, + "grad_norm": 0.0390625, + "learning_rate": 5.617389783680307e-07, + "logits/chosen": -2.080115795135498, + "logits/rejected": -2.3226914405822754, + "logps/chosen": -0.7443311810493469, + "logps/rejected": -1530.034423828125, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21728749573230743, + "rewards/margins": 14.894986152648926, + "rewards/rejected": -14.677696228027344, + "step": 3220 + }, + { + "epoch": 0.8065925833437383, + "grad_norm": 0.232421875, + "learning_rate": 5.48045199055596e-07, + "logits/chosen": -2.1640877723693848, + "logits/rejected": -2.36962628364563, + "logps/chosen": -1.1329087018966675, + "logps/rejected": -1273.104736328125, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20660333335399628, + "rewards/margins": 12.42498779296875, + "rewards/rejected": -12.218384742736816, + "step": 3230 + }, + { + "epoch": 0.8090897740042452, + "grad_norm": 0.043212890625, + "learning_rate": 5.344998212212704e-07, + "logits/chosen": -2.091491937637329, + "logits/rejected": -2.327758312225342, + "logps/chosen": -1.4844013452529907, + "logps/rejected": -1502.2427978515625, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2210853397846222, + "rewards/margins": 14.54228687286377, + "rewards/rejected": -14.321202278137207, + "step": 3240 + }, + { + "epoch": 0.8115869646647521, + "grad_norm": 0.03564453125, + "learning_rate": 5.211038746809551e-07, + "logits/chosen": -2.192322015762329, + "logits/rejected": -2.3808138370513916, + "logps/chosen": -0.5706063508987427, + "logps/rejected": -1285.364013671875, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20710210502147675, + "rewards/margins": 12.564165115356445, + "rewards/rejected": -12.357062339782715, + "step": 3250 + }, + { + "epoch": 0.8140841553252591, + "grad_norm": 0.07275390625, + "learning_rate": 5.078583778897216e-07, + "logits/chosen": -2.1883485317230225, + "logits/rejected": -2.3633830547332764, + "logps/chosen": -1.4209082126617432, + "logps/rejected": -1214.212890625, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21422162652015686, + "rewards/margins": 11.81783390045166, + "rewards/rejected": -11.603612899780273, + "step": 3260 + }, + { + "epoch": 0.816581345985766, + "grad_norm": 0.044677734375, + "learning_rate": 4.94764337864384e-07, + "logits/chosen": -2.2724106311798096, + "logits/rejected": -2.4622254371643066, + "logps/chosen": -0.9480986595153809, + "logps/rejected": -1307.112548828125, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2102733850479126, + "rewards/margins": 12.694864273071289, + "rewards/rejected": -12.484590530395508, + "step": 3270 + }, + { + "epoch": 0.819078536646273, + "grad_norm": 0.0203857421875, + "learning_rate": 4.818227501069328e-07, + "logits/chosen": -2.2342686653137207, + "logits/rejected": -2.4815187454223633, + "logps/chosen": -1.358782410621643, + "logps/rejected": -1722.28125, + "loss": 0.2233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21825866401195526, + "rewards/margins": 16.87509536743164, + "rewards/rejected": -16.656835556030273, + "step": 3280 + }, + { + "epoch": 0.8215757273067799, + "grad_norm": 0.031982421875, + "learning_rate": 4.690345985288572e-07, + "logits/chosen": -2.1274971961975098, + "logits/rejected": -2.328562021255493, + "logps/chosen": -1.1761976480484009, + "logps/rejected": -1403.730224609375, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.213038831949234, + "rewards/margins": 13.633008003234863, + "rewards/rejected": -13.41996955871582, + "step": 3290 + }, + { + "epoch": 0.8240729179672868, + "grad_norm": 0.01953125, + "learning_rate": 4.5640085537633633e-07, + "logits/chosen": -2.1780600547790527, + "logits/rejected": -2.418370008468628, + "logps/chosen": -1.041684865951538, + "logps/rejected": -1449.9898681640625, + "loss": 0.2235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21910138428211212, + "rewards/margins": 14.122426986694336, + "rewards/rejected": -13.903326034545898, + "step": 3300 + }, + { + "epoch": 0.8265701086277938, + "grad_norm": 0.04052734375, + "learning_rate": 4.439224811563211e-07, + "logits/chosen": -2.0584537982940674, + "logits/rejected": -2.258396625518799, + "logps/chosen": -0.6486183404922485, + "logps/rejected": -1476.123291015625, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21381358802318573, + "rewards/margins": 14.360862731933594, + "rewards/rejected": -14.147050857543945, + "step": 3310 + }, + { + "epoch": 0.8290672992883007, + "grad_norm": 0.04345703125, + "learning_rate": 4.316004245635158e-07, + "logits/chosen": -2.147899866104126, + "logits/rejected": -2.3480546474456787, + "logps/chosen": -1.0383152961730957, + "logps/rejected": -1587.1812744140625, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2098073661327362, + "rewards/margins": 15.521720886230469, + "rewards/rejected": -15.311912536621094, + "step": 3320 + }, + { + "epoch": 0.8315644899488076, + "grad_norm": 0.01336669921875, + "learning_rate": 4.194356224082455e-07, + "logits/chosen": -2.0754525661468506, + "logits/rejected": -2.304088592529297, + "logps/chosen": -0.6566920876502991, + "logps/rejected": -1547.8634033203125, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20915034413337708, + "rewards/margins": 15.037984848022461, + "rewards/rejected": -14.828834533691406, + "step": 3330 + }, + { + "epoch": 0.8340616806093145, + "grad_norm": 0.0283203125, + "learning_rate": 4.074289995452338e-07, + "logits/chosen": -2.141746997833252, + "logits/rejected": -2.3306097984313965, + "logps/chosen": -0.9173293113708496, + "logps/rejected": -1333.7867431640625, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2176503688097, + "rewards/margins": 13.049924850463867, + "rewards/rejected": -12.832275390625, + "step": 3340 + }, + { + "epoch": 0.8365588712698214, + "grad_norm": 0.01904296875, + "learning_rate": 3.9558146880329246e-07, + "logits/chosen": -2.1531293392181396, + "logits/rejected": -2.3555006980895996, + "logps/chosen": -1.041372537612915, + "logps/rejected": -1363.6673583984375, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21647608280181885, + "rewards/margins": 13.151887893676758, + "rewards/rejected": -12.935412406921387, + "step": 3350 + }, + { + "epoch": 0.8390560619303283, + "grad_norm": 0.0869140625, + "learning_rate": 3.838939309159187e-07, + "logits/chosen": -2.150744915008545, + "logits/rejected": -2.3291797637939453, + "logps/chosen": -0.6859675645828247, + "logps/rejected": -1347.5732421875, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21168136596679688, + "rewards/margins": 13.16771411895752, + "rewards/rejected": -12.956031799316406, + "step": 3360 + }, + { + "epoch": 0.8415532525908354, + "grad_norm": 0.0213623046875, + "learning_rate": 3.723672744528162e-07, + "logits/chosen": -2.225355863571167, + "logits/rejected": -2.434971570968628, + "logps/chosen": -0.7719963788986206, + "logps/rejected": -1404.882568359375, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2120278775691986, + "rewards/margins": 13.696516036987305, + "rewards/rejected": -13.484487533569336, + "step": 3370 + }, + { + "epoch": 0.8440504432513423, + "grad_norm": 0.0220947265625, + "learning_rate": 3.6100237575233647e-07, + "logits/chosen": -2.2835781574249268, + "logits/rejected": -2.459686279296875, + "logps/chosen": -0.8155478239059448, + "logps/rejected": -1183.509033203125, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21422457695007324, + "rewards/margins": 11.59019660949707, + "rewards/rejected": -11.375970840454102, + "step": 3380 + }, + { + "epoch": 0.8465476339118492, + "grad_norm": 0.047607421875, + "learning_rate": 3.4980009885486054e-07, + "logits/chosen": -2.2139523029327393, + "logits/rejected": -2.3762905597686768, + "logps/chosen": -0.49865931272506714, + "logps/rejected": -1125.1737060546875, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20752505958080292, + "rewards/margins": 10.996343612670898, + "rewards/rejected": -10.788819313049316, + "step": 3390 + }, + { + "epoch": 0.8490448245723561, + "grad_norm": 0.01556396484375, + "learning_rate": 3.3876129543710197e-07, + "logits/chosen": -2.184354305267334, + "logits/rejected": -2.3724493980407715, + "logps/chosen": -0.690311074256897, + "logps/rejected": -1528.7587890625, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21172885596752167, + "rewards/margins": 14.938085556030273, + "rewards/rejected": -14.726354598999023, + "step": 3400 + }, + { + "epoch": 0.851542015232863, + "grad_norm": 0.01495361328125, + "learning_rate": 3.2788680474735687e-07, + "logits/chosen": -2.1705164909362793, + "logits/rejected": -2.373166561126709, + "logps/chosen": -0.5612165927886963, + "logps/rejected": -1317.1187744140625, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20889082551002502, + "rewards/margins": 12.858512878417969, + "rewards/rejected": -12.649621963500977, + "step": 3410 + }, + { + "epoch": 0.85403920589337, + "grad_norm": 0.00872802734375, + "learning_rate": 3.1717745354170214e-07, + "logits/chosen": -2.071406841278076, + "logits/rejected": -2.2939293384552, + "logps/chosen": -0.8238442540168762, + "logps/rejected": -1532.26513671875, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21248868107795715, + "rewards/margins": 15.00029468536377, + "rewards/rejected": -14.787805557250977, + "step": 3420 + }, + { + "epoch": 0.8565363965538769, + "grad_norm": 0.021484375, + "learning_rate": 3.0663405602113727e-07, + "logits/chosen": -2.24153208732605, + "logits/rejected": -2.467984676361084, + "logps/chosen": -0.9753687977790833, + "logps/rejected": -1389.8427734375, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20957298576831818, + "rewards/margins": 13.579081535339355, + "rewards/rejected": -13.369508743286133, + "step": 3430 + }, + { + "epoch": 0.8590335872143838, + "grad_norm": 0.047607421875, + "learning_rate": 2.9625741376968107e-07, + "logits/chosen": -2.060586452484131, + "logits/rejected": -2.3030850887298584, + "logps/chosen": -2.972282886505127, + "logps/rejected": -1365.322265625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21505007147789001, + "rewards/margins": 12.994850158691406, + "rewards/rejected": -12.77979850769043, + "step": 3440 + }, + { + "epoch": 0.8615307778748907, + "grad_norm": 0.02001953125, + "learning_rate": 2.8604831569343324e-07, + "logits/chosen": -2.2799830436706543, + "logits/rejected": -2.4574227333068848, + "logps/chosen": -0.9521903991699219, + "logps/rejected": -1208.1971435546875, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21667388081550598, + "rewards/margins": 11.710010528564453, + "rewards/rejected": -11.493337631225586, + "step": 3450 + }, + { + "epoch": 0.8640279685353977, + "grad_norm": 0.028564453125, + "learning_rate": 2.760075379605942e-07, + "logits/chosen": -2.1184418201446533, + "logits/rejected": -2.292738199234009, + "logps/chosen": -0.882199764251709, + "logps/rejected": -1400.0753173828125, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20649878680706024, + "rewards/margins": 13.685522079467773, + "rewards/rejected": -13.479023933410645, + "step": 3460 + }, + { + "epoch": 0.8665251591959046, + "grad_norm": 0.166015625, + "learning_rate": 2.661358439424552e-07, + "logits/chosen": -2.1794090270996094, + "logits/rejected": -2.3647027015686035, + "logps/chosen": -0.8141934275627136, + "logps/rejected": -1179.304931640625, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21081428229808807, + "rewards/margins": 11.469918251037598, + "rewards/rejected": -11.25910472869873, + "step": 3470 + }, + { + "epoch": 0.8690223498564116, + "grad_norm": 0.0262451171875, + "learning_rate": 2.564339841553615e-07, + "logits/chosen": -2.1696417331695557, + "logits/rejected": -2.341275453567505, + "logps/chosen": -0.6168124675750732, + "logps/rejected": -1255.4180908203125, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20439627766609192, + "rewards/margins": 12.246126174926758, + "rewards/rejected": -12.041730880737305, + "step": 3480 + }, + { + "epoch": 0.8715195405169185, + "grad_norm": 0.212890625, + "learning_rate": 2.469026962036539e-07, + "logits/chosen": -2.155325412750244, + "logits/rejected": -2.346266984939575, + "logps/chosen": -1.7188537120819092, + "logps/rejected": -1198.50634765625, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21203342080116272, + "rewards/margins": 11.423583984375, + "rewards/rejected": -11.211549758911133, + "step": 3490 + }, + { + "epoch": 0.8740167311774254, + "grad_norm": 0.033447265625, + "learning_rate": 2.3754270472358786e-07, + "logits/chosen": -2.1500706672668457, + "logits/rejected": -2.346287965774536, + "logps/chosen": -1.2322837114334106, + "logps/rejected": -1203.8701171875, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2134041041135788, + "rewards/margins": 11.585506439208984, + "rewards/rejected": -11.372100830078125, + "step": 3500 + }, + { + "epoch": 0.8765139218379323, + "grad_norm": 0.0419921875, + "learning_rate": 2.283547213282458e-07, + "logits/chosen": -2.26165509223938, + "logits/rejected": -2.4591403007507324, + "logps/chosen": -1.2189921140670776, + "logps/rejected": -1291.148193359375, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21497786045074463, + "rewards/margins": 12.45503044128418, + "rewards/rejected": -12.24005126953125, + "step": 3510 + }, + { + "epoch": 0.8790111124984392, + "grad_norm": 0.0400390625, + "learning_rate": 2.1933944455343166e-07, + "logits/chosen": -1.9996531009674072, + "logits/rejected": -2.232881784439087, + "logps/chosen": -1.0685181617736816, + "logps/rejected": -1328.8775634765625, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21257129311561584, + "rewards/margins": 12.904253959655762, + "rewards/rejected": -12.691683769226074, + "step": 3520 + }, + { + "epoch": 0.8815083031589462, + "grad_norm": 0.01324462890625, + "learning_rate": 2.104975598045647e-07, + "logits/chosen": -2.1279807090759277, + "logits/rejected": -2.3155367374420166, + "logps/chosen": -0.7418814897537231, + "logps/rejected": -1234.801025390625, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21231353282928467, + "rewards/margins": 12.064626693725586, + "rewards/rejected": -11.852312088012695, + "step": 3530 + }, + { + "epoch": 0.8840054938194531, + "grad_norm": 0.04248046875, + "learning_rate": 2.018297393045701e-07, + "logits/chosen": -2.169581651687622, + "logits/rejected": -2.334414005279541, + "logps/chosen": -1.1093792915344238, + "logps/rejected": -1281.266357421875, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.209160715341568, + "rewards/margins": 12.48926067352295, + "rewards/rejected": -12.280099868774414, + "step": 3540 + }, + { + "epoch": 0.8865026844799601, + "grad_norm": 0.0888671875, + "learning_rate": 1.9333664204277236e-07, + "logits/chosen": -2.0912182331085205, + "logits/rejected": -2.292468309402466, + "logps/chosen": -0.8641373515129089, + "logps/rejected": -1473.767333984375, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21468326449394226, + "rewards/margins": 14.221132278442383, + "rewards/rejected": -14.006446838378906, + "step": 3550 + }, + { + "epoch": 0.888999875140467, + "grad_norm": 0.017578125, + "learning_rate": 1.8501891372479124e-07, + "logits/chosen": -2.155086040496826, + "logits/rejected": -2.3607256412506104, + "logps/chosen": -1.0332701206207275, + "logps/rejected": -1407.046875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21279795467853546, + "rewards/margins": 13.675623893737793, + "rewards/rejected": -13.4628267288208, + "step": 3560 + }, + { + "epoch": 0.8914970658009739, + "grad_norm": 0.10498046875, + "learning_rate": 1.7687718672345533e-07, + "logits/chosen": -2.1115050315856934, + "logits/rejected": -2.295365810394287, + "logps/chosen": -1.0766206979751587, + "logps/rejected": -1537.2154541015625, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21170708537101746, + "rewards/margins": 15.0249662399292, + "rewards/rejected": -14.813258171081543, + "step": 3570 + }, + { + "epoch": 0.8939942564614808, + "grad_norm": 0.03515625, + "learning_rate": 1.689120800307212e-07, + "logits/chosen": -2.010655403137207, + "logits/rejected": -2.2343146800994873, + "logps/chosen": -0.6959076523780823, + "logps/rejected": -1583.5645751953125, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2187151461839676, + "rewards/margins": 15.251518249511719, + "rewards/rejected": -15.032801628112793, + "step": 3580 + }, + { + "epoch": 0.8964914471219878, + "grad_norm": 0.041259765625, + "learning_rate": 1.6112419921061357e-07, + "logits/chosen": -2.149298906326294, + "logits/rejected": -2.3335325717926025, + "logps/chosen": -1.0091092586517334, + "logps/rejected": -1295.9100341796875, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2126895934343338, + "rewards/margins": 12.669659614562988, + "rewards/rejected": -12.456971168518066, + "step": 3590 + }, + { + "epoch": 0.8989886377824947, + "grad_norm": 0.05615234375, + "learning_rate": 1.5351413635318807e-07, + "logits/chosen": -2.2476723194122314, + "logits/rejected": -2.4481379985809326, + "logps/chosen": -1.025138258934021, + "logps/rejected": -1300.3070068359375, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21075649559497833, + "rewards/margins": 12.60840892791748, + "rewards/rejected": -12.397652626037598, + "step": 3600 + }, + { + "epoch": 0.9014858284430016, + "grad_norm": 0.0279541015625, + "learning_rate": 1.460824700295138e-07, + "logits/chosen": -2.246796131134033, + "logits/rejected": -2.438882350921631, + "logps/chosen": -1.5276464223861694, + "logps/rejected": -1376.2548828125, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21834063529968262, + "rewards/margins": 13.44012451171875, + "rewards/rejected": -13.221783638000488, + "step": 3610 + }, + { + "epoch": 0.9039830191035085, + "grad_norm": 0.0157470703125, + "learning_rate": 1.3882976524768694e-07, + "logits/chosen": -2.2246479988098145, + "logits/rejected": -2.397996425628662, + "logps/chosen": -1.2670552730560303, + "logps/rejected": -1179.010986328125, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21077945828437805, + "rewards/margins": 11.500974655151367, + "rewards/rejected": -11.290196418762207, + "step": 3620 + }, + { + "epoch": 0.9064802097640154, + "grad_norm": 0.044677734375, + "learning_rate": 1.3175657340987664e-07, + "logits/chosen": -2.1487388610839844, + "logits/rejected": -2.334177255630493, + "logps/chosen": -0.5317996740341187, + "logps/rejected": -1380.195556640625, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2111264169216156, + "rewards/margins": 13.50303840637207, + "rewards/rejected": -13.291911125183105, + "step": 3630 + }, + { + "epoch": 0.9089774004245225, + "grad_norm": 0.052978515625, + "learning_rate": 1.2486343227040122e-07, + "logits/chosen": -2.2575807571411133, + "logits/rejected": -2.471717357635498, + "logps/chosen": -1.4988012313842773, + "logps/rejected": -1318.032470703125, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22440342605113983, + "rewards/margins": 12.804231643676758, + "rewards/rejected": -12.579828262329102, + "step": 3640 + }, + { + "epoch": 0.9114745910850294, + "grad_norm": 0.0654296875, + "learning_rate": 1.181508658948452e-07, + "logits/chosen": -2.189079999923706, + "logits/rejected": -2.372708559036255, + "logps/chosen": -0.8293665051460266, + "logps/rejected": -1286.6043701171875, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21110188961029053, + "rewards/margins": 12.531554222106934, + "rewards/rejected": -12.320451736450195, + "step": 3650 + }, + { + "epoch": 0.9139717817455363, + "grad_norm": 0.018798828125, + "learning_rate": 1.1161938462021627e-07, + "logits/chosen": -2.082040309906006, + "logits/rejected": -2.2717125415802, + "logps/chosen": -1.0598349571228027, + "logps/rejected": -1245.3990478515625, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21748106181621552, + "rewards/margins": 12.090206146240234, + "rewards/rejected": -11.872724533081055, + "step": 3660 + }, + { + "epoch": 0.9164689724060432, + "grad_norm": 0.06982421875, + "learning_rate": 1.0526948501614536e-07, + "logits/chosen": -2.103464126586914, + "logits/rejected": -2.3152968883514404, + "logps/chosen": -1.075402021408081, + "logps/rejected": -1461.2308349609375, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21564142405986786, + "rewards/margins": 14.216550827026367, + "rewards/rejected": -14.000910758972168, + "step": 3670 + }, + { + "epoch": 0.9189661630665501, + "grad_norm": 0.06982421875, + "learning_rate": 9.910164984713477e-08, + "logits/chosen": -2.1121301651000977, + "logits/rejected": -2.327693223953247, + "logps/chosen": -1.3442718982696533, + "logps/rejected": -1471.4466552734375, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21234098076820374, + "rewards/margins": 14.340484619140625, + "rewards/rejected": -14.128143310546875, + "step": 3680 + }, + { + "epoch": 0.921463353727057, + "grad_norm": 0.06005859375, + "learning_rate": 9.311634803585323e-08, + "logits/chosen": -2.1561217308044434, + "logits/rejected": -2.3662197589874268, + "logps/chosen": -0.8007721900939941, + "logps/rejected": -1469.8878173828125, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2085859775543213, + "rewards/margins": 14.383298873901367, + "rewards/rejected": -14.174713134765625, + "step": 3690 + }, + { + "epoch": 0.923960544387564, + "grad_norm": 0.068359375, + "learning_rate": 8.7314034627487e-08, + "logits/chosen": -2.203339099884033, + "logits/rejected": -2.40441632270813, + "logps/chosen": -0.5535265207290649, + "logps/rejected": -1442.5123291015625, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20969831943511963, + "rewards/margins": 14.11566162109375, + "rewards/rejected": -13.905962944030762, + "step": 3700 + }, + { + "epoch": 0.9264577350480709, + "grad_norm": 0.04833984375, + "learning_rate": 8.16951507551439e-08, + "logits/chosen": -2.2100465297698975, + "logits/rejected": -2.394742250442505, + "logps/chosen": -1.0725539922714233, + "logps/rejected": -1294.114990234375, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21020770072937012, + "rewards/margins": 12.512723922729492, + "rewards/rejected": -12.302515029907227, + "step": 3710 + }, + { + "epoch": 0.9289549257085778, + "grad_norm": 0.01544189453125, + "learning_rate": 7.626012360631291e-08, + "logits/chosen": -2.2372231483459473, + "logits/rejected": -2.4310178756713867, + "logps/chosen": -1.1079142093658447, + "logps/rejected": -1298.0045166015625, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21124926209449768, + "rewards/margins": 12.63310718536377, + "rewards/rejected": -12.421857833862305, + "step": 3720 + }, + { + "epoch": 0.9314521163690848, + "grad_norm": 0.03466796875, + "learning_rate": 7.100936639038936e-08, + "logits/chosen": -2.0344414710998535, + "logits/rejected": -2.2667644023895264, + "logps/chosen": -1.0317838191986084, + "logps/rejected": -1655.9033203125, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2157924920320511, + "rewards/margins": 16.17062759399414, + "rewards/rejected": -15.954833984375, + "step": 3730 + }, + { + "epoch": 0.9339493070295917, + "grad_norm": 0.00531005859375, + "learning_rate": 6.594327830725916e-08, + "logits/chosen": -2.162308931350708, + "logits/rejected": -2.371338367462158, + "logps/chosen": -0.7821828722953796, + "logps/rejected": -1442.607177734375, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21128495037555695, + "rewards/margins": 14.142976760864258, + "rewards/rejected": -13.931692123413086, + "step": 3740 + }, + { + "epoch": 0.9364464976900987, + "grad_norm": 0.043701171875, + "learning_rate": 6.106224451694592e-08, + "logits/chosen": -2.1930558681488037, + "logits/rejected": -2.386634111404419, + "logps/chosen": -0.6907114386558533, + "logps/rejected": -1420.8463134765625, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21480241417884827, + "rewards/margins": 13.8462495803833, + "rewards/rejected": -13.631448745727539, + "step": 3750 + }, + { + "epoch": 0.9389436883506056, + "grad_norm": 0.04736328125, + "learning_rate": 5.636663611033266e-08, + "logits/chosen": -2.058790683746338, + "logits/rejected": -2.274880886077881, + "logps/chosen": -0.41397613286972046, + "logps/rejected": -1431.2852783203125, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20900818705558777, + "rewards/margins": 14.00958251953125, + "rewards/rejected": -13.800572395324707, + "step": 3760 + }, + { + "epoch": 0.9414408790111125, + "grad_norm": 0.019287109375, + "learning_rate": 5.185681008094579e-08, + "logits/chosen": -2.251438617706299, + "logits/rejected": -2.4458415508270264, + "logps/chosen": -1.0221302509307861, + "logps/rejected": -1385.7362060546875, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21662044525146484, + "rewards/margins": 13.469772338867188, + "rewards/rejected": -13.253152847290039, + "step": 3770 + }, + { + "epoch": 0.9439380696716194, + "grad_norm": 0.03857421875, + "learning_rate": 4.753310929781513e-08, + "logits/chosen": -2.206300973892212, + "logits/rejected": -2.3716189861297607, + "logps/chosen": -0.6498397588729858, + "logps/rejected": -1291.197021484375, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21270425617694855, + "rewards/margins": 12.625164985656738, + "rewards/rejected": -12.412460327148438, + "step": 3780 + }, + { + "epoch": 0.9464352603321263, + "grad_norm": 0.033447265625, + "learning_rate": 4.3395862479405914e-08, + "logits/chosen": -2.1362087726593018, + "logits/rejected": -2.332123041152954, + "logps/chosen": -1.0763086080551147, + "logps/rejected": -1387.7713623046875, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2146589756011963, + "rewards/margins": 13.39326286315918, + "rewards/rejected": -13.178604125976562, + "step": 3790 + }, + { + "epoch": 0.9489324509926332, + "grad_norm": 0.0155029296875, + "learning_rate": 3.9445384168628474e-08, + "logits/chosen": -2.291581869125366, + "logits/rejected": -2.500275135040283, + "logps/chosen": -1.0031490325927734, + "logps/rejected": -1276.675537109375, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20909054577350616, + "rewards/margins": 12.3539457321167, + "rewards/rejected": -12.144854545593262, + "step": 3800 + }, + { + "epoch": 0.9514296416531403, + "grad_norm": 0.0284423828125, + "learning_rate": 3.5681974708923484e-08, + "logits/chosen": -2.1034350395202637, + "logits/rejected": -2.2940239906311035, + "logps/chosen": -0.8783596158027649, + "logps/rejected": -1220.660888671875, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21768565475940704, + "rewards/margins": 11.806703567504883, + "rewards/rejected": -11.589017868041992, + "step": 3810 + }, + { + "epoch": 0.9539268323136472, + "grad_norm": 0.047119140625, + "learning_rate": 3.210592022142717e-08, + "logits/chosen": -2.1330649852752686, + "logits/rejected": -2.2985074520111084, + "logps/chosen": -0.7123221158981323, + "logps/rejected": -1336.329833984375, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20608916878700256, + "rewards/margins": 12.982263565063477, + "rewards/rejected": -12.776172637939453, + "step": 3820 + }, + { + "epoch": 0.9564240229741541, + "grad_norm": 0.021484375, + "learning_rate": 2.8717492583220095e-08, + "logits/chosen": -2.225675106048584, + "logits/rejected": -2.428712844848633, + "logps/chosen": -0.8774779438972473, + "logps/rejected": -1398.2039794921875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20993375778198242, + "rewards/margins": 13.676666259765625, + "rewards/rejected": -13.4667329788208, + "step": 3830 + }, + { + "epoch": 0.958921213634661, + "grad_norm": 0.03271484375, + "learning_rate": 2.551694940665539e-08, + "logits/chosen": -2.163163423538208, + "logits/rejected": -2.351386070251465, + "logps/chosen": -0.9975617527961731, + "logps/rejected": -1255.6383056640625, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21191437542438507, + "rewards/margins": 12.254827499389648, + "rewards/rejected": -12.042913436889648, + "step": 3840 + }, + { + "epoch": 0.9614184042951679, + "grad_norm": 0.055908203125, + "learning_rate": 2.2504534019774092e-08, + "logits/chosen": -2.3171262741088867, + "logits/rejected": -2.492202043533325, + "logps/chosen": -0.872540295124054, + "logps/rejected": -1181.1051025390625, + "loss": 0.2264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21220049262046814, + "rewards/margins": 11.461995124816895, + "rewards/rejected": -11.24979305267334, + "step": 3850 + }, + { + "epoch": 0.9639155949556749, + "grad_norm": 0.032958984375, + "learning_rate": 1.9680475447805826e-08, + "logits/chosen": -2.1993744373321533, + "logits/rejected": -2.380159378051758, + "logps/chosen": -0.721504807472229, + "logps/rejected": -1297.561767578125, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20789849758148193, + "rewards/margins": 12.683464050292969, + "rewards/rejected": -12.475565910339355, + "step": 3860 + }, + { + "epoch": 0.9664127856161818, + "grad_norm": 0.0201416015625, + "learning_rate": 1.70449883957563e-08, + "logits/chosen": -2.232905626296997, + "logits/rejected": -2.4287447929382324, + "logps/chosen": -2.1762092113494873, + "logps/rejected": -1314.664794921875, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21345773339271545, + "rewards/margins": 12.700533866882324, + "rewards/rejected": -12.487077713012695, + "step": 3870 + }, + { + "epoch": 0.9689099762766887, + "grad_norm": 0.072265625, + "learning_rate": 1.4598273232083182e-08, + "logits/chosen": -2.198019027709961, + "logits/rejected": -2.3671329021453857, + "logps/chosen": -0.9621660113334656, + "logps/rejected": -1280.2039794921875, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20697855949401855, + "rewards/margins": 12.497517585754395, + "rewards/rejected": -12.29054069519043, + "step": 3880 + }, + { + "epoch": 0.9714071669371956, + "grad_norm": 0.0478515625, + "learning_rate": 1.2340515973464917e-08, + "logits/chosen": -2.1526544094085693, + "logits/rejected": -2.3664348125457764, + "logps/chosen": -1.546007752418518, + "logps/rejected": -1401.54638671875, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20804882049560547, + "rewards/margins": 13.68072509765625, + "rewards/rejected": -13.472674369812012, + "step": 3890 + }, + { + "epoch": 0.9739043575977026, + "grad_norm": 0.016357421875, + "learning_rate": 1.0271888270655118e-08, + "logits/chosen": -2.043034076690674, + "logits/rejected": -2.229666233062744, + "logps/chosen": -0.9901046752929688, + "logps/rejected": -1281.4815673828125, + "loss": 0.2265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2106485813856125, + "rewards/margins": 12.358712196350098, + "rewards/rejected": -12.148063659667969, + "step": 3900 + }, + { + "epoch": 0.9764015482582095, + "grad_norm": 0.07666015625, + "learning_rate": 8.392547395435769e-09, + "logits/chosen": -2.374267101287842, + "logits/rejected": -2.551339626312256, + "logps/chosen": -1.2009716033935547, + "logps/rejected": -1176.1605224609375, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20642951130867004, + "rewards/margins": 11.431352615356445, + "rewards/rejected": -11.224924087524414, + "step": 3910 + }, + { + "epoch": 0.9788987389187165, + "grad_norm": 0.0732421875, + "learning_rate": 6.702636228657911e-09, + "logits/chosen": -2.262585163116455, + "logits/rejected": -2.4511070251464844, + "logps/chosen": -0.7528651356697083, + "logps/rejected": -1265.910400390625, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21303972601890564, + "rewards/margins": 12.352733612060547, + "rewards/rejected": -12.139693260192871, + "step": 3920 + }, + { + "epoch": 0.9813959295792234, + "grad_norm": 0.11572265625, + "learning_rate": 5.2022832493800465e-09, + "logits/chosen": -2.3309874534606934, + "logits/rejected": -2.5094618797302246, + "logps/chosen": -0.8482611775398254, + "logps/rejected": -1145.723876953125, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2170940339565277, + "rewards/margins": 11.18010139465332, + "rewards/rejected": -10.963006973266602, + "step": 3930 + }, + { + "epoch": 0.9838931202397303, + "grad_norm": 0.0174560546875, + "learning_rate": 3.891602525100124e-09, + "logits/chosen": -2.202822208404541, + "logits/rejected": -2.4167404174804688, + "logps/chosen": -0.8022462725639343, + "logps/rejected": -1359.097412109375, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21217508614063263, + "rewards/margins": 13.16607666015625, + "rewards/rejected": -12.953901290893555, + "step": 3940 + }, + { + "epoch": 0.9863903109002372, + "grad_norm": 0.0947265625, + "learning_rate": 2.7706937030827495e-09, + "logits/chosen": -2.245856285095215, + "logits/rejected": -2.436892032623291, + "logps/chosen": -1.236242651939392, + "logps/rejected": -1134.9066162109375, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20883974432945251, + "rewards/margins": 10.997222900390625, + "rewards/rejected": -10.788381576538086, + "step": 3950 + }, + { + "epoch": 0.9888875015607441, + "grad_norm": 0.07568359375, + "learning_rate": 1.839642002783859e-09, + "logits/chosen": -2.1721549034118652, + "logits/rejected": -2.3608601093292236, + "logps/chosen": -0.9914839863777161, + "logps/rejected": -1147.5926513671875, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21253648400306702, + "rewards/margins": 11.109753608703613, + "rewards/rejected": -10.897214889526367, + "step": 3960 + }, + { + "epoch": 0.9913846922212511, + "grad_norm": 0.0308837890625, + "learning_rate": 1.0985182093714574e-09, + "logits/chosen": -2.2215476036071777, + "logits/rejected": -2.3835880756378174, + "logps/chosen": -0.42377692461013794, + "logps/rejected": -1237.712646484375, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20775683224201202, + "rewards/margins": 12.072611808776855, + "rewards/rejected": -11.86485481262207, + "step": 3970 + }, + { + "epoch": 0.993881882881758, + "grad_norm": 0.1875, + "learning_rate": 5.473786683440896e-10, + "logits/chosen": -2.119377613067627, + "logits/rejected": -2.3185195922851562, + "logps/chosen": -1.0564239025115967, + "logps/rejected": -1471.339111328125, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21374066174030304, + "rewards/margins": 14.393136978149414, + "rewards/rejected": -14.17939567565918, + "step": 3980 + }, + { + "epoch": 0.996379073542265, + "grad_norm": 0.0311279296875, + "learning_rate": 1.862652812467669e-10, + "logits/chosen": -2.1754400730133057, + "logits/rejected": -2.3970232009887695, + "logps/chosen": -1.259765863418579, + "logps/rejected": -1448.65576171875, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2176629602909088, + "rewards/margins": 13.863238334655762, + "rewards/rejected": -13.645576477050781, + "step": 3990 + }, + { + "epoch": 0.9988762642027719, + "grad_norm": 0.035400390625, + "learning_rate": 1.5205502486292932e-11, + "logits/chosen": -2.143209934234619, + "logits/rejected": -2.34411883354187, + "logps/chosen": -0.6734473705291748, + "logps/rejected": -1441.0018310546875, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20581206679344177, + "rewards/margins": 14.11164379119873, + "rewards/rejected": -13.905832290649414, + "step": 4000 + }, + { + "epoch": 0.9988762642027719, + "eval_logits/chosen": -2.571059465408325, + "eval_logits/rejected": -2.6589972972869873, + "eval_logps/chosen": -0.11967950314283371, + "eval_logps/rejected": -652.1184692382812, + "eval_loss": 0.22132086753845215, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": 0.2579382359981537, + "eval_rewards/margins": 6.330402374267578, + "eval_rewards/rejected": -6.072464466094971, + "eval_runtime": 0.656, + "eval_samples_per_second": 7.622, + "eval_steps_per_second": 4.573, + "step": 4000 + }, + { + "epoch": 0.9998751404669747, + "step": 4004, + "total_flos": 0.0, + "train_loss": 0.2426841035559699, + "train_runtime": 8271.4989, + "train_samples_per_second": 1.936, + "train_steps_per_second": 0.484 + } + ], + "logging_steps": 10, + "max_steps": 4004, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}