diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.9140625, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": -3.0349411964416504, + "logits/rejected": -2.9776864051818848, + "logps/chosen": -456.54913330078125, + "logps/rejected": -495.31854248046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.8984375, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.916372776031494, + "logits/rejected": -2.8596677780151367, + "logps/chosen": -410.91595458984375, + "logps/rejected": -357.0899353027344, + "loss": 0.6933, + "rewards/accuracies": 0.4513888955116272, + "rewards/chosen": -0.00023399748897645622, + "rewards/margins": -0.0003936050634365529, + "rewards/rejected": 0.0001596075453562662, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 1.03125, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.9189207553863525, + "logits/rejected": -2.8314878940582275, + "logps/chosen": -452.43377685546875, + "logps/rejected": -340.9873046875, + "loss": 0.6934, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.0003274443151894957, + "rewards/margins": -0.0004972027963958681, + "rewards/rejected": 0.00016975855396594852, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.9921875, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": -2.922006607055664, + "logits/rejected": -2.8864428997039795, + "logps/chosen": -397.01446533203125, + "logps/rejected": -380.24017333984375, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.00024825072614476085, + "rewards/margins": -0.00015650910791009665, + "rewards/rejected": 0.00040475986315868795, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 1.1171875, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": -2.840169906616211, + "logits/rejected": -2.8260841369628906, + "logps/chosen": -335.4841613769531, + "logps/rejected": -324.1778259277344, + "loss": 0.6928, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0013349488144740462, + "rewards/margins": 0.0006949803791940212, + "rewards/rejected": 0.000639968435280025, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.875, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": -2.9089934825897217, + "logits/rejected": -2.8778884410858154, + "logps/chosen": -403.4610900878906, + "logps/rejected": -344.0279846191406, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.001873858505859971, + "rewards/margins": 0.00040313409408554435, + "rewards/rejected": 0.0014707243535667658, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.83203125, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": -2.842329502105713, + "logits/rejected": -2.808168888092041, + "logps/chosen": -382.2471923828125, + "logps/rejected": -331.0003662109375, + "loss": 0.693, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.0033431951887905598, + "rewards/margins": 0.00034264856367371976, + "rewards/rejected": 0.0030005467124283314, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 1.4765625, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": -2.939213991165161, + "logits/rejected": -2.8729026317596436, + "logps/chosen": -406.10760498046875, + "logps/rejected": -353.65576171875, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003925986122339964, + "rewards/margins": -0.0006284945411607623, + "rewards/rejected": 0.004554481245577335, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 1.4453125, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": -2.8512609004974365, + "logits/rejected": -2.8039116859436035, + "logps/chosen": -399.90130615234375, + "logps/rejected": -359.86932373046875, + "loss": 0.6924, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.005817199591547251, + "rewards/margins": 0.0015620887279510498, + "rewards/rejected": 0.004255110863596201, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 0.97265625, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": -2.8811564445495605, + "logits/rejected": -2.8479971885681152, + "logps/chosen": -381.4345703125, + "logps/rejected": -349.1830139160156, + "loss": 0.692, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.009745048359036446, + "rewards/margins": 0.0022289410699158907, + "rewards/rejected": 0.007516107521951199, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.90234375, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": -2.8783085346221924, + "logits/rejected": -2.8482470512390137, + "logps/chosen": -379.60736083984375, + "logps/rejected": -338.56134033203125, + "loss": 0.6921, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.01279132068157196, + "rewards/margins": 0.002047107554972172, + "rewards/rejected": 0.010744214989244938, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.836320638656616, + "eval_logits/rejected": -2.784111261367798, + "eval_logps/chosen": -394.6286315917969, + "eval_logps/rejected": -350.2682800292969, + "eval_loss": 0.6922685503959656, + "eval_rewards/accuracies": 0.5644999742507935, + "eval_rewards/chosen": 0.015961581841111183, + "eval_rewards/margins": 0.0018089638324454427, + "eval_rewards/rejected": 0.014152619056403637, + "eval_runtime": 347.8719, + "eval_samples_per_second": 5.749, + "eval_steps_per_second": 0.719, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 0.8359375, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": -2.9356420040130615, + "logits/rejected": -2.863084554672241, + "logps/chosen": -409.4505310058594, + "logps/rejected": -340.94085693359375, + "loss": 0.6926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015902375802397728, + "rewards/margins": 0.0012320507084950805, + "rewards/rejected": 0.014670324511826038, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 0.9140625, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": -2.9147744178771973, + "logits/rejected": -2.8579554557800293, + "logps/chosen": -428.4613342285156, + "logps/rejected": -384.51239013671875, + "loss": 0.6912, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.019384171813726425, + "rewards/margins": 0.004034861922264099, + "rewards/rejected": 0.0153493108227849, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 1.0390625, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": -2.9081578254699707, + "logits/rejected": -2.849456787109375, + "logps/chosen": -383.9152526855469, + "logps/rejected": -361.4400329589844, + "loss": 0.6923, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.014263955876231194, + "rewards/margins": 0.0018312319880351424, + "rewards/rejected": 0.012432724237442017, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.984375, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": -2.9242119789123535, + "logits/rejected": -2.800161600112915, + "logps/chosen": -417.3861389160156, + "logps/rejected": -330.3959045410156, + "loss": 0.6914, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.016321910545229912, + "rewards/margins": 0.0035601272247731686, + "rewards/rejected": 0.01276178378611803, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 1.6875, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": -2.943554639816284, + "logits/rejected": -2.8792669773101807, + "logps/chosen": -413.8592224121094, + "logps/rejected": -343.1801452636719, + "loss": 0.6911, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.018247485160827637, + "rewards/margins": 0.0042217145673930645, + "rewards/rejected": 0.014025771990418434, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 0.96484375, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": -2.8945369720458984, + "logits/rejected": -2.8573861122131348, + "logps/chosen": -378.51092529296875, + "logps/rejected": -363.04132080078125, + "loss": 0.6911, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.019316475838422775, + "rewards/margins": 0.004308086819946766, + "rewards/rejected": 0.01500838715583086, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 1.203125, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": -2.8808963298797607, + "logits/rejected": -2.8329708576202393, + "logps/chosen": -337.3949279785156, + "logps/rejected": -312.98773193359375, + "loss": 0.6898, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.022478580474853516, + "rewards/margins": 0.006769159343093634, + "rewards/rejected": 0.015709420666098595, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.75, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": -2.8937134742736816, + "logits/rejected": -2.837139129638672, + "logps/chosen": -379.23760986328125, + "logps/rejected": -340.9472351074219, + "loss": 0.6901, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.02863329090178013, + "rewards/margins": 0.006192624568939209, + "rewards/rejected": 0.02244066260755062, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 1.140625, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": -2.944746494293213, + "logits/rejected": -2.892685651779175, + "logps/chosen": -402.75653076171875, + "logps/rejected": -342.20172119140625, + "loss": 0.6916, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.03445928543806076, + "rewards/margins": 0.003415898187085986, + "rewards/rejected": 0.031043391674757004, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 1.546875, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": -2.8978257179260254, + "logits/rejected": -2.837639570236206, + "logps/chosen": -374.3539733886719, + "logps/rejected": -313.32257080078125, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.040689874440431595, + "rewards/margins": 0.008053514175117016, + "rewards/rejected": 0.032636359333992004, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.8333005905151367, + "eval_logits/rejected": -2.781090021133423, + "eval_logps/chosen": -391.8948669433594, + "eval_logps/rejected": -348.1495361328125, + "eval_loss": 0.6894406080245972, + "eval_rewards/accuracies": 0.5920000076293945, + "eval_rewards/chosen": 0.04329930990934372, + "eval_rewards/margins": 0.007959411479532719, + "eval_rewards/rejected": 0.03533989191055298, + "eval_runtime": 347.8469, + "eval_samples_per_second": 5.75, + "eval_steps_per_second": 0.719, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 0.89453125, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": -2.9700427055358887, + "logits/rejected": -2.8941903114318848, + "logps/chosen": -390.8605041503906, + "logps/rejected": -332.96624755859375, + "loss": 0.6898, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.04209558665752411, + "rewards/margins": 0.0070165605284273624, + "rewards/rejected": 0.035079024732112885, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 0.859375, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": -2.862724542617798, + "logits/rejected": -2.8397350311279297, + "logps/chosen": -380.3348083496094, + "logps/rejected": -332.48809814453125, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04277118295431137, + "rewards/margins": 0.007822849787771702, + "rewards/rejected": 0.034948334097862244, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 0.9765625, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": -2.980745792388916, + "logits/rejected": -2.915937900543213, + "logps/chosen": -444.210205078125, + "logps/rejected": -392.0636291503906, + "loss": 0.6886, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.04879799112677574, + "rewards/margins": 0.009785487316548824, + "rewards/rejected": 0.039012499153614044, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 0.9453125, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": -2.897602081298828, + "logits/rejected": -2.86655592918396, + "logps/chosen": -422.80157470703125, + "logps/rejected": -391.47808837890625, + "loss": 0.6864, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.05084707587957382, + "rewards/margins": 0.014665389433503151, + "rewards/rejected": 0.03618168458342552, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 0.8671875, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": -2.885866165161133, + "logits/rejected": -2.8721957206726074, + "logps/chosen": -390.76702880859375, + "logps/rejected": -342.5933837890625, + "loss": 0.6892, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05445907637476921, + "rewards/margins": 0.009773282334208488, + "rewards/rejected": 0.044685788452625275, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 1.0, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": -2.9248204231262207, + "logits/rejected": -2.853421449661255, + "logps/chosen": -404.9168701171875, + "logps/rejected": -348.4195861816406, + "loss": 0.686, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.0569482147693634, + "rewards/margins": 0.0154123455286026, + "rewards/rejected": 0.041535865515470505, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 0.68359375, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": -2.888120174407959, + "logits/rejected": -2.8404757976531982, + "logps/chosen": -384.75872802734375, + "logps/rejected": -330.0838928222656, + "loss": 0.6859, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.05628432705998421, + "rewards/margins": 0.016307855024933815, + "rewards/rejected": 0.039976466447114944, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 0.875, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": -2.906266212463379, + "logits/rejected": -2.832149028778076, + "logps/chosen": -387.818115234375, + "logps/rejected": -337.62017822265625, + "loss": 0.6871, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.060541294515132904, + "rewards/margins": 0.013487743213772774, + "rewards/rejected": 0.04705354943871498, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 0.94921875, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": -2.888960361480713, + "logits/rejected": -2.847351551055908, + "logps/chosen": -411.20904541015625, + "logps/rejected": -362.61474609375, + "loss": 0.6837, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07661953568458557, + "rewards/margins": 0.022713415324687958, + "rewards/rejected": 0.053906120359897614, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 0.92578125, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": -2.868332862854004, + "logits/rejected": -2.7791736125946045, + "logps/chosen": -369.7691345214844, + "logps/rejected": -329.0905456542969, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07754194736480713, + "rewards/margins": 0.024631675332784653, + "rewards/rejected": 0.05291026830673218, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.8349289894104004, + "eval_logits/rejected": -2.783784866333008, + "eval_logps/chosen": -388.1691589355469, + "eval_logps/rejected": -345.5898132324219, + "eval_loss": 0.6844429969787598, + "eval_rewards/accuracies": 0.6025000214576721, + "eval_rewards/chosen": 0.08055612444877625, + "eval_rewards/margins": 0.019618848338723183, + "eval_rewards/rejected": 0.06093727424740791, + "eval_runtime": 347.9802, + "eval_samples_per_second": 5.747, + "eval_steps_per_second": 0.718, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 1.125, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": -2.9738924503326416, + "logits/rejected": -2.891376495361328, + "logps/chosen": -419.47601318359375, + "logps/rejected": -330.5395202636719, + "loss": 0.6852, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08353392034769058, + "rewards/margins": 0.01769311912357807, + "rewards/rejected": 0.06584079563617706, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 0.83984375, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": -2.9381539821624756, + "logits/rejected": -2.868596076965332, + "logps/chosen": -382.90325927734375, + "logps/rejected": -343.45361328125, + "loss": 0.6849, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.07833738625049591, + "rewards/margins": 0.018442410975694656, + "rewards/rejected": 0.059894971549510956, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 0.87109375, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": -2.8898563385009766, + "logits/rejected": -2.863262176513672, + "logps/chosen": -379.0876159667969, + "logps/rejected": -343.98748779296875, + "loss": 0.6769, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.11513115465641022, + "rewards/margins": 0.03892900422215462, + "rewards/rejected": 0.0762021541595459, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 0.8984375, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": -2.924078941345215, + "logits/rejected": -2.884129762649536, + "logps/chosen": -412.20428466796875, + "logps/rejected": -369.37615966796875, + "loss": 0.6837, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.10442419350147247, + "rewards/margins": 0.021783817559480667, + "rewards/rejected": 0.0826403871178627, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 0.921875, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": -2.8822202682495117, + "logits/rejected": -2.832418441772461, + "logps/chosen": -419.82122802734375, + "logps/rejected": -383.4366455078125, + "loss": 0.6807, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.1065782904624939, + "rewards/margins": 0.02887110412120819, + "rewards/rejected": 0.0777071863412857, + "step": 350 + }, + { + "epoch": 0.09, + "grad_norm": 0.6328125, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": -2.9021358489990234, + "logits/rejected": -2.8665103912353516, + "logps/chosen": -353.05645751953125, + "logps/rejected": -322.67803955078125, + "loss": 0.6916, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07063518464565277, + "rewards/margins": 0.0060135251842439175, + "rewards/rejected": 0.06462165713310242, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 1.109375, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": -2.914665699005127, + "logits/rejected": -2.8427698612213135, + "logps/chosen": -405.73809814453125, + "logps/rejected": -331.1866149902344, + "loss": 0.6722, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.09655407071113586, + "rewards/margins": 0.05310269445180893, + "rewards/rejected": 0.043451376259326935, + "step": 370 + }, + { + "epoch": 0.1, + "grad_norm": 0.94140625, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": -2.900359869003296, + "logits/rejected": -2.81068754196167, + "logps/chosen": -427.6316833496094, + "logps/rejected": -363.28839111328125, + "loss": 0.6717, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.10045112669467926, + "rewards/margins": 0.053518980741500854, + "rewards/rejected": 0.046932149678468704, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 1.3984375, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": -2.8918914794921875, + "logits/rejected": -2.845247745513916, + "logps/chosen": -412.29071044921875, + "logps/rejected": -371.7367248535156, + "loss": 0.6796, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.07834922522306442, + "rewards/margins": 0.031100135296583176, + "rewards/rejected": 0.04724908620119095, + "step": 390 + }, + { + "epoch": 0.1, + "grad_norm": 0.796875, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": -2.9121220111846924, + "logits/rejected": -2.850386619567871, + "logps/chosen": -381.02825927734375, + "logps/rejected": -322.06927490234375, + "loss": 0.6869, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.08469127863645554, + "rewards/margins": 0.020099209621548653, + "rewards/rejected": 0.06459207087755203, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.842348098754883, + "eval_logits/rejected": -2.7931265830993652, + "eval_logps/chosen": -390.1521911621094, + "eval_logps/rejected": -348.9979248046875, + "eval_loss": 0.6788274645805359, + "eval_rewards/accuracies": 0.612500011920929, + "eval_rewards/chosen": 0.0607261136174202, + "eval_rewards/margins": 0.0338701568543911, + "eval_rewards/rejected": 0.026855960488319397, + "eval_runtime": 347.7786, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 1.1328125, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": -2.8766117095947266, + "logits/rejected": -2.8555123805999756, + "logps/chosen": -361.97406005859375, + "logps/rejected": -350.51287841796875, + "loss": 0.6959, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.05293396860361099, + "rewards/margins": 0.0049603343941271305, + "rewards/rejected": 0.0479736328125, + "step": 410 + }, + { + "epoch": 0.11, + "grad_norm": 1.0859375, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": -2.8863377571105957, + "logits/rejected": -2.8117470741271973, + "logps/chosen": -351.9708251953125, + "logps/rejected": -313.9596252441406, + "loss": 0.6861, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.033623430877923965, + "rewards/margins": 0.01676887646317482, + "rewards/rejected": 0.016854556277394295, + "step": 420 + }, + { + "epoch": 0.11, + "grad_norm": 1.0703125, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": -2.8675060272216797, + "logits/rejected": -2.8136143684387207, + "logps/chosen": -408.8647155761719, + "logps/rejected": -369.70257568359375, + "loss": 0.686, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.041368208825588226, + "rewards/margins": 0.02153196558356285, + "rewards/rejected": 0.019836245104670525, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 0.78515625, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": -2.9551219940185547, + "logits/rejected": -2.9198532104492188, + "logps/chosen": -399.39410400390625, + "logps/rejected": -351.37982177734375, + "loss": 0.6821, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.03481561690568924, + "rewards/margins": 0.02588939666748047, + "rewards/rejected": 0.00892622210085392, + "step": 440 + }, + { + "epoch": 0.12, + "grad_norm": 0.85546875, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": -2.879997968673706, + "logits/rejected": -2.857247829437256, + "logps/chosen": -394.7357482910156, + "logps/rejected": -354.9325256347656, + "loss": 0.6736, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.04896073415875435, + "rewards/margins": 0.04451023414731026, + "rewards/rejected": 0.004450496751815081, + "step": 450 + }, + { + "epoch": 0.12, + "grad_norm": 1.1796875, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": -2.89152455329895, + "logits/rejected": -2.8554794788360596, + "logps/chosen": -394.02325439453125, + "logps/rejected": -363.3119201660156, + "loss": 0.6753, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05705699324607849, + "rewards/margins": 0.047898683696985245, + "rewards/rejected": 0.009158318862318993, + "step": 460 + }, + { + "epoch": 0.12, + "grad_norm": 1.7109375, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": -2.8964645862579346, + "logits/rejected": -2.844935417175293, + "logps/chosen": -369.896484375, + "logps/rejected": -337.79278564453125, + "loss": 0.6741, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.05254621058702469, + "rewards/margins": 0.04723441228270531, + "rewards/rejected": 0.005311795976012945, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 0.94921875, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": -2.928675413131714, + "logits/rejected": -2.873516798019409, + "logps/chosen": -383.92852783203125, + "logps/rejected": -358.11566162109375, + "loss": 0.6722, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.039998866617679596, + "rewards/margins": 0.04847729206085205, + "rewards/rejected": -0.008478422649204731, + "step": 480 + }, + { + "epoch": 0.13, + "grad_norm": 1.078125, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": -2.9350996017456055, + "logits/rejected": -2.8882603645324707, + "logps/chosen": -420.064697265625, + "logps/rejected": -359.68072509765625, + "loss": 0.6711, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.030888009816408157, + "rewards/margins": 0.04867198318243027, + "rewards/rejected": -0.01778397336602211, + "step": 490 + }, + { + "epoch": 0.13, + "grad_norm": 1.03125, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": -2.916215658187866, + "logits/rejected": -2.8541104793548584, + "logps/chosen": -411.7395935058594, + "logps/rejected": -343.6163330078125, + "loss": 0.6744, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.018574411049485207, + "rewards/margins": 0.047007013112306595, + "rewards/rejected": -0.028432602062821388, + "step": 500 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.8370718955993652, + "eval_logits/rejected": -2.7888970375061035, + "eval_logps/chosen": -393.79833984375, + "eval_logps/rejected": -354.1763610839844, + "eval_loss": 0.6723790168762207, + "eval_rewards/accuracies": 0.6209999918937683, + "eval_rewards/chosen": 0.02426437847316265, + "eval_rewards/margins": 0.04919267073273659, + "eval_rewards/rejected": -0.024928290396928787, + "eval_runtime": 347.9, + "eval_samples_per_second": 5.749, + "eval_steps_per_second": 0.719, + "step": 500 + }, + { + "epoch": 0.13, + "grad_norm": 1.1171875, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": -2.90047025680542, + "logits/rejected": -2.8580737113952637, + "logps/chosen": -407.69781494140625, + "logps/rejected": -361.30267333984375, + "loss": 0.6753, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.039911769330501556, + "rewards/margins": 0.046458516269922256, + "rewards/rejected": -0.0065467446111142635, + "step": 510 + }, + { + "epoch": 0.14, + "grad_norm": 1.2734375, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": -2.914069414138794, + "logits/rejected": -2.8952417373657227, + "logps/chosen": -400.2842712402344, + "logps/rejected": -370.97039794921875, + "loss": 0.6663, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.015612982213497162, + "rewards/margins": 0.06524328887462616, + "rewards/rejected": -0.0496302992105484, + "step": 520 + }, + { + "epoch": 0.14, + "grad_norm": 1.0859375, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": -2.8638806343078613, + "logits/rejected": -2.8652124404907227, + "logps/chosen": -400.7289123535156, + "logps/rejected": -399.452880859375, + "loss": 0.6814, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.001105122035369277, + "rewards/margins": 0.03718414530158043, + "rewards/rejected": -0.03828927129507065, + "step": 530 + }, + { + "epoch": 0.14, + "grad_norm": 1.125, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": -2.869264841079712, + "logits/rejected": -2.8013787269592285, + "logps/chosen": -414.38800048828125, + "logps/rejected": -343.013671875, + "loss": 0.6495, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.005649521015584469, + "rewards/margins": 0.10599911212921143, + "rewards/rejected": -0.11164864152669907, + "step": 540 + }, + { + "epoch": 0.14, + "grad_norm": 1.015625, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": -2.867119073867798, + "logits/rejected": -2.8448452949523926, + "logps/chosen": -406.0426940917969, + "logps/rejected": -376.4588623046875, + "loss": 0.666, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.004353336989879608, + "rewards/margins": 0.07087867707014084, + "rewards/rejected": -0.06652534753084183, + "step": 550 + }, + { + "epoch": 0.15, + "grad_norm": 1.078125, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": -2.9276793003082275, + "logits/rejected": -2.8994839191436768, + "logps/chosen": -393.2682189941406, + "logps/rejected": -381.3654479980469, + "loss": 0.6694, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01845262572169304, + "rewards/margins": 0.057842254638671875, + "rewards/rejected": -0.07629488408565521, + "step": 560 + }, + { + "epoch": 0.15, + "grad_norm": 1.1328125, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": -2.9889473915100098, + "logits/rejected": -2.9415230751037598, + "logps/chosen": -441.64227294921875, + "logps/rejected": -392.90789794921875, + "loss": 0.6794, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.01707800105214119, + "rewards/margins": 0.04300212487578392, + "rewards/rejected": -0.06008012965321541, + "step": 570 + }, + { + "epoch": 0.15, + "grad_norm": 1.2421875, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": -2.9623188972473145, + "logits/rejected": -2.8731496334075928, + "logps/chosen": -446.22662353515625, + "logps/rejected": -381.77752685546875, + "loss": 0.656, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.03769092634320259, + "rewards/margins": 0.09427281469106674, + "rewards/rejected": -0.13196374475955963, + "step": 580 + }, + { + "epoch": 0.15, + "grad_norm": 1.203125, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": -2.857860565185547, + "logits/rejected": -2.8171629905700684, + "logps/chosen": -391.1763916015625, + "logps/rejected": -357.4033203125, + "loss": 0.663, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09089629352092743, + "rewards/margins": 0.07482485473155975, + "rewards/rejected": -0.165721133351326, + "step": 590 + }, + { + "epoch": 0.16, + "grad_norm": 2.46875, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": -2.895017147064209, + "logits/rejected": -2.8843705654144287, + "logps/chosen": -395.64947509765625, + "logps/rejected": -366.892822265625, + "loss": 0.6679, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04552796110510826, + "rewards/margins": 0.06777463853359222, + "rewards/rejected": -0.11330260336399078, + "step": 600 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.817946672439575, + "eval_logits/rejected": -2.7708613872528076, + "eval_logps/chosen": -401.88262939453125, + "eval_logps/rejected": -365.14019775390625, + "eval_loss": 0.6624515652656555, + "eval_rewards/accuracies": 0.6265000104904175, + "eval_rewards/chosen": -0.05657815560698509, + "eval_rewards/margins": 0.07798823714256287, + "eval_rewards/rejected": -0.13456639647483826, + "eval_runtime": 348.0683, + "eval_samples_per_second": 5.746, + "eval_steps_per_second": 0.718, + "step": 600 + }, + { + "epoch": 0.16, + "grad_norm": 1.375, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": -2.8852925300598145, + "logits/rejected": -2.841823101043701, + "logps/chosen": -383.7059020996094, + "logps/rejected": -347.17669677734375, + "loss": 0.6691, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05533725023269653, + "rewards/margins": 0.0660722628235817, + "rewards/rejected": -0.12140952050685883, + "step": 610 + }, + { + "epoch": 0.16, + "grad_norm": 1.8515625, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": -2.8929924964904785, + "logits/rejected": -2.801697254180908, + "logps/chosen": -444.3966369628906, + "logps/rejected": -354.19598388671875, + "loss": 0.6443, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.15692010521888733, + "rewards/margins": 0.11467301845550537, + "rewards/rejected": -0.2715931236743927, + "step": 620 + }, + { + "epoch": 0.16, + "grad_norm": 1.8671875, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": -2.893341541290283, + "logits/rejected": -2.8479011058807373, + "logps/chosen": -392.2441711425781, + "logps/rejected": -361.94268798828125, + "loss": 0.6421, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.15643954277038574, + "rewards/margins": 0.12407402694225311, + "rewards/rejected": -0.28051358461380005, + "step": 630 + }, + { + "epoch": 0.17, + "grad_norm": 1.90625, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": -2.8857052326202393, + "logits/rejected": -2.811861515045166, + "logps/chosen": -418.484619140625, + "logps/rejected": -368.8498840332031, + "loss": 0.6506, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.21659335494041443, + "rewards/margins": 0.10539694130420685, + "rewards/rejected": -0.3219902813434601, + "step": 640 + }, + { + "epoch": 0.17, + "grad_norm": 1.65625, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": -2.8835432529449463, + "logits/rejected": -2.842060089111328, + "logps/chosen": -399.4352722167969, + "logps/rejected": -387.4463806152344, + "loss": 0.6506, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.17046542465686798, + "rewards/margins": 0.11528462171554565, + "rewards/rejected": -0.28575003147125244, + "step": 650 + }, + { + "epoch": 0.17, + "grad_norm": 3.625, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": -2.8580405712127686, + "logits/rejected": -2.7751071453094482, + "logps/chosen": -410.6004943847656, + "logps/rejected": -364.0141296386719, + "loss": 0.6449, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.15695621073246002, + "rewards/margins": 0.1257764846086502, + "rewards/rejected": -0.28273266553878784, + "step": 660 + }, + { + "epoch": 0.18, + "grad_norm": 3.09375, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": -2.93772292137146, + "logits/rejected": -2.882884979248047, + "logps/chosen": -430.3829650878906, + "logps/rejected": -389.68328857421875, + "loss": 0.6462, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.19247478246688843, + "rewards/margins": 0.12777897715568542, + "rewards/rejected": -0.32025375962257385, + "step": 670 + }, + { + "epoch": 0.18, + "grad_norm": 1.6015625, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": -2.8791463375091553, + "logits/rejected": -2.821669101715088, + "logps/chosen": -436.94708251953125, + "logps/rejected": -379.05267333984375, + "loss": 0.6408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30955275893211365, + "rewards/margins": 0.13841886818408966, + "rewards/rejected": -0.4479715824127197, + "step": 680 + }, + { + "epoch": 0.18, + "grad_norm": 2.4375, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": -2.887767791748047, + "logits/rejected": -2.8599493503570557, + "logps/chosen": -401.97064208984375, + "logps/rejected": -403.107421875, + "loss": 0.661, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3961626887321472, + "rewards/margins": 0.1000264510512352, + "rewards/rejected": -0.4961891770362854, + "step": 690 + }, + { + "epoch": 0.18, + "grad_norm": 1.78125, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": -2.8757412433624268, + "logits/rejected": -2.86216139793396, + "logps/chosen": -415.5210876464844, + "logps/rejected": -404.1314697265625, + "loss": 0.637, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2919083535671234, + "rewards/margins": 0.1432960480451584, + "rewards/rejected": -0.435204416513443, + "step": 700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.8051064014434814, + "eval_logits/rejected": -2.7595512866973877, + "eval_logps/chosen": -421.9038391113281, + "eval_logps/rejected": -388.2210998535156, + "eval_loss": 0.6554521918296814, + "eval_rewards/accuracies": 0.6290000081062317, + "eval_rewards/chosen": -0.256790429353714, + "eval_rewards/margins": 0.10858490318059921, + "eval_rewards/rejected": -0.3653753399848938, + "eval_runtime": 348.1124, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 0.718, + "step": 700 + }, + { + "epoch": 0.19, + "grad_norm": 2.625, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": -2.8942840099334717, + "logits/rejected": -2.857219934463501, + "logps/chosen": -418.62322998046875, + "logps/rejected": -394.4143981933594, + "loss": 0.6587, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1779194325208664, + "rewards/margins": 0.0996006429195404, + "rewards/rejected": -0.2775201201438904, + "step": 710 + }, + { + "epoch": 0.19, + "grad_norm": 1.6328125, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": -2.8712573051452637, + "logits/rejected": -2.8021373748779297, + "logps/chosen": -414.42999267578125, + "logps/rejected": -377.20977783203125, + "loss": 0.6622, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.1302914321422577, + "rewards/margins": 0.09119856357574463, + "rewards/rejected": -0.22148998081684113, + "step": 720 + }, + { + "epoch": 0.19, + "grad_norm": 2.21875, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": -2.872545003890991, + "logits/rejected": -2.815669059753418, + "logps/chosen": -410.472412109375, + "logps/rejected": -358.6063232421875, + "loss": 0.6462, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.07536014169454575, + "rewards/margins": 0.11603609472513199, + "rewards/rejected": -0.19139623641967773, + "step": 730 + }, + { + "epoch": 0.19, + "grad_norm": 3.96875, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": -2.844766139984131, + "logits/rejected": -2.786101818084717, + "logps/chosen": -370.5934143066406, + "logps/rejected": -329.69647216796875, + "loss": 0.6229, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.21907174587249756, + "rewards/margins": 0.19244422018527985, + "rewards/rejected": -0.41151589155197144, + "step": 740 + }, + { + "epoch": 0.2, + "grad_norm": 1.5, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": -2.8021976947784424, + "logits/rejected": -2.775529384613037, + "logps/chosen": -399.1451110839844, + "logps/rejected": -384.2425842285156, + "loss": 0.653, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.397163063287735, + "rewards/margins": 0.11699549108743668, + "rewards/rejected": -0.5141586065292358, + "step": 750 + }, + { + "epoch": 0.2, + "grad_norm": 2.109375, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": -2.816556453704834, + "logits/rejected": -2.7907676696777344, + "logps/chosen": -448.0435485839844, + "logps/rejected": -432.1190490722656, + "loss": 0.6524, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4579244554042816, + "rewards/margins": 0.1237117201089859, + "rewards/rejected": -0.5816361904144287, + "step": 760 + }, + { + "epoch": 0.2, + "grad_norm": 2.71875, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": -2.839500665664673, + "logits/rejected": -2.799598455429077, + "logps/chosen": -418.523193359375, + "logps/rejected": -380.72637939453125, + "loss": 0.6546, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3512226939201355, + "rewards/margins": 0.12142340838909149, + "rewards/rejected": -0.4726460874080658, + "step": 770 + }, + { + "epoch": 0.2, + "grad_norm": 1.9296875, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": -2.8736701011657715, + "logits/rejected": -2.815397262573242, + "logps/chosen": -448.5498046875, + "logps/rejected": -413.5223693847656, + "loss": 0.6451, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3000739514827728, + "rewards/margins": 0.14568910002708435, + "rewards/rejected": -0.4457630515098572, + "step": 780 + }, + { + "epoch": 0.21, + "grad_norm": 2.453125, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": -2.8422582149505615, + "logits/rejected": -2.8511698246002197, + "logps/chosen": -416.14312744140625, + "logps/rejected": -407.68487548828125, + "loss": 0.6656, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.430819571018219, + "rewards/margins": 0.0904776006937027, + "rewards/rejected": -0.5212971568107605, + "step": 790 + }, + { + "epoch": 0.21, + "grad_norm": 2.109375, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": -2.9085071086883545, + "logits/rejected": -2.8099634647369385, + "logps/chosen": -441.54296875, + "logps/rejected": -363.7981262207031, + "loss": 0.6166, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.40520063042640686, + "rewards/margins": 0.19839458167552948, + "rewards/rejected": -0.6035951972007751, + "step": 800 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.7961480617523193, + "eval_logits/rejected": -2.7522692680358887, + "eval_logps/chosen": -435.57562255859375, + "eval_logps/rejected": -403.91156005859375, + "eval_loss": 0.6488239765167236, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": -0.39350807666778564, + "eval_rewards/margins": 0.12877221405506134, + "eval_rewards/rejected": -0.5222803354263306, + "eval_runtime": 347.7457, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 800 + }, + { + "epoch": 0.21, + "grad_norm": 1.7421875, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": -2.8743693828582764, + "logits/rejected": -2.890942096710205, + "logps/chosen": -443.807373046875, + "logps/rejected": -438.0557556152344, + "loss": 0.6621, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4286819100379944, + "rewards/margins": 0.10433633625507355, + "rewards/rejected": -0.5330182313919067, + "step": 810 + }, + { + "epoch": 0.21, + "grad_norm": 2.421875, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": -2.844068765640259, + "logits/rejected": -2.8668532371520996, + "logps/chosen": -392.5022277832031, + "logps/rejected": -408.68621826171875, + "loss": 0.6549, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4813667833805084, + "rewards/margins": 0.12066853046417236, + "rewards/rejected": -0.6020352244377136, + "step": 820 + }, + { + "epoch": 0.22, + "grad_norm": 2.375, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": -2.8786587715148926, + "logits/rejected": -2.8575031757354736, + "logps/chosen": -460.0040588378906, + "logps/rejected": -449.45977783203125, + "loss": 0.6536, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4967479705810547, + "rewards/margins": 0.12318827956914902, + "rewards/rejected": -0.6199362874031067, + "step": 830 + }, + { + "epoch": 0.22, + "grad_norm": 2.671875, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": -2.8527016639709473, + "logits/rejected": -2.792952060699463, + "logps/chosen": -423.39801025390625, + "logps/rejected": -387.58355712890625, + "loss": 0.654, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.43838852643966675, + "rewards/margins": 0.12122461944818497, + "rewards/rejected": -0.5596131086349487, + "step": 840 + }, + { + "epoch": 0.22, + "grad_norm": 2.09375, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": -2.8446707725524902, + "logits/rejected": -2.8177928924560547, + "logps/chosen": -430.2727966308594, + "logps/rejected": -414.4742736816406, + "loss": 0.6639, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.44889992475509644, + "rewards/margins": 0.10808303207159042, + "rewards/rejected": -0.5569829940795898, + "step": 850 + }, + { + "epoch": 0.23, + "grad_norm": 2.734375, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": -2.8193821907043457, + "logits/rejected": -2.782836675643921, + "logps/chosen": -438.9239196777344, + "logps/rejected": -410.18048095703125, + "loss": 0.6483, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.36728745698928833, + "rewards/margins": 0.1387586146593094, + "rewards/rejected": -0.5060460567474365, + "step": 860 + }, + { + "epoch": 0.23, + "grad_norm": 3.046875, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": -2.7876265048980713, + "logits/rejected": -2.7442731857299805, + "logps/chosen": -412.59539794921875, + "logps/rejected": -410.63653564453125, + "loss": 0.655, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2634727656841278, + "rewards/margins": 0.13101360201835632, + "rewards/rejected": -0.39448636770248413, + "step": 870 + }, + { + "epoch": 0.23, + "grad_norm": 1.8828125, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": -2.830331802368164, + "logits/rejected": -2.815990924835205, + "logps/chosen": -431.3417053222656, + "logps/rejected": -406.23236083984375, + "loss": 0.6457, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.308138370513916, + "rewards/margins": 0.13623470067977905, + "rewards/rejected": -0.44437307119369507, + "step": 880 + }, + { + "epoch": 0.23, + "grad_norm": 2.765625, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": -2.8040425777435303, + "logits/rejected": -2.8051490783691406, + "logps/chosen": -414.428466796875, + "logps/rejected": -427.8641052246094, + "loss": 0.6559, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3916015923023224, + "rewards/margins": 0.15443366765975952, + "rewards/rejected": -0.5460351705551147, + "step": 890 + }, + { + "epoch": 0.24, + "grad_norm": 2.1875, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": -2.7597362995147705, + "logits/rejected": -2.7200279235839844, + "logps/chosen": -403.2427673339844, + "logps/rejected": -386.6829528808594, + "loss": 0.6335, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.49025583267211914, + "rewards/margins": 0.1762937605381012, + "rewards/rejected": -0.666549563407898, + "step": 900 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.7764461040496826, + "eval_logits/rejected": -2.7324697971343994, + "eval_logps/chosen": -441.37982177734375, + "eval_logps/rejected": -412.10833740234375, + "eval_loss": 0.6458185911178589, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": -0.4515506625175476, + "eval_rewards/margins": 0.1526976376771927, + "eval_rewards/rejected": -0.6042482256889343, + "eval_runtime": 347.8684, + "eval_samples_per_second": 5.749, + "eval_steps_per_second": 0.719, + "step": 900 + }, + { + "epoch": 0.24, + "grad_norm": 3.421875, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": -2.8672194480895996, + "logits/rejected": -2.7797975540161133, + "logps/chosen": -462.484375, + "logps/rejected": -388.0072326660156, + "loss": 0.6518, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3822649419307709, + "rewards/margins": 0.13505886495113373, + "rewards/rejected": -0.5173237919807434, + "step": 910 + }, + { + "epoch": 0.24, + "grad_norm": 2.0, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": -2.7902486324310303, + "logits/rejected": -2.733579158782959, + "logps/chosen": -440.94390869140625, + "logps/rejected": -411.362060546875, + "loss": 0.6272, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4113929867744446, + "rewards/margins": 0.18324372172355652, + "rewards/rejected": -0.5946367383003235, + "step": 920 + }, + { + "epoch": 0.24, + "grad_norm": 2.765625, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": -2.855236768722534, + "logits/rejected": -2.8222384452819824, + "logps/chosen": -445.28094482421875, + "logps/rejected": -420.7135314941406, + "loss": 0.6181, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.42793193459510803, + "rewards/margins": 0.2128107249736786, + "rewards/rejected": -0.6407425999641418, + "step": 930 + }, + { + "epoch": 0.25, + "grad_norm": 11.5, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": -2.787015676498413, + "logits/rejected": -2.7473626136779785, + "logps/chosen": -461.5128479003906, + "logps/rejected": -439.97930908203125, + "loss": 0.6182, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.39246460795402527, + "rewards/margins": 0.2291673719882965, + "rewards/rejected": -0.6216319799423218, + "step": 940 + }, + { + "epoch": 0.25, + "grad_norm": 2.25, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": -2.8133838176727295, + "logits/rejected": -2.7668540477752686, + "logps/chosen": -431.5006408691406, + "logps/rejected": -404.2099609375, + "loss": 0.6353, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.518032431602478, + "rewards/margins": 0.17736072838306427, + "rewards/rejected": -0.6953932046890259, + "step": 950 + }, + { + "epoch": 0.25, + "grad_norm": 2.4375, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": -2.8185486793518066, + "logits/rejected": -2.793464183807373, + "logps/chosen": -465.65814208984375, + "logps/rejected": -436.23931884765625, + "loss": 0.6434, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6132515668869019, + "rewards/margins": 0.16154471039772034, + "rewards/rejected": -0.7747962474822998, + "step": 960 + }, + { + "epoch": 0.25, + "grad_norm": 4.3125, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": -2.827209234237671, + "logits/rejected": -2.7537784576416016, + "logps/chosen": -473.32196044921875, + "logps/rejected": -427.2911071777344, + "loss": 0.5966, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6051194071769714, + "rewards/margins": 0.27736273407936096, + "rewards/rejected": -0.8824821710586548, + "step": 970 + }, + { + "epoch": 0.26, + "grad_norm": 4.34375, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": -2.8257224559783936, + "logits/rejected": -2.7971818447113037, + "logps/chosen": -481.39697265625, + "logps/rejected": -462.51416015625, + "loss": 0.6356, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6484049558639526, + "rewards/margins": 0.18180248141288757, + "rewards/rejected": -0.8302074670791626, + "step": 980 + }, + { + "epoch": 0.26, + "grad_norm": 2.171875, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": -2.8246514797210693, + "logits/rejected": -2.7782371044158936, + "logps/chosen": -443.04925537109375, + "logps/rejected": -426.6200256347656, + "loss": 0.6097, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7901213765144348, + "rewards/margins": 0.251308411359787, + "rewards/rejected": -1.0414297580718994, + "step": 990 + }, + { + "epoch": 0.26, + "grad_norm": 2.1875, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": -2.8568129539489746, + "logits/rejected": -2.7981626987457275, + "logps/chosen": -530.5858154296875, + "logps/rejected": -487.3711853027344, + "loss": 0.6286, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7988755106925964, + "rewards/margins": 0.19343645870685577, + "rewards/rejected": -0.9923120737075806, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.753098964691162, + "eval_logits/rejected": -2.7123119831085205, + "eval_logps/chosen": -483.1429443359375, + "eval_logps/rejected": -456.1025695800781, + "eval_loss": 0.6405959725379944, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -0.869181215763092, + "eval_rewards/margins": 0.17500866949558258, + "eval_rewards/rejected": -1.0441899299621582, + "eval_runtime": 347.7912, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 1000 + }, + { + "epoch": 0.26, + "grad_norm": 4.3125, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": -2.7716658115386963, + "logits/rejected": -2.7464160919189453, + "logps/chosen": -449.4707946777344, + "logps/rejected": -440.67181396484375, + "loss": 0.6483, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.828794002532959, + "rewards/margins": 0.16667340695858002, + "rewards/rejected": -0.995467483997345, + "step": 1010 + }, + { + "epoch": 0.27, + "grad_norm": 2.15625, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": -2.82257342338562, + "logits/rejected": -2.7864184379577637, + "logps/chosen": -431.7757263183594, + "logps/rejected": -390.1167907714844, + "loss": 0.6416, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5249059200286865, + "rewards/margins": 0.17124707996845245, + "rewards/rejected": -0.6961530447006226, + "step": 1020 + }, + { + "epoch": 0.27, + "grad_norm": 2.265625, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": -2.77396821975708, + "logits/rejected": -2.745842456817627, + "logps/chosen": -399.4544982910156, + "logps/rejected": -385.6885070800781, + "loss": 0.6561, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.313784658908844, + "rewards/margins": 0.1287391483783722, + "rewards/rejected": -0.44252386689186096, + "step": 1030 + }, + { + "epoch": 0.27, + "grad_norm": 3.46875, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": -2.8060081005096436, + "logits/rejected": -2.755823850631714, + "logps/chosen": -465.9139709472656, + "logps/rejected": -425.3163146972656, + "loss": 0.6176, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2053132951259613, + "rewards/margins": 0.21412332355976105, + "rewards/rejected": -0.41943663358688354, + "step": 1040 + }, + { + "epoch": 0.27, + "grad_norm": 2.734375, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": -2.8254618644714355, + "logits/rejected": -2.7887418270111084, + "logps/chosen": -431.9395446777344, + "logps/rejected": -413.49468994140625, + "loss": 0.6406, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3133324682712555, + "rewards/margins": 0.14582258462905884, + "rewards/rejected": -0.45915499329566956, + "step": 1050 + }, + { + "epoch": 0.28, + "grad_norm": 2.3125, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": -2.744650363922119, + "logits/rejected": -2.7300617694854736, + "logps/chosen": -397.44451904296875, + "logps/rejected": -379.4835205078125, + "loss": 0.6429, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3276776671409607, + "rewards/margins": 0.16546614468097687, + "rewards/rejected": -0.49314385652542114, + "step": 1060 + }, + { + "epoch": 0.28, + "grad_norm": 2.703125, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": -2.803278684616089, + "logits/rejected": -2.7433362007141113, + "logps/chosen": -464.9649963378906, + "logps/rejected": -400.2354431152344, + "loss": 0.6388, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.34644556045532227, + "rewards/margins": 0.17325380444526672, + "rewards/rejected": -0.5196993947029114, + "step": 1070 + }, + { + "epoch": 0.28, + "grad_norm": 1.875, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": -2.730520248413086, + "logits/rejected": -2.7134251594543457, + "logps/chosen": -448.7145080566406, + "logps/rejected": -413.89276123046875, + "loss": 0.6438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39723989367485046, + "rewards/margins": 0.15517649054527283, + "rewards/rejected": -0.5524164438247681, + "step": 1080 + }, + { + "epoch": 0.29, + "grad_norm": 2.59375, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": -2.788811206817627, + "logits/rejected": -2.7599310874938965, + "logps/chosen": -435.73614501953125, + "logps/rejected": -406.4289245605469, + "loss": 0.6569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4202515482902527, + "rewards/margins": 0.11943890154361725, + "rewards/rejected": -0.5396904945373535, + "step": 1090 + }, + { + "epoch": 0.29, + "grad_norm": 3.453125, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": -2.809835195541382, + "logits/rejected": -2.7414846420288086, + "logps/chosen": -434.068115234375, + "logps/rejected": -373.0021667480469, + "loss": 0.669, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4107758104801178, + "rewards/margins": 0.09588425606489182, + "rewards/rejected": -0.5066600441932678, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.7354044914245605, + "eval_logits/rejected": -2.6946027278900146, + "eval_logps/chosen": -430.67889404296875, + "eval_logps/rejected": -401.52215576171875, + "eval_loss": 0.6406324505805969, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.3445412218570709, + "eval_rewards/margins": 0.1538446694612503, + "eval_rewards/rejected": -0.49838587641716003, + "eval_runtime": 347.8119, + "eval_samples_per_second": 5.75, + "eval_steps_per_second": 0.719, + "step": 1100 + }, + { + "epoch": 0.29, + "grad_norm": 2.3125, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": -2.8207192420959473, + "logits/rejected": -2.8083157539367676, + "logps/chosen": -434.90960693359375, + "logps/rejected": -418.67315673828125, + "loss": 0.6428, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.29851752519607544, + "rewards/margins": 0.14117324352264404, + "rewards/rejected": -0.4396907389163971, + "step": 1110 + }, + { + "epoch": 0.29, + "grad_norm": 3.0625, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": -2.775874614715576, + "logits/rejected": -2.747631788253784, + "logps/chosen": -398.10650634765625, + "logps/rejected": -393.90191650390625, + "loss": 0.6401, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.26665258407592773, + "rewards/margins": 0.15709388256072998, + "rewards/rejected": -0.4237464964389801, + "step": 1120 + }, + { + "epoch": 0.3, + "grad_norm": 2.140625, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": -2.780519723892212, + "logits/rejected": -2.726839065551758, + "logps/chosen": -421.2362365722656, + "logps/rejected": -409.1251525878906, + "loss": 0.6628, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3165889382362366, + "rewards/margins": 0.10841169208288193, + "rewards/rejected": -0.4250006675720215, + "step": 1130 + }, + { + "epoch": 0.3, + "grad_norm": 3.484375, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": -2.8075780868530273, + "logits/rejected": -2.7242207527160645, + "logps/chosen": -451.6830139160156, + "logps/rejected": -388.5299987792969, + "loss": 0.6071, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.22056348621845245, + "rewards/margins": 0.22768864035606384, + "rewards/rejected": -0.4482521116733551, + "step": 1140 + }, + { + "epoch": 0.3, + "grad_norm": 2.203125, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": -2.8118491172790527, + "logits/rejected": -2.7709052562713623, + "logps/chosen": -445.9686584472656, + "logps/rejected": -409.93218994140625, + "loss": 0.6404, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3341691493988037, + "rewards/margins": 0.16431960463523865, + "rewards/rejected": -0.49848875403404236, + "step": 1150 + }, + { + "epoch": 0.3, + "grad_norm": 2.203125, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": -2.73759126663208, + "logits/rejected": -2.6830625534057617, + "logps/chosen": -443.0125427246094, + "logps/rejected": -411.4647521972656, + "loss": 0.6215, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.43005380034446716, + "rewards/margins": 0.20315060019493103, + "rewards/rejected": -0.6332044005393982, + "step": 1160 + }, + { + "epoch": 0.31, + "grad_norm": 1.9921875, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": -2.8012502193450928, + "logits/rejected": -2.733692169189453, + "logps/chosen": -441.42498779296875, + "logps/rejected": -387.70147705078125, + "loss": 0.6355, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5223753452301025, + "rewards/margins": 0.18315255641937256, + "rewards/rejected": -0.7055279016494751, + "step": 1170 + }, + { + "epoch": 0.31, + "grad_norm": 2.9375, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": -2.682478427886963, + "logits/rejected": -2.7226719856262207, + "logps/chosen": -434.474365234375, + "logps/rejected": -434.96771240234375, + "loss": 0.6547, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4838402271270752, + "rewards/margins": 0.1491493284702301, + "rewards/rejected": -0.6329895257949829, + "step": 1180 + }, + { + "epoch": 0.31, + "grad_norm": 2.265625, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": -2.769261121749878, + "logits/rejected": -2.7192516326904297, + "logps/chosen": -421.7001953125, + "logps/rejected": -409.52642822265625, + "loss": 0.6475, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4103321135044098, + "rewards/margins": 0.1710616797208786, + "rewards/rejected": -0.5813937783241272, + "step": 1190 + }, + { + "epoch": 0.31, + "grad_norm": 3.328125, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": -2.7317872047424316, + "logits/rejected": -2.693282127380371, + "logps/chosen": -409.9453125, + "logps/rejected": -401.80670166015625, + "loss": 0.6723, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.508698582649231, + "rewards/margins": 0.11917855590581894, + "rewards/rejected": -0.6278771162033081, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.7076668739318848, + "eval_logits/rejected": -2.6700782775878906, + "eval_logps/chosen": -442.416259765625, + "eval_logps/rejected": -415.98406982421875, + "eval_loss": 0.6357947587966919, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.4619145095348358, + "eval_rewards/margins": 0.18109098076820374, + "eval_rewards/rejected": -0.6430054903030396, + "eval_runtime": 348.1294, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 0.718, + "step": 1200 + }, + { + "epoch": 0.32, + "grad_norm": 2.796875, + "learning_rate": 4.319478895246e-06, + "logits/chosen": -2.737431526184082, + "logits/rejected": -2.6825287342071533, + "logps/chosen": -423.69134521484375, + "logps/rejected": -388.00445556640625, + "loss": 0.631, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4585542678833008, + "rewards/margins": 0.19313645362854004, + "rewards/rejected": -0.6516907811164856, + "step": 1210 + }, + { + "epoch": 0.32, + "grad_norm": 2.296875, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": -2.806246519088745, + "logits/rejected": -2.7676374912261963, + "logps/chosen": -441.5145568847656, + "logps/rejected": -449.9791564941406, + "loss": 0.6441, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4756022095680237, + "rewards/margins": 0.1646694839000702, + "rewards/rejected": -0.6402716636657715, + "step": 1220 + }, + { + "epoch": 0.32, + "grad_norm": 3.015625, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": -2.7820241451263428, + "logits/rejected": -2.729149341583252, + "logps/chosen": -496.11297607421875, + "logps/rejected": -433.9457092285156, + "loss": 0.6333, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5403844118118286, + "rewards/margins": 0.1901749074459076, + "rewards/rejected": -0.7305592894554138, + "step": 1230 + }, + { + "epoch": 0.32, + "grad_norm": 2.15625, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": -2.7426555156707764, + "logits/rejected": -2.711142063140869, + "logps/chosen": -488.27862548828125, + "logps/rejected": -438.2783203125, + "loss": 0.6226, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5886915922164917, + "rewards/margins": 0.22011339664459229, + "rewards/rejected": -0.8088048696517944, + "step": 1240 + }, + { + "epoch": 0.33, + "grad_norm": 3.0, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": -2.769096851348877, + "logits/rejected": -2.7023282051086426, + "logps/chosen": -451.603759765625, + "logps/rejected": -415.9313049316406, + "loss": 0.6332, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6167994141578674, + "rewards/margins": 0.1871766597032547, + "rewards/rejected": -0.8039760589599609, + "step": 1250 + }, + { + "epoch": 0.33, + "grad_norm": 2.046875, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": -2.7416205406188965, + "logits/rejected": -2.7040672302246094, + "logps/chosen": -444.20526123046875, + "logps/rejected": -424.52972412109375, + "loss": 0.65, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5938581228256226, + "rewards/margins": 0.1559075564146042, + "rewards/rejected": -0.7497657537460327, + "step": 1260 + }, + { + "epoch": 0.33, + "grad_norm": 2.875, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": -2.7380728721618652, + "logits/rejected": -2.6828763484954834, + "logps/chosen": -430.66961669921875, + "logps/rejected": -395.87994384765625, + "loss": 0.6311, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5714975595474243, + "rewards/margins": 0.20155255496501923, + "rewards/rejected": -0.7730501294136047, + "step": 1270 + }, + { + "epoch": 0.33, + "grad_norm": 2.90625, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": -2.7856247425079346, + "logits/rejected": -2.7396748065948486, + "logps/chosen": -438.529296875, + "logps/rejected": -418.1429138183594, + "loss": 0.6182, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6402245759963989, + "rewards/margins": 0.21725162863731384, + "rewards/rejected": -0.8574762344360352, + "step": 1280 + }, + { + "epoch": 0.34, + "grad_norm": 2.984375, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": -2.7590296268463135, + "logits/rejected": -2.7307534217834473, + "logps/chosen": -455.66064453125, + "logps/rejected": -426.7330017089844, + "loss": 0.6707, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.744498074054718, + "rewards/margins": 0.12376417219638824, + "rewards/rejected": -0.8682621717453003, + "step": 1290 + }, + { + "epoch": 0.34, + "grad_norm": 3.125, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": -2.773496150970459, + "logits/rejected": -2.764591693878174, + "logps/chosen": -448.2276306152344, + "logps/rejected": -449.07598876953125, + "loss": 0.605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6141419410705566, + "rewards/margins": 0.2553822696208954, + "rewards/rejected": -0.8695241808891296, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -2.7122175693511963, + "eval_logits/rejected": -2.6764349937438965, + "eval_logps/chosen": -465.1626892089844, + "eval_logps/rejected": -440.71441650390625, + "eval_loss": 0.6297281384468079, + "eval_rewards/accuracies": 0.6434999704360962, + "eval_rewards/chosen": -0.6893790364265442, + "eval_rewards/margins": 0.20092952251434326, + "eval_rewards/rejected": -0.8903085589408875, + "eval_runtime": 348.1512, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 0.718, + "step": 1300 + }, + { + "epoch": 0.34, + "grad_norm": 3.4375, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": -2.8083739280700684, + "logits/rejected": -2.752487897872925, + "logps/chosen": -442.2110900878906, + "logps/rejected": -403.32525634765625, + "loss": 0.6226, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6493631601333618, + "rewards/margins": 0.21884135901927948, + "rewards/rejected": -0.8682045936584473, + "step": 1310 + }, + { + "epoch": 0.35, + "grad_norm": 3.09375, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": -2.7438082695007324, + "logits/rejected": -2.7184457778930664, + "logps/chosen": -465.0062561035156, + "logps/rejected": -466.47674560546875, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7264341711997986, + "rewards/margins": 0.1592479646205902, + "rewards/rejected": -0.8856821060180664, + "step": 1320 + }, + { + "epoch": 0.35, + "grad_norm": 2.5, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": -2.721998453140259, + "logits/rejected": -2.7064220905303955, + "logps/chosen": -416.6083984375, + "logps/rejected": -412.7229919433594, + "loss": 0.6416, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6872702836990356, + "rewards/margins": 0.16901233792304993, + "rewards/rejected": -0.8562827110290527, + "step": 1330 + }, + { + "epoch": 0.35, + "grad_norm": 3.84375, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": -2.7732884883880615, + "logits/rejected": -2.7553248405456543, + "logps/chosen": -469.11907958984375, + "logps/rejected": -438.9508361816406, + "loss": 0.6312, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6161693930625916, + "rewards/margins": 0.18880559504032135, + "rewards/rejected": -0.8049749135971069, + "step": 1340 + }, + { + "epoch": 0.35, + "grad_norm": 2.96875, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": -2.713508129119873, + "logits/rejected": -2.668172836303711, + "logps/chosen": -456.80804443359375, + "logps/rejected": -435.1529846191406, + "loss": 0.6418, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6335210204124451, + "rewards/margins": 0.1759893149137497, + "rewards/rejected": -0.8095104098320007, + "step": 1350 + }, + { + "epoch": 0.36, + "grad_norm": 2.734375, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": -2.730811595916748, + "logits/rejected": -2.6753551959991455, + "logps/chosen": -447.3460388183594, + "logps/rejected": -414.4151306152344, + "loss": 0.6275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6381969451904297, + "rewards/margins": 0.1885562241077423, + "rewards/rejected": -0.8267530202865601, + "step": 1360 + }, + { + "epoch": 0.36, + "grad_norm": 2.75, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": -2.7473931312561035, + "logits/rejected": -2.7274961471557617, + "logps/chosen": -464.76397705078125, + "logps/rejected": -457.59674072265625, + "loss": 0.6418, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6140081882476807, + "rewards/margins": 0.1607227772474289, + "rewards/rejected": -0.7747309803962708, + "step": 1370 + }, + { + "epoch": 0.36, + "grad_norm": 3.21875, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": -2.832428455352783, + "logits/rejected": -2.760331153869629, + "logps/chosen": -478.40185546875, + "logps/rejected": -440.13043212890625, + "loss": 0.628, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6477630734443665, + "rewards/margins": 0.22839903831481934, + "rewards/rejected": -0.876162052154541, + "step": 1380 + }, + { + "epoch": 0.36, + "grad_norm": 2.953125, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": -2.720529079437256, + "logits/rejected": -2.6880316734313965, + "logps/chosen": -449.77996826171875, + "logps/rejected": -435.7823791503906, + "loss": 0.6226, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6553342938423157, + "rewards/margins": 0.21635405719280243, + "rewards/rejected": -0.8716884851455688, + "step": 1390 + }, + { + "epoch": 0.37, + "grad_norm": 2.96875, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": -2.7746074199676514, + "logits/rejected": -2.7223048210144043, + "logps/chosen": -469.13836669921875, + "logps/rejected": -439.2986755371094, + "loss": 0.6361, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6836882829666138, + "rewards/margins": 0.19334295392036438, + "rewards/rejected": -0.8770312070846558, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.7091026306152344, + "eval_logits/rejected": -2.6711199283599854, + "eval_logps/chosen": -467.6648254394531, + "eval_logps/rejected": -444.7496337890625, + "eval_loss": 0.6266594529151917, + "eval_rewards/accuracies": 0.6504999995231628, + "eval_rewards/chosen": -0.7144004702568054, + "eval_rewards/margins": 0.2162601202726364, + "eval_rewards/rejected": -0.930660605430603, + "eval_runtime": 347.968, + "eval_samples_per_second": 5.748, + "eval_steps_per_second": 0.718, + "step": 1400 + }, + { + "epoch": 0.37, + "grad_norm": 2.375, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": -2.7707862854003906, + "logits/rejected": -2.7450509071350098, + "logps/chosen": -476.1197204589844, + "logps/rejected": -445.98419189453125, + "loss": 0.6258, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7250741720199585, + "rewards/margins": 0.21456794440746307, + "rewards/rejected": -0.9396421313285828, + "step": 1410 + }, + { + "epoch": 0.37, + "grad_norm": 2.40625, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": -2.780161142349243, + "logits/rejected": -2.7392024993896484, + "logps/chosen": -497.08758544921875, + "logps/rejected": -469.8622131347656, + "loss": 0.6158, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7258157730102539, + "rewards/margins": 0.2515636384487152, + "rewards/rejected": -0.9773795008659363, + "step": 1420 + }, + { + "epoch": 0.37, + "grad_norm": 2.78125, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": -2.685615062713623, + "logits/rejected": -2.663395404815674, + "logps/chosen": -459.1412048339844, + "logps/rejected": -449.1641540527344, + "loss": 0.627, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7857618927955627, + "rewards/margins": 0.22839538753032684, + "rewards/rejected": -1.0141572952270508, + "step": 1430 + }, + { + "epoch": 0.38, + "grad_norm": 2.484375, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": -2.772688388824463, + "logits/rejected": -2.7538001537323, + "logps/chosen": -472.8526916503906, + "logps/rejected": -457.0860900878906, + "loss": 0.6102, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9769464731216431, + "rewards/margins": 0.26345548033714294, + "rewards/rejected": -1.2404019832611084, + "step": 1440 + }, + { + "epoch": 0.38, + "grad_norm": 2.6875, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": -2.7669315338134766, + "logits/rejected": -2.722926378250122, + "logps/chosen": -466.6268615722656, + "logps/rejected": -452.5586853027344, + "loss": 0.6191, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0403883457183838, + "rewards/margins": 0.2461782991886139, + "rewards/rejected": -1.2865667343139648, + "step": 1450 + }, + { + "epoch": 0.38, + "grad_norm": 4.625, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": -2.698967456817627, + "logits/rejected": -2.680738925933838, + "logps/chosen": -471.3136291503906, + "logps/rejected": -444.6326599121094, + "loss": 0.6268, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9365707635879517, + "rewards/margins": 0.22086432576179504, + "rewards/rejected": -1.1574350595474243, + "step": 1460 + }, + { + "epoch": 0.38, + "grad_norm": 2.25, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": -2.724776268005371, + "logits/rejected": -2.6953415870666504, + "logps/chosen": -462.2669982910156, + "logps/rejected": -459.33428955078125, + "loss": 0.591, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8260868787765503, + "rewards/margins": 0.2951758801937103, + "rewards/rejected": -1.121262788772583, + "step": 1470 + }, + { + "epoch": 0.39, + "grad_norm": 2.96875, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": -2.7450900077819824, + "logits/rejected": -2.7022852897644043, + "logps/chosen": -477.535888671875, + "logps/rejected": -422.33587646484375, + "loss": 0.6484, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.9595173597335815, + "rewards/margins": 0.19346167147159576, + "rewards/rejected": -1.1529791355133057, + "step": 1480 + }, + { + "epoch": 0.39, + "grad_norm": 2.734375, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": -2.7165746688842773, + "logits/rejected": -2.6513075828552246, + "logps/chosen": -472.983642578125, + "logps/rejected": -456.36956787109375, + "loss": 0.6028, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9271873235702515, + "rewards/margins": 0.28870025277137756, + "rewards/rejected": -1.2158875465393066, + "step": 1490 + }, + { + "epoch": 0.39, + "grad_norm": 4.21875, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": -2.6848392486572266, + "logits/rejected": -2.6425843238830566, + "logps/chosen": -507.7676696777344, + "logps/rejected": -480.82501220703125, + "loss": 0.6085, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9639270901679993, + "rewards/margins": 0.28983956575393677, + "rewards/rejected": -1.253766655921936, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.67969012260437, + "eval_logits/rejected": -2.643465995788574, + "eval_logps/chosen": -501.5469055175781, + "eval_logps/rejected": -482.52557373046875, + "eval_loss": 0.6213365197181702, + "eval_rewards/accuracies": 0.6489999890327454, + "eval_rewards/chosen": -1.053221344947815, + "eval_rewards/margins": 0.25519895553588867, + "eval_rewards/rejected": -1.308420181274414, + "eval_runtime": 348.1901, + "eval_samples_per_second": 5.744, + "eval_steps_per_second": 0.718, + "step": 1500 + }, + { + "epoch": 0.4, + "grad_norm": 3.921875, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": -2.711010456085205, + "logits/rejected": -2.683377981185913, + "logps/chosen": -460.9742736816406, + "logps/rejected": -446.6480407714844, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0935702323913574, + "rewards/margins": 0.11124851554632187, + "rewards/rejected": -1.2048187255859375, + "step": 1510 + }, + { + "epoch": 0.4, + "grad_norm": 2.953125, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": -2.748260021209717, + "logits/rejected": -2.724116802215576, + "logps/chosen": -488.9134826660156, + "logps/rejected": -478.4491271972656, + "loss": 0.6052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9258454442024231, + "rewards/margins": 0.2710776627063751, + "rewards/rejected": -1.196923017501831, + "step": 1520 + }, + { + "epoch": 0.4, + "grad_norm": 2.984375, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": -2.722660779953003, + "logits/rejected": -2.7004356384277344, + "logps/chosen": -515.2189331054688, + "logps/rejected": -489.676025390625, + "loss": 0.6558, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9935762286186218, + "rewards/margins": 0.16242071986198425, + "rewards/rejected": -1.1559970378875732, + "step": 1530 + }, + { + "epoch": 0.4, + "grad_norm": 2.515625, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": -2.693944215774536, + "logits/rejected": -2.662226438522339, + "logps/chosen": -472.21392822265625, + "logps/rejected": -458.48651123046875, + "loss": 0.6344, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1790839433670044, + "rewards/margins": 0.1845519244670868, + "rewards/rejected": -1.363635778427124, + "step": 1540 + }, + { + "epoch": 0.41, + "grad_norm": 2.75, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": -2.7512943744659424, + "logits/rejected": -2.7197911739349365, + "logps/chosen": -525.9802856445312, + "logps/rejected": -505.2586975097656, + "loss": 0.6247, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1584722995758057, + "rewards/margins": 0.232163667678833, + "rewards/rejected": -1.3906362056732178, + "step": 1550 + }, + { + "epoch": 0.41, + "grad_norm": 2.859375, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": -2.755866050720215, + "logits/rejected": -2.70156192779541, + "logps/chosen": -549.6534423828125, + "logps/rejected": -496.219482421875, + "loss": 0.626, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2471288442611694, + "rewards/margins": 0.21645119786262512, + "rewards/rejected": -1.4635800123214722, + "step": 1560 + }, + { + "epoch": 0.41, + "grad_norm": 3.4375, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": -2.751842498779297, + "logits/rejected": -2.7206082344055176, + "logps/chosen": -509.11871337890625, + "logps/rejected": -488.1609802246094, + "loss": 0.6118, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2405083179473877, + "rewards/margins": 0.24299952387809753, + "rewards/rejected": -1.483507752418518, + "step": 1570 + }, + { + "epoch": 0.41, + "grad_norm": 2.78125, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": -2.7066681385040283, + "logits/rejected": -2.7152516841888428, + "logps/chosen": -505.77618408203125, + "logps/rejected": -542.4833374023438, + "loss": 0.602, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2058336734771729, + "rewards/margins": 0.2781684398651123, + "rewards/rejected": -1.4840023517608643, + "step": 1580 + }, + { + "epoch": 0.42, + "grad_norm": 2.96875, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": -2.750699520111084, + "logits/rejected": -2.7490906715393066, + "logps/chosen": -485.8789978027344, + "logps/rejected": -506.06243896484375, + "loss": 0.6448, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.244022250175476, + "rewards/margins": 0.20155465602874756, + "rewards/rejected": -1.4455769062042236, + "step": 1590 + }, + { + "epoch": 0.42, + "grad_norm": 2.78125, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": -2.6556179523468018, + "logits/rejected": -2.649691343307495, + "logps/chosen": -509.494873046875, + "logps/rejected": -483.21875, + "loss": 0.6317, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0604232549667358, + "rewards/margins": 0.21748527884483337, + "rewards/rejected": -1.2779085636138916, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.6505815982818604, + "eval_logits/rejected": -2.617206573486328, + "eval_logps/chosen": -508.6858215332031, + "eval_logps/rejected": -489.9322814941406, + "eval_loss": 0.619657576084137, + "eval_rewards/accuracies": 0.6489999890327454, + "eval_rewards/chosen": -1.1246099472045898, + "eval_rewards/margins": 0.2578776180744171, + "eval_rewards/rejected": -1.3824876546859741, + "eval_runtime": 347.7803, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 1600 + }, + { + "epoch": 0.42, + "grad_norm": 7.125, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": -2.710347890853882, + "logits/rejected": -2.695310354232788, + "logps/chosen": -473.04541015625, + "logps/rejected": -494.8163146972656, + "loss": 0.5936, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0344406366348267, + "rewards/margins": 0.3075736463069916, + "rewards/rejected": -1.342014193534851, + "step": 1610 + }, + { + "epoch": 0.42, + "grad_norm": 4.5, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": -2.7335174083709717, + "logits/rejected": -2.673346996307373, + "logps/chosen": -509.60595703125, + "logps/rejected": -487.2217712402344, + "loss": 0.614, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1273787021636963, + "rewards/margins": 0.2849898934364319, + "rewards/rejected": -1.4123685359954834, + "step": 1620 + }, + { + "epoch": 0.43, + "grad_norm": 4.90625, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": -2.588106155395508, + "logits/rejected": -2.6156704425811768, + "logps/chosen": -476.9378967285156, + "logps/rejected": -474.3050842285156, + "loss": 0.6187, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2659618854522705, + "rewards/margins": 0.28414902091026306, + "rewards/rejected": -1.5501108169555664, + "step": 1630 + }, + { + "epoch": 0.43, + "grad_norm": 3.09375, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": -2.6407034397125244, + "logits/rejected": -2.6249217987060547, + "logps/chosen": -523.3114013671875, + "logps/rejected": -515.78564453125, + "loss": 0.6534, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3326655626296997, + "rewards/margins": 0.2217075377702713, + "rewards/rejected": -1.554373025894165, + "step": 1640 + }, + { + "epoch": 0.43, + "grad_norm": 3.0, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": -2.678744077682495, + "logits/rejected": -2.6236231327056885, + "logps/chosen": -500.12322998046875, + "logps/rejected": -453.09375, + "loss": 0.6798, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1778029203414917, + "rewards/margins": 0.14696446061134338, + "rewards/rejected": -1.3247674703598022, + "step": 1650 + }, + { + "epoch": 0.43, + "grad_norm": 2.515625, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": -2.6644175052642822, + "logits/rejected": -2.6675162315368652, + "logps/chosen": -451.5030822753906, + "logps/rejected": -466.20867919921875, + "loss": 0.6371, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1360504627227783, + "rewards/margins": 0.20215868949890137, + "rewards/rejected": -1.3382090330123901, + "step": 1660 + }, + { + "epoch": 0.44, + "grad_norm": 3.421875, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": -2.714818239212036, + "logits/rejected": -2.6459782123565674, + "logps/chosen": -528.015380859375, + "logps/rejected": -481.2875061035156, + "loss": 0.6269, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1526925563812256, + "rewards/margins": 0.24837598204612732, + "rewards/rejected": -1.4010684490203857, + "step": 1670 + }, + { + "epoch": 0.44, + "grad_norm": 2.8125, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": -2.7072434425354004, + "logits/rejected": -2.6440227031707764, + "logps/chosen": -518.8106689453125, + "logps/rejected": -480.94091796875, + "loss": 0.5745, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9318562746047974, + "rewards/margins": 0.3667041063308716, + "rewards/rejected": -1.298560380935669, + "step": 1680 + }, + { + "epoch": 0.44, + "grad_norm": 3.0625, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": -2.739563465118408, + "logits/rejected": -2.681410312652588, + "logps/chosen": -504.88580322265625, + "logps/rejected": -463.4295959472656, + "loss": 0.5969, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9455466270446777, + "rewards/margins": 0.3077097535133362, + "rewards/rejected": -1.2532564401626587, + "step": 1690 + }, + { + "epoch": 0.44, + "grad_norm": 4.53125, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": -2.6793107986450195, + "logits/rejected": -2.6711511611938477, + "logps/chosen": -440.7421875, + "logps/rejected": -438.96209716796875, + "loss": 0.6702, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.124517798423767, + "rewards/margins": 0.14334309101104736, + "rewards/rejected": -1.267861008644104, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.6761555671691895, + "eval_logits/rejected": -2.6407337188720703, + "eval_logps/chosen": -496.5814514160156, + "eval_logps/rejected": -478.1268310546875, + "eval_loss": 0.6182043552398682, + "eval_rewards/accuracies": 0.652999997138977, + "eval_rewards/chosen": -1.0035661458969116, + "eval_rewards/margins": 0.26086705923080444, + "eval_rewards/rejected": -1.2644333839416504, + "eval_runtime": 347.9034, + "eval_samples_per_second": 5.749, + "eval_steps_per_second": 0.719, + "step": 1700 + }, + { + "epoch": 0.45, + "grad_norm": 4.0, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": -2.7096104621887207, + "logits/rejected": -2.717935562133789, + "logps/chosen": -451.46923828125, + "logps/rejected": -466.99798583984375, + "loss": 0.6448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9781481027603149, + "rewards/margins": 0.2028496265411377, + "rewards/rejected": -1.180997610092163, + "step": 1710 + }, + { + "epoch": 0.45, + "grad_norm": 3.953125, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": -2.759023427963257, + "logits/rejected": -2.7278056144714355, + "logps/chosen": -501.0248107910156, + "logps/rejected": -491.77374267578125, + "loss": 0.5937, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9347749948501587, + "rewards/margins": 0.3128504455089569, + "rewards/rejected": -1.2476253509521484, + "step": 1720 + }, + { + "epoch": 0.45, + "grad_norm": 3.40625, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": -2.640856981277466, + "logits/rejected": -2.624553918838501, + "logps/chosen": -483.8340759277344, + "logps/rejected": -468.92327880859375, + "loss": 0.637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9758474230766296, + "rewards/margins": 0.2623240053653717, + "rewards/rejected": -1.2381714582443237, + "step": 1730 + }, + { + "epoch": 0.46, + "grad_norm": 3.015625, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": -2.6590754985809326, + "logits/rejected": -2.620539426803589, + "logps/chosen": -497.45587158203125, + "logps/rejected": -465.5917053222656, + "loss": 0.6199, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.884781002998352, + "rewards/margins": 0.24650093913078308, + "rewards/rejected": -1.1312817335128784, + "step": 1740 + }, + { + "epoch": 0.46, + "grad_norm": 3.625, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": -2.7382700443267822, + "logits/rejected": -2.6750712394714355, + "logps/chosen": -496.0645446777344, + "logps/rejected": -476.931884765625, + "loss": 0.6089, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1136571168899536, + "rewards/margins": 0.2823924124240875, + "rewards/rejected": -1.3960494995117188, + "step": 1750 + }, + { + "epoch": 0.46, + "grad_norm": 3.359375, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": -2.6461880207061768, + "logits/rejected": -2.6458840370178223, + "logps/chosen": -484.7660217285156, + "logps/rejected": -470.3802795410156, + "loss": 0.6207, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.033327341079712, + "rewards/margins": 0.2565310001373291, + "rewards/rejected": -1.2898584604263306, + "step": 1760 + }, + { + "epoch": 0.46, + "grad_norm": 3.21875, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": -2.664067268371582, + "logits/rejected": -2.641017436981201, + "logps/chosen": -490.5843200683594, + "logps/rejected": -511.4412536621094, + "loss": 0.5835, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0362211465835571, + "rewards/margins": 0.35183295607566833, + "rewards/rejected": -1.3880541324615479, + "step": 1770 + }, + { + "epoch": 0.47, + "grad_norm": 3.46875, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": -2.7072222232818604, + "logits/rejected": -2.6669273376464844, + "logps/chosen": -479.81292724609375, + "logps/rejected": -471.8095703125, + "loss": 0.6306, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.1294889450073242, + "rewards/margins": 0.23517270386219025, + "rewards/rejected": -1.3646615743637085, + "step": 1780 + }, + { + "epoch": 0.47, + "grad_norm": 5.15625, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": -2.6887664794921875, + "logits/rejected": -2.672578811645508, + "logps/chosen": -530.8712158203125, + "logps/rejected": -537.0011596679688, + "loss": 0.6336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.256080985069275, + "rewards/margins": 0.2619550824165344, + "rewards/rejected": -1.518036127090454, + "step": 1790 + }, + { + "epoch": 0.47, + "grad_norm": 5.1875, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": -2.5891640186309814, + "logits/rejected": -2.568915843963623, + "logps/chosen": -489.94512939453125, + "logps/rejected": -538.85986328125, + "loss": 0.5658, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.305600881576538, + "rewards/margins": 0.43258899450302124, + "rewards/rejected": -1.738189935684204, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.618206739425659, + "eval_logits/rejected": -2.5865581035614014, + "eval_logps/chosen": -531.0144653320312, + "eval_logps/rejected": -515.1605834960938, + "eval_loss": 0.6218886375427246, + "eval_rewards/accuracies": 0.6445000171661377, + "eval_rewards/chosen": -1.3478968143463135, + "eval_rewards/margins": 0.28687387704849243, + "eval_rewards/rejected": -1.6347707509994507, + "eval_runtime": 347.7653, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 1800 + }, + { + "epoch": 0.47, + "grad_norm": 3.3125, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": -2.5783843994140625, + "logits/rejected": -2.530768871307373, + "logps/chosen": -519.810546875, + "logps/rejected": -486.0419921875, + "loss": 0.6143, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3787872791290283, + "rewards/margins": 0.29414287209510803, + "rewards/rejected": -1.672930121421814, + "step": 1810 + }, + { + "epoch": 0.48, + "grad_norm": 11.125, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": -2.6292014122009277, + "logits/rejected": -2.6139864921569824, + "logps/chosen": -565.1087646484375, + "logps/rejected": -546.5718994140625, + "loss": 0.6793, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.386059284210205, + "rewards/margins": 0.1806623637676239, + "rewards/rejected": -1.5667215585708618, + "step": 1820 + }, + { + "epoch": 0.48, + "grad_norm": 3.515625, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": -2.6707286834716797, + "logits/rejected": -2.669381618499756, + "logps/chosen": -514.181640625, + "logps/rejected": -503.86663818359375, + "loss": 0.6624, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.390178918838501, + "rewards/margins": 0.18112266063690186, + "rewards/rejected": -1.5713016986846924, + "step": 1830 + }, + { + "epoch": 0.48, + "grad_norm": 4.3125, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": -2.7397656440734863, + "logits/rejected": -2.692636013031006, + "logps/chosen": -547.605224609375, + "logps/rejected": -530.4255981445312, + "loss": 0.5952, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2703100442886353, + "rewards/margins": 0.33880940079689026, + "rewards/rejected": -1.6091196537017822, + "step": 1840 + }, + { + "epoch": 0.48, + "grad_norm": 2.703125, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": -2.7308197021484375, + "logits/rejected": -2.691286087036133, + "logps/chosen": -530.3106689453125, + "logps/rejected": -512.4490356445312, + "loss": 0.5837, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1016675233840942, + "rewards/margins": 0.3237997591495514, + "rewards/rejected": -1.4254672527313232, + "step": 1850 + }, + { + "epoch": 0.49, + "grad_norm": 2.71875, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": -2.739076614379883, + "logits/rejected": -2.7019972801208496, + "logps/chosen": -540.4013671875, + "logps/rejected": -528.4253540039062, + "loss": 0.6168, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.127809762954712, + "rewards/margins": 0.2989148199558258, + "rewards/rejected": -1.4267246723175049, + "step": 1860 + }, + { + "epoch": 0.49, + "grad_norm": 4.03125, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": -2.6942806243896484, + "logits/rejected": -2.652289628982544, + "logps/chosen": -524.1945190429688, + "logps/rejected": -504.88848876953125, + "loss": 0.5845, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0817714929580688, + "rewards/margins": 0.3258208930492401, + "rewards/rejected": -1.4075922966003418, + "step": 1870 + }, + { + "epoch": 0.49, + "grad_norm": 3.78125, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": -2.7198615074157715, + "logits/rejected": -2.676637649536133, + "logps/chosen": -517.9676513671875, + "logps/rejected": -481.9654846191406, + "loss": 0.5941, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1094682216644287, + "rewards/margins": 0.3417368233203888, + "rewards/rejected": -1.4512050151824951, + "step": 1880 + }, + { + "epoch": 0.49, + "grad_norm": 5.0625, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": -2.70827054977417, + "logits/rejected": -2.6766083240509033, + "logps/chosen": -514.5819091796875, + "logps/rejected": -497.3114318847656, + "loss": 0.6435, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.134453535079956, + "rewards/margins": 0.22488650679588318, + "rewards/rejected": -1.3593400716781616, + "step": 1890 + }, + { + "epoch": 0.5, + "grad_norm": 2.21875, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": -2.6851532459259033, + "logits/rejected": -2.6381797790527344, + "logps/chosen": -517.7092895507812, + "logps/rejected": -501.8164978027344, + "loss": 0.6039, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9668029546737671, + "rewards/margins": 0.30051741003990173, + "rewards/rejected": -1.2673202753067017, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.674211025238037, + "eval_logits/rejected": -2.6375982761383057, + "eval_logps/chosen": -486.3656311035156, + "eval_logps/rejected": -468.8457946777344, + "eval_loss": 0.6153793931007385, + "eval_rewards/accuracies": 0.6629999876022339, + "eval_rewards/chosen": -0.9014082551002502, + "eval_rewards/margins": 0.2702144682407379, + "eval_rewards/rejected": -1.1716225147247314, + "eval_runtime": 347.781, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 1900 + }, + { + "epoch": 0.5, + "grad_norm": 3.515625, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": -2.7260212898254395, + "logits/rejected": -2.7005722522735596, + "logps/chosen": -478.63385009765625, + "logps/rejected": -446.9085998535156, + "loss": 0.6221, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8971524238586426, + "rewards/margins": 0.24045062065124512, + "rewards/rejected": -1.1376030445098877, + "step": 1910 + }, + { + "epoch": 0.5, + "grad_norm": 2.828125, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": -2.7050108909606934, + "logits/rejected": -2.6705167293548584, + "logps/chosen": -476.31854248046875, + "logps/rejected": -468.4576110839844, + "loss": 0.5804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7747586369514465, + "rewards/margins": 0.34002387523651123, + "rewards/rejected": -1.1147825717926025, + "step": 1920 + }, + { + "epoch": 0.51, + "grad_norm": 3.0, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": -2.6942856311798096, + "logits/rejected": -2.6358091831207275, + "logps/chosen": -478.71759033203125, + "logps/rejected": -443.03668212890625, + "loss": 0.5989, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8447883725166321, + "rewards/margins": 0.32821527123451233, + "rewards/rejected": -1.1730036735534668, + "step": 1930 + }, + { + "epoch": 0.51, + "grad_norm": 2.609375, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": -2.701122760772705, + "logits/rejected": -2.682640790939331, + "logps/chosen": -484.0836486816406, + "logps/rejected": -488.54425048828125, + "loss": 0.611, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8852831125259399, + "rewards/margins": 0.296100914478302, + "rewards/rejected": -1.1813839673995972, + "step": 1940 + }, + { + "epoch": 0.51, + "grad_norm": 3.171875, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": -2.728684902191162, + "logits/rejected": -2.6673216819763184, + "logps/chosen": -502.921875, + "logps/rejected": -477.1438903808594, + "loss": 0.6062, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9080885052680969, + "rewards/margins": 0.27347394824028015, + "rewards/rejected": -1.1815625429153442, + "step": 1950 + }, + { + "epoch": 0.51, + "grad_norm": 2.265625, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": -2.741576671600342, + "logits/rejected": -2.6904985904693604, + "logps/chosen": -522.7763671875, + "logps/rejected": -488.14080810546875, + "loss": 0.5742, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9640465974807739, + "rewards/margins": 0.36099857091903687, + "rewards/rejected": -1.325045108795166, + "step": 1960 + }, + { + "epoch": 0.52, + "grad_norm": 4.5625, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": -2.70404052734375, + "logits/rejected": -2.65217924118042, + "logps/chosen": -512.2633666992188, + "logps/rejected": -466.35870361328125, + "loss": 0.6164, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0254846811294556, + "rewards/margins": 0.28917989134788513, + "rewards/rejected": -1.3146644830703735, + "step": 1970 + }, + { + "epoch": 0.52, + "grad_norm": 2.96875, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": -2.6516964435577393, + "logits/rejected": -2.666243076324463, + "logps/chosen": -464.35076904296875, + "logps/rejected": -491.3067932128906, + "loss": 0.628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0909736156463623, + "rewards/margins": 0.2263956516981125, + "rewards/rejected": -1.3173692226409912, + "step": 1980 + }, + { + "epoch": 0.52, + "grad_norm": 3.640625, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": -2.6758196353912354, + "logits/rejected": -2.6307640075683594, + "logps/chosen": -480.7010803222656, + "logps/rejected": -462.5948791503906, + "loss": 0.6001, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9626979827880859, + "rewards/margins": 0.29944872856140137, + "rewards/rejected": -1.2621467113494873, + "step": 1990 + }, + { + "epoch": 0.52, + "grad_norm": 3.78125, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": -2.732978105545044, + "logits/rejected": -2.6916584968566895, + "logps/chosen": -537.5288696289062, + "logps/rejected": -481.4242248535156, + "loss": 0.6173, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0523701906204224, + "rewards/margins": 0.30254489183425903, + "rewards/rejected": -1.354914903640747, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.6580026149749756, + "eval_logits/rejected": -2.6232478618621826, + "eval_logps/chosen": -511.5793151855469, + "eval_logps/rejected": -496.3810119628906, + "eval_loss": 0.6120737791061401, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -1.1535453796386719, + "eval_rewards/margins": 0.29342907667160034, + "eval_rewards/rejected": -1.4469746351242065, + "eval_runtime": 347.7542, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 2000 + }, + { + "epoch": 0.53, + "grad_norm": 3.171875, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": -2.6824750900268555, + "logits/rejected": -2.6871109008789062, + "logps/chosen": -506.978759765625, + "logps/rejected": -510.89569091796875, + "loss": 0.6114, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1684012413024902, + "rewards/margins": 0.284812867641449, + "rewards/rejected": -1.4532140493392944, + "step": 2010 + }, + { + "epoch": 0.53, + "grad_norm": 4.03125, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": -2.732447624206543, + "logits/rejected": -2.692403554916382, + "logps/chosen": -490.4368591308594, + "logps/rejected": -467.0401306152344, + "loss": 0.6204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0760316848754883, + "rewards/margins": 0.24762101471424103, + "rewards/rejected": -1.3236526250839233, + "step": 2020 + }, + { + "epoch": 0.53, + "grad_norm": 3.65625, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": -2.7549805641174316, + "logits/rejected": -2.7140159606933594, + "logps/chosen": -533.2722778320312, + "logps/rejected": -497.967041015625, + "loss": 0.5817, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9551935195922852, + "rewards/margins": 0.3695451319217682, + "rewards/rejected": -1.324738621711731, + "step": 2030 + }, + { + "epoch": 0.53, + "grad_norm": 2.703125, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": -2.7105820178985596, + "logits/rejected": -2.7078709602355957, + "logps/chosen": -511.4810485839844, + "logps/rejected": -498.16705322265625, + "loss": 0.6262, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0229012966156006, + "rewards/margins": 0.2677188515663147, + "rewards/rejected": -1.290619969367981, + "step": 2040 + }, + { + "epoch": 0.54, + "grad_norm": 2.984375, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": -2.7516863346099854, + "logits/rejected": -2.6846871376037598, + "logps/chosen": -489.28924560546875, + "logps/rejected": -436.2731018066406, + "loss": 0.6068, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1246410608291626, + "rewards/margins": 0.29066139459609985, + "rewards/rejected": -1.4153025150299072, + "step": 2050 + }, + { + "epoch": 0.54, + "grad_norm": 3.8125, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": -2.744058132171631, + "logits/rejected": -2.71818208694458, + "logps/chosen": -528.1295166015625, + "logps/rejected": -513.4764404296875, + "loss": 0.604, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1548454761505127, + "rewards/margins": 0.3176589012145996, + "rewards/rejected": -1.4725043773651123, + "step": 2060 + }, + { + "epoch": 0.54, + "grad_norm": 3.640625, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": -2.7094826698303223, + "logits/rejected": -2.7104077339172363, + "logps/chosen": -496.2884826660156, + "logps/rejected": -523.0441284179688, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2200782299041748, + "rewards/margins": 0.2470276653766632, + "rewards/rejected": -1.4671061038970947, + "step": 2070 + }, + { + "epoch": 0.54, + "grad_norm": 2.90625, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": -2.7130637168884277, + "logits/rejected": -2.6693716049194336, + "logps/chosen": -506.1182556152344, + "logps/rejected": -476.56866455078125, + "loss": 0.6481, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2350577116012573, + "rewards/margins": 0.21940436959266663, + "rewards/rejected": -1.4544621706008911, + "step": 2080 + }, + { + "epoch": 0.55, + "grad_norm": 3.328125, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": -2.6736958026885986, + "logits/rejected": -2.632709264755249, + "logps/chosen": -545.9395751953125, + "logps/rejected": -542.23583984375, + "loss": 0.6028, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1226942539215088, + "rewards/margins": 0.33349329233169556, + "rewards/rejected": -1.4561874866485596, + "step": 2090 + }, + { + "epoch": 0.55, + "grad_norm": 5.34375, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": -2.6629996299743652, + "logits/rejected": -2.6413462162017822, + "logps/chosen": -490.80059814453125, + "logps/rejected": -472.08319091796875, + "loss": 0.62, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1775743961334229, + "rewards/margins": 0.25760284066200256, + "rewards/rejected": -1.4351773262023926, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.662881851196289, + "eval_logits/rejected": -2.627786159515381, + "eval_logps/chosen": -512.2247314453125, + "eval_logps/rejected": -496.9116516113281, + "eval_loss": 0.6116329431533813, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -1.1599992513656616, + "eval_rewards/margins": 0.29228222370147705, + "eval_rewards/rejected": -1.4522814750671387, + "eval_runtime": 347.7217, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 0.719, + "step": 2100 + }, + { + "epoch": 0.55, + "grad_norm": 3.40625, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": -2.703998565673828, + "logits/rejected": -2.6619462966918945, + "logps/chosen": -514.3597412109375, + "logps/rejected": -477.54974365234375, + "loss": 0.6243, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1271086931228638, + "rewards/margins": 0.2521311044692993, + "rewards/rejected": -1.379239797592163, + "step": 2110 + }, + { + "epoch": 0.55, + "grad_norm": 3.125, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": -2.6903960704803467, + "logits/rejected": -2.680983304977417, + "logps/chosen": -468.71826171875, + "logps/rejected": -445.9656677246094, + "loss": 0.6159, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9441890716552734, + "rewards/margins": 0.2759680449962616, + "rewards/rejected": -1.2201570272445679, + "step": 2120 + }, + { + "epoch": 0.56, + "grad_norm": 3.34375, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": -2.6948788166046143, + "logits/rejected": -2.668149471282959, + "logps/chosen": -500.23248291015625, + "logps/rejected": -478.42437744140625, + "loss": 0.6166, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.892300009727478, + "rewards/margins": 0.28213489055633545, + "rewards/rejected": -1.174435019493103, + "step": 2130 + }, + { + "epoch": 0.56, + "grad_norm": 3.109375, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": -2.7459912300109863, + "logits/rejected": -2.701768398284912, + "logps/chosen": -485.0846252441406, + "logps/rejected": -448.44140625, + "loss": 0.5966, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9321788549423218, + "rewards/margins": 0.30308184027671814, + "rewards/rejected": -1.2352608442306519, + "step": 2140 + }, + { + "epoch": 0.56, + "grad_norm": 3.40625, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": -2.7058706283569336, + "logits/rejected": -2.6738340854644775, + "logps/chosen": -463.795654296875, + "logps/rejected": -433.58721923828125, + "loss": 0.6387, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0220860242843628, + "rewards/margins": 0.22584767639636993, + "rewards/rejected": -1.2479338645935059, + "step": 2150 + }, + { + "epoch": 0.57, + "grad_norm": 5.0, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": -2.757347583770752, + "logits/rejected": -2.6829416751861572, + "logps/chosen": -521.1004638671875, + "logps/rejected": -448.514404296875, + "loss": 0.5971, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9433916211128235, + "rewards/margins": 0.3374863266944885, + "rewards/rejected": -1.2808778285980225, + "step": 2160 + }, + { + "epoch": 0.57, + "grad_norm": 3.21875, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": -2.7074646949768066, + "logits/rejected": -2.6540534496307373, + "logps/chosen": -489.44866943359375, + "logps/rejected": -449.8168029785156, + "loss": 0.6124, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0346986055374146, + "rewards/margins": 0.2758663594722748, + "rewards/rejected": -1.3105649948120117, + "step": 2170 + }, + { + "epoch": 0.57, + "grad_norm": 4.5625, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": -2.69421124458313, + "logits/rejected": -2.6716065406799316, + "logps/chosen": -481.97833251953125, + "logps/rejected": -441.35443115234375, + "loss": 0.614, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0301321744918823, + "rewards/margins": 0.2506099343299866, + "rewards/rejected": -1.2807420492172241, + "step": 2180 + }, + { + "epoch": 0.57, + "grad_norm": 3.171875, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": -2.66270112991333, + "logits/rejected": -2.640023946762085, + "logps/chosen": -484.82354736328125, + "logps/rejected": -490.209716796875, + "loss": 0.642, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0577830076217651, + "rewards/margins": 0.22141680121421814, + "rewards/rejected": -1.2791998386383057, + "step": 2190 + }, + { + "epoch": 0.58, + "grad_norm": 4.375, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": -2.7031655311584473, + "logits/rejected": -2.675705909729004, + "logps/chosen": -489.45269775390625, + "logps/rejected": -480.9483337402344, + "loss": 0.5957, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9436511993408203, + "rewards/margins": 0.3099278509616852, + "rewards/rejected": -1.2535789012908936, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.6674294471740723, + "eval_logits/rejected": -2.631662130355835, + "eval_logps/chosen": -492.14892578125, + "eval_logps/rejected": -475.9957580566406, + "eval_loss": 0.6131682991981506, + "eval_rewards/accuracies": 0.6654999852180481, + "eval_rewards/chosen": -0.959242045879364, + "eval_rewards/margins": 0.28388017416000366, + "eval_rewards/rejected": -1.2431222200393677, + "eval_runtime": 347.6695, + "eval_samples_per_second": 5.753, + "eval_steps_per_second": 0.719, + "step": 2200 + }, + { + "epoch": 0.58, + "grad_norm": 3.890625, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": -2.764758586883545, + "logits/rejected": -2.708240032196045, + "logps/chosen": -508.2550354003906, + "logps/rejected": -496.92022705078125, + "loss": 0.6149, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9579612612724304, + "rewards/margins": 0.28434932231903076, + "rewards/rejected": -1.242310643196106, + "step": 2210 + }, + { + "epoch": 0.58, + "grad_norm": 2.96875, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": -2.628192901611328, + "logits/rejected": -2.6235828399658203, + "logps/chosen": -510.7039489746094, + "logps/rejected": -509.13592529296875, + "loss": 0.6373, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.038236141204834, + "rewards/margins": 0.2541654407978058, + "rewards/rejected": -1.2924015522003174, + "step": 2220 + }, + { + "epoch": 0.58, + "grad_norm": 3.65625, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": -2.65356707572937, + "logits/rejected": -2.639221668243408, + "logps/chosen": -514.1550903320312, + "logps/rejected": -486.2848205566406, + "loss": 0.6169, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0424216985702515, + "rewards/margins": 0.28543582558631897, + "rewards/rejected": -1.3278576135635376, + "step": 2230 + }, + { + "epoch": 0.59, + "grad_norm": 3.421875, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": -2.676987409591675, + "logits/rejected": -2.6617724895477295, + "logps/chosen": -531.5819091796875, + "logps/rejected": -520.0554809570312, + "loss": 0.6348, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0140860080718994, + "rewards/margins": 0.2539765238761902, + "rewards/rejected": -1.2680623531341553, + "step": 2240 + }, + { + "epoch": 0.59, + "grad_norm": 2.71875, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": -2.7012898921966553, + "logits/rejected": -2.684084415435791, + "logps/chosen": -495.6434631347656, + "logps/rejected": -480.5547790527344, + "loss": 0.58, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9610779881477356, + "rewards/margins": 0.3606341779232025, + "rewards/rejected": -1.3217121362686157, + "step": 2250 + }, + { + "epoch": 0.59, + "grad_norm": 4.25, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": -2.6837801933288574, + "logits/rejected": -2.6432583332061768, + "logps/chosen": -504.1858825683594, + "logps/rejected": -483.0741271972656, + "loss": 0.6411, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0147348642349243, + "rewards/margins": 0.23686587810516357, + "rewards/rejected": -1.2516006231307983, + "step": 2260 + }, + { + "epoch": 0.59, + "grad_norm": 3.78125, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": -2.679379940032959, + "logits/rejected": -2.6332995891571045, + "logps/chosen": -515.0089721679688, + "logps/rejected": -473.6858825683594, + "loss": 0.6223, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9973223805427551, + "rewards/margins": 0.3110753893852234, + "rewards/rejected": -1.3083977699279785, + "step": 2270 + }, + { + "epoch": 0.6, + "grad_norm": 2.953125, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": -2.688920497894287, + "logits/rejected": -2.629225492477417, + "logps/chosen": -494.71026611328125, + "logps/rejected": -466.7489318847656, + "loss": 0.575, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9701055288314819, + "rewards/margins": 0.36299929022789, + "rewards/rejected": -1.3331048488616943, + "step": 2280 + }, + { + "epoch": 0.6, + "grad_norm": 3.0, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": -2.679488182067871, + "logits/rejected": -2.660877227783203, + "logps/chosen": -480.1832580566406, + "logps/rejected": -459.4652404785156, + "loss": 0.5981, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9950377345085144, + "rewards/margins": 0.2864169776439667, + "rewards/rejected": -1.2814548015594482, + "step": 2290 + }, + { + "epoch": 0.6, + "grad_norm": 3.828125, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": -2.7407097816467285, + "logits/rejected": -2.6703708171844482, + "logps/chosen": -514.2717895507812, + "logps/rejected": -493.68157958984375, + "loss": 0.6093, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.067603588104248, + "rewards/margins": 0.3232964277267456, + "rewards/rejected": -1.390899896621704, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.661860704421997, + "eval_logits/rejected": -2.6282894611358643, + "eval_logps/chosen": -505.5737609863281, + "eval_logps/rejected": -489.7906494140625, + "eval_loss": 0.6137638092041016, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -1.0934895277023315, + "eval_rewards/margins": 0.2875814139842987, + "eval_rewards/rejected": -1.3810709714889526, + "eval_runtime": 348.0995, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 0.718, + "step": 2300 + }, + { + "epoch": 0.6, + "grad_norm": 3.4375, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": -2.611936092376709, + "logits/rejected": -2.5979480743408203, + "logps/chosen": -461.7267150878906, + "logps/rejected": -432.57012939453125, + "loss": 0.6371, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.189932107925415, + "rewards/margins": 0.21743163466453552, + "rewards/rejected": -1.4073638916015625, + "step": 2310 + }, + { + "epoch": 0.61, + "grad_norm": 4.75, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": -2.705918788909912, + "logits/rejected": -2.672658920288086, + "logps/chosen": -493.9043884277344, + "logps/rejected": -467.73699951171875, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0233014822006226, + "rewards/margins": 0.26420658826828003, + "rewards/rejected": -1.2875080108642578, + "step": 2320 + }, + { + "epoch": 0.61, + "grad_norm": 4.8125, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": -2.700793743133545, + "logits/rejected": -2.6994223594665527, + "logps/chosen": -514.3643798828125, + "logps/rejected": -513.8361206054688, + "loss": 0.6417, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9839721918106079, + "rewards/margins": 0.20725660026073456, + "rewards/rejected": -1.1912287473678589, + "step": 2330 + }, + { + "epoch": 0.61, + "grad_norm": 4.53125, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": -2.6701126098632812, + "logits/rejected": -2.639960289001465, + "logps/chosen": -459.0147399902344, + "logps/rejected": -447.525634765625, + "loss": 0.6503, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0298200845718384, + "rewards/margins": 0.19326387345790863, + "rewards/rejected": -1.2230839729309082, + "step": 2340 + }, + { + "epoch": 0.62, + "grad_norm": 4.34375, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": -2.7172811031341553, + "logits/rejected": -2.704822063446045, + "logps/chosen": -486.2060546875, + "logps/rejected": -455.1549377441406, + "loss": 0.6126, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9620019197463989, + "rewards/margins": 0.29844018816947937, + "rewards/rejected": -1.2604421377182007, + "step": 2350 + }, + { + "epoch": 0.62, + "grad_norm": 2.84375, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": -2.7060694694519043, + "logits/rejected": -2.650001049041748, + "logps/chosen": -501.8260803222656, + "logps/rejected": -487.3619079589844, + "loss": 0.586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9851962327957153, + "rewards/margins": 0.33760061860084534, + "rewards/rejected": -1.3227968215942383, + "step": 2360 + }, + { + "epoch": 0.62, + "grad_norm": 3.125, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": -2.707420825958252, + "logits/rejected": -2.6578211784362793, + "logps/chosen": -503.98114013671875, + "logps/rejected": -489.624755859375, + "loss": 0.5859, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.100565791130066, + "rewards/margins": 0.3875434398651123, + "rewards/rejected": -1.4881092309951782, + "step": 2370 + }, + { + "epoch": 0.62, + "grad_norm": 5.4375, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": -2.6093525886535645, + "logits/rejected": -2.603379726409912, + "logps/chosen": -481.91998291015625, + "logps/rejected": -481.68890380859375, + "loss": 0.6485, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1177568435668945, + "rewards/margins": 0.2344493865966797, + "rewards/rejected": -1.3522062301635742, + "step": 2380 + }, + { + "epoch": 0.63, + "grad_norm": 3.234375, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": -2.685995101928711, + "logits/rejected": -2.6452722549438477, + "logps/chosen": -505.78961181640625, + "logps/rejected": -489.805419921875, + "loss": 0.6215, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0744038820266724, + "rewards/margins": 0.29028916358947754, + "rewards/rejected": -1.3646929264068604, + "step": 2390 + }, + { + "epoch": 0.63, + "grad_norm": 3.828125, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": -2.692858934402466, + "logits/rejected": -2.651174306869507, + "logps/chosen": -500.21099853515625, + "logps/rejected": -488.8199157714844, + "loss": 0.6009, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0547789335250854, + "rewards/margins": 0.330585777759552, + "rewards/rejected": -1.3853647708892822, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.6432461738586426, + "eval_logits/rejected": -2.608781337738037, + "eval_logps/chosen": -501.4175109863281, + "eval_logps/rejected": -486.4694519042969, + "eval_loss": 0.6107898950576782, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -1.051926612854004, + "eval_rewards/margins": 0.2959325611591339, + "eval_rewards/rejected": -1.3478593826293945, + "eval_runtime": 347.9473, + "eval_samples_per_second": 5.748, + "eval_steps_per_second": 0.718, + "step": 2400 + }, + { + "epoch": 0.63, + "grad_norm": 4.21875, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": -2.7207000255584717, + "logits/rejected": -2.6985878944396973, + "logps/chosen": -473.2940979003906, + "logps/rejected": -458.9827575683594, + "loss": 0.6073, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0358033180236816, + "rewards/margins": 0.3086177110671997, + "rewards/rejected": -1.344421148300171, + "step": 2410 + }, + { + "epoch": 0.63, + "grad_norm": 5.0, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": -2.704127311706543, + "logits/rejected": -2.6785435676574707, + "logps/chosen": -496.7994689941406, + "logps/rejected": -474.5835876464844, + "loss": 0.5987, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0449730157852173, + "rewards/margins": 0.3369174897670746, + "rewards/rejected": -1.3818905353546143, + "step": 2420 + }, + { + "epoch": 0.64, + "grad_norm": 5.1875, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": -2.6866023540496826, + "logits/rejected": -2.659691095352173, + "logps/chosen": -503.9330139160156, + "logps/rejected": -494.0077209472656, + "loss": 0.6131, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1332998275756836, + "rewards/margins": 0.2737593650817871, + "rewards/rejected": -1.4070589542388916, + "step": 2430 + }, + { + "epoch": 0.64, + "grad_norm": 3.390625, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": -2.696007251739502, + "logits/rejected": -2.6478171348571777, + "logps/chosen": -518.2098999023438, + "logps/rejected": -465.0953063964844, + "loss": 0.6265, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.170287013053894, + "rewards/margins": 0.31294775009155273, + "rewards/rejected": -1.4832347631454468, + "step": 2440 + }, + { + "epoch": 0.64, + "grad_norm": 4.625, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": -2.61708927154541, + "logits/rejected": -2.5760369300842285, + "logps/chosen": -529.7711181640625, + "logps/rejected": -493.506591796875, + "loss": 0.6125, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.097019076347351, + "rewards/margins": 0.28592607378959656, + "rewards/rejected": -1.3829452991485596, + "step": 2450 + }, + { + "epoch": 0.64, + "grad_norm": 2.765625, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": -2.737034320831299, + "logits/rejected": -2.706390857696533, + "logps/chosen": -520.4344482421875, + "logps/rejected": -464.23492431640625, + "loss": 0.5894, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.011051058769226, + "rewards/margins": 0.3496933877468109, + "rewards/rejected": -1.360744595527649, + "step": 2460 + }, + { + "epoch": 0.65, + "grad_norm": 3.25, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": -2.6961193084716797, + "logits/rejected": -2.677546501159668, + "logps/chosen": -491.8929748535156, + "logps/rejected": -502.51959228515625, + "loss": 0.6301, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.006574273109436, + "rewards/margins": 0.23815563321113586, + "rewards/rejected": -1.244729995727539, + "step": 2470 + }, + { + "epoch": 0.65, + "grad_norm": 4.75, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": -2.604997158050537, + "logits/rejected": -2.5811421871185303, + "logps/chosen": -477.0950622558594, + "logps/rejected": -455.02410888671875, + "loss": 0.5967, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0661613941192627, + "rewards/margins": 0.30554550886154175, + "rewards/rejected": -1.3717070817947388, + "step": 2480 + }, + { + "epoch": 0.65, + "grad_norm": 3.234375, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": -2.6581666469573975, + "logits/rejected": -2.642603874206543, + "logps/chosen": -471.28521728515625, + "logps/rejected": -457.45794677734375, + "loss": 0.6281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.046363115310669, + "rewards/margins": 0.2672274708747864, + "rewards/rejected": -1.3135906457901, + "step": 2490 + }, + { + "epoch": 0.65, + "grad_norm": 4.125, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": -2.718479633331299, + "logits/rejected": -2.663071870803833, + "logps/chosen": -489.6497497558594, + "logps/rejected": -492.8302307128906, + "loss": 0.5988, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0412070751190186, + "rewards/margins": 0.3211270868778229, + "rewards/rejected": -1.3623343706130981, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.647676706314087, + "eval_logits/rejected": -2.6143462657928467, + "eval_logps/chosen": -500.49822998046875, + "eval_logps/rejected": -485.87298583984375, + "eval_loss": 0.6108289957046509, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -1.0427342653274536, + "eval_rewards/margins": 0.299160897731781, + "eval_rewards/rejected": -1.3418951034545898, + "eval_runtime": 347.8052, + "eval_samples_per_second": 5.75, + "eval_steps_per_second": 0.719, + "step": 2500 + }, + { + "epoch": 0.66, + "grad_norm": 3.59375, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": -2.711664915084839, + "logits/rejected": -2.6842312812805176, + "logps/chosen": -523.2516479492188, + "logps/rejected": -486.0882873535156, + "loss": 0.6146, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0051112174987793, + "rewards/margins": 0.2787768244743347, + "rewards/rejected": -1.2838881015777588, + "step": 2510 + }, + { + "epoch": 0.66, + "grad_norm": 3.03125, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": -2.6525261402130127, + "logits/rejected": -2.5945048332214355, + "logps/chosen": -518.8192138671875, + "logps/rejected": -473.05615234375, + "loss": 0.551, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9474281072616577, + "rewards/margins": 0.451382577419281, + "rewards/rejected": -1.398810625076294, + "step": 2520 + }, + { + "epoch": 0.66, + "grad_norm": 5.03125, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": -2.678679943084717, + "logits/rejected": -2.631782054901123, + "logps/chosen": -477.3982849121094, + "logps/rejected": -477.5099182128906, + "loss": 0.6084, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0620421171188354, + "rewards/margins": 0.28218984603881836, + "rewards/rejected": -1.3442319631576538, + "step": 2530 + }, + { + "epoch": 0.66, + "grad_norm": 3.265625, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": -2.6359200477600098, + "logits/rejected": -2.611725330352783, + "logps/chosen": -475.1856994628906, + "logps/rejected": -478.42254638671875, + "loss": 0.6265, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0813992023468018, + "rewards/margins": 0.27633678913116455, + "rewards/rejected": -1.3577358722686768, + "step": 2540 + }, + { + "epoch": 0.67, + "grad_norm": 5.1875, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": -2.6500914096832275, + "logits/rejected": -2.6187453269958496, + "logps/chosen": -469.1063537597656, + "logps/rejected": -479.3811950683594, + "loss": 0.6122, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0489628314971924, + "rewards/margins": 0.3200768828392029, + "rewards/rejected": -1.36903977394104, + "step": 2550 + }, + { + "epoch": 0.67, + "grad_norm": 4.46875, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": -2.6609740257263184, + "logits/rejected": -2.613036632537842, + "logps/chosen": -528.0738525390625, + "logps/rejected": -479.12847900390625, + "loss": 0.5824, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0216680765151978, + "rewards/margins": 0.3447554111480713, + "rewards/rejected": -1.3664233684539795, + "step": 2560 + }, + { + "epoch": 0.67, + "grad_norm": 3.875, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": -2.6330864429473877, + "logits/rejected": -2.6078662872314453, + "logps/chosen": -480.5138244628906, + "logps/rejected": -466.89813232421875, + "loss": 0.6461, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.1947067975997925, + "rewards/margins": 0.2124728262424469, + "rewards/rejected": -1.4071797132492065, + "step": 2570 + }, + { + "epoch": 0.68, + "grad_norm": 3.546875, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": -2.6624321937561035, + "logits/rejected": -2.634147882461548, + "logps/chosen": -490.93731689453125, + "logps/rejected": -508.0350646972656, + "loss": 0.6106, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0125614404678345, + "rewards/margins": 0.31729286909103394, + "rewards/rejected": -1.3298542499542236, + "step": 2580 + }, + { + "epoch": 0.68, + "grad_norm": 4.25, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": -2.6459295749664307, + "logits/rejected": -2.632132053375244, + "logps/chosen": -476.1249084472656, + "logps/rejected": -480.6978454589844, + "loss": 0.601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0420411825180054, + "rewards/margins": 0.32264775037765503, + "rewards/rejected": -1.3646891117095947, + "step": 2590 + }, + { + "epoch": 0.68, + "grad_norm": 4.1875, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": -2.6568338871002197, + "logits/rejected": -2.613276481628418, + "logps/chosen": -504.14825439453125, + "logps/rejected": -478.09832763671875, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0711807012557983, + "rewards/margins": 0.307957261800766, + "rewards/rejected": -1.3791382312774658, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.6304023265838623, + "eval_logits/rejected": -2.5973665714263916, + "eval_logps/chosen": -498.1077575683594, + "eval_logps/rejected": -483.6012878417969, + "eval_loss": 0.611174464225769, + "eval_rewards/accuracies": 0.6545000076293945, + "eval_rewards/chosen": -1.0188294649124146, + "eval_rewards/margins": 0.30034834146499634, + "eval_rewards/rejected": -1.3191777467727661, + "eval_runtime": 347.7844, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 2600 + }, + { + "epoch": 0.68, + "grad_norm": 2.921875, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": -2.691493034362793, + "logits/rejected": -2.6732335090637207, + "logps/chosen": -509.51202392578125, + "logps/rejected": -488.71826171875, + "loss": 0.5973, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9540036916732788, + "rewards/margins": 0.33757534623146057, + "rewards/rejected": -1.2915791273117065, + "step": 2610 + }, + { + "epoch": 0.69, + "grad_norm": 4.8125, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": -2.6027607917785645, + "logits/rejected": -2.576066255569458, + "logps/chosen": -490.2715759277344, + "logps/rejected": -477.48992919921875, + "loss": 0.6103, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0267969369888306, + "rewards/margins": 0.3205047845840454, + "rewards/rejected": -1.347301721572876, + "step": 2620 + }, + { + "epoch": 0.69, + "grad_norm": 3.15625, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": -2.6897635459899902, + "logits/rejected": -2.6353797912597656, + "logps/chosen": -514.4585571289062, + "logps/rejected": -483.48565673828125, + "loss": 0.6332, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.041078805923462, + "rewards/margins": 0.2664690613746643, + "rewards/rejected": -1.3075478076934814, + "step": 2630 + }, + { + "epoch": 0.69, + "grad_norm": 3.8125, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": -2.7139453887939453, + "logits/rejected": -2.6701016426086426, + "logps/chosen": -471.3026428222656, + "logps/rejected": -466.50689697265625, + "loss": 0.5969, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0769729614257812, + "rewards/margins": 0.31179046630859375, + "rewards/rejected": -1.3887633085250854, + "step": 2640 + }, + { + "epoch": 0.69, + "grad_norm": 4.15625, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": -2.697122573852539, + "logits/rejected": -2.6447901725769043, + "logps/chosen": -536.4827270507812, + "logps/rejected": -519.1363525390625, + "loss": 0.6039, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9090393781661987, + "rewards/margins": 0.34489864110946655, + "rewards/rejected": -1.25393807888031, + "step": 2650 + }, + { + "epoch": 0.7, + "grad_norm": 6.28125, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": -2.68558931350708, + "logits/rejected": -2.6322624683380127, + "logps/chosen": -508.85711669921875, + "logps/rejected": -504.25762939453125, + "loss": 0.5892, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9868243932723999, + "rewards/margins": 0.371315598487854, + "rewards/rejected": -1.358140230178833, + "step": 2660 + }, + { + "epoch": 0.7, + "grad_norm": 4.5625, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": -2.6563174724578857, + "logits/rejected": -2.6325182914733887, + "logps/chosen": -482.51422119140625, + "logps/rejected": -471.16632080078125, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.011075496673584, + "rewards/margins": 0.36485710740089417, + "rewards/rejected": -1.3759326934814453, + "step": 2670 + }, + { + "epoch": 0.7, + "grad_norm": 3.8125, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": -2.649975299835205, + "logits/rejected": -2.6116244792938232, + "logps/chosen": -506.97186279296875, + "logps/rejected": -472.00714111328125, + "loss": 0.6322, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0412980318069458, + "rewards/margins": 0.2263387143611908, + "rewards/rejected": -1.2676366567611694, + "step": 2680 + }, + { + "epoch": 0.7, + "grad_norm": 4.15625, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": -2.704833984375, + "logits/rejected": -2.6896462440490723, + "logps/chosen": -495.80877685546875, + "logps/rejected": -516.6968994140625, + "loss": 0.5828, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0221621990203857, + "rewards/margins": 0.38270777463912964, + "rewards/rejected": -1.404869794845581, + "step": 2690 + }, + { + "epoch": 0.71, + "grad_norm": 5.09375, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": -2.675306558609009, + "logits/rejected": -2.650527000427246, + "logps/chosen": -492.5806579589844, + "logps/rejected": -478.58258056640625, + "loss": 0.6118, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0126721858978271, + "rewards/margins": 0.3190918564796448, + "rewards/rejected": -1.3317642211914062, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.627389669418335, + "eval_logits/rejected": -2.594527244567871, + "eval_logps/chosen": -504.3044738769531, + "eval_logps/rejected": -490.2562255859375, + "eval_loss": 0.6105741262435913, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -1.0807968378067017, + "eval_rewards/margins": 0.3049302399158478, + "eval_rewards/rejected": -1.385727047920227, + "eval_runtime": 347.7409, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.719, + "step": 2700 + }, + { + "epoch": 0.71, + "grad_norm": 3.90625, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": -2.6303482055664062, + "logits/rejected": -2.6323742866516113, + "logps/chosen": -503.49786376953125, + "logps/rejected": -513.8046875, + "loss": 0.6299, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0185651779174805, + "rewards/margins": 0.29084575176239014, + "rewards/rejected": -1.3094110488891602, + "step": 2710 + }, + { + "epoch": 0.71, + "grad_norm": 7.09375, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": -2.663734197616577, + "logits/rejected": -2.6387317180633545, + "logps/chosen": -552.3123779296875, + "logps/rejected": -516.0528564453125, + "loss": 0.6352, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1312639713287354, + "rewards/margins": 0.2474222630262375, + "rewards/rejected": -1.3786863088607788, + "step": 2720 + }, + { + "epoch": 0.71, + "grad_norm": 3.53125, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": -2.66949725151062, + "logits/rejected": -2.630190849304199, + "logps/chosen": -495.2144470214844, + "logps/rejected": -458.618408203125, + "loss": 0.5847, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1124293804168701, + "rewards/margins": 0.3431801199913025, + "rewards/rejected": -1.4556094408035278, + "step": 2730 + }, + { + "epoch": 0.72, + "grad_norm": 4.75, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": -2.6718780994415283, + "logits/rejected": -2.6413564682006836, + "logps/chosen": -537.8629150390625, + "logps/rejected": -509.77056884765625, + "loss": 0.631, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1508649587631226, + "rewards/margins": 0.2626705765724182, + "rewards/rejected": -1.4135355949401855, + "step": 2740 + }, + { + "epoch": 0.72, + "grad_norm": 4.40625, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": -2.6759867668151855, + "logits/rejected": -2.66206955909729, + "logps/chosen": -496.76458740234375, + "logps/rejected": -523.6871948242188, + "loss": 0.6281, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.131340503692627, + "rewards/margins": 0.26272568106651306, + "rewards/rejected": -1.3940664529800415, + "step": 2750 + }, + { + "epoch": 0.72, + "grad_norm": 5.03125, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": -2.6411399841308594, + "logits/rejected": -2.598507881164551, + "logps/chosen": -491.46722412109375, + "logps/rejected": -468.1087951660156, + "loss": 0.5897, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1277272701263428, + "rewards/margins": 0.36656227707862854, + "rewards/rejected": -1.494289517402649, + "step": 2760 + }, + { + "epoch": 0.72, + "grad_norm": 4.96875, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": -2.6684410572052, + "logits/rejected": -2.619168996810913, + "logps/chosen": -516.2086181640625, + "logps/rejected": -494.1477966308594, + "loss": 0.608, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.11739182472229, + "rewards/margins": 0.31762081384658813, + "rewards/rejected": -1.4350125789642334, + "step": 2770 + }, + { + "epoch": 0.73, + "grad_norm": 3.765625, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": -2.632700204849243, + "logits/rejected": -2.6073365211486816, + "logps/chosen": -529.2457885742188, + "logps/rejected": -526.523681640625, + "loss": 0.5543, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0624479055404663, + "rewards/margins": 0.49154725670814514, + "rewards/rejected": -1.5539953708648682, + "step": 2780 + }, + { + "epoch": 0.73, + "grad_norm": 4.46875, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": -2.6516880989074707, + "logits/rejected": -2.6374242305755615, + "logps/chosen": -508.30126953125, + "logps/rejected": -489.03680419921875, + "loss": 0.5965, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1453887224197388, + "rewards/margins": 0.35155534744262695, + "rewards/rejected": -1.4969440698623657, + "step": 2790 + }, + { + "epoch": 0.73, + "grad_norm": 4.0, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": -2.6636240482330322, + "logits/rejected": -2.643951654434204, + "logps/chosen": -511.94073486328125, + "logps/rejected": -502.1498107910156, + "loss": 0.6134, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1842275857925415, + "rewards/margins": 0.28109192848205566, + "rewards/rejected": -1.4653196334838867, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.6303441524505615, + "eval_logits/rejected": -2.5978312492370605, + "eval_logps/chosen": -511.7178955078125, + "eval_logps/rejected": -498.03662109375, + "eval_loss": 0.6096385717391968, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -1.1549309492111206, + "eval_rewards/margins": 0.3085997700691223, + "eval_rewards/rejected": -1.4635308980941772, + "eval_runtime": 347.7207, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 0.719, + "step": 2800 + }, + { + "epoch": 0.74, + "grad_norm": 3.15625, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": -2.677860736846924, + "logits/rejected": -2.6445260047912598, + "logps/chosen": -506.71746826171875, + "logps/rejected": -513.6080932617188, + "loss": 0.6076, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1613363027572632, + "rewards/margins": 0.32913991808891296, + "rewards/rejected": -1.490476369857788, + "step": 2810 + }, + { + "epoch": 0.74, + "grad_norm": 3.796875, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": -2.6382384300231934, + "logits/rejected": -2.5947837829589844, + "logps/chosen": -486.73236083984375, + "logps/rejected": -469.90087890625, + "loss": 0.6641, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2020397186279297, + "rewards/margins": 0.1584251970052719, + "rewards/rejected": -1.3604648113250732, + "step": 2820 + }, + { + "epoch": 0.74, + "grad_norm": 4.3125, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": -2.6399216651916504, + "logits/rejected": -2.635921001434326, + "logps/chosen": -508.64031982421875, + "logps/rejected": -471.86297607421875, + "loss": 0.6411, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1394246816635132, + "rewards/margins": 0.22725781798362732, + "rewards/rejected": -1.3666824102401733, + "step": 2830 + }, + { + "epoch": 0.74, + "grad_norm": 5.15625, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": -2.6898536682128906, + "logits/rejected": -2.655017137527466, + "logps/chosen": -504.71441650390625, + "logps/rejected": -477.46258544921875, + "loss": 0.6408, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1481832265853882, + "rewards/margins": 0.22253009676933289, + "rewards/rejected": -1.370713472366333, + "step": 2840 + }, + { + "epoch": 0.75, + "grad_norm": 4.125, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": -2.66972017288208, + "logits/rejected": -2.6421947479248047, + "logps/chosen": -510.1853942871094, + "logps/rejected": -497.7322692871094, + "loss": 0.5764, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0694119930267334, + "rewards/margins": 0.37935250997543335, + "rewards/rejected": -1.448764681816101, + "step": 2850 + }, + { + "epoch": 0.75, + "grad_norm": 3.171875, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": -2.651383638381958, + "logits/rejected": -2.6579291820526123, + "logps/chosen": -488.3523864746094, + "logps/rejected": -507.35272216796875, + "loss": 0.5909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9926462173461914, + "rewards/margins": 0.34531423449516296, + "rewards/rejected": -1.3379603624343872, + "step": 2860 + }, + { + "epoch": 0.75, + "grad_norm": 4.25, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": -2.722695827484131, + "logits/rejected": -2.719184398651123, + "logps/chosen": -504.15966796875, + "logps/rejected": -503.24078369140625, + "loss": 0.6191, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9827505350112915, + "rewards/margins": 0.25871509313583374, + "rewards/rejected": -1.2414658069610596, + "step": 2870 + }, + { + "epoch": 0.75, + "grad_norm": 6.25, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": -2.6997554302215576, + "logits/rejected": -2.6661736965179443, + "logps/chosen": -492.560791015625, + "logps/rejected": -473.57403564453125, + "loss": 0.6363, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.149613618850708, + "rewards/margins": 0.23957280814647675, + "rewards/rejected": -1.3891866207122803, + "step": 2880 + }, + { + "epoch": 0.76, + "grad_norm": 4.40625, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": -2.7065916061401367, + "logits/rejected": -2.6797971725463867, + "logps/chosen": -494.64764404296875, + "logps/rejected": -473.16558837890625, + "loss": 0.6074, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1044108867645264, + "rewards/margins": 0.31540459394454956, + "rewards/rejected": -1.4198153018951416, + "step": 2890 + }, + { + "epoch": 0.76, + "grad_norm": 3.828125, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": -2.6506381034851074, + "logits/rejected": -2.6390433311462402, + "logps/chosen": -493.5978088378906, + "logps/rejected": -498.072509765625, + "loss": 0.6159, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0802371501922607, + "rewards/margins": 0.29849973320961, + "rewards/rejected": -1.378736972808838, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.649965763092041, + "eval_logits/rejected": -2.6174795627593994, + "eval_logps/chosen": -501.72564697265625, + "eval_logps/rejected": -486.77386474609375, + "eval_loss": 0.6097070574760437, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -1.0550086498260498, + "eval_rewards/margins": 0.2958948612213135, + "eval_rewards/rejected": -1.3509035110473633, + "eval_runtime": 348.4416, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 0.717, + "step": 2900 + }, + { + "epoch": 0.76, + "grad_norm": 4.0, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": -2.640998363494873, + "logits/rejected": -2.595895290374756, + "logps/chosen": -470.3514709472656, + "logps/rejected": -444.5486755371094, + "loss": 0.5996, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.065406084060669, + "rewards/margins": 0.3272332549095154, + "rewards/rejected": -1.3926395177841187, + "step": 2910 + }, + { + "epoch": 0.76, + "grad_norm": 3.96875, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": -2.6588757038116455, + "logits/rejected": -2.636864423751831, + "logps/chosen": -520.3292846679688, + "logps/rejected": -518.36328125, + "loss": 0.6253, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0216096639633179, + "rewards/margins": 0.25970107316970825, + "rewards/rejected": -1.281310796737671, + "step": 2920 + }, + { + "epoch": 0.77, + "grad_norm": 3.484375, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": -2.7186899185180664, + "logits/rejected": -2.671178102493286, + "logps/chosen": -507.38653564453125, + "logps/rejected": -495.093994140625, + "loss": 0.6065, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.029103398323059, + "rewards/margins": 0.2967410683631897, + "rewards/rejected": -1.325844407081604, + "step": 2930 + }, + { + "epoch": 0.77, + "grad_norm": 3.5625, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": -2.6119322776794434, + "logits/rejected": -2.5850017070770264, + "logps/chosen": -498.72705078125, + "logps/rejected": -450.4840393066406, + "loss": 0.6209, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.052886724472046, + "rewards/margins": 0.27135077118873596, + "rewards/rejected": -1.32423734664917, + "step": 2940 + }, + { + "epoch": 0.77, + "grad_norm": 4.03125, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": -2.6534907817840576, + "logits/rejected": -2.6120922565460205, + "logps/chosen": -490.2318420410156, + "logps/rejected": -474.75811767578125, + "loss": 0.5999, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0758155584335327, + "rewards/margins": 0.31523874402046204, + "rewards/rejected": -1.391054391860962, + "step": 2950 + }, + { + "epoch": 0.77, + "grad_norm": 3.75, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": -2.648266315460205, + "logits/rejected": -2.6395068168640137, + "logps/chosen": -505.58984375, + "logps/rejected": -491.96356201171875, + "loss": 0.595, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0662881135940552, + "rewards/margins": 0.33815911412239075, + "rewards/rejected": -1.404447317123413, + "step": 2960 + }, + { + "epoch": 0.78, + "grad_norm": 5.03125, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": -2.666665554046631, + "logits/rejected": -2.6183128356933594, + "logps/chosen": -520.7285766601562, + "logps/rejected": -519.2037963867188, + "loss": 0.5844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.031095266342163, + "rewards/margins": 0.3654022812843323, + "rewards/rejected": -1.3964974880218506, + "step": 2970 + }, + { + "epoch": 0.78, + "grad_norm": 3.90625, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": -2.6764984130859375, + "logits/rejected": -2.6465909481048584, + "logps/chosen": -487.6134338378906, + "logps/rejected": -479.92742919921875, + "loss": 0.6011, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.068189263343811, + "rewards/margins": 0.2995051443576813, + "rewards/rejected": -1.36769437789917, + "step": 2980 + }, + { + "epoch": 0.78, + "grad_norm": 2.96875, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": -2.706010580062866, + "logits/rejected": -2.6768994331359863, + "logps/chosen": -467.7933044433594, + "logps/rejected": -459.462890625, + "loss": 0.6033, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1356656551361084, + "rewards/margins": 0.3141850531101227, + "rewards/rejected": -1.4498507976531982, + "step": 2990 + }, + { + "epoch": 0.79, + "grad_norm": 5.4375, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": -2.645399570465088, + "logits/rejected": -2.6416878700256348, + "logps/chosen": -460.01348876953125, + "logps/rejected": -490.2184143066406, + "loss": 0.5815, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0389206409454346, + "rewards/margins": 0.35546866059303284, + "rewards/rejected": -1.3943893909454346, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.6419765949249268, + "eval_logits/rejected": -2.608949661254883, + "eval_logps/chosen": -506.4727478027344, + "eval_logps/rejected": -492.1650085449219, + "eval_loss": 0.6090958714485168, + "eval_rewards/accuracies": 0.6570000052452087, + "eval_rewards/chosen": -1.1024789810180664, + "eval_rewards/margins": 0.3023359179496765, + "eval_rewards/rejected": -1.4048149585723877, + "eval_runtime": 347.5981, + "eval_samples_per_second": 5.754, + "eval_steps_per_second": 0.719, + "step": 3000 + }, + { + "epoch": 0.79, + "grad_norm": 3.15625, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": -2.6427597999572754, + "logits/rejected": -2.6378586292266846, + "logps/chosen": -508.255615234375, + "logps/rejected": -498.01300048828125, + "loss": 0.6168, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.076911211013794, + "rewards/margins": 0.3045389652252197, + "rewards/rejected": -1.3814500570297241, + "step": 3010 + }, + { + "epoch": 0.79, + "grad_norm": 3.8125, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": -2.635432720184326, + "logits/rejected": -2.580549716949463, + "logps/chosen": -463.28411865234375, + "logps/rejected": -442.45684814453125, + "loss": 0.5944, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.119048833847046, + "rewards/margins": 0.3251452147960663, + "rewards/rejected": -1.444193959236145, + "step": 3020 + }, + { + "epoch": 0.79, + "grad_norm": 4.40625, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": -2.6761887073516846, + "logits/rejected": -2.6448588371276855, + "logps/chosen": -470.81353759765625, + "logps/rejected": -462.959228515625, + "loss": 0.6316, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1331965923309326, + "rewards/margins": 0.2631588578224182, + "rewards/rejected": -1.3963555097579956, + "step": 3030 + }, + { + "epoch": 0.8, + "grad_norm": 3.4375, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": -2.747220754623413, + "logits/rejected": -2.6560819149017334, + "logps/chosen": -522.0474853515625, + "logps/rejected": -451.25836181640625, + "loss": 0.5939, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0959001779556274, + "rewards/margins": 0.3455473482608795, + "rewards/rejected": -1.441447377204895, + "step": 3040 + }, + { + "epoch": 0.8, + "grad_norm": 3.40625, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": -2.619112730026245, + "logits/rejected": -2.6229631900787354, + "logps/chosen": -463.21539306640625, + "logps/rejected": -479.87957763671875, + "loss": 0.5902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0741875171661377, + "rewards/margins": 0.34632328152656555, + "rewards/rejected": -1.4205108880996704, + "step": 3050 + }, + { + "epoch": 0.8, + "grad_norm": 4.5625, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": -2.6366074085235596, + "logits/rejected": -2.636824131011963, + "logps/chosen": -507.3749084472656, + "logps/rejected": -560.8580322265625, + "loss": 0.5986, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0962883234024048, + "rewards/margins": 0.34130847454071045, + "rewards/rejected": -1.4375969171524048, + "step": 3060 + }, + { + "epoch": 0.8, + "grad_norm": 4.84375, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": -2.7327983379364014, + "logits/rejected": -2.7102444171905518, + "logps/chosen": -523.8292846679688, + "logps/rejected": -506.67742919921875, + "loss": 0.6271, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0699433088302612, + "rewards/margins": 0.2572742998600006, + "rewards/rejected": -1.3272178173065186, + "step": 3070 + }, + { + "epoch": 0.81, + "grad_norm": 3.625, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": -2.7022974491119385, + "logits/rejected": -2.6688647270202637, + "logps/chosen": -505.4044494628906, + "logps/rejected": -490.70867919921875, + "loss": 0.5871, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1016364097595215, + "rewards/margins": 0.34242209792137146, + "rewards/rejected": -1.4440586566925049, + "step": 3080 + }, + { + "epoch": 0.81, + "grad_norm": 2.828125, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": -2.707888126373291, + "logits/rejected": -2.647016763687134, + "logps/chosen": -517.645751953125, + "logps/rejected": -478.8772888183594, + "loss": 0.5976, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.051059603691101, + "rewards/margins": 0.3384644389152527, + "rewards/rejected": -1.389524221420288, + "step": 3090 + }, + { + "epoch": 0.81, + "grad_norm": 3.234375, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": -2.671968936920166, + "logits/rejected": -2.647467851638794, + "logps/chosen": -513.36669921875, + "logps/rejected": -501.7236328125, + "loss": 0.5885, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9971181154251099, + "rewards/margins": 0.36711040139198303, + "rewards/rejected": -1.3642284870147705, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.633734703063965, + "eval_logits/rejected": -2.600095272064209, + "eval_logps/chosen": -505.9959716796875, + "eval_logps/rejected": -491.744384765625, + "eval_loss": 0.6088695526123047, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -1.0977121591567993, + "eval_rewards/margins": 0.3028964698314667, + "eval_rewards/rejected": -1.4006085395812988, + "eval_runtime": 347.6519, + "eval_samples_per_second": 5.753, + "eval_steps_per_second": 0.719, + "step": 3100 + }, + { + "epoch": 0.81, + "grad_norm": 4.0625, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": -2.7147414684295654, + "logits/rejected": -2.6615347862243652, + "logps/chosen": -522.6924438476562, + "logps/rejected": -512.82373046875, + "loss": 0.6231, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9803631901741028, + "rewards/margins": 0.28811120986938477, + "rewards/rejected": -1.2684743404388428, + "step": 3110 + }, + { + "epoch": 0.82, + "grad_norm": 3.828125, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": -2.671032190322876, + "logits/rejected": -2.633485794067383, + "logps/chosen": -516.436767578125, + "logps/rejected": -491.43963623046875, + "loss": 0.6036, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0946416854858398, + "rewards/margins": 0.33089718222618103, + "rewards/rejected": -1.4255390167236328, + "step": 3120 + }, + { + "epoch": 0.82, + "grad_norm": 3.234375, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": -2.63783860206604, + "logits/rejected": -2.608571767807007, + "logps/chosen": -481.3115234375, + "logps/rejected": -481.80499267578125, + "loss": 0.619, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1024813652038574, + "rewards/margins": 0.2848733067512512, + "rewards/rejected": -1.3873546123504639, + "step": 3130 + }, + { + "epoch": 0.82, + "grad_norm": 3.75, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": -2.6718032360076904, + "logits/rejected": -2.6056950092315674, + "logps/chosen": -484.45489501953125, + "logps/rejected": -480.09619140625, + "loss": 0.6237, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1476237773895264, + "rewards/margins": 0.28761929273605347, + "rewards/rejected": -1.4352428913116455, + "step": 3140 + }, + { + "epoch": 0.82, + "grad_norm": 3.53125, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": -2.72918438911438, + "logits/rejected": -2.6762828826904297, + "logps/chosen": -508.759521484375, + "logps/rejected": -497.8362731933594, + "loss": 0.6242, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1187124252319336, + "rewards/margins": 0.27222010493278503, + "rewards/rejected": -1.3909324407577515, + "step": 3150 + }, + { + "epoch": 0.83, + "grad_norm": 3.4375, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": -2.6751275062561035, + "logits/rejected": -2.640679121017456, + "logps/chosen": -520.294921875, + "logps/rejected": -521.7930908203125, + "loss": 0.5949, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0745341777801514, + "rewards/margins": 0.33133482933044434, + "rewards/rejected": -1.4058691263198853, + "step": 3160 + }, + { + "epoch": 0.83, + "grad_norm": 4.0, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": -2.6148898601531982, + "logits/rejected": -2.5540575981140137, + "logps/chosen": -503.8189392089844, + "logps/rejected": -468.48468017578125, + "loss": 0.6091, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0721933841705322, + "rewards/margins": 0.2973105311393738, + "rewards/rejected": -1.3695039749145508, + "step": 3170 + }, + { + "epoch": 0.83, + "grad_norm": 3.234375, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": -2.6700875759124756, + "logits/rejected": -2.6666862964630127, + "logps/chosen": -451.3338928222656, + "logps/rejected": -494.00537109375, + "loss": 0.6031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0849757194519043, + "rewards/margins": 0.3115970492362976, + "rewards/rejected": -1.3965727090835571, + "step": 3180 + }, + { + "epoch": 0.83, + "grad_norm": 3.65625, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": -2.6685242652893066, + "logits/rejected": -2.658634662628174, + "logps/chosen": -498.36871337890625, + "logps/rejected": -510.15118408203125, + "loss": 0.584, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0659822225570679, + "rewards/margins": 0.3512209951877594, + "rewards/rejected": -1.4172031879425049, + "step": 3190 + }, + { + "epoch": 0.84, + "grad_norm": 2.203125, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": -2.637341022491455, + "logits/rejected": -2.636918544769287, + "logps/chosen": -501.9124450683594, + "logps/rejected": -526.0079345703125, + "loss": 0.6074, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0901362895965576, + "rewards/margins": 0.30762726068496704, + "rewards/rejected": -1.3977636098861694, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.6388015747070312, + "eval_logits/rejected": -2.605605363845825, + "eval_logps/chosen": -506.04547119140625, + "eval_logps/rejected": -491.9724426269531, + "eval_loss": 0.6086438894271851, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -1.0982069969177246, + "eval_rewards/margins": 0.3046818971633911, + "eval_rewards/rejected": -1.4028888940811157, + "eval_runtime": 347.8304, + "eval_samples_per_second": 5.75, + "eval_steps_per_second": 0.719, + "step": 3200 + }, + { + "epoch": 0.84, + "grad_norm": 3.28125, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": -2.6419506072998047, + "logits/rejected": -2.6541552543640137, + "logps/chosen": -491.3660583496094, + "logps/rejected": -470.6940002441406, + "loss": 0.6613, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.115142583847046, + "rewards/margins": 0.18211853504180908, + "rewards/rejected": -1.2972612380981445, + "step": 3210 + }, + { + "epoch": 0.84, + "grad_norm": 3.90625, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": -2.672940731048584, + "logits/rejected": -2.6646289825439453, + "logps/chosen": -534.7271728515625, + "logps/rejected": -501.14385986328125, + "loss": 0.6169, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0392810106277466, + "rewards/margins": 0.2989768981933594, + "rewards/rejected": -1.3382577896118164, + "step": 3220 + }, + { + "epoch": 0.85, + "grad_norm": 4.46875, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": -2.627161979675293, + "logits/rejected": -2.624307155609131, + "logps/chosen": -498.4679260253906, + "logps/rejected": -492.9081115722656, + "loss": 0.6572, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0917952060699463, + "rewards/margins": 0.18084125220775604, + "rewards/rejected": -1.2726365327835083, + "step": 3230 + }, + { + "epoch": 0.85, + "grad_norm": 5.125, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": -2.7235939502716064, + "logits/rejected": -2.68570613861084, + "logps/chosen": -511.3997497558594, + "logps/rejected": -466.37591552734375, + "loss": 0.6249, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0969817638397217, + "rewards/margins": 0.2552695572376251, + "rewards/rejected": -1.3522512912750244, + "step": 3240 + }, + { + "epoch": 0.85, + "grad_norm": 3.234375, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": -2.7106406688690186, + "logits/rejected": -2.665052890777588, + "logps/chosen": -538.0487060546875, + "logps/rejected": -513.3031005859375, + "loss": 0.5986, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.014005422592163, + "rewards/margins": 0.33192679286003113, + "rewards/rejected": -1.3459322452545166, + "step": 3250 + }, + { + "epoch": 0.85, + "grad_norm": 3.6875, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": -2.6548619270324707, + "logits/rejected": -2.6132593154907227, + "logps/chosen": -503.3099060058594, + "logps/rejected": -498.5037536621094, + "loss": 0.599, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1849998235702515, + "rewards/margins": 0.2957174777984619, + "rewards/rejected": -1.480717420578003, + "step": 3260 + }, + { + "epoch": 0.86, + "grad_norm": 3.421875, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": -2.6821701526641846, + "logits/rejected": -2.667227268218994, + "logps/chosen": -484.52783203125, + "logps/rejected": -481.2435607910156, + "loss": 0.6298, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0785605907440186, + "rewards/margins": 0.2731327414512634, + "rewards/rejected": -1.3516933917999268, + "step": 3270 + }, + { + "epoch": 0.86, + "grad_norm": 3.125, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": -2.6534295082092285, + "logits/rejected": -2.655452013015747, + "logps/chosen": -510.244384765625, + "logps/rejected": -527.35546875, + "loss": 0.639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.032504677772522, + "rewards/margins": 0.2230747640132904, + "rewards/rejected": -1.2555794715881348, + "step": 3280 + }, + { + "epoch": 0.86, + "grad_norm": 3.078125, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": -2.6963908672332764, + "logits/rejected": -2.7045657634735107, + "logps/chosen": -497.63653564453125, + "logps/rejected": -511.78851318359375, + "loss": 0.6386, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0954744815826416, + "rewards/margins": 0.259405255317688, + "rewards/rejected": -1.3548799753189087, + "step": 3290 + }, + { + "epoch": 0.86, + "grad_norm": 3.25, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": -2.6455070972442627, + "logits/rejected": -2.602128505706787, + "logps/chosen": -493.291259765625, + "logps/rejected": -495.4798889160156, + "loss": 0.5981, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0171977281570435, + "rewards/margins": 0.32562780380249023, + "rewards/rejected": -1.3428254127502441, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.64424991607666, + "eval_logits/rejected": -2.611661911010742, + "eval_logps/chosen": -504.757080078125, + "eval_logps/rejected": -490.4914855957031, + "eval_loss": 0.6086958050727844, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -1.0853232145309448, + "eval_rewards/margins": 0.30275672674179077, + "eval_rewards/rejected": -1.3880800008773804, + "eval_runtime": 347.7972, + "eval_samples_per_second": 5.75, + "eval_steps_per_second": 0.719, + "step": 3300 + }, + { + "epoch": 0.87, + "grad_norm": 6.28125, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": -2.68962025642395, + "logits/rejected": -2.662564754486084, + "logps/chosen": -457.9961853027344, + "logps/rejected": -422.3614807128906, + "loss": 0.5896, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0693327188491821, + "rewards/margins": 0.3496701717376709, + "rewards/rejected": -1.4190027713775635, + "step": 3310 + }, + { + "epoch": 0.87, + "grad_norm": 3.46875, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": -2.6620020866394043, + "logits/rejected": -2.629638433456421, + "logps/chosen": -499.93896484375, + "logps/rejected": -487.2354431152344, + "loss": 0.6226, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.130860686302185, + "rewards/margins": 0.27780821919441223, + "rewards/rejected": -1.408668875694275, + "step": 3320 + }, + { + "epoch": 0.87, + "grad_norm": 3.984375, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": -2.7372817993164062, + "logits/rejected": -2.6844122409820557, + "logps/chosen": -516.2169189453125, + "logps/rejected": -501.70721435546875, + "loss": 0.5967, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0415713787078857, + "rewards/margins": 0.37831583619117737, + "rewards/rejected": -1.4198873043060303, + "step": 3330 + }, + { + "epoch": 0.87, + "grad_norm": 4.125, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": -2.6761159896850586, + "logits/rejected": -2.651787281036377, + "logps/chosen": -508.19891357421875, + "logps/rejected": -515.2844848632812, + "loss": 0.5993, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1085197925567627, + "rewards/margins": 0.3225208520889282, + "rewards/rejected": -1.4310405254364014, + "step": 3340 + }, + { + "epoch": 0.88, + "grad_norm": 3.390625, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": -2.6180145740509033, + "logits/rejected": -2.5854008197784424, + "logps/chosen": -513.4478759765625, + "logps/rejected": -516.1890869140625, + "loss": 0.5651, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0392944812774658, + "rewards/margins": 0.3910349905490875, + "rewards/rejected": -1.4303295612335205, + "step": 3350 + }, + { + "epoch": 0.88, + "grad_norm": 3.6875, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": -2.6438541412353516, + "logits/rejected": -2.5801243782043457, + "logps/chosen": -512.1270751953125, + "logps/rejected": -525.7010498046875, + "loss": 0.5627, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0036083459854126, + "rewards/margins": 0.4596267640590668, + "rewards/rejected": -1.4632351398468018, + "step": 3360 + }, + { + "epoch": 0.88, + "grad_norm": 3.90625, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": -2.637359380722046, + "logits/rejected": -2.609163761138916, + "logps/chosen": -479.6756896972656, + "logps/rejected": -470.44158935546875, + "loss": 0.5943, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1648880243301392, + "rewards/margins": 0.31521743535995483, + "rewards/rejected": -1.4801056385040283, + "step": 3370 + }, + { + "epoch": 0.88, + "grad_norm": 4.25, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": -2.655611276626587, + "logits/rejected": -2.6167244911193848, + "logps/chosen": -495.48779296875, + "logps/rejected": -477.42877197265625, + "loss": 0.6252, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.13040292263031, + "rewards/margins": 0.2747969925403595, + "rewards/rejected": -1.4052000045776367, + "step": 3380 + }, + { + "epoch": 0.89, + "grad_norm": 3.328125, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": -2.6717348098754883, + "logits/rejected": -2.594569683074951, + "logps/chosen": -509.77593994140625, + "logps/rejected": -469.570068359375, + "loss": 0.573, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0561670064926147, + "rewards/margins": 0.38789016008377075, + "rewards/rejected": -1.4440572261810303, + "step": 3390 + }, + { + "epoch": 0.89, + "grad_norm": 3.8125, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": -2.6890311241149902, + "logits/rejected": -2.6426968574523926, + "logps/chosen": -555.2242431640625, + "logps/rejected": -503.68292236328125, + "loss": 0.5944, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9910923838615417, + "rewards/margins": 0.3401171565055847, + "rewards/rejected": -1.331209421157837, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.6360013484954834, + "eval_logits/rejected": -2.602590560913086, + "eval_logps/chosen": -505.1947021484375, + "eval_logps/rejected": -490.9886779785156, + "eval_loss": 0.6087493300437927, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -1.0896990299224854, + "eval_rewards/margins": 0.3033522665500641, + "eval_rewards/rejected": -1.393051266670227, + "eval_runtime": 347.6769, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 0.719, + "step": 3400 + }, + { + "epoch": 0.89, + "grad_norm": 4.46875, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": -2.651811122894287, + "logits/rejected": -2.603327512741089, + "logps/chosen": -529.9959106445312, + "logps/rejected": -487.5215759277344, + "loss": 0.6188, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1013660430908203, + "rewards/margins": 0.261643648147583, + "rewards/rejected": -1.3630096912384033, + "step": 3410 + }, + { + "epoch": 0.9, + "grad_norm": 4.0, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": -2.723508596420288, + "logits/rejected": -2.6757960319519043, + "logps/chosen": -535.6593627929688, + "logps/rejected": -493.79840087890625, + "loss": 0.6145, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.080249309539795, + "rewards/margins": 0.29932349920272827, + "rewards/rejected": -1.379572868347168, + "step": 3420 + }, + { + "epoch": 0.9, + "grad_norm": 3.53125, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": -2.687269687652588, + "logits/rejected": -2.629744529724121, + "logps/chosen": -520.5648193359375, + "logps/rejected": -503.39532470703125, + "loss": 0.6145, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0786540508270264, + "rewards/margins": 0.2842092216014862, + "rewards/rejected": -1.362863302230835, + "step": 3430 + }, + { + "epoch": 0.9, + "grad_norm": 3.8125, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": -2.7056689262390137, + "logits/rejected": -2.6712379455566406, + "logps/chosen": -483.80413818359375, + "logps/rejected": -458.0314025878906, + "loss": 0.5931, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0653079748153687, + "rewards/margins": 0.31929486989974976, + "rewards/rejected": -1.3846029043197632, + "step": 3440 + }, + { + "epoch": 0.9, + "grad_norm": 3.9375, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": -2.697467803955078, + "logits/rejected": -2.6769003868103027, + "logps/chosen": -504.36572265625, + "logps/rejected": -491.1905212402344, + "loss": 0.5925, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0281347036361694, + "rewards/margins": 0.3227378726005554, + "rewards/rejected": -1.3508726358413696, + "step": 3450 + }, + { + "epoch": 0.91, + "grad_norm": 3.4375, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": -2.667304039001465, + "logits/rejected": -2.652622938156128, + "logps/chosen": -477.559326171875, + "logps/rejected": -501.595703125, + "loss": 0.5803, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0702049732208252, + "rewards/margins": 0.3717937469482422, + "rewards/rejected": -1.4419987201690674, + "step": 3460 + }, + { + "epoch": 0.91, + "grad_norm": 3.828125, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": -2.677260398864746, + "logits/rejected": -2.619901180267334, + "logps/chosen": -533.4788818359375, + "logps/rejected": -490.34552001953125, + "loss": 0.5897, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0250308513641357, + "rewards/margins": 0.38970544934272766, + "rewards/rejected": -1.4147361516952515, + "step": 3470 + }, + { + "epoch": 0.91, + "grad_norm": 3.640625, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": -2.7128968238830566, + "logits/rejected": -2.679882764816284, + "logps/chosen": -489.0921325683594, + "logps/rejected": -481.76470947265625, + "loss": 0.6548, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1597046852111816, + "rewards/margins": 0.2149733006954193, + "rewards/rejected": -1.3746780157089233, + "step": 3480 + }, + { + "epoch": 0.91, + "grad_norm": 2.953125, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": -2.703012466430664, + "logits/rejected": -2.672036647796631, + "logps/chosen": -522.70849609375, + "logps/rejected": -502.2613830566406, + "loss": 0.5897, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0457698106765747, + "rewards/margins": 0.35509949922561646, + "rewards/rejected": -1.4008692502975464, + "step": 3490 + }, + { + "epoch": 0.92, + "grad_norm": 2.96875, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": -2.6770823001861572, + "logits/rejected": -2.624206066131592, + "logps/chosen": -506.50311279296875, + "logps/rejected": -476.0753479003906, + "loss": 0.5979, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.052063226699829, + "rewards/margins": 0.32971978187561035, + "rewards/rejected": -1.381783127784729, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.6460154056549072, + "eval_logits/rejected": -2.613610029220581, + "eval_logps/chosen": -505.44384765625, + "eval_logps/rejected": -491.3070068359375, + "eval_loss": 0.6085324883460999, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -1.0921905040740967, + "eval_rewards/margins": 0.30404436588287354, + "eval_rewards/rejected": -1.3962348699569702, + "eval_runtime": 347.657, + "eval_samples_per_second": 5.753, + "eval_steps_per_second": 0.719, + "step": 3500 + }, + { + "epoch": 0.92, + "grad_norm": 3.765625, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": -2.6578614711761475, + "logits/rejected": -2.6162824630737305, + "logps/chosen": -483.9580993652344, + "logps/rejected": -489.7320251464844, + "loss": 0.5651, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0210082530975342, + "rewards/margins": 0.37885022163391113, + "rewards/rejected": -1.3998584747314453, + "step": 3510 + }, + { + "epoch": 0.92, + "grad_norm": 3.59375, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": -2.7051877975463867, + "logits/rejected": -2.6907284259796143, + "logps/chosen": -506.522216796875, + "logps/rejected": -518.1729736328125, + "loss": 0.586, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9934719204902649, + "rewards/margins": 0.3790653347969055, + "rewards/rejected": -1.3725372552871704, + "step": 3520 + }, + { + "epoch": 0.92, + "grad_norm": 4.03125, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": -2.62373948097229, + "logits/rejected": -2.6049044132232666, + "logps/chosen": -467.76312255859375, + "logps/rejected": -479.63446044921875, + "loss": 0.6085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.143489122390747, + "rewards/margins": 0.27841717004776, + "rewards/rejected": -1.4219063520431519, + "step": 3530 + }, + { + "epoch": 0.93, + "grad_norm": 4.625, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": -2.706756114959717, + "logits/rejected": -2.6707985401153564, + "logps/chosen": -539.0672607421875, + "logps/rejected": -535.8557739257812, + "loss": 0.609, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0309526920318604, + "rewards/margins": 0.33775442838668823, + "rewards/rejected": -1.3687069416046143, + "step": 3540 + }, + { + "epoch": 0.93, + "grad_norm": 3.59375, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": -2.6710562705993652, + "logits/rejected": -2.6908836364746094, + "logps/chosen": -492.14251708984375, + "logps/rejected": -494.04534912109375, + "loss": 0.6436, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0950779914855957, + "rewards/margins": 0.22633162140846252, + "rewards/rejected": -1.3214095830917358, + "step": 3550 + }, + { + "epoch": 0.93, + "grad_norm": 3.453125, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": -2.6932473182678223, + "logits/rejected": -2.676130771636963, + "logps/chosen": -497.710693359375, + "logps/rejected": -469.8008728027344, + "loss": 0.5853, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0803611278533936, + "rewards/margins": 0.3779391050338745, + "rewards/rejected": -1.458300232887268, + "step": 3560 + }, + { + "epoch": 0.93, + "grad_norm": 5.1875, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": -2.695744752883911, + "logits/rejected": -2.657261610031128, + "logps/chosen": -534.2222900390625, + "logps/rejected": -514.4498291015625, + "loss": 0.6, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9726902842521667, + "rewards/margins": 0.33556845784187317, + "rewards/rejected": -1.3082587718963623, + "step": 3570 + }, + { + "epoch": 0.94, + "grad_norm": 3.3125, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": -2.645259380340576, + "logits/rejected": -2.626682758331299, + "logps/chosen": -505.6795959472656, + "logps/rejected": -522.542724609375, + "loss": 0.5892, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0380734205245972, + "rewards/margins": 0.3490239083766937, + "rewards/rejected": -1.3870973587036133, + "step": 3580 + }, + { + "epoch": 0.94, + "grad_norm": 4.3125, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": -2.6519410610198975, + "logits/rejected": -2.6350717544555664, + "logps/chosen": -507.71734619140625, + "logps/rejected": -494.5531311035156, + "loss": 0.6209, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1069073677062988, + "rewards/margins": 0.278178870677948, + "rewards/rejected": -1.3850862979888916, + "step": 3590 + }, + { + "epoch": 0.94, + "grad_norm": 4.375, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": -2.623077869415283, + "logits/rejected": -2.615199565887451, + "logps/chosen": -456.0958557128906, + "logps/rejected": -476.017578125, + "loss": 0.6154, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1336462497711182, + "rewards/margins": 0.26231056451797485, + "rewards/rejected": -1.3959566354751587, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.639690399169922, + "eval_logits/rejected": -2.6065900325775146, + "eval_logps/chosen": -505.278076171875, + "eval_logps/rejected": -491.1413269042969, + "eval_loss": 0.6085542440414429, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -1.0905324220657349, + "eval_rewards/margins": 0.30404558777809143, + "eval_rewards/rejected": -1.394577980041504, + "eval_runtime": 347.9164, + "eval_samples_per_second": 5.749, + "eval_steps_per_second": 0.719, + "step": 3600 + }, + { + "epoch": 0.94, + "grad_norm": 3.140625, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": -2.6383109092712402, + "logits/rejected": -2.6063365936279297, + "logps/chosen": -466.67974853515625, + "logps/rejected": -459.43292236328125, + "loss": 0.6194, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0989251136779785, + "rewards/margins": 0.2716377377510071, + "rewards/rejected": -1.3705627918243408, + "step": 3610 + }, + { + "epoch": 0.95, + "grad_norm": 4.53125, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": -2.677117347717285, + "logits/rejected": -2.641345500946045, + "logps/chosen": -486.0668029785156, + "logps/rejected": -461.42364501953125, + "loss": 0.6305, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1190522909164429, + "rewards/margins": 0.26842787861824036, + "rewards/rejected": -1.3874801397323608, + "step": 3620 + }, + { + "epoch": 0.95, + "grad_norm": 3.671875, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": -2.729708433151245, + "logits/rejected": -2.699693202972412, + "logps/chosen": -496.13580322265625, + "logps/rejected": -466.28607177734375, + "loss": 0.6133, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.098503589630127, + "rewards/margins": 0.29642829298973083, + "rewards/rejected": -1.3949320316314697, + "step": 3630 + }, + { + "epoch": 0.95, + "grad_norm": 3.984375, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": -2.66868257522583, + "logits/rejected": -2.611743450164795, + "logps/chosen": -512.9805908203125, + "logps/rejected": -457.05517578125, + "loss": 0.6083, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0667917728424072, + "rewards/margins": 0.3067484498023987, + "rewards/rejected": -1.3735402822494507, + "step": 3640 + }, + { + "epoch": 0.96, + "grad_norm": 3.171875, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": -2.7076990604400635, + "logits/rejected": -2.6731173992156982, + "logps/chosen": -483.63604736328125, + "logps/rejected": -467.524658203125, + "loss": 0.5848, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0306214094161987, + "rewards/margins": 0.3499363660812378, + "rewards/rejected": -1.3805577754974365, + "step": 3650 + }, + { + "epoch": 0.96, + "grad_norm": 3.890625, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": -2.664285182952881, + "logits/rejected": -2.6678249835968018, + "logps/chosen": -514.0250244140625, + "logps/rejected": -520.1316528320312, + "loss": 0.6323, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1207656860351562, + "rewards/margins": 0.2250840663909912, + "rewards/rejected": -1.3458497524261475, + "step": 3660 + }, + { + "epoch": 0.96, + "grad_norm": 4.4375, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": -2.7375500202178955, + "logits/rejected": -2.689077377319336, + "logps/chosen": -519.54150390625, + "logps/rejected": -497.95574951171875, + "loss": 0.6163, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1362625360488892, + "rewards/margins": 0.3190504312515259, + "rewards/rejected": -1.455312967300415, + "step": 3670 + }, + { + "epoch": 0.96, + "grad_norm": 6.71875, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": -2.6551547050476074, + "logits/rejected": -2.6728549003601074, + "logps/chosen": -494.8778381347656, + "logps/rejected": -511.70086669921875, + "loss": 0.5927, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0726226568222046, + "rewards/margins": 0.3504863679409027, + "rewards/rejected": -1.4231090545654297, + "step": 3680 + }, + { + "epoch": 0.97, + "grad_norm": 3.5, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": -2.6753056049346924, + "logits/rejected": -2.628095865249634, + "logps/chosen": -563.8702392578125, + "logps/rejected": -515.8682861328125, + "loss": 0.5758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9584245681762695, + "rewards/margins": 0.3854634165763855, + "rewards/rejected": -1.3438880443572998, + "step": 3690 + }, + { + "epoch": 0.97, + "grad_norm": 5.125, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": -2.586505174636841, + "logits/rejected": -2.5706770420074463, + "logps/chosen": -538.0152587890625, + "logps/rejected": -521.6544189453125, + "loss": 0.6053, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0640016794204712, + "rewards/margins": 0.30589136481285095, + "rewards/rejected": -1.369892954826355, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.6422648429870605, + "eval_logits/rejected": -2.609360694885254, + "eval_logps/chosen": -505.29425048828125, + "eval_logps/rejected": -491.1404724121094, + "eval_loss": 0.6086028218269348, + "eval_rewards/accuracies": 0.6549999713897705, + "eval_rewards/chosen": -1.0906946659088135, + "eval_rewards/margins": 0.30387499928474426, + "eval_rewards/rejected": -1.394569754600525, + "eval_runtime": 347.9396, + "eval_samples_per_second": 5.748, + "eval_steps_per_second": 0.719, + "step": 3700 + }, + { + "epoch": 0.97, + "grad_norm": 5.1875, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": -2.6033217906951904, + "logits/rejected": -2.595217227935791, + "logps/chosen": -475.32647705078125, + "logps/rejected": -481.47039794921875, + "loss": 0.6223, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0801260471343994, + "rewards/margins": 0.30023181438446045, + "rewards/rejected": -1.3803579807281494, + "step": 3710 + }, + { + "epoch": 0.97, + "grad_norm": 3.59375, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": -2.6388094425201416, + "logits/rejected": -2.5794878005981445, + "logps/chosen": -490.6612854003906, + "logps/rejected": -458.67999267578125, + "loss": 0.593, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1297202110290527, + "rewards/margins": 0.3556092083454132, + "rewards/rejected": -1.4853293895721436, + "step": 3720 + }, + { + "epoch": 0.98, + "grad_norm": 3.65625, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": -2.6429569721221924, + "logits/rejected": -2.6291909217834473, + "logps/chosen": -483.68841552734375, + "logps/rejected": -469.73638916015625, + "loss": 0.6087, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.080833077430725, + "rewards/margins": 0.29357820749282837, + "rewards/rejected": -1.3744113445281982, + "step": 3730 + }, + { + "epoch": 0.98, + "grad_norm": 3.015625, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": -2.6733694076538086, + "logits/rejected": -2.645113468170166, + "logps/chosen": -516.4934692382812, + "logps/rejected": -511.3959045410156, + "loss": 0.5696, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0565123558044434, + "rewards/margins": 0.41344934701919556, + "rewards/rejected": -1.4699615240097046, + "step": 3740 + }, + { + "epoch": 0.98, + "grad_norm": 2.765625, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": -2.6986021995544434, + "logits/rejected": -2.662108898162842, + "logps/chosen": -497.4439392089844, + "logps/rejected": -491.6142578125, + "loss": 0.5763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0045363903045654, + "rewards/margins": 0.381572961807251, + "rewards/rejected": -1.3861093521118164, + "step": 3750 + }, + { + "epoch": 0.98, + "grad_norm": 3.234375, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": -2.6970138549804688, + "logits/rejected": -2.6791810989379883, + "logps/chosen": -501.97576904296875, + "logps/rejected": -475.7244567871094, + "loss": 0.6286, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0755218267440796, + "rewards/margins": 0.2614571452140808, + "rewards/rejected": -1.3369790315628052, + "step": 3760 + }, + { + "epoch": 0.99, + "grad_norm": 3.453125, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": -2.6607155799865723, + "logits/rejected": -2.617769241333008, + "logps/chosen": -522.4849853515625, + "logps/rejected": -511.22186279296875, + "loss": 0.5748, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0117288827896118, + "rewards/margins": 0.3699984848499298, + "rewards/rejected": -1.3817272186279297, + "step": 3770 + }, + { + "epoch": 0.99, + "grad_norm": 4.21875, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": -2.6623787879943848, + "logits/rejected": -2.649315357208252, + "logps/chosen": -489.23443603515625, + "logps/rejected": -522.0306396484375, + "loss": 0.6375, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.0721361637115479, + "rewards/margins": 0.22307145595550537, + "rewards/rejected": -1.2952076196670532, + "step": 3780 + }, + { + "epoch": 0.99, + "grad_norm": 3.84375, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": -2.676607608795166, + "logits/rejected": -2.614905595779419, + "logps/chosen": -527.0256958007812, + "logps/rejected": -511.72943115234375, + "loss": 0.5925, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1001697778701782, + "rewards/margins": 0.33944058418273926, + "rewards/rejected": -1.4396103620529175, + "step": 3790 + }, + { + "epoch": 0.99, + "grad_norm": 3.40625, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": -2.7049756050109863, + "logits/rejected": -2.668239116668701, + "logps/chosen": -520.1328125, + "logps/rejected": -504.3516540527344, + "loss": 0.602, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.012479305267334, + "rewards/margins": 0.34507402777671814, + "rewards/rejected": -1.357553243637085, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.642503023147583, + "eval_logits/rejected": -2.609644889831543, + "eval_logps/chosen": -504.9806823730469, + "eval_logps/rejected": -490.8211364746094, + "eval_loss": 0.6085299253463745, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -1.0875595808029175, + "eval_rewards/margins": 0.3038162589073181, + "eval_rewards/rejected": -1.3913757801055908, + "eval_runtime": 347.9755, + "eval_samples_per_second": 5.748, + "eval_steps_per_second": 0.718, + "step": 3800 + }, + { + "epoch": 1.0, + "grad_norm": 4.15625, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": -2.6654155254364014, + "logits/rejected": -2.625561475753784, + "logps/chosen": -482.1392517089844, + "logps/rejected": -486.5087890625, + "loss": 0.5962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0852601528167725, + "rewards/margins": 0.3787585496902466, + "rewards/rejected": -1.464018702507019, + "step": 3810 + }, + { + "epoch": 1.0, + "grad_norm": 3.78125, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": -2.6726207733154297, + "logits/rejected": -2.636209487915039, + "logps/chosen": -527.1629638671875, + "logps/rejected": -491.42041015625, + "loss": 0.5974, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.100174069404602, + "rewards/margins": 0.3280791640281677, + "rewards/rejected": -1.428253173828125, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.6288731582999011, + "train_runtime": 37165.2285, + "train_samples_per_second": 1.645, + "train_steps_per_second": 0.103 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}