{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.9140625, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -3.0349411964416504, "logits/rejected": -2.9776864051818848, "logps/chosen": -456.54913330078125, "logps/rejected": -495.31854248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.8984375, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.916372776031494, "logits/rejected": -2.8596677780151367, "logps/chosen": -410.91595458984375, "logps/rejected": -357.0899353027344, "loss": 0.6933, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": -0.00023399748897645622, "rewards/margins": -0.0003936050634365529, "rewards/rejected": 0.0001596075453562662, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.03125, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.9189207553863525, "logits/rejected": -2.8314878940582275, "logps/chosen": -452.43377685546875, "logps/rejected": -340.9873046875, "loss": 0.6934, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.0003274443151894957, "rewards/margins": -0.0004972027963958681, "rewards/rejected": 0.00016975855396594852, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.9921875, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.922006607055664, "logits/rejected": -2.8864428997039795, "logps/chosen": -397.01446533203125, "logps/rejected": -380.24017333984375, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00024825072614476085, "rewards/margins": -0.00015650910791009665, "rewards/rejected": 0.00040475986315868795, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.1171875, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.840169906616211, "logits/rejected": -2.8260841369628906, "logps/chosen": -335.4841613769531, "logps/rejected": -324.1778259277344, "loss": 0.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0013349488144740462, "rewards/margins": 0.0006949803791940212, "rewards/rejected": 0.000639968435280025, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.875, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.9089934825897217, "logits/rejected": -2.8778884410858154, "logps/chosen": -403.4610900878906, "logps/rejected": -344.0279846191406, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.001873858505859971, "rewards/margins": 0.00040313409408554435, "rewards/rejected": 0.0014707243535667658, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.83203125, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.842329502105713, "logits/rejected": -2.808168888092041, "logps/chosen": -382.2471923828125, "logps/rejected": -331.0003662109375, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0033431951887905598, "rewards/margins": 0.00034264856367371976, "rewards/rejected": 0.0030005467124283314, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.4765625, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.939213991165161, "logits/rejected": -2.8729026317596436, "logps/chosen": -406.10760498046875, "logps/rejected": -353.65576171875, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.003925986122339964, "rewards/margins": -0.0006284945411607623, "rewards/rejected": 0.004554481245577335, "step": 70 }, { "epoch": 0.02, "grad_norm": 1.4453125, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.8512609004974365, "logits/rejected": -2.8039116859436035, "logps/chosen": -399.90130615234375, "logps/rejected": -359.86932373046875, "loss": 0.6924, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.005817199591547251, "rewards/margins": 0.0015620887279510498, "rewards/rejected": 0.004255110863596201, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.97265625, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.8811564445495605, "logits/rejected": -2.8479971885681152, "logps/chosen": -381.4345703125, "logps/rejected": -349.1830139160156, "loss": 0.692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.009745048359036446, "rewards/margins": 0.0022289410699158907, "rewards/rejected": 0.007516107521951199, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.90234375, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.8783085346221924, "logits/rejected": -2.8482470512390137, "logps/chosen": -379.60736083984375, "logps/rejected": -338.56134033203125, "loss": 0.6921, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01279132068157196, "rewards/margins": 0.002047107554972172, "rewards/rejected": 0.010744214989244938, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -2.836320638656616, "eval_logits/rejected": -2.784111261367798, "eval_logps/chosen": -394.6286315917969, "eval_logps/rejected": -350.2682800292969, "eval_loss": 0.6922685503959656, "eval_rewards/accuracies": 0.5644999742507935, "eval_rewards/chosen": 0.015961581841111183, "eval_rewards/margins": 0.0018089638324454427, "eval_rewards/rejected": 0.014152619056403637, "eval_runtime": 347.8719, "eval_samples_per_second": 5.749, "eval_steps_per_second": 0.719, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.8359375, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.9356420040130615, "logits/rejected": -2.863084554672241, "logps/chosen": -409.4505310058594, "logps/rejected": -340.94085693359375, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.015902375802397728, "rewards/margins": 0.0012320507084950805, "rewards/rejected": 0.014670324511826038, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.9140625, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.9147744178771973, "logits/rejected": -2.8579554557800293, "logps/chosen": -428.4613342285156, "logps/rejected": -384.51239013671875, "loss": 0.6912, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.019384171813726425, "rewards/margins": 0.004034861922264099, "rewards/rejected": 0.0153493108227849, "step": 120 }, { "epoch": 0.03, "grad_norm": 1.0390625, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.9081578254699707, "logits/rejected": -2.849456787109375, "logps/chosen": -383.9152526855469, "logps/rejected": -361.4400329589844, "loss": 0.6923, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.014263955876231194, "rewards/margins": 0.0018312319880351424, "rewards/rejected": 0.012432724237442017, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.984375, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.9242119789123535, "logits/rejected": -2.800161600112915, "logps/chosen": -417.3861389160156, "logps/rejected": -330.3959045410156, "loss": 0.6914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.016321910545229912, "rewards/margins": 0.0035601272247731686, "rewards/rejected": 0.01276178378611803, "step": 140 }, { "epoch": 0.04, "grad_norm": 1.6875, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.943554639816284, "logits/rejected": -2.8792669773101807, "logps/chosen": -413.8592224121094, "logps/rejected": -343.1801452636719, "loss": 0.6911, "rewards/accuracies": 0.5625, "rewards/chosen": 0.018247485160827637, "rewards/margins": 0.0042217145673930645, "rewards/rejected": 0.014025771990418434, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.96484375, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.8945369720458984, "logits/rejected": -2.8573861122131348, "logps/chosen": -378.51092529296875, "logps/rejected": -363.04132080078125, "loss": 0.6911, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.019316475838422775, "rewards/margins": 0.004308086819946766, "rewards/rejected": 0.01500838715583086, "step": 160 }, { "epoch": 0.04, "grad_norm": 1.203125, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.8808963298797607, "logits/rejected": -2.8329708576202393, "logps/chosen": -337.3949279785156, "logps/rejected": -312.98773193359375, "loss": 0.6898, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.022478580474853516, "rewards/margins": 0.006769159343093634, "rewards/rejected": 0.015709420666098595, "step": 170 }, { "epoch": 0.05, "grad_norm": 0.75, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.8937134742736816, "logits/rejected": -2.837139129638672, "logps/chosen": -379.23760986328125, "logps/rejected": -340.9472351074219, "loss": 0.6901, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.02863329090178013, "rewards/margins": 0.006192624568939209, "rewards/rejected": 0.02244066260755062, "step": 180 }, { "epoch": 0.05, "grad_norm": 1.140625, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.944746494293213, "logits/rejected": -2.892685651779175, "logps/chosen": -402.75653076171875, "logps/rejected": -342.20172119140625, "loss": 0.6916, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03445928543806076, "rewards/margins": 0.003415898187085986, "rewards/rejected": 0.031043391674757004, "step": 190 }, { "epoch": 0.05, "grad_norm": 1.546875, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.8978257179260254, "logits/rejected": -2.837639570236206, "logps/chosen": -374.3539733886719, "logps/rejected": -313.32257080078125, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": 0.040689874440431595, "rewards/margins": 0.008053514175117016, "rewards/rejected": 0.032636359333992004, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.8333005905151367, "eval_logits/rejected": -2.781090021133423, "eval_logps/chosen": -391.8948669433594, "eval_logps/rejected": -348.1495361328125, "eval_loss": 0.6894406080245972, "eval_rewards/accuracies": 0.5920000076293945, "eval_rewards/chosen": 0.04329930990934372, "eval_rewards/margins": 0.007959411479532719, "eval_rewards/rejected": 0.03533989191055298, "eval_runtime": 347.8469, "eval_samples_per_second": 5.75, "eval_steps_per_second": 0.719, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.89453125, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.9700427055358887, "logits/rejected": -2.8941903114318848, "logps/chosen": -390.8605041503906, "logps/rejected": -332.96624755859375, "loss": 0.6898, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04209558665752411, "rewards/margins": 0.0070165605284273624, "rewards/rejected": 0.035079024732112885, "step": 210 }, { "epoch": 0.06, "grad_norm": 0.859375, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.862724542617798, "logits/rejected": -2.8397350311279297, "logps/chosen": -380.3348083496094, "logps/rejected": -332.48809814453125, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": 0.04277118295431137, "rewards/margins": 0.007822849787771702, "rewards/rejected": 0.034948334097862244, "step": 220 }, { "epoch": 0.06, "grad_norm": 0.9765625, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.980745792388916, "logits/rejected": -2.915937900543213, "logps/chosen": -444.210205078125, "logps/rejected": -392.0636291503906, "loss": 0.6886, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.04879799112677574, "rewards/margins": 0.009785487316548824, "rewards/rejected": 0.039012499153614044, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.9453125, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.897602081298828, "logits/rejected": -2.86655592918396, "logps/chosen": -422.80157470703125, "logps/rejected": -391.47808837890625, "loss": 0.6864, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.05084707587957382, "rewards/margins": 0.014665389433503151, "rewards/rejected": 0.03618168458342552, "step": 240 }, { "epoch": 0.07, "grad_norm": 0.8671875, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.885866165161133, "logits/rejected": -2.8721957206726074, "logps/chosen": -390.76702880859375, "logps/rejected": -342.5933837890625, "loss": 0.6892, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05445907637476921, "rewards/margins": 0.009773282334208488, "rewards/rejected": 0.044685788452625275, "step": 250 }, { "epoch": 0.07, "grad_norm": 1.0, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.9248204231262207, "logits/rejected": -2.853421449661255, "logps/chosen": -404.9168701171875, "logps/rejected": -348.4195861816406, "loss": 0.686, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0569482147693634, "rewards/margins": 0.0154123455286026, "rewards/rejected": 0.041535865515470505, "step": 260 }, { "epoch": 0.07, "grad_norm": 0.68359375, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.888120174407959, "logits/rejected": -2.8404757976531982, "logps/chosen": -384.75872802734375, "logps/rejected": -330.0838928222656, "loss": 0.6859, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.05628432705998421, "rewards/margins": 0.016307855024933815, "rewards/rejected": 0.039976466447114944, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.875, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.906266212463379, "logits/rejected": -2.832149028778076, "logps/chosen": -387.818115234375, "logps/rejected": -337.62017822265625, "loss": 0.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.060541294515132904, "rewards/margins": 0.013487743213772774, "rewards/rejected": 0.04705354943871498, "step": 280 }, { "epoch": 0.08, "grad_norm": 0.94921875, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.888960361480713, "logits/rejected": -2.847351551055908, "logps/chosen": -411.20904541015625, "logps/rejected": -362.61474609375, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.07661953568458557, "rewards/margins": 0.022713415324687958, "rewards/rejected": 0.053906120359897614, "step": 290 }, { "epoch": 0.08, "grad_norm": 0.92578125, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.868332862854004, "logits/rejected": -2.7791736125946045, "logps/chosen": -369.7691345214844, "logps/rejected": -329.0905456542969, "loss": 0.6815, "rewards/accuracies": 0.625, "rewards/chosen": 0.07754194736480713, "rewards/margins": 0.024631675332784653, "rewards/rejected": 0.05291026830673218, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": -2.8349289894104004, "eval_logits/rejected": -2.783784866333008, "eval_logps/chosen": -388.1691589355469, "eval_logps/rejected": -345.5898132324219, "eval_loss": 0.6844429969787598, "eval_rewards/accuracies": 0.6025000214576721, "eval_rewards/chosen": 0.08055612444877625, "eval_rewards/margins": 0.019618848338723183, "eval_rewards/rejected": 0.06093727424740791, "eval_runtime": 347.9802, "eval_samples_per_second": 5.747, "eval_steps_per_second": 0.718, "step": 300 }, { "epoch": 0.08, "grad_norm": 1.125, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.9738924503326416, "logits/rejected": -2.891376495361328, "logps/chosen": -419.47601318359375, "logps/rejected": -330.5395202636719, "loss": 0.6852, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.08353392034769058, "rewards/margins": 0.01769311912357807, "rewards/rejected": 0.06584079563617706, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.83984375, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.9381539821624756, "logits/rejected": -2.868596076965332, "logps/chosen": -382.90325927734375, "logps/rejected": -343.45361328125, "loss": 0.6849, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.07833738625049591, "rewards/margins": 0.018442410975694656, "rewards/rejected": 0.059894971549510956, "step": 320 }, { "epoch": 0.09, "grad_norm": 0.87109375, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.8898563385009766, "logits/rejected": -2.863262176513672, "logps/chosen": -379.0876159667969, "logps/rejected": -343.98748779296875, "loss": 0.6769, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.11513115465641022, "rewards/margins": 0.03892900422215462, "rewards/rejected": 0.0762021541595459, "step": 330 }, { "epoch": 0.09, "grad_norm": 0.8984375, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.924078941345215, "logits/rejected": -2.884129762649536, "logps/chosen": -412.20428466796875, "logps/rejected": -369.37615966796875, "loss": 0.6837, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.10442419350147247, "rewards/margins": 0.021783817559480667, "rewards/rejected": 0.0826403871178627, "step": 340 }, { "epoch": 0.09, "grad_norm": 0.921875, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.8822202682495117, "logits/rejected": -2.832418441772461, "logps/chosen": -419.82122802734375, "logps/rejected": -383.4366455078125, "loss": 0.6807, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.1065782904624939, "rewards/margins": 0.02887110412120819, "rewards/rejected": 0.0777071863412857, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.6328125, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.9021358489990234, "logits/rejected": -2.8665103912353516, "logps/chosen": -353.05645751953125, "logps/rejected": -322.67803955078125, "loss": 0.6916, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07063518464565277, "rewards/margins": 0.0060135251842439175, "rewards/rejected": 0.06462165713310242, "step": 360 }, { "epoch": 0.1, "grad_norm": 1.109375, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.914665699005127, "logits/rejected": -2.8427698612213135, "logps/chosen": -405.73809814453125, "logps/rejected": -331.1866149902344, "loss": 0.6722, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.09655407071113586, "rewards/margins": 0.05310269445180893, "rewards/rejected": 0.043451376259326935, "step": 370 }, { "epoch": 0.1, "grad_norm": 0.94140625, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.900359869003296, "logits/rejected": -2.81068754196167, "logps/chosen": -427.6316833496094, "logps/rejected": -363.28839111328125, "loss": 0.6717, "rewards/accuracies": 0.59375, "rewards/chosen": 0.10045112669467926, "rewards/margins": 0.053518980741500854, "rewards/rejected": 0.046932149678468704, "step": 380 }, { "epoch": 0.1, "grad_norm": 1.3984375, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.8918914794921875, "logits/rejected": -2.845247745513916, "logps/chosen": -412.29071044921875, "logps/rejected": -371.7367248535156, "loss": 0.6796, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.07834922522306442, "rewards/margins": 0.031100135296583176, "rewards/rejected": 0.04724908620119095, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.796875, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.9121220111846924, "logits/rejected": -2.850386619567871, "logps/chosen": -381.02825927734375, "logps/rejected": -322.06927490234375, "loss": 0.6869, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.08469127863645554, "rewards/margins": 0.020099209621548653, "rewards/rejected": 0.06459207087755203, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -2.842348098754883, "eval_logits/rejected": -2.7931265830993652, "eval_logps/chosen": -390.1521911621094, "eval_logps/rejected": -348.9979248046875, "eval_loss": 0.6788274645805359, "eval_rewards/accuracies": 0.612500011920929, "eval_rewards/chosen": 0.0607261136174202, "eval_rewards/margins": 0.0338701568543911, "eval_rewards/rejected": 0.026855960488319397, "eval_runtime": 347.7786, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 400 }, { "epoch": 0.11, "grad_norm": 1.1328125, "learning_rate": 4.999239142174581e-06, "logits/chosen": -2.8766117095947266, "logits/rejected": -2.8555123805999756, "logps/chosen": -361.97406005859375, "logps/rejected": -350.51287841796875, "loss": 0.6959, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.05293396860361099, "rewards/margins": 0.0049603343941271305, "rewards/rejected": 0.0479736328125, "step": 410 }, { "epoch": 0.11, "grad_norm": 1.0859375, "learning_rate": 4.99857123734344e-06, "logits/chosen": -2.8863377571105957, "logits/rejected": -2.8117470741271973, "logps/chosen": -351.9708251953125, "logps/rejected": -313.9596252441406, "loss": 0.6861, "rewards/accuracies": 0.5625, "rewards/chosen": 0.033623430877923965, "rewards/margins": 0.01676887646317482, "rewards/rejected": 0.016854556277394295, "step": 420 }, { "epoch": 0.11, "grad_norm": 1.0703125, "learning_rate": 4.997694702533016e-06, "logits/chosen": -2.8675060272216797, "logits/rejected": -2.8136143684387207, "logps/chosen": -408.8647155761719, "logps/rejected": -369.70257568359375, "loss": 0.686, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.041368208825588226, "rewards/margins": 0.02153196558356285, "rewards/rejected": 0.019836245104670525, "step": 430 }, { "epoch": 0.12, "grad_norm": 0.78515625, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.9551219940185547, "logits/rejected": -2.9198532104492188, "logps/chosen": -399.39410400390625, "logps/rejected": -351.37982177734375, "loss": 0.6821, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03481561690568924, "rewards/margins": 0.02588939666748047, "rewards/rejected": 0.00892622210085392, "step": 440 }, { "epoch": 0.12, "grad_norm": 0.85546875, "learning_rate": 4.995316053150366e-06, "logits/chosen": -2.879997968673706, "logits/rejected": -2.857247829437256, "logps/chosen": -394.7357482910156, "logps/rejected": -354.9325256347656, "loss": 0.6736, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.04896073415875435, "rewards/margins": 0.04451023414731026, "rewards/rejected": 0.004450496751815081, "step": 450 }, { "epoch": 0.12, "grad_norm": 1.1796875, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -2.89152455329895, "logits/rejected": -2.8554794788360596, "logps/chosen": -394.02325439453125, "logps/rejected": -363.3119201660156, "loss": 0.6753, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05705699324607849, "rewards/margins": 0.047898683696985245, "rewards/rejected": 0.009158318862318993, "step": 460 }, { "epoch": 0.12, "grad_norm": 1.7109375, "learning_rate": 4.992103988476206e-06, "logits/chosen": -2.8964645862579346, "logits/rejected": -2.844935417175293, "logps/chosen": -369.896484375, "logps/rejected": -337.79278564453125, "loss": 0.6741, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.05254621058702469, "rewards/margins": 0.04723441228270531, "rewards/rejected": 0.005311795976012945, "step": 470 }, { "epoch": 0.13, "grad_norm": 0.94921875, "learning_rate": 4.990185749791866e-06, "logits/chosen": -2.928675413131714, "logits/rejected": -2.873516798019409, "logps/chosen": -383.92852783203125, "logps/rejected": -358.11566162109375, "loss": 0.6722, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.039998866617679596, "rewards/margins": 0.04847729206085205, "rewards/rejected": -0.008478422649204731, "step": 480 }, { "epoch": 0.13, "grad_norm": 1.078125, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -2.9350996017456055, "logits/rejected": -2.8882603645324707, "logps/chosen": -420.064697265625, "logps/rejected": -359.68072509765625, "loss": 0.6711, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.030888009816408157, "rewards/margins": 0.04867198318243027, "rewards/rejected": -0.01778397336602211, "step": 490 }, { "epoch": 0.13, "grad_norm": 1.03125, "learning_rate": 4.985725660577184e-06, "logits/chosen": -2.916215658187866, "logits/rejected": -2.8541104793548584, "logps/chosen": -411.7395935058594, "logps/rejected": -343.6163330078125, "loss": 0.6744, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018574411049485207, "rewards/margins": 0.047007013112306595, "rewards/rejected": -0.028432602062821388, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": -2.8370718955993652, "eval_logits/rejected": -2.7888970375061035, "eval_logps/chosen": -393.79833984375, "eval_logps/rejected": -354.1763610839844, "eval_loss": 0.6723790168762207, "eval_rewards/accuracies": 0.6209999918937683, "eval_rewards/chosen": 0.02426437847316265, "eval_rewards/margins": 0.04919267073273659, "eval_rewards/rejected": -0.024928290396928787, "eval_runtime": 347.9, "eval_samples_per_second": 5.749, "eval_steps_per_second": 0.719, "step": 500 }, { "epoch": 0.13, "grad_norm": 1.1171875, "learning_rate": 4.983184182463009e-06, "logits/chosen": -2.90047025680542, "logits/rejected": -2.8580737113952637, "logps/chosen": -407.69781494140625, "logps/rejected": -361.30267333984375, "loss": 0.6753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.039911769330501556, "rewards/margins": 0.046458516269922256, "rewards/rejected": -0.0065467446111142635, "step": 510 }, { "epoch": 0.14, "grad_norm": 1.2734375, "learning_rate": 4.980435359184203e-06, "logits/chosen": -2.914069414138794, "logits/rejected": -2.8952417373657227, "logps/chosen": -400.2842712402344, "logps/rejected": -370.97039794921875, "loss": 0.6663, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.015612982213497162, "rewards/margins": 0.06524328887462616, "rewards/rejected": -0.0496302992105484, "step": 520 }, { "epoch": 0.14, "grad_norm": 1.0859375, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -2.8638806343078613, "logits/rejected": -2.8652124404907227, "logps/chosen": -400.7289123535156, "logps/rejected": -399.452880859375, "loss": 0.6814, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.001105122035369277, "rewards/margins": 0.03718414530158043, "rewards/rejected": -0.03828927129507065, "step": 530 }, { "epoch": 0.14, "grad_norm": 1.125, "learning_rate": 4.974316612530615e-06, "logits/chosen": -2.869264841079712, "logits/rejected": -2.8013787269592285, "logps/chosen": -414.38800048828125, "logps/rejected": -343.013671875, "loss": 0.6495, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.005649521015584469, "rewards/margins": 0.10599911212921143, "rewards/rejected": -0.11164864152669907, "step": 540 }, { "epoch": 0.14, "grad_norm": 1.015625, "learning_rate": 4.970947200069416e-06, "logits/chosen": -2.867119073867798, "logits/rejected": -2.8448452949523926, "logps/chosen": -406.0426940917969, "logps/rejected": -376.4588623046875, "loss": 0.666, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004353336989879608, "rewards/margins": 0.07087867707014084, "rewards/rejected": -0.06652534753084183, "step": 550 }, { "epoch": 0.15, "grad_norm": 1.078125, "learning_rate": 4.967371464228096e-06, "logits/chosen": -2.9276793003082275, "logits/rejected": -2.8994839191436768, "logps/chosen": -393.2682189941406, "logps/rejected": -381.3654479980469, "loss": 0.6694, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01845262572169304, "rewards/margins": 0.057842254638671875, "rewards/rejected": -0.07629488408565521, "step": 560 }, { "epoch": 0.15, "grad_norm": 1.1328125, "learning_rate": 4.963589703579569e-06, "logits/chosen": -2.9889473915100098, "logits/rejected": -2.9415230751037598, "logps/chosen": -441.64227294921875, "logps/rejected": -392.90789794921875, "loss": 0.6794, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01707800105214119, "rewards/margins": 0.04300212487578392, "rewards/rejected": -0.06008012965321541, "step": 570 }, { "epoch": 0.15, "grad_norm": 1.2421875, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -2.9623188972473145, "logits/rejected": -2.8731496334075928, "logps/chosen": -446.22662353515625, "logps/rejected": -381.77752685546875, "loss": 0.656, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03769092634320259, "rewards/margins": 0.09427281469106674, "rewards/rejected": -0.13196374475955963, "step": 580 }, { "epoch": 0.15, "grad_norm": 1.203125, "learning_rate": 4.955409388141243e-06, "logits/chosen": -2.857860565185547, "logits/rejected": -2.8171629905700684, "logps/chosen": -391.1763916015625, "logps/rejected": -357.4033203125, "loss": 0.663, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09089629352092743, "rewards/margins": 0.07482485473155975, "rewards/rejected": -0.165721133351326, "step": 590 }, { "epoch": 0.16, "grad_norm": 2.46875, "learning_rate": 4.951011516405429e-06, "logits/chosen": -2.895017147064209, "logits/rejected": -2.8843705654144287, "logps/chosen": -395.64947509765625, "logps/rejected": -366.892822265625, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": -0.04552796110510826, "rewards/margins": 0.06777463853359222, "rewards/rejected": -0.11330260336399078, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": -2.817946672439575, "eval_logits/rejected": -2.7708613872528076, "eval_logps/chosen": -401.88262939453125, "eval_logps/rejected": -365.14019775390625, "eval_loss": 0.6624515652656555, "eval_rewards/accuracies": 0.6265000104904175, "eval_rewards/chosen": -0.05657815560698509, "eval_rewards/margins": 0.07798823714256287, "eval_rewards/rejected": -0.13456639647483826, "eval_runtime": 348.0683, "eval_samples_per_second": 5.746, "eval_steps_per_second": 0.718, "step": 600 }, { "epoch": 0.16, "grad_norm": 1.375, "learning_rate": 4.946408985913344e-06, "logits/chosen": -2.8852925300598145, "logits/rejected": -2.841823101043701, "logps/chosen": -383.7059020996094, "logps/rejected": -347.17669677734375, "loss": 0.6691, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05533725023269653, "rewards/margins": 0.0660722628235817, "rewards/rejected": -0.12140952050685883, "step": 610 }, { "epoch": 0.16, "grad_norm": 1.8515625, "learning_rate": 4.941602180974958e-06, "logits/chosen": -2.8929924964904785, "logits/rejected": -2.801697254180908, "logps/chosen": -444.3966369628906, "logps/rejected": -354.19598388671875, "loss": 0.6443, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15692010521888733, "rewards/margins": 0.11467301845550537, "rewards/rejected": -0.2715931236743927, "step": 620 }, { "epoch": 0.16, "grad_norm": 1.8671875, "learning_rate": 4.936591502957101e-06, "logits/chosen": -2.893341541290283, "logits/rejected": -2.8479011058807373, "logps/chosen": -392.2441711425781, "logps/rejected": -361.94268798828125, "loss": 0.6421, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15643954277038574, "rewards/margins": 0.12407402694225311, "rewards/rejected": -0.28051358461380005, "step": 630 }, { "epoch": 0.17, "grad_norm": 1.90625, "learning_rate": 4.931377370249946e-06, "logits/chosen": -2.8857052326202393, "logits/rejected": -2.811861515045166, "logps/chosen": -418.484619140625, "logps/rejected": -368.8498840332031, "loss": 0.6506, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.21659335494041443, "rewards/margins": 0.10539694130420685, "rewards/rejected": -0.3219902813434601, "step": 640 }, { "epoch": 0.17, "grad_norm": 1.65625, "learning_rate": 4.925960218232073e-06, "logits/chosen": -2.8835432529449463, "logits/rejected": -2.842060089111328, "logps/chosen": -399.4352722167969, "logps/rejected": -387.4463806152344, "loss": 0.6506, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17046542465686798, "rewards/margins": 0.11528462171554565, "rewards/rejected": -0.28575003147125244, "step": 650 }, { "epoch": 0.17, "grad_norm": 3.625, "learning_rate": 4.920340499234116e-06, "logits/chosen": -2.8580405712127686, "logits/rejected": -2.7751071453094482, "logps/chosen": -410.6004943847656, "logps/rejected": -364.0141296386719, "loss": 0.6449, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15695621073246002, "rewards/margins": 0.1257764846086502, "rewards/rejected": -0.28273266553878784, "step": 660 }, { "epoch": 0.18, "grad_norm": 3.09375, "learning_rate": 4.914518682500995e-06, "logits/chosen": -2.93772292137146, "logits/rejected": -2.882884979248047, "logps/chosen": -430.3829650878906, "logps/rejected": -389.68328857421875, "loss": 0.6462, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19247478246688843, "rewards/margins": 0.12777897715568542, "rewards/rejected": -0.32025375962257385, "step": 670 }, { "epoch": 0.18, "grad_norm": 1.6015625, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -2.8791463375091553, "logits/rejected": -2.821669101715088, "logps/chosen": -436.94708251953125, "logps/rejected": -379.05267333984375, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.30955275893211365, "rewards/margins": 0.13841886818408966, "rewards/rejected": -0.4479715824127197, "step": 680 }, { "epoch": 0.18, "grad_norm": 2.4375, "learning_rate": 4.902270717143858e-06, "logits/chosen": -2.887767791748047, "logits/rejected": -2.8599493503570557, "logps/chosen": -401.97064208984375, "logps/rejected": -403.107421875, "loss": 0.661, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3961626887321472, "rewards/margins": 0.1000264510512352, "rewards/rejected": -0.4961891770362854, "step": 690 }, { "epoch": 0.18, "grad_norm": 1.78125, "learning_rate": 4.895845591221427e-06, "logits/chosen": -2.8757412433624268, "logits/rejected": -2.86216139793396, "logps/chosen": -415.5210876464844, "logps/rejected": -404.1314697265625, "loss": 0.637, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2919083535671234, "rewards/margins": 0.1432960480451584, "rewards/rejected": -0.435204416513443, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": -2.8051064014434814, "eval_logits/rejected": -2.7595512866973877, "eval_logps/chosen": -421.9038391113281, "eval_logps/rejected": -388.2210998535156, "eval_loss": 0.6554521918296814, "eval_rewards/accuracies": 0.6290000081062317, "eval_rewards/chosen": -0.256790429353714, "eval_rewards/margins": 0.10858490318059921, "eval_rewards/rejected": -0.3653753399848938, "eval_runtime": 348.1124, "eval_samples_per_second": 5.745, "eval_steps_per_second": 0.718, "step": 700 }, { "epoch": 0.19, "grad_norm": 2.625, "learning_rate": 4.8892204128816e-06, "logits/chosen": -2.8942840099334717, "logits/rejected": -2.857219934463501, "logps/chosen": -418.62322998046875, "logps/rejected": -394.4143981933594, "loss": 0.6587, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1779194325208664, "rewards/margins": 0.0996006429195404, "rewards/rejected": -0.2775201201438904, "step": 710 }, { "epoch": 0.19, "grad_norm": 1.6328125, "learning_rate": 4.882395735324864e-06, "logits/chosen": -2.8712573051452637, "logits/rejected": -2.8021373748779297, "logps/chosen": -414.42999267578125, "logps/rejected": -377.20977783203125, "loss": 0.6622, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1302914321422577, "rewards/margins": 0.09119856357574463, "rewards/rejected": -0.22148998081684113, "step": 720 }, { "epoch": 0.19, "grad_norm": 2.21875, "learning_rate": 4.87537212840983e-06, "logits/chosen": -2.872545003890991, "logits/rejected": -2.815669059753418, "logps/chosen": -410.472412109375, "logps/rejected": -358.6063232421875, "loss": 0.6462, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.07536014169454575, "rewards/margins": 0.11603609472513199, "rewards/rejected": -0.19139623641967773, "step": 730 }, { "epoch": 0.19, "grad_norm": 3.96875, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -2.844766139984131, "logits/rejected": -2.786101818084717, "logps/chosen": -370.5934143066406, "logps/rejected": -329.69647216796875, "loss": 0.6229, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.21907174587249756, "rewards/margins": 0.19244422018527985, "rewards/rejected": -0.41151589155197144, "step": 740 }, { "epoch": 0.2, "grad_norm": 1.5, "learning_rate": 4.860730488943068e-06, "logits/chosen": -2.8021976947784424, "logits/rejected": -2.775529384613037, "logps/chosen": -399.1451110839844, "logps/rejected": -384.2425842285156, "loss": 0.653, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.397163063287735, "rewards/margins": 0.11699549108743668, "rewards/rejected": -0.5141586065292358, "step": 750 }, { "epoch": 0.2, "grad_norm": 2.109375, "learning_rate": 4.853113678964022e-06, "logits/chosen": -2.816556453704834, "logits/rejected": -2.7907676696777344, "logps/chosen": -448.0435485839844, "logps/rejected": -432.1190490722656, "loss": 0.6524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4579244554042816, "rewards/margins": 0.1237117201089859, "rewards/rejected": -0.5816361904144287, "step": 760 }, { "epoch": 0.2, "grad_norm": 2.71875, "learning_rate": 4.845300384669958e-06, "logits/chosen": -2.839500665664673, "logits/rejected": -2.799598455429077, "logps/chosen": -418.523193359375, "logps/rejected": -380.72637939453125, "loss": 0.6546, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3512226939201355, "rewards/margins": 0.12142340838909149, "rewards/rejected": -0.4726460874080658, "step": 770 }, { "epoch": 0.2, "grad_norm": 1.9296875, "learning_rate": 4.837291258468701e-06, "logits/chosen": -2.8736701011657715, "logits/rejected": -2.815397262573242, "logps/chosen": -448.5498046875, "logps/rejected": -413.5223693847656, "loss": 0.6451, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3000739514827728, "rewards/margins": 0.14568910002708435, "rewards/rejected": -0.4457630515098572, "step": 780 }, { "epoch": 0.21, "grad_norm": 2.453125, "learning_rate": 4.829086969119984e-06, "logits/chosen": -2.8422582149505615, "logits/rejected": -2.8511698246002197, "logps/chosen": -416.14312744140625, "logps/rejected": -407.68487548828125, "loss": 0.6656, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.430819571018219, "rewards/margins": 0.0904776006937027, "rewards/rejected": -0.5212971568107605, "step": 790 }, { "epoch": 0.21, "grad_norm": 2.109375, "learning_rate": 4.820688201679605e-06, "logits/chosen": -2.9085071086883545, "logits/rejected": -2.8099634647369385, "logps/chosen": -441.54296875, "logps/rejected": -363.7981262207031, "loss": 0.6166, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.40520063042640686, "rewards/margins": 0.19839458167552948, "rewards/rejected": -0.6035951972007751, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": -2.7961480617523193, "eval_logits/rejected": -2.7522692680358887, "eval_logps/chosen": -435.57562255859375, "eval_logps/rejected": -403.91156005859375, "eval_loss": 0.6488239765167236, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.39350807666778564, "eval_rewards/margins": 0.12877221405506134, "eval_rewards/rejected": -0.5222803354263306, "eval_runtime": 347.7457, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 800 }, { "epoch": 0.21, "grad_norm": 1.7421875, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -2.8743693828582764, "logits/rejected": -2.890942096710205, "logps/chosen": -443.807373046875, "logps/rejected": -438.0557556152344, "loss": 0.6621, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4286819100379944, "rewards/margins": 0.10433633625507355, "rewards/rejected": -0.5330182313919067, "step": 810 }, { "epoch": 0.21, "grad_norm": 2.421875, "learning_rate": 4.803310053882831e-06, "logits/chosen": -2.844068765640259, "logits/rejected": -2.8668532371520996, "logps/chosen": -392.5022277832031, "logps/rejected": -408.68621826171875, "loss": 0.6549, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4813667833805084, "rewards/margins": 0.12066853046417236, "rewards/rejected": -0.6020352244377136, "step": 820 }, { "epoch": 0.22, "grad_norm": 2.375, "learning_rate": 4.794332124596775e-06, "logits/chosen": -2.8786587715148926, "logits/rejected": -2.8575031757354736, "logps/chosen": -460.0040588378906, "logps/rejected": -449.45977783203125, "loss": 0.6536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4967479705810547, "rewards/margins": 0.12318827956914902, "rewards/rejected": -0.6199362874031067, "step": 830 }, { "epoch": 0.22, "grad_norm": 2.671875, "learning_rate": 4.785162619238575e-06, "logits/chosen": -2.8527016639709473, "logits/rejected": -2.792952060699463, "logps/chosen": -423.39801025390625, "logps/rejected": -387.58355712890625, "loss": 0.654, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.43838852643966675, "rewards/margins": 0.12122461944818497, "rewards/rejected": -0.5596131086349487, "step": 840 }, { "epoch": 0.22, "grad_norm": 2.09375, "learning_rate": 4.775802303459288e-06, "logits/chosen": -2.8446707725524902, "logits/rejected": -2.8177928924560547, "logps/chosen": -430.2727966308594, "logps/rejected": -414.4742736816406, "loss": 0.6639, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.44889992475509644, "rewards/margins": 0.10808303207159042, "rewards/rejected": -0.5569829940795898, "step": 850 }, { "epoch": 0.23, "grad_norm": 2.734375, "learning_rate": 4.766251958842589e-06, "logits/chosen": -2.8193821907043457, "logits/rejected": -2.782836675643921, "logps/chosen": -438.9239196777344, "logps/rejected": -410.18048095703125, "loss": 0.6483, "rewards/accuracies": 0.59375, "rewards/chosen": -0.36728745698928833, "rewards/margins": 0.1387586146593094, "rewards/rejected": -0.5060460567474365, "step": 860 }, { "epoch": 0.23, "grad_norm": 3.046875, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -2.7876265048980713, "logits/rejected": -2.7442731857299805, "logps/chosen": -412.59539794921875, "logps/rejected": -410.63653564453125, "loss": 0.655, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2634727656841278, "rewards/margins": 0.13101360201835632, "rewards/rejected": -0.39448636770248413, "step": 870 }, { "epoch": 0.23, "grad_norm": 1.8828125, "learning_rate": 4.746584388701831e-06, "logits/chosen": -2.830331802368164, "logits/rejected": -2.815990924835205, "logps/chosen": -431.3417053222656, "logps/rejected": -406.23236083984375, "loss": 0.6457, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.308138370513916, "rewards/margins": 0.13623470067977905, "rewards/rejected": -0.44437307119369507, "step": 880 }, { "epoch": 0.23, "grad_norm": 2.765625, "learning_rate": 4.736468805414218e-06, "logits/chosen": -2.8040425777435303, "logits/rejected": -2.8051490783691406, "logps/chosen": -414.428466796875, "logps/rejected": -427.8641052246094, "loss": 0.6559, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3916015923023224, "rewards/margins": 0.15443366765975952, "rewards/rejected": -0.5460351705551147, "step": 890 }, { "epoch": 0.24, "grad_norm": 2.1875, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -2.7597362995147705, "logits/rejected": -2.7200279235839844, "logps/chosen": -403.2427673339844, "logps/rejected": -386.6829528808594, "loss": 0.6335, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.49025583267211914, "rewards/margins": 0.1762937605381012, "rewards/rejected": -0.666549563407898, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": -2.7764461040496826, "eval_logits/rejected": -2.7324697971343994, "eval_logps/chosen": -441.37982177734375, "eval_logps/rejected": -412.10833740234375, "eval_loss": 0.6458185911178589, "eval_rewards/accuracies": 0.6380000114440918, "eval_rewards/chosen": -0.4515506625175476, "eval_rewards/margins": 0.1526976376771927, "eval_rewards/rejected": -0.6042482256889343, "eval_runtime": 347.8684, "eval_samples_per_second": 5.749, "eval_steps_per_second": 0.719, "step": 900 }, { "epoch": 0.24, "grad_norm": 3.421875, "learning_rate": 4.715678265575463e-06, "logits/chosen": -2.8672194480895996, "logits/rejected": -2.7797975540161133, "logps/chosen": -462.484375, "logps/rejected": -388.0072326660156, "loss": 0.6518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3822649419307709, "rewards/margins": 0.13505886495113373, "rewards/rejected": -0.5173237919807434, "step": 910 }, { "epoch": 0.24, "grad_norm": 2.0, "learning_rate": 4.705005045028415e-06, "logits/chosen": -2.7902486324310303, "logits/rejected": -2.733579158782959, "logps/chosen": -440.94390869140625, "logps/rejected": -411.362060546875, "loss": 0.6272, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4113929867744446, "rewards/margins": 0.18324372172355652, "rewards/rejected": -0.5946367383003235, "step": 920 }, { "epoch": 0.24, "grad_norm": 2.765625, "learning_rate": 4.694147707194659e-06, "logits/chosen": -2.855236768722534, "logits/rejected": -2.8222384452819824, "logps/chosen": -445.28094482421875, "logps/rejected": -420.7135314941406, "loss": 0.6181, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.42793193459510803, "rewards/margins": 0.2128107249736786, "rewards/rejected": -0.6407425999641418, "step": 930 }, { "epoch": 0.25, "grad_norm": 11.5, "learning_rate": 4.683107158658782e-06, "logits/chosen": -2.787015676498413, "logits/rejected": -2.7473626136779785, "logps/chosen": -461.5128479003906, "logps/rejected": -439.97930908203125, "loss": 0.6182, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.39246460795402527, "rewards/margins": 0.2291673719882965, "rewards/rejected": -0.6216319799423218, "step": 940 }, { "epoch": 0.25, "grad_norm": 2.25, "learning_rate": 4.671884321303407e-06, "logits/chosen": -2.8133838176727295, "logits/rejected": -2.7668540477752686, "logps/chosen": -431.5006408691406, "logps/rejected": -404.2099609375, "loss": 0.6353, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.518032431602478, "rewards/margins": 0.17736072838306427, "rewards/rejected": -0.6953932046890259, "step": 950 }, { "epoch": 0.25, "grad_norm": 2.4375, "learning_rate": 4.660480132232224e-06, "logits/chosen": -2.8185486793518066, "logits/rejected": -2.793464183807373, "logps/chosen": -465.65814208984375, "logps/rejected": -436.23931884765625, "loss": 0.6434, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6132515668869019, "rewards/margins": 0.16154471039772034, "rewards/rejected": -0.7747962474822998, "step": 960 }, { "epoch": 0.25, "grad_norm": 4.3125, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -2.827209234237671, "logits/rejected": -2.7537784576416016, "logps/chosen": -473.32196044921875, "logps/rejected": -427.2911071777344, "loss": 0.5966, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6051194071769714, "rewards/margins": 0.27736273407936096, "rewards/rejected": -0.8824821710586548, "step": 970 }, { "epoch": 0.26, "grad_norm": 4.34375, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -2.8257224559783936, "logits/rejected": -2.7971818447113037, "logps/chosen": -481.39697265625, "logps/rejected": -462.51416015625, "loss": 0.6356, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6484049558639526, "rewards/margins": 0.18180248141288757, "rewards/rejected": -0.8302074670791626, "step": 980 }, { "epoch": 0.26, "grad_norm": 2.171875, "learning_rate": 4.625189052424638e-06, "logits/chosen": -2.8246514797210693, "logits/rejected": -2.7782371044158936, "logps/chosen": -443.04925537109375, "logps/rejected": -426.6200256347656, "loss": 0.6097, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7901213765144348, "rewards/margins": 0.251308411359787, "rewards/rejected": -1.0414297580718994, "step": 990 }, { "epoch": 0.26, "grad_norm": 2.1875, "learning_rate": 4.613069129183218e-06, "logits/chosen": -2.8568129539489746, "logits/rejected": -2.7981626987457275, "logps/chosen": -530.5858154296875, "logps/rejected": -487.3711853027344, "loss": 0.6286, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7988755106925964, "rewards/margins": 0.19343645870685577, "rewards/rejected": -0.9923120737075806, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": -2.753098964691162, "eval_logits/rejected": -2.7123119831085205, "eval_logps/chosen": -483.1429443359375, "eval_logps/rejected": -456.1025695800781, "eval_loss": 0.6405959725379944, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -0.869181215763092, "eval_rewards/margins": 0.17500866949558258, "eval_rewards/rejected": -1.0441899299621582, "eval_runtime": 347.7912, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 1000 }, { "epoch": 0.26, "grad_norm": 4.3125, "learning_rate": 4.600772765277607e-06, "logits/chosen": -2.7716658115386963, "logits/rejected": -2.7464160919189453, "logps/chosen": -449.4707946777344, "logps/rejected": -440.67181396484375, "loss": 0.6483, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.828794002532959, "rewards/margins": 0.16667340695858002, "rewards/rejected": -0.995467483997345, "step": 1010 }, { "epoch": 0.27, "grad_norm": 2.15625, "learning_rate": 4.588300987450652e-06, "logits/chosen": -2.82257342338562, "logits/rejected": -2.7864184379577637, "logps/chosen": -431.7757263183594, "logps/rejected": -390.1167907714844, "loss": 0.6416, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5249059200286865, "rewards/margins": 0.17124707996845245, "rewards/rejected": -0.6961530447006226, "step": 1020 }, { "epoch": 0.27, "grad_norm": 2.265625, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -2.77396821975708, "logits/rejected": -2.745842456817627, "logps/chosen": -399.4544982910156, "logps/rejected": -385.6885070800781, "loss": 0.6561, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.313784658908844, "rewards/margins": 0.1287391483783722, "rewards/rejected": -0.44252386689186096, "step": 1030 }, { "epoch": 0.27, "grad_norm": 3.46875, "learning_rate": 4.562835370152206e-06, "logits/chosen": -2.8060081005096436, "logits/rejected": -2.755823850631714, "logps/chosen": -465.9139709472656, "logps/rejected": -425.3163146972656, "loss": 0.6176, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2053132951259613, "rewards/margins": 0.21412332355976105, "rewards/rejected": -0.41943663358688354, "step": 1040 }, { "epoch": 0.27, "grad_norm": 2.734375, "learning_rate": 4.54984365705243e-06, "logits/chosen": -2.8254618644714355, "logits/rejected": -2.7887418270111084, "logps/chosen": -431.9395446777344, "logps/rejected": -413.49468994140625, "loss": 0.6406, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3133324682712555, "rewards/margins": 0.14582258462905884, "rewards/rejected": -0.45915499329566956, "step": 1050 }, { "epoch": 0.28, "grad_norm": 2.3125, "learning_rate": 4.536680782597191e-06, "logits/chosen": -2.744650363922119, "logits/rejected": -2.7300617694854736, "logps/chosen": -397.44451904296875, "logps/rejected": -379.4835205078125, "loss": 0.6429, "rewards/accuracies": 0.625, "rewards/chosen": -0.3276776671409607, "rewards/margins": 0.16546614468097687, "rewards/rejected": -0.49314385652542114, "step": 1060 }, { "epoch": 0.28, "grad_norm": 2.703125, "learning_rate": 4.523347845882718e-06, "logits/chosen": -2.803278684616089, "logits/rejected": -2.7433362007141113, "logps/chosen": -464.9649963378906, "logps/rejected": -400.2354431152344, "loss": 0.6388, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.34644556045532227, "rewards/margins": 0.17325380444526672, "rewards/rejected": -0.5196993947029114, "step": 1070 }, { "epoch": 0.28, "grad_norm": 1.875, "learning_rate": 4.50984596020539e-06, "logits/chosen": -2.730520248413086, "logits/rejected": -2.7134251594543457, "logps/chosen": -448.7145080566406, "logps/rejected": -413.89276123046875, "loss": 0.6438, "rewards/accuracies": 0.625, "rewards/chosen": -0.39723989367485046, "rewards/margins": 0.15517649054527283, "rewards/rejected": -0.5524164438247681, "step": 1080 }, { "epoch": 0.29, "grad_norm": 2.59375, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -2.788811206817627, "logits/rejected": -2.7599310874938965, "logps/chosen": -435.73614501953125, "logps/rejected": -406.4289245605469, "loss": 0.6569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4202515482902527, "rewards/margins": 0.11943890154361725, "rewards/rejected": -0.5396904945373535, "step": 1090 }, { "epoch": 0.29, "grad_norm": 3.453125, "learning_rate": 4.482339865589492e-06, "logits/chosen": -2.809835195541382, "logits/rejected": -2.7414846420288086, "logps/chosen": -434.068115234375, "logps/rejected": -373.0021667480469, "loss": 0.669, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4107758104801178, "rewards/margins": 0.09588425606489182, "rewards/rejected": -0.5066600441932678, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": -2.7354044914245605, "eval_logits/rejected": -2.6946027278900146, "eval_logps/chosen": -430.67889404296875, "eval_logps/rejected": -401.52215576171875, "eval_loss": 0.6406324505805969, "eval_rewards/accuracies": 0.6365000009536743, "eval_rewards/chosen": -0.3445412218570709, "eval_rewards/margins": 0.1538446694612503, "eval_rewards/rejected": -0.49838587641716003, "eval_runtime": 347.8119, "eval_samples_per_second": 5.75, "eval_steps_per_second": 0.719, "step": 1100 }, { "epoch": 0.29, "grad_norm": 2.3125, "learning_rate": 4.468337953401909e-06, "logits/chosen": -2.8207192420959473, "logits/rejected": -2.8083157539367676, "logps/chosen": -434.90960693359375, "logps/rejected": -418.67315673828125, "loss": 0.6428, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.29851752519607544, "rewards/margins": 0.14117324352264404, "rewards/rejected": -0.4396907389163971, "step": 1110 }, { "epoch": 0.29, "grad_norm": 3.0625, "learning_rate": 4.45417168556166e-06, "logits/chosen": -2.775874614715576, "logits/rejected": -2.747631788253784, "logps/chosen": -398.10650634765625, "logps/rejected": -393.90191650390625, "loss": 0.6401, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.26665258407592773, "rewards/margins": 0.15709388256072998, "rewards/rejected": -0.4237464964389801, "step": 1120 }, { "epoch": 0.3, "grad_norm": 2.140625, "learning_rate": 4.439842244948036e-06, "logits/chosen": -2.780519723892212, "logits/rejected": -2.726839065551758, "logps/chosen": -421.2362365722656, "logps/rejected": -409.1251525878906, "loss": 0.6628, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3165889382362366, "rewards/margins": 0.10841169208288193, "rewards/rejected": -0.4250006675720215, "step": 1130 }, { "epoch": 0.3, "grad_norm": 3.484375, "learning_rate": 4.425350828065204e-06, "logits/chosen": -2.8075780868530273, "logits/rejected": -2.7242207527160645, "logps/chosen": -451.6830139160156, "logps/rejected": -388.5299987792969, "loss": 0.6071, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.22056348621845245, "rewards/margins": 0.22768864035606384, "rewards/rejected": -0.4482521116733551, "step": 1140 }, { "epoch": 0.3, "grad_norm": 2.203125, "learning_rate": 4.410698644942303e-06, "logits/chosen": -2.8118491172790527, "logits/rejected": -2.7709052562713623, "logps/chosen": -445.9686584472656, "logps/rejected": -409.93218994140625, "loss": 0.6404, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3341691493988037, "rewards/margins": 0.16431960463523865, "rewards/rejected": -0.49848875403404236, "step": 1150 }, { "epoch": 0.3, "grad_norm": 2.203125, "learning_rate": 4.395886919032406e-06, "logits/chosen": -2.73759126663208, "logits/rejected": -2.6830625534057617, "logps/chosen": -443.0125427246094, "logps/rejected": -411.4647521972656, "loss": 0.6215, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.43005380034446716, "rewards/margins": 0.20315060019493103, "rewards/rejected": -0.6332044005393982, "step": 1160 }, { "epoch": 0.31, "grad_norm": 1.9921875, "learning_rate": 4.380916887110366e-06, "logits/chosen": -2.8012502193450928, "logits/rejected": -2.733692169189453, "logps/chosen": -441.42498779296875, "logps/rejected": -387.70147705078125, "loss": 0.6355, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5223753452301025, "rewards/margins": 0.18315255641937256, "rewards/rejected": -0.7055279016494751, "step": 1170 }, { "epoch": 0.31, "grad_norm": 2.9375, "learning_rate": 4.365789799169539e-06, "logits/chosen": -2.682478427886963, "logits/rejected": -2.7226719856262207, "logps/chosen": -434.474365234375, "logps/rejected": -434.96771240234375, "loss": 0.6547, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4838402271270752, "rewards/margins": 0.1491493284702301, "rewards/rejected": -0.6329895257949829, "step": 1180 }, { "epoch": 0.31, "grad_norm": 2.265625, "learning_rate": 4.350506918317416e-06, "logits/chosen": -2.769261121749878, "logits/rejected": -2.7192516326904297, "logps/chosen": -421.7001953125, "logps/rejected": -409.52642822265625, "loss": 0.6475, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4103321135044098, "rewards/margins": 0.1710616797208786, "rewards/rejected": -0.5813937783241272, "step": 1190 }, { "epoch": 0.31, "grad_norm": 3.328125, "learning_rate": 4.335069520670149e-06, "logits/chosen": -2.7317872047424316, "logits/rejected": -2.693282127380371, "logps/chosen": -409.9453125, "logps/rejected": -401.80670166015625, "loss": 0.6723, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.508698582649231, "rewards/margins": 0.11917855590581894, "rewards/rejected": -0.6278771162033081, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": -2.7076668739318848, "eval_logits/rejected": -2.6700782775878906, "eval_logps/chosen": -442.416259765625, "eval_logps/rejected": -415.98406982421875, "eval_loss": 0.6357947587966919, "eval_rewards/accuracies": 0.6424999833106995, "eval_rewards/chosen": -0.4619145095348358, "eval_rewards/margins": 0.18109098076820374, "eval_rewards/rejected": -0.6430054903030396, "eval_runtime": 348.1294, "eval_samples_per_second": 5.745, "eval_steps_per_second": 0.718, "step": 1200 }, { "epoch": 0.32, "grad_norm": 2.796875, "learning_rate": 4.319478895246e-06, "logits/chosen": -2.737431526184082, "logits/rejected": -2.6825287342071533, "logps/chosen": -423.69134521484375, "logps/rejected": -388.00445556640625, "loss": 0.631, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4585542678833008, "rewards/margins": 0.19313645362854004, "rewards/rejected": -0.6516907811164856, "step": 1210 }, { "epoch": 0.32, "grad_norm": 2.296875, "learning_rate": 4.303736343857704e-06, "logits/chosen": -2.806246519088745, "logits/rejected": -2.7676374912261963, "logps/chosen": -441.5145568847656, "logps/rejected": -449.9791564941406, "loss": 0.6441, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4756022095680237, "rewards/margins": 0.1646694839000702, "rewards/rejected": -0.6402716636657715, "step": 1220 }, { "epoch": 0.32, "grad_norm": 3.015625, "learning_rate": 4.287843181003772e-06, "logits/chosen": -2.7820241451263428, "logits/rejected": -2.729149341583252, "logps/chosen": -496.11297607421875, "logps/rejected": -433.9457092285156, "loss": 0.6333, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5403844118118286, "rewards/margins": 0.1901749074459076, "rewards/rejected": -0.7305592894554138, "step": 1230 }, { "epoch": 0.32, "grad_norm": 2.15625, "learning_rate": 4.27180073375873e-06, "logits/chosen": -2.7426555156707764, "logits/rejected": -2.711142063140869, "logps/chosen": -488.27862548828125, "logps/rejected": -438.2783203125, "loss": 0.6226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5886915922164917, "rewards/margins": 0.22011339664459229, "rewards/rejected": -0.8088048696517944, "step": 1240 }, { "epoch": 0.33, "grad_norm": 3.0, "learning_rate": 4.255610341662304e-06, "logits/chosen": -2.769096851348877, "logits/rejected": -2.7023282051086426, "logps/chosen": -451.603759765625, "logps/rejected": -415.9313049316406, "loss": 0.6332, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6167994141578674, "rewards/margins": 0.1871766597032547, "rewards/rejected": -0.8039760589599609, "step": 1250 }, { "epoch": 0.33, "grad_norm": 2.046875, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -2.7416205406188965, "logits/rejected": -2.7040672302246094, "logps/chosen": -444.20526123046875, "logps/rejected": -424.52972412109375, "loss": 0.65, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5938581228256226, "rewards/margins": 0.1559075564146042, "rewards/rejected": -0.7497657537460327, "step": 1260 }, { "epoch": 0.33, "grad_norm": 2.875, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -2.7380728721618652, "logits/rejected": -2.6828763484954834, "logps/chosen": -430.66961669921875, "logps/rejected": -395.87994384765625, "loss": 0.6311, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5714975595474243, "rewards/margins": 0.20155255496501923, "rewards/rejected": -0.7730501294136047, "step": 1270 }, { "epoch": 0.33, "grad_norm": 2.90625, "learning_rate": 4.206165076283983e-06, "logits/chosen": -2.7856247425079346, "logits/rejected": -2.7396748065948486, "logps/chosen": -438.529296875, "logps/rejected": -418.1429138183594, "loss": 0.6182, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6402245759963989, "rewards/margins": 0.21725162863731384, "rewards/rejected": -0.8574762344360352, "step": 1280 }, { "epoch": 0.34, "grad_norm": 2.984375, "learning_rate": 4.189396545546995e-06, "logits/chosen": -2.7590296268463135, "logits/rejected": -2.7307534217834473, "logps/chosen": -455.66064453125, "logps/rejected": -426.7330017089844, "loss": 0.6707, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.744498074054718, "rewards/margins": 0.12376417219638824, "rewards/rejected": -0.8682621717453003, "step": 1290 }, { "epoch": 0.34, "grad_norm": 3.125, "learning_rate": 4.172486950684627e-06, "logits/chosen": -2.773496150970459, "logits/rejected": -2.764591693878174, "logps/chosen": -448.2276306152344, "logps/rejected": -449.07598876953125, "loss": 0.605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6141419410705566, "rewards/margins": 0.2553822696208954, "rewards/rejected": -0.8695241808891296, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": -2.7122175693511963, "eval_logits/rejected": -2.6764349937438965, "eval_logps/chosen": -465.1626892089844, "eval_logps/rejected": -440.71441650390625, "eval_loss": 0.6297281384468079, "eval_rewards/accuracies": 0.6434999704360962, "eval_rewards/chosen": -0.6893790364265442, "eval_rewards/margins": 0.20092952251434326, "eval_rewards/rejected": -0.8903085589408875, "eval_runtime": 348.1512, "eval_samples_per_second": 5.745, "eval_steps_per_second": 0.718, "step": 1300 }, { "epoch": 0.34, "grad_norm": 3.4375, "learning_rate": 4.155437703643182e-06, "logits/chosen": -2.8083739280700684, "logits/rejected": -2.752487897872925, "logps/chosen": -442.2110900878906, "logps/rejected": -403.32525634765625, "loss": 0.6226, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6493631601333618, "rewards/margins": 0.21884135901927948, "rewards/rejected": -0.8682045936584473, "step": 1310 }, { "epoch": 0.35, "grad_norm": 3.09375, "learning_rate": 4.138250228029882e-06, "logits/chosen": -2.7438082695007324, "logits/rejected": -2.7184457778930664, "logps/chosen": -465.0062561035156, "logps/rejected": -466.47674560546875, "loss": 0.6516, "rewards/accuracies": 0.625, "rewards/chosen": -0.7264341711997986, "rewards/margins": 0.1592479646205902, "rewards/rejected": -0.8856821060180664, "step": 1320 }, { "epoch": 0.35, "grad_norm": 2.5, "learning_rate": 4.120925958993994e-06, "logits/chosen": -2.721998453140259, "logits/rejected": -2.7064220905303955, "logps/chosen": -416.6083984375, "logps/rejected": -412.7229919433594, "loss": 0.6416, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6872702836990356, "rewards/margins": 0.16901233792304993, "rewards/rejected": -0.8562827110290527, "step": 1330 }, { "epoch": 0.35, "grad_norm": 3.84375, "learning_rate": 4.103466343106999e-06, "logits/chosen": -2.7732884883880615, "logits/rejected": -2.7553248405456543, "logps/chosen": -469.11907958984375, "logps/rejected": -438.9508361816406, "loss": 0.6312, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6161693930625916, "rewards/margins": 0.18880559504032135, "rewards/rejected": -0.8049749135971069, "step": 1340 }, { "epoch": 0.35, "grad_norm": 2.96875, "learning_rate": 4.085872838241797e-06, "logits/chosen": -2.713508129119873, "logits/rejected": -2.668172836303711, "logps/chosen": -456.80804443359375, "logps/rejected": -435.1529846191406, "loss": 0.6418, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6335210204124451, "rewards/margins": 0.1759893149137497, "rewards/rejected": -0.8095104098320007, "step": 1350 }, { "epoch": 0.36, "grad_norm": 2.734375, "learning_rate": 4.06814691345098e-06, "logits/chosen": -2.730811595916748, "logits/rejected": -2.6753551959991455, "logps/chosen": -447.3460388183594, "logps/rejected": -414.4151306152344, "loss": 0.6275, "rewards/accuracies": 0.625, "rewards/chosen": -0.6381969451904297, "rewards/margins": 0.1885562241077423, "rewards/rejected": -0.8267530202865601, "step": 1360 }, { "epoch": 0.36, "grad_norm": 2.75, "learning_rate": 4.050290048844171e-06, "logits/chosen": -2.7473931312561035, "logits/rejected": -2.7274961471557617, "logps/chosen": -464.76397705078125, "logps/rejected": -457.59674072265625, "loss": 0.6418, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6140081882476807, "rewards/margins": 0.1607227772474289, "rewards/rejected": -0.7747309803962708, "step": 1370 }, { "epoch": 0.36, "grad_norm": 3.21875, "learning_rate": 4.032303735464422e-06, "logits/chosen": -2.832428455352783, "logits/rejected": -2.760331153869629, "logps/chosen": -478.40185546875, "logps/rejected": -440.13043212890625, "loss": 0.628, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6477630734443665, "rewards/margins": 0.22839903831481934, "rewards/rejected": -0.876162052154541, "step": 1380 }, { "epoch": 0.36, "grad_norm": 2.953125, "learning_rate": 4.014189475163727e-06, "logits/chosen": -2.720529079437256, "logits/rejected": -2.6880316734313965, "logps/chosen": -449.77996826171875, "logps/rejected": -435.7823791503906, "loss": 0.6226, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6553342938423157, "rewards/margins": 0.21635405719280243, "rewards/rejected": -0.8716884851455688, "step": 1390 }, { "epoch": 0.37, "grad_norm": 2.96875, "learning_rate": 3.995948780477605e-06, "logits/chosen": -2.7746074199676514, "logits/rejected": -2.7223048210144043, "logps/chosen": -469.13836669921875, "logps/rejected": -439.2986755371094, "loss": 0.6361, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6836882829666138, "rewards/margins": 0.19334295392036438, "rewards/rejected": -0.8770312070846558, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": -2.7091026306152344, "eval_logits/rejected": -2.6711199283599854, "eval_logps/chosen": -467.6648254394531, "eval_logps/rejected": -444.7496337890625, "eval_loss": 0.6266594529151917, "eval_rewards/accuracies": 0.6504999995231628, "eval_rewards/chosen": -0.7144004702568054, "eval_rewards/margins": 0.2162601202726364, "eval_rewards/rejected": -0.930660605430603, "eval_runtime": 347.968, "eval_samples_per_second": 5.748, "eval_steps_per_second": 0.718, "step": 1400 }, { "epoch": 0.37, "grad_norm": 2.375, "learning_rate": 3.977583174498816e-06, "logits/chosen": -2.7707862854003906, "logits/rejected": -2.7450509071350098, "logps/chosen": -476.1197204589844, "logps/rejected": -445.98419189453125, "loss": 0.6258, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7250741720199585, "rewards/margins": 0.21456794440746307, "rewards/rejected": -0.9396421313285828, "step": 1410 }, { "epoch": 0.37, "grad_norm": 2.40625, "learning_rate": 3.959094190750172e-06, "logits/chosen": -2.780161142349243, "logits/rejected": -2.7392024993896484, "logps/chosen": -497.08758544921875, "logps/rejected": -469.8622131347656, "loss": 0.6158, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7258157730102539, "rewards/margins": 0.2515636384487152, "rewards/rejected": -0.9773795008659363, "step": 1420 }, { "epoch": 0.37, "grad_norm": 2.78125, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -2.685615062713623, "logits/rejected": -2.663395404815674, "logps/chosen": -459.1412048339844, "logps/rejected": -449.1641540527344, "loss": 0.627, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7857618927955627, "rewards/margins": 0.22839538753032684, "rewards/rejected": -1.0141572952270508, "step": 1430 }, { "epoch": 0.38, "grad_norm": 2.484375, "learning_rate": 3.921752275415712e-06, "logits/chosen": -2.772688388824463, "logits/rejected": -2.7538001537323, "logps/chosen": -472.8526916503906, "logps/rejected": -457.0860900878906, "loss": 0.6102, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9769464731216431, "rewards/margins": 0.26345548033714294, "rewards/rejected": -1.2404019832611084, "step": 1440 }, { "epoch": 0.38, "grad_norm": 2.6875, "learning_rate": 3.902902461869079e-06, "logits/chosen": -2.7669315338134766, "logits/rejected": -2.722926378250122, "logps/chosen": -466.6268615722656, "logps/rejected": -452.5586853027344, "loss": 0.6191, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0403883457183838, "rewards/margins": 0.2461782991886139, "rewards/rejected": -1.2865667343139648, "step": 1450 }, { "epoch": 0.38, "grad_norm": 4.625, "learning_rate": 3.883935506370605e-06, "logits/chosen": -2.698967456817627, "logits/rejected": -2.680738925933838, "logps/chosen": -471.3136291503906, "logps/rejected": -444.6326599121094, "loss": 0.6268, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9365707635879517, "rewards/margins": 0.22086432576179504, "rewards/rejected": -1.1574350595474243, "step": 1460 }, { "epoch": 0.38, "grad_norm": 2.25, "learning_rate": 3.864852992655617e-06, "logits/chosen": -2.724776268005371, "logits/rejected": -2.6953415870666504, "logps/chosen": -462.2669982910156, "logps/rejected": -459.33428955078125, "loss": 0.591, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8260868787765503, "rewards/margins": 0.2951758801937103, "rewards/rejected": -1.121262788772583, "step": 1470 }, { "epoch": 0.39, "grad_norm": 2.96875, "learning_rate": 3.845656514108516e-06, "logits/chosen": -2.7450900077819824, "logits/rejected": -2.7022852897644043, "logps/chosen": -477.535888671875, "logps/rejected": -422.33587646484375, "loss": 0.6484, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.9595173597335815, "rewards/margins": 0.19346167147159576, "rewards/rejected": -1.1529791355133057, "step": 1480 }, { "epoch": 0.39, "grad_norm": 2.734375, "learning_rate": 3.826347673629738e-06, "logits/chosen": -2.7165746688842773, "logits/rejected": -2.6513075828552246, "logps/chosen": -472.983642578125, "logps/rejected": -456.36956787109375, "loss": 0.6028, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9271873235702515, "rewards/margins": 0.28870025277137756, "rewards/rejected": -1.2158875465393066, "step": 1490 }, { "epoch": 0.39, "grad_norm": 4.21875, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -2.6848392486572266, "logits/rejected": -2.6425843238830566, "logps/chosen": -507.7676696777344, "logps/rejected": -480.82501220703125, "loss": 0.6085, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9639270901679993, "rewards/margins": 0.28983956575393677, "rewards/rejected": -1.253766655921936, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": -2.67969012260437, "eval_logits/rejected": -2.643465995788574, "eval_logps/chosen": -501.5469055175781, "eval_logps/rejected": -482.52557373046875, "eval_loss": 0.6213365197181702, "eval_rewards/accuracies": 0.6489999890327454, "eval_rewards/chosen": -1.053221344947815, "eval_rewards/margins": 0.25519895553588867, "eval_rewards/rejected": -1.308420181274414, "eval_runtime": 348.1901, "eval_samples_per_second": 5.744, "eval_steps_per_second": 0.718, "step": 1500 }, { "epoch": 0.4, "grad_norm": 3.921875, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -2.711010456085205, "logits/rejected": -2.683377981185913, "logps/chosen": -460.9742736816406, "logps/rejected": -446.6480407714844, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -1.0935702323913574, "rewards/margins": 0.11124851554632187, "rewards/rejected": -1.2048187255859375, "step": 1510 }, { "epoch": 0.4, "grad_norm": 2.953125, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -2.748260021209717, "logits/rejected": -2.724116802215576, "logps/chosen": -488.9134826660156, "logps/rejected": -478.4491271972656, "loss": 0.6052, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9258454442024231, "rewards/margins": 0.2710776627063751, "rewards/rejected": -1.196923017501831, "step": 1520 }, { "epoch": 0.4, "grad_norm": 2.984375, "learning_rate": 3.748021075950633e-06, "logits/chosen": -2.722660779953003, "logits/rejected": -2.7004356384277344, "logps/chosen": -515.2189331054688, "logps/rejected": -489.676025390625, "loss": 0.6558, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9935762286186218, "rewards/margins": 0.16242071986198425, "rewards/rejected": -1.1559970378875732, "step": 1530 }, { "epoch": 0.4, "grad_norm": 2.515625, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -2.693944215774536, "logits/rejected": -2.662226438522339, "logps/chosen": -472.21392822265625, "logps/rejected": -458.48651123046875, "loss": 0.6344, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1790839433670044, "rewards/margins": 0.1845519244670868, "rewards/rejected": -1.363635778427124, "step": 1540 }, { "epoch": 0.41, "grad_norm": 2.75, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -2.7512943744659424, "logits/rejected": -2.7197911739349365, "logps/chosen": -525.9802856445312, "logps/rejected": -505.2586975097656, "loss": 0.6247, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1584722995758057, "rewards/margins": 0.232163667678833, "rewards/rejected": -1.3906362056732178, "step": 1550 }, { "epoch": 0.41, "grad_norm": 2.859375, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -2.755866050720215, "logits/rejected": -2.70156192779541, "logps/chosen": -549.6534423828125, "logps/rejected": -496.219482421875, "loss": 0.626, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2471288442611694, "rewards/margins": 0.21645119786262512, "rewards/rejected": -1.4635800123214722, "step": 1560 }, { "epoch": 0.41, "grad_norm": 3.4375, "learning_rate": 3.668027301883802e-06, "logits/chosen": -2.751842498779297, "logits/rejected": -2.7206082344055176, "logps/chosen": -509.11871337890625, "logps/rejected": -488.1609802246094, "loss": 0.6118, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2405083179473877, "rewards/margins": 0.24299952387809753, "rewards/rejected": -1.483507752418518, "step": 1570 }, { "epoch": 0.41, "grad_norm": 2.78125, "learning_rate": 3.64778083782286e-06, "logits/chosen": -2.7066681385040283, "logits/rejected": -2.7152516841888428, "logps/chosen": -505.77618408203125, "logps/rejected": -542.4833374023438, "loss": 0.602, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2058336734771729, "rewards/margins": 0.2781684398651123, "rewards/rejected": -1.4840023517608643, "step": 1580 }, { "epoch": 0.42, "grad_norm": 2.96875, "learning_rate": 3.627438534392268e-06, "logits/chosen": -2.750699520111084, "logits/rejected": -2.7490906715393066, "logps/chosen": -485.8789978027344, "logps/rejected": -506.06243896484375, "loss": 0.6448, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.244022250175476, "rewards/margins": 0.20155465602874756, "rewards/rejected": -1.4455769062042236, "step": 1590 }, { "epoch": 0.42, "grad_norm": 2.78125, "learning_rate": 3.607002090168506e-06, "logits/chosen": -2.6556179523468018, "logits/rejected": -2.649691343307495, "logps/chosen": -509.494873046875, "logps/rejected": -483.21875, "loss": 0.6317, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0604232549667358, "rewards/margins": 0.21748527884483337, "rewards/rejected": -1.2779085636138916, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": -2.6505815982818604, "eval_logits/rejected": -2.617206573486328, "eval_logps/chosen": -508.6858215332031, "eval_logps/rejected": -489.9322814941406, "eval_loss": 0.619657576084137, "eval_rewards/accuracies": 0.6489999890327454, "eval_rewards/chosen": -1.1246099472045898, "eval_rewards/margins": 0.2578776180744171, "eval_rewards/rejected": -1.3824876546859741, "eval_runtime": 347.7803, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 1600 }, { "epoch": 0.42, "grad_norm": 7.125, "learning_rate": 3.586473211588787e-06, "logits/chosen": -2.710347890853882, "logits/rejected": -2.695310354232788, "logps/chosen": -473.04541015625, "logps/rejected": -494.8163146972656, "loss": 0.5936, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0344406366348267, "rewards/margins": 0.3075736463069916, "rewards/rejected": -1.342014193534851, "step": 1610 }, { "epoch": 0.42, "grad_norm": 4.5, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -2.7335174083709717, "logits/rejected": -2.673346996307373, "logps/chosen": -509.60595703125, "logps/rejected": -487.2217712402344, "loss": 0.614, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1273787021636963, "rewards/margins": 0.2849898934364319, "rewards/rejected": -1.4123685359954834, "step": 1620 }, { "epoch": 0.43, "grad_norm": 4.90625, "learning_rate": 3.545145015558399e-06, "logits/chosen": -2.588106155395508, "logits/rejected": -2.6156704425811768, "logps/chosen": -476.9378967285156, "logps/rejected": -474.3050842285156, "loss": 0.6187, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2659618854522705, "rewards/margins": 0.28414902091026306, "rewards/rejected": -1.5501108169555664, "step": 1630 }, { "epoch": 0.43, "grad_norm": 3.09375, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -2.6407034397125244, "logits/rejected": -2.6249217987060547, "logps/chosen": -523.3114013671875, "logps/rejected": -515.78564453125, "loss": 0.6534, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3326655626296997, "rewards/margins": 0.2217075377702713, "rewards/rejected": -1.554373025894165, "step": 1640 }, { "epoch": 0.43, "grad_norm": 3.0, "learning_rate": 3.503467749582857e-06, "logits/chosen": -2.678744077682495, "logits/rejected": -2.6236231327056885, "logps/chosen": -500.12322998046875, "logps/rejected": -453.09375, "loss": 0.6798, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1778029203414917, "rewards/margins": 0.14696446061134338, "rewards/rejected": -1.3247674703598022, "step": 1650 }, { "epoch": 0.43, "grad_norm": 2.515625, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -2.6644175052642822, "logits/rejected": -2.6675162315368652, "logps/chosen": -451.5030822753906, "logps/rejected": -466.20867919921875, "loss": 0.6371, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1360504627227783, "rewards/margins": 0.20215868949890137, "rewards/rejected": -1.3382090330123901, "step": 1660 }, { "epoch": 0.44, "grad_norm": 3.421875, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -2.714818239212036, "logits/rejected": -2.6459782123565674, "logps/chosen": -528.015380859375, "logps/rejected": -481.2875061035156, "loss": 0.6269, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1526925563812256, "rewards/margins": 0.24837598204612732, "rewards/rejected": -1.4010684490203857, "step": 1670 }, { "epoch": 0.44, "grad_norm": 2.8125, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -2.7072434425354004, "logits/rejected": -2.6440227031707764, "logps/chosen": -518.8106689453125, "logps/rejected": -480.94091796875, "loss": 0.5745, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9318562746047974, "rewards/margins": 0.3667041063308716, "rewards/rejected": -1.298560380935669, "step": 1680 }, { "epoch": 0.44, "grad_norm": 3.0625, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -2.739563465118408, "logits/rejected": -2.681410312652588, "logps/chosen": -504.88580322265625, "logps/rejected": -463.4295959472656, "loss": 0.5969, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9455466270446777, "rewards/margins": 0.3077097535133362, "rewards/rejected": -1.2532564401626587, "step": 1690 }, { "epoch": 0.44, "grad_norm": 4.53125, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -2.6793107986450195, "logits/rejected": -2.6711511611938477, "logps/chosen": -440.7421875, "logps/rejected": -438.96209716796875, "loss": 0.6702, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.124517798423767, "rewards/margins": 0.14334309101104736, "rewards/rejected": -1.267861008644104, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": -2.6761555671691895, "eval_logits/rejected": -2.6407337188720703, "eval_logps/chosen": -496.5814514160156, "eval_logps/rejected": -478.1268310546875, "eval_loss": 0.6182043552398682, "eval_rewards/accuracies": 0.652999997138977, "eval_rewards/chosen": -1.0035661458969116, "eval_rewards/margins": 0.26086705923080444, "eval_rewards/rejected": -1.2644333839416504, "eval_runtime": 347.9034, "eval_samples_per_second": 5.749, "eval_steps_per_second": 0.719, "step": 1700 }, { "epoch": 0.45, "grad_norm": 4.0, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -2.7096104621887207, "logits/rejected": -2.717935562133789, "logps/chosen": -451.46923828125, "logps/rejected": -466.99798583984375, "loss": 0.6448, "rewards/accuracies": 0.625, "rewards/chosen": -0.9781481027603149, "rewards/margins": 0.2028496265411377, "rewards/rejected": -1.180997610092163, "step": 1710 }, { "epoch": 0.45, "grad_norm": 3.953125, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -2.759023427963257, "logits/rejected": -2.7278056144714355, "logps/chosen": -501.0248107910156, "logps/rejected": -491.77374267578125, "loss": 0.5937, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9347749948501587, "rewards/margins": 0.3128504455089569, "rewards/rejected": -1.2476253509521484, "step": 1720 }, { "epoch": 0.45, "grad_norm": 3.40625, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -2.640856981277466, "logits/rejected": -2.624553918838501, "logps/chosen": -483.8340759277344, "logps/rejected": -468.92327880859375, "loss": 0.637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9758474230766296, "rewards/margins": 0.2623240053653717, "rewards/rejected": -1.2381714582443237, "step": 1730 }, { "epoch": 0.46, "grad_norm": 3.015625, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -2.6590754985809326, "logits/rejected": -2.620539426803589, "logps/chosen": -497.45587158203125, "logps/rejected": -465.5917053222656, "loss": 0.6199, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.884781002998352, "rewards/margins": 0.24650093913078308, "rewards/rejected": -1.1312817335128784, "step": 1740 }, { "epoch": 0.46, "grad_norm": 3.625, "learning_rate": 3.290336385060832e-06, "logits/chosen": -2.7382700443267822, "logits/rejected": -2.6750712394714355, "logps/chosen": -496.0645446777344, "logps/rejected": -476.931884765625, "loss": 0.6089, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1136571168899536, "rewards/margins": 0.2823924124240875, "rewards/rejected": -1.3960494995117188, "step": 1750 }, { "epoch": 0.46, "grad_norm": 3.359375, "learning_rate": 3.268630667594348e-06, "logits/chosen": -2.6461880207061768, "logits/rejected": -2.6458840370178223, "logps/chosen": -484.7660217285156, "logps/rejected": -470.3802795410156, "loss": 0.6207, "rewards/accuracies": 0.65625, "rewards/chosen": -1.033327341079712, "rewards/margins": 0.2565310001373291, "rewards/rejected": -1.2898584604263306, "step": 1760 }, { "epoch": 0.46, "grad_norm": 3.21875, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -2.664067268371582, "logits/rejected": -2.641017436981201, "logps/chosen": -490.5843200683594, "logps/rejected": -511.4412536621094, "loss": 0.5835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0362211465835571, "rewards/margins": 0.35183295607566833, "rewards/rejected": -1.3880541324615479, "step": 1770 }, { "epoch": 0.47, "grad_norm": 3.46875, "learning_rate": 3.225028509122944e-06, "logits/chosen": -2.7072222232818604, "logits/rejected": -2.6669273376464844, "logps/chosen": -479.81292724609375, "logps/rejected": -471.8095703125, "loss": 0.6306, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1294889450073242, "rewards/margins": 0.23517270386219025, "rewards/rejected": -1.3646615743637085, "step": 1780 }, { "epoch": 0.47, "grad_norm": 5.15625, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -2.6887664794921875, "logits/rejected": -2.672578811645508, "logps/chosen": -530.8712158203125, "logps/rejected": -537.0011596679688, "loss": 0.6336, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.256080985069275, "rewards/margins": 0.2619550824165344, "rewards/rejected": -1.518036127090454, "step": 1790 }, { "epoch": 0.47, "grad_norm": 5.1875, "learning_rate": 3.181184197019127e-06, "logits/chosen": -2.5891640186309814, "logits/rejected": -2.568915843963623, "logps/chosen": -489.94512939453125, "logps/rejected": -538.85986328125, "loss": 0.5658, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.305600881576538, "rewards/margins": 0.43258899450302124, "rewards/rejected": -1.738189935684204, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": -2.618206739425659, "eval_logits/rejected": -2.5865581035614014, "eval_logps/chosen": -531.0144653320312, "eval_logps/rejected": -515.1605834960938, "eval_loss": 0.6218886375427246, "eval_rewards/accuracies": 0.6445000171661377, "eval_rewards/chosen": -1.3478968143463135, "eval_rewards/margins": 0.28687387704849243, "eval_rewards/rejected": -1.6347707509994507, "eval_runtime": 347.7653, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 1800 }, { "epoch": 0.47, "grad_norm": 3.3125, "learning_rate": 3.159175806468126e-06, "logits/chosen": -2.5783843994140625, "logits/rejected": -2.530768871307373, "logps/chosen": -519.810546875, "logps/rejected": -486.0419921875, "loss": 0.6143, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3787872791290283, "rewards/margins": 0.29414287209510803, "rewards/rejected": -1.672930121421814, "step": 1810 }, { "epoch": 0.48, "grad_norm": 11.125, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -2.6292014122009277, "logits/rejected": -2.6139864921569824, "logps/chosen": -565.1087646484375, "logps/rejected": -546.5718994140625, "loss": 0.6793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.386059284210205, "rewards/margins": 0.1806623637676239, "rewards/rejected": -1.5667215585708618, "step": 1820 }, { "epoch": 0.48, "grad_norm": 3.515625, "learning_rate": 3.114995744685877e-06, "logits/chosen": -2.6707286834716797, "logits/rejected": -2.669381618499756, "logps/chosen": -514.181640625, "logps/rejected": -503.86663818359375, "loss": 0.6624, "rewards/accuracies": 0.59375, "rewards/chosen": -1.390178918838501, "rewards/margins": 0.18112266063690186, "rewards/rejected": -1.5713016986846924, "step": 1830 }, { "epoch": 0.48, "grad_norm": 4.3125, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -2.7397656440734863, "logits/rejected": -2.692636013031006, "logps/chosen": -547.605224609375, "logps/rejected": -530.4255981445312, "loss": 0.5952, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2703100442886353, "rewards/margins": 0.33880940079689026, "rewards/rejected": -1.6091196537017822, "step": 1840 }, { "epoch": 0.48, "grad_norm": 2.703125, "learning_rate": 3.070610279320708e-06, "logits/chosen": -2.7308197021484375, "logits/rejected": -2.691286087036133, "logps/chosen": -530.3106689453125, "logps/rejected": -512.4490356445312, "loss": 0.5837, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1016675233840942, "rewards/margins": 0.3237997591495514, "rewards/rejected": -1.4254672527313232, "step": 1850 }, { "epoch": 0.49, "grad_norm": 2.71875, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -2.739076614379883, "logits/rejected": -2.7019972801208496, "logps/chosen": -540.4013671875, "logps/rejected": -528.4253540039062, "loss": 0.6168, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.127809762954712, "rewards/margins": 0.2989148199558258, "rewards/rejected": -1.4267246723175049, "step": 1860 }, { "epoch": 0.49, "grad_norm": 4.03125, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -2.6942806243896484, "logits/rejected": -2.652289628982544, "logps/chosen": -524.1945190429688, "logps/rejected": -504.88848876953125, "loss": 0.5845, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0817714929580688, "rewards/margins": 0.3258208930492401, "rewards/rejected": -1.4075922966003418, "step": 1870 }, { "epoch": 0.49, "grad_norm": 3.78125, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -2.7198615074157715, "logits/rejected": -2.676637649536133, "logps/chosen": -517.9676513671875, "logps/rejected": -481.9654846191406, "loss": 0.5941, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1094682216644287, "rewards/margins": 0.3417368233203888, "rewards/rejected": -1.4512050151824951, "step": 1880 }, { "epoch": 0.49, "grad_norm": 5.0625, "learning_rate": 2.981282499033009e-06, "logits/chosen": -2.70827054977417, "logits/rejected": -2.6766083240509033, "logps/chosen": -514.5819091796875, "logps/rejected": -497.3114318847656, "loss": 0.6435, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.134453535079956, "rewards/margins": 0.22488650679588318, "rewards/rejected": -1.3593400716781616, "step": 1890 }, { "epoch": 0.5, "grad_norm": 2.21875, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -2.6851532459259033, "logits/rejected": -2.6381797790527344, "logps/chosen": -517.7092895507812, "logps/rejected": -501.8164978027344, "loss": 0.6039, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9668029546737671, "rewards/margins": 0.30051741003990173, "rewards/rejected": -1.2673202753067017, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": -2.674211025238037, "eval_logits/rejected": -2.6375982761383057, "eval_logps/chosen": -486.3656311035156, "eval_logps/rejected": -468.8457946777344, "eval_loss": 0.6153793931007385, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": -0.9014082551002502, "eval_rewards/margins": 0.2702144682407379, "eval_rewards/rejected": -1.1716225147247314, "eval_runtime": 347.781, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 1900 }, { "epoch": 0.5, "grad_norm": 3.515625, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -2.7260212898254395, "logits/rejected": -2.7005722522735596, "logps/chosen": -478.63385009765625, "logps/rejected": -446.9085998535156, "loss": 0.6221, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8971524238586426, "rewards/margins": 0.24045062065124512, "rewards/rejected": -1.1376030445098877, "step": 1910 }, { "epoch": 0.5, "grad_norm": 2.828125, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -2.7050108909606934, "logits/rejected": -2.6705167293548584, "logps/chosen": -476.31854248046875, "logps/rejected": -468.4576110839844, "loss": 0.5804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7747586369514465, "rewards/margins": 0.34002387523651123, "rewards/rejected": -1.1147825717926025, "step": 1920 }, { "epoch": 0.51, "grad_norm": 3.0, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -2.6942856311798096, "logits/rejected": -2.6358091831207275, "logps/chosen": -478.71759033203125, "logps/rejected": -443.03668212890625, "loss": 0.5989, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8447883725166321, "rewards/margins": 0.32821527123451233, "rewards/rejected": -1.1730036735534668, "step": 1930 }, { "epoch": 0.51, "grad_norm": 2.609375, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -2.701122760772705, "logits/rejected": -2.682640790939331, "logps/chosen": -484.0836486816406, "logps/rejected": -488.54425048828125, "loss": 0.611, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8852831125259399, "rewards/margins": 0.296100914478302, "rewards/rejected": -1.1813839673995972, "step": 1940 }, { "epoch": 0.51, "grad_norm": 3.171875, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -2.728684902191162, "logits/rejected": -2.6673216819763184, "logps/chosen": -502.921875, "logps/rejected": -477.1438903808594, "loss": 0.6062, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9080885052680969, "rewards/margins": 0.27347394824028015, "rewards/rejected": -1.1815625429153442, "step": 1950 }, { "epoch": 0.51, "grad_norm": 2.265625, "learning_rate": 2.823484120195865e-06, "logits/chosen": -2.741576671600342, "logits/rejected": -2.6904985904693604, "logps/chosen": -522.7763671875, "logps/rejected": -488.14080810546875, "loss": 0.5742, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9640465974807739, "rewards/margins": 0.36099857091903687, "rewards/rejected": -1.325045108795166, "step": 1960 }, { "epoch": 0.52, "grad_norm": 4.5625, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -2.70404052734375, "logits/rejected": -2.65217924118042, "logps/chosen": -512.2633666992188, "logps/rejected": -466.35870361328125, "loss": 0.6164, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0254846811294556, "rewards/margins": 0.28917989134788513, "rewards/rejected": -1.3146644830703735, "step": 1970 }, { "epoch": 0.52, "grad_norm": 2.96875, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -2.6516964435577393, "logits/rejected": -2.666243076324463, "logps/chosen": -464.35076904296875, "logps/rejected": -491.3067932128906, "loss": 0.628, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0909736156463623, "rewards/margins": 0.2263956516981125, "rewards/rejected": -1.3173692226409912, "step": 1980 }, { "epoch": 0.52, "grad_norm": 3.640625, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -2.6758196353912354, "logits/rejected": -2.6307640075683594, "logps/chosen": -480.7010803222656, "logps/rejected": -462.5948791503906, "loss": 0.6001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9626979827880859, "rewards/margins": 0.29944872856140137, "rewards/rejected": -1.2621467113494873, "step": 1990 }, { "epoch": 0.52, "grad_norm": 3.78125, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -2.732978105545044, "logits/rejected": -2.6916584968566895, "logps/chosen": -537.5288696289062, "logps/rejected": -481.4242248535156, "loss": 0.6173, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0523701906204224, "rewards/margins": 0.30254489183425903, "rewards/rejected": -1.354914903640747, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": -2.6580026149749756, "eval_logits/rejected": -2.6232478618621826, "eval_logps/chosen": -511.5793151855469, "eval_logps/rejected": -496.3810119628906, "eval_loss": 0.6120737791061401, "eval_rewards/accuracies": 0.6575000286102295, "eval_rewards/chosen": -1.1535453796386719, "eval_rewards/margins": 0.29342907667160034, "eval_rewards/rejected": -1.4469746351242065, "eval_runtime": 347.7542, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 2000 }, { "epoch": 0.53, "grad_norm": 3.171875, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -2.6824750900268555, "logits/rejected": -2.6871109008789062, "logps/chosen": -506.978759765625, "logps/rejected": -510.89569091796875, "loss": 0.6114, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1684012413024902, "rewards/margins": 0.284812867641449, "rewards/rejected": -1.4532140493392944, "step": 2010 }, { "epoch": 0.53, "grad_norm": 4.03125, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -2.732447624206543, "logits/rejected": -2.692403554916382, "logps/chosen": -490.4368591308594, "logps/rejected": -467.0401306152344, "loss": 0.6204, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0760316848754883, "rewards/margins": 0.24762101471424103, "rewards/rejected": -1.3236526250839233, "step": 2020 }, { "epoch": 0.53, "grad_norm": 3.65625, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -2.7549805641174316, "logits/rejected": -2.7140159606933594, "logps/chosen": -533.2722778320312, "logps/rejected": -497.967041015625, "loss": 0.5817, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9551935195922852, "rewards/margins": 0.3695451319217682, "rewards/rejected": -1.324738621711731, "step": 2030 }, { "epoch": 0.53, "grad_norm": 2.703125, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -2.7105820178985596, "logits/rejected": -2.7078709602355957, "logps/chosen": -511.4810485839844, "logps/rejected": -498.16705322265625, "loss": 0.6262, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0229012966156006, "rewards/margins": 0.2677188515663147, "rewards/rejected": -1.290619969367981, "step": 2040 }, { "epoch": 0.54, "grad_norm": 2.984375, "learning_rate": 2.618747345980904e-06, "logits/chosen": -2.7516863346099854, "logits/rejected": -2.6846871376037598, "logps/chosen": -489.28924560546875, "logps/rejected": -436.2731018066406, "loss": 0.6068, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1246410608291626, "rewards/margins": 0.29066139459609985, "rewards/rejected": -1.4153025150299072, "step": 2050 }, { "epoch": 0.54, "grad_norm": 3.8125, "learning_rate": 2.595923867132136e-06, "logits/chosen": -2.744058132171631, "logits/rejected": -2.71818208694458, "logps/chosen": -528.1295166015625, "logps/rejected": -513.4764404296875, "loss": 0.604, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1548454761505127, "rewards/margins": 0.3176589012145996, "rewards/rejected": -1.4725043773651123, "step": 2060 }, { "epoch": 0.54, "grad_norm": 3.640625, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -2.7094826698303223, "logits/rejected": -2.7104077339172363, "logps/chosen": -496.2884826660156, "logps/rejected": -523.0441284179688, "loss": 0.629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2200782299041748, "rewards/margins": 0.2470276653766632, "rewards/rejected": -1.4671061038970947, "step": 2070 }, { "epoch": 0.54, "grad_norm": 2.90625, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -2.7130637168884277, "logits/rejected": -2.6693716049194336, "logps/chosen": -506.1182556152344, "logps/rejected": -476.56866455078125, "loss": 0.6481, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2350577116012573, "rewards/margins": 0.21940436959266663, "rewards/rejected": -1.4544621706008911, "step": 2080 }, { "epoch": 0.55, "grad_norm": 3.328125, "learning_rate": 2.527412999094507e-06, "logits/chosen": -2.6736958026885986, "logits/rejected": -2.632709264755249, "logps/chosen": -545.9395751953125, "logps/rejected": -542.23583984375, "loss": 0.6028, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1226942539215088, "rewards/margins": 0.33349329233169556, "rewards/rejected": -1.4561874866485596, "step": 2090 }, { "epoch": 0.55, "grad_norm": 5.34375, "learning_rate": 2.504568922200064e-06, "logits/chosen": -2.6629996299743652, "logits/rejected": -2.6413462162017822, "logps/chosen": -490.80059814453125, "logps/rejected": -472.08319091796875, "loss": 0.62, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1775743961334229, "rewards/margins": 0.25760284066200256, "rewards/rejected": -1.4351773262023926, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": -2.662881851196289, "eval_logits/rejected": -2.627786159515381, "eval_logps/chosen": -512.2247314453125, "eval_logps/rejected": -496.9116516113281, "eval_loss": 0.6116329431533813, "eval_rewards/accuracies": 0.6650000214576721, "eval_rewards/chosen": -1.1599992513656616, "eval_rewards/margins": 0.29228222370147705, "eval_rewards/rejected": -1.4522814750671387, "eval_runtime": 347.7217, "eval_samples_per_second": 5.752, "eval_steps_per_second": 0.719, "step": 2100 }, { "epoch": 0.55, "grad_norm": 3.40625, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -2.703998565673828, "logits/rejected": -2.6619462966918945, "logps/chosen": -514.3597412109375, "logps/rejected": -477.54974365234375, "loss": 0.6243, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1271086931228638, "rewards/margins": 0.2521311044692993, "rewards/rejected": -1.379239797592163, "step": 2110 }, { "epoch": 0.55, "grad_norm": 3.125, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -2.6903960704803467, "logits/rejected": -2.680983304977417, "logps/chosen": -468.71826171875, "logps/rejected": -445.9656677246094, "loss": 0.6159, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9441890716552734, "rewards/margins": 0.2759680449962616, "rewards/rejected": -1.2201570272445679, "step": 2120 }, { "epoch": 0.56, "grad_norm": 3.34375, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -2.6948788166046143, "logits/rejected": -2.668149471282959, "logps/chosen": -500.23248291015625, "logps/rejected": -478.42437744140625, "loss": 0.6166, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.892300009727478, "rewards/margins": 0.28213489055633545, "rewards/rejected": -1.174435019493103, "step": 2130 }, { "epoch": 0.56, "grad_norm": 3.109375, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -2.7459912300109863, "logits/rejected": -2.701768398284912, "logps/chosen": -485.0846252441406, "logps/rejected": -448.44140625, "loss": 0.5966, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9321788549423218, "rewards/margins": 0.30308184027671814, "rewards/rejected": -1.2352608442306519, "step": 2140 }, { "epoch": 0.56, "grad_norm": 3.40625, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -2.7058706283569336, "logits/rejected": -2.6738340854644775, "logps/chosen": -463.795654296875, "logps/rejected": -433.58721923828125, "loss": 0.6387, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0220860242843628, "rewards/margins": 0.22584767639636993, "rewards/rejected": -1.2479338645935059, "step": 2150 }, { "epoch": 0.57, "grad_norm": 5.0, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -2.757347583770752, "logits/rejected": -2.6829416751861572, "logps/chosen": -521.1004638671875, "logps/rejected": -448.514404296875, "loss": 0.5971, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9433916211128235, "rewards/margins": 0.3374863266944885, "rewards/rejected": -1.2808778285980225, "step": 2160 }, { "epoch": 0.57, "grad_norm": 3.21875, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -2.7074646949768066, "logits/rejected": -2.6540534496307373, "logps/chosen": -489.44866943359375, "logps/rejected": -449.8168029785156, "loss": 0.6124, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0346986055374146, "rewards/margins": 0.2758663594722748, "rewards/rejected": -1.3105649948120117, "step": 2170 }, { "epoch": 0.57, "grad_norm": 4.5625, "learning_rate": 2.321962767270724e-06, "logits/chosen": -2.69421124458313, "logits/rejected": -2.6716065406799316, "logps/chosen": -481.97833251953125, "logps/rejected": -441.35443115234375, "loss": 0.614, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0301321744918823, "rewards/margins": 0.2506099343299866, "rewards/rejected": -1.2807420492172241, "step": 2180 }, { "epoch": 0.57, "grad_norm": 3.171875, "learning_rate": 2.299183896281692e-06, "logits/chosen": -2.66270112991333, "logits/rejected": -2.640023946762085, "logps/chosen": -484.82354736328125, "logps/rejected": -490.209716796875, "loss": 0.642, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0577830076217651, "rewards/margins": 0.22141680121421814, "rewards/rejected": -1.2791998386383057, "step": 2190 }, { "epoch": 0.58, "grad_norm": 4.375, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -2.7031655311584473, "logits/rejected": -2.675705909729004, "logps/chosen": -489.45269775390625, "logps/rejected": -480.9483337402344, "loss": 0.5957, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9436511993408203, "rewards/margins": 0.3099278509616852, "rewards/rejected": -1.2535789012908936, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": -2.6674294471740723, "eval_logits/rejected": -2.631662130355835, "eval_logps/chosen": -492.14892578125, "eval_logps/rejected": -475.9957580566406, "eval_loss": 0.6131682991981506, "eval_rewards/accuracies": 0.6654999852180481, "eval_rewards/chosen": -0.959242045879364, "eval_rewards/margins": 0.28388017416000366, "eval_rewards/rejected": -1.2431222200393677, "eval_runtime": 347.6695, "eval_samples_per_second": 5.753, "eval_steps_per_second": 0.719, "step": 2200 }, { "epoch": 0.58, "grad_norm": 3.890625, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -2.764758586883545, "logits/rejected": -2.708240032196045, "logps/chosen": -508.2550354003906, "logps/rejected": -496.92022705078125, "loss": 0.6149, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9579612612724304, "rewards/margins": 0.28434932231903076, "rewards/rejected": -1.242310643196106, "step": 2210 }, { "epoch": 0.58, "grad_norm": 2.96875, "learning_rate": 2.230955492793149e-06, "logits/chosen": -2.628192901611328, "logits/rejected": -2.6235828399658203, "logps/chosen": -510.7039489746094, "logps/rejected": -509.13592529296875, "loss": 0.6373, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.038236141204834, "rewards/margins": 0.2541654407978058, "rewards/rejected": -1.2924015522003174, "step": 2220 }, { "epoch": 0.58, "grad_norm": 3.65625, "learning_rate": 2.208255091531947e-06, "logits/chosen": -2.65356707572937, "logits/rejected": -2.639221668243408, "logps/chosen": -514.1550903320312, "logps/rejected": -486.2848205566406, "loss": 0.6169, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0424216985702515, "rewards/margins": 0.28543582558631897, "rewards/rejected": -1.3278576135635376, "step": 2230 }, { "epoch": 0.59, "grad_norm": 3.421875, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -2.676987409591675, "logits/rejected": -2.6617724895477295, "logps/chosen": -531.5819091796875, "logps/rejected": -520.0554809570312, "loss": 0.6348, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0140860080718994, "rewards/margins": 0.2539765238761902, "rewards/rejected": -1.2680623531341553, "step": 2240 }, { "epoch": 0.59, "grad_norm": 2.71875, "learning_rate": 2.162929264300107e-06, "logits/chosen": -2.7012898921966553, "logits/rejected": -2.684084415435791, "logps/chosen": -495.6434631347656, "logps/rejected": -480.5547790527344, "loss": 0.58, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9610779881477356, "rewards/margins": 0.3606341779232025, "rewards/rejected": -1.3217121362686157, "step": 2250 }, { "epoch": 0.59, "grad_norm": 4.25, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -2.6837801933288574, "logits/rejected": -2.6432583332061768, "logps/chosen": -504.1858825683594, "logps/rejected": -483.0741271972656, "loss": 0.6411, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0147348642349243, "rewards/margins": 0.23686587810516357, "rewards/rejected": -1.2516006231307983, "step": 2260 }, { "epoch": 0.59, "grad_norm": 3.78125, "learning_rate": 2.11771601595586e-06, "logits/chosen": -2.679379940032959, "logits/rejected": -2.6332995891571045, "logps/chosen": -515.0089721679688, "logps/rejected": -473.6858825683594, "loss": 0.6223, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9973223805427551, "rewards/margins": 0.3110753893852234, "rewards/rejected": -1.3083977699279785, "step": 2270 }, { "epoch": 0.6, "grad_norm": 2.953125, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -2.688920497894287, "logits/rejected": -2.629225492477417, "logps/chosen": -494.71026611328125, "logps/rejected": -466.7489318847656, "loss": 0.575, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9701055288314819, "rewards/margins": 0.36299929022789, "rewards/rejected": -1.3331048488616943, "step": 2280 }, { "epoch": 0.6, "grad_norm": 3.0, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -2.679488182067871, "logits/rejected": -2.660877227783203, "logps/chosen": -480.1832580566406, "logps/rejected": -459.4652404785156, "loss": 0.5981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9950377345085144, "rewards/margins": 0.2864169776439667, "rewards/rejected": -1.2814548015594482, "step": 2290 }, { "epoch": 0.6, "grad_norm": 3.828125, "learning_rate": 2.050140250457023e-06, "logits/chosen": -2.7407097816467285, "logits/rejected": -2.6703708171844482, "logps/chosen": -514.2717895507812, "logps/rejected": -493.68157958984375, "loss": 0.6093, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.067603588104248, "rewards/margins": 0.3232964277267456, "rewards/rejected": -1.390899896621704, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": -2.661860704421997, "eval_logits/rejected": -2.6282894611358643, "eval_logps/chosen": -505.5737609863281, "eval_logps/rejected": -489.7906494140625, "eval_loss": 0.6137638092041016, "eval_rewards/accuracies": 0.6625000238418579, "eval_rewards/chosen": -1.0934895277023315, "eval_rewards/margins": 0.2875814139842987, "eval_rewards/rejected": -1.3810709714889526, "eval_runtime": 348.0995, "eval_samples_per_second": 5.745, "eval_steps_per_second": 0.718, "step": 2300 }, { "epoch": 0.6, "grad_norm": 3.4375, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -2.611936092376709, "logits/rejected": -2.5979480743408203, "logps/chosen": -461.7267150878906, "logps/rejected": -432.57012939453125, "loss": 0.6371, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.189932107925415, "rewards/margins": 0.21743163466453552, "rewards/rejected": -1.4073638916015625, "step": 2310 }, { "epoch": 0.61, "grad_norm": 4.75, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -2.705918788909912, "logits/rejected": -2.672658920288086, "logps/chosen": -493.9043884277344, "logps/rejected": -467.73699951171875, "loss": 0.629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0233014822006226, "rewards/margins": 0.26420658826828003, "rewards/rejected": -1.2875080108642578, "step": 2320 }, { "epoch": 0.61, "grad_norm": 4.8125, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -2.700793743133545, "logits/rejected": -2.6994223594665527, "logps/chosen": -514.3643798828125, "logps/rejected": -513.8361206054688, "loss": 0.6417, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9839721918106079, "rewards/margins": 0.20725660026073456, "rewards/rejected": -1.1912287473678589, "step": 2330 }, { "epoch": 0.61, "grad_norm": 4.53125, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -2.6701126098632812, "logits/rejected": -2.639960289001465, "logps/chosen": -459.0147399902344, "logps/rejected": -447.525634765625, "loss": 0.6503, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0298200845718384, "rewards/margins": 0.19326387345790863, "rewards/rejected": -1.2230839729309082, "step": 2340 }, { "epoch": 0.62, "grad_norm": 4.34375, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -2.7172811031341553, "logits/rejected": -2.704822063446045, "logps/chosen": -486.2060546875, "logps/rejected": -455.1549377441406, "loss": 0.6126, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9620019197463989, "rewards/margins": 0.29844018816947937, "rewards/rejected": -1.2604421377182007, "step": 2350 }, { "epoch": 0.62, "grad_norm": 2.84375, "learning_rate": 1.916053394469437e-06, "logits/chosen": -2.7060694694519043, "logits/rejected": -2.650001049041748, "logps/chosen": -501.8260803222656, "logps/rejected": -487.3619079589844, "loss": 0.586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9851962327957153, "rewards/margins": 0.33760061860084534, "rewards/rejected": -1.3227968215942383, "step": 2360 }, { "epoch": 0.62, "grad_norm": 3.125, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -2.707420825958252, "logits/rejected": -2.6578211784362793, "logps/chosen": -503.98114013671875, "logps/rejected": -489.624755859375, "loss": 0.5859, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.100565791130066, "rewards/margins": 0.3875434398651123, "rewards/rejected": -1.4881092309951782, "step": 2370 }, { "epoch": 0.62, "grad_norm": 5.4375, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -2.6093525886535645, "logits/rejected": -2.603379726409912, "logps/chosen": -481.91998291015625, "logps/rejected": -481.68890380859375, "loss": 0.6485, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1177568435668945, "rewards/margins": 0.2344493865966797, "rewards/rejected": -1.3522062301635742, "step": 2380 }, { "epoch": 0.63, "grad_norm": 3.234375, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -2.685995101928711, "logits/rejected": -2.6452722549438477, "logps/chosen": -505.78961181640625, "logps/rejected": -489.805419921875, "loss": 0.6215, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0744038820266724, "rewards/margins": 0.29028916358947754, "rewards/rejected": -1.3646929264068604, "step": 2390 }, { "epoch": 0.63, "grad_norm": 3.828125, "learning_rate": 1.827612436565286e-06, "logits/chosen": -2.692858934402466, "logits/rejected": -2.651174306869507, "logps/chosen": -500.21099853515625, "logps/rejected": -488.8199157714844, "loss": 0.6009, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0547789335250854, "rewards/margins": 0.330585777759552, "rewards/rejected": -1.3853647708892822, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": -2.6432461738586426, "eval_logits/rejected": -2.608781337738037, "eval_logps/chosen": -501.4175109863281, "eval_logps/rejected": -486.4694519042969, "eval_loss": 0.6107898950576782, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": -1.051926612854004, "eval_rewards/margins": 0.2959325611591339, "eval_rewards/rejected": -1.3478593826293945, "eval_runtime": 347.9473, "eval_samples_per_second": 5.748, "eval_steps_per_second": 0.718, "step": 2400 }, { "epoch": 0.63, "grad_norm": 4.21875, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -2.7207000255584717, "logits/rejected": -2.6985878944396973, "logps/chosen": -473.2940979003906, "logps/rejected": -458.9827575683594, "loss": 0.6073, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0358033180236816, "rewards/margins": 0.3086177110671997, "rewards/rejected": -1.344421148300171, "step": 2410 }, { "epoch": 0.63, "grad_norm": 5.0, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -2.704127311706543, "logits/rejected": -2.6785435676574707, "logps/chosen": -496.7994689941406, "logps/rejected": -474.5835876464844, "loss": 0.5987, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0449730157852173, "rewards/margins": 0.3369174897670746, "rewards/rejected": -1.3818905353546143, "step": 2420 }, { "epoch": 0.64, "grad_norm": 5.1875, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -2.6866023540496826, "logits/rejected": -2.659691095352173, "logps/chosen": -503.9330139160156, "logps/rejected": -494.0077209472656, "loss": 0.6131, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1332998275756836, "rewards/margins": 0.2737593650817871, "rewards/rejected": -1.4070589542388916, "step": 2430 }, { "epoch": 0.64, "grad_norm": 3.390625, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -2.696007251739502, "logits/rejected": -2.6478171348571777, "logps/chosen": -518.2098999023438, "logps/rejected": -465.0953063964844, "loss": 0.6265, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.170287013053894, "rewards/margins": 0.31294775009155273, "rewards/rejected": -1.4832347631454468, "step": 2440 }, { "epoch": 0.64, "grad_norm": 4.625, "learning_rate": 1.718338084156254e-06, "logits/chosen": -2.61708927154541, "logits/rejected": -2.5760369300842285, "logps/chosen": -529.7711181640625, "logps/rejected": -493.506591796875, "loss": 0.6125, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.097019076347351, "rewards/margins": 0.28592607378959656, "rewards/rejected": -1.3829452991485596, "step": 2450 }, { "epoch": 0.64, "grad_norm": 2.765625, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -2.737034320831299, "logits/rejected": -2.706390857696533, "logps/chosen": -520.4344482421875, "logps/rejected": -464.23492431640625, "loss": 0.5894, "rewards/accuracies": 0.6875, "rewards/chosen": -1.011051058769226, "rewards/margins": 0.3496933877468109, "rewards/rejected": -1.360744595527649, "step": 2460 }, { "epoch": 0.65, "grad_norm": 3.25, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -2.6961193084716797, "logits/rejected": -2.677546501159668, "logps/chosen": -491.8929748535156, "logps/rejected": -502.51959228515625, "loss": 0.6301, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.006574273109436, "rewards/margins": 0.23815563321113586, "rewards/rejected": -1.244729995727539, "step": 2470 }, { "epoch": 0.65, "grad_norm": 4.75, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -2.604997158050537, "logits/rejected": -2.5811421871185303, "logps/chosen": -477.0950622558594, "logps/rejected": -455.02410888671875, "loss": 0.5967, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0661613941192627, "rewards/margins": 0.30554550886154175, "rewards/rejected": -1.3717070817947388, "step": 2480 }, { "epoch": 0.65, "grad_norm": 3.234375, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -2.6581666469573975, "logits/rejected": -2.642603874206543, "logps/chosen": -471.28521728515625, "logps/rejected": -457.45794677734375, "loss": 0.6281, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.046363115310669, "rewards/margins": 0.2672274708747864, "rewards/rejected": -1.3135906457901, "step": 2490 }, { "epoch": 0.65, "grad_norm": 4.125, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -2.718479633331299, "logits/rejected": -2.663071870803833, "logps/chosen": -489.6497497558594, "logps/rejected": -492.8302307128906, "loss": 0.5988, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0412070751190186, "rewards/margins": 0.3211270868778229, "rewards/rejected": -1.3623343706130981, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": -2.647676706314087, "eval_logits/rejected": -2.6143462657928467, "eval_logps/chosen": -500.49822998046875, "eval_logps/rejected": -485.87298583984375, "eval_loss": 0.6108289957046509, "eval_rewards/accuracies": 0.6589999794960022, "eval_rewards/chosen": -1.0427342653274536, "eval_rewards/margins": 0.299160897731781, "eval_rewards/rejected": -1.3418951034545898, "eval_runtime": 347.8052, "eval_samples_per_second": 5.75, "eval_steps_per_second": 0.719, "step": 2500 }, { "epoch": 0.66, "grad_norm": 3.59375, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -2.711664915084839, "logits/rejected": -2.6842312812805176, "logps/chosen": -523.2516479492188, "logps/rejected": -486.0882873535156, "loss": 0.6146, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0051112174987793, "rewards/margins": 0.2787768244743347, "rewards/rejected": -1.2838881015777588, "step": 2510 }, { "epoch": 0.66, "grad_norm": 3.03125, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -2.6525261402130127, "logits/rejected": -2.5945048332214355, "logps/chosen": -518.8192138671875, "logps/rejected": -473.05615234375, "loss": 0.551, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9474281072616577, "rewards/margins": 0.451382577419281, "rewards/rejected": -1.398810625076294, "step": 2520 }, { "epoch": 0.66, "grad_norm": 5.03125, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -2.678679943084717, "logits/rejected": -2.631782054901123, "logps/chosen": -477.3982849121094, "logps/rejected": -477.5099182128906, "loss": 0.6084, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0620421171188354, "rewards/margins": 0.28218984603881836, "rewards/rejected": -1.3442319631576538, "step": 2530 }, { "epoch": 0.66, "grad_norm": 3.265625, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -2.6359200477600098, "logits/rejected": -2.611725330352783, "logps/chosen": -475.1856994628906, "logps/rejected": -478.42254638671875, "loss": 0.6265, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0813992023468018, "rewards/margins": 0.27633678913116455, "rewards/rejected": -1.3577358722686768, "step": 2540 }, { "epoch": 0.67, "grad_norm": 5.1875, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -2.6500914096832275, "logits/rejected": -2.6187453269958496, "logps/chosen": -469.1063537597656, "logps/rejected": -479.3811950683594, "loss": 0.6122, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0489628314971924, "rewards/margins": 0.3200768828392029, "rewards/rejected": -1.36903977394104, "step": 2550 }, { "epoch": 0.67, "grad_norm": 4.46875, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -2.6609740257263184, "logits/rejected": -2.613036632537842, "logps/chosen": -528.0738525390625, "logps/rejected": -479.12847900390625, "loss": 0.5824, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0216680765151978, "rewards/margins": 0.3447554111480713, "rewards/rejected": -1.3664233684539795, "step": 2560 }, { "epoch": 0.67, "grad_norm": 3.875, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -2.6330864429473877, "logits/rejected": -2.6078662872314453, "logps/chosen": -480.5138244628906, "logps/rejected": -466.89813232421875, "loss": 0.6461, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1947067975997925, "rewards/margins": 0.2124728262424469, "rewards/rejected": -1.4071797132492065, "step": 2570 }, { "epoch": 0.68, "grad_norm": 3.546875, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -2.6624321937561035, "logits/rejected": -2.634147882461548, "logps/chosen": -490.93731689453125, "logps/rejected": -508.0350646972656, "loss": 0.6106, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0125614404678345, "rewards/margins": 0.31729286909103394, "rewards/rejected": -1.3298542499542236, "step": 2580 }, { "epoch": 0.68, "grad_norm": 4.25, "learning_rate": 1.421763837748016e-06, "logits/chosen": -2.6459295749664307, "logits/rejected": -2.632132053375244, "logps/chosen": -476.1249084472656, "logps/rejected": -480.6978454589844, "loss": 0.601, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0420411825180054, "rewards/margins": 0.32264775037765503, "rewards/rejected": -1.3646891117095947, "step": 2590 }, { "epoch": 0.68, "grad_norm": 4.1875, "learning_rate": 1.401198464962021e-06, "logits/chosen": -2.6568338871002197, "logits/rejected": -2.613276481628418, "logps/chosen": -504.14825439453125, "logps/rejected": -478.09832763671875, "loss": 0.606, "rewards/accuracies": 0.625, "rewards/chosen": -1.0711807012557983, "rewards/margins": 0.307957261800766, "rewards/rejected": -1.3791382312774658, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": -2.6304023265838623, "eval_logits/rejected": -2.5973665714263916, "eval_logps/chosen": -498.1077575683594, "eval_logps/rejected": -483.6012878417969, "eval_loss": 0.611174464225769, "eval_rewards/accuracies": 0.6545000076293945, "eval_rewards/chosen": -1.0188294649124146, "eval_rewards/margins": 0.30034834146499634, "eval_rewards/rejected": -1.3191777467727661, "eval_runtime": 347.7844, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 2600 }, { "epoch": 0.68, "grad_norm": 2.921875, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -2.691493034362793, "logits/rejected": -2.6732335090637207, "logps/chosen": -509.51202392578125, "logps/rejected": -488.71826171875, "loss": 0.5973, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9540036916732788, "rewards/margins": 0.33757534623146057, "rewards/rejected": -1.2915791273117065, "step": 2610 }, { "epoch": 0.69, "grad_norm": 4.8125, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -2.6027607917785645, "logits/rejected": -2.576066255569458, "logps/chosen": -490.2715759277344, "logps/rejected": -477.48992919921875, "loss": 0.6103, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0267969369888306, "rewards/margins": 0.3205047845840454, "rewards/rejected": -1.347301721572876, "step": 2620 }, { "epoch": 0.69, "grad_norm": 3.15625, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -2.6897635459899902, "logits/rejected": -2.6353797912597656, "logps/chosen": -514.4585571289062, "logps/rejected": -483.48565673828125, "loss": 0.6332, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.041078805923462, "rewards/margins": 0.2664690613746643, "rewards/rejected": -1.3075478076934814, "step": 2630 }, { "epoch": 0.69, "grad_norm": 3.8125, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -2.7139453887939453, "logits/rejected": -2.6701016426086426, "logps/chosen": -471.3026428222656, "logps/rejected": -466.50689697265625, "loss": 0.5969, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0769729614257812, "rewards/margins": 0.31179046630859375, "rewards/rejected": -1.3887633085250854, "step": 2640 }, { "epoch": 0.69, "grad_norm": 4.15625, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -2.697122573852539, "logits/rejected": -2.6447901725769043, "logps/chosen": -536.4827270507812, "logps/rejected": -519.1363525390625, "loss": 0.6039, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9090393781661987, "rewards/margins": 0.34489864110946655, "rewards/rejected": -1.25393807888031, "step": 2650 }, { "epoch": 0.7, "grad_norm": 6.28125, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -2.68558931350708, "logits/rejected": -2.6322624683380127, "logps/chosen": -508.85711669921875, "logps/rejected": -504.25762939453125, "loss": 0.5892, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9868243932723999, "rewards/margins": 0.371315598487854, "rewards/rejected": -1.358140230178833, "step": 2660 }, { "epoch": 0.7, "grad_norm": 4.5625, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -2.6563174724578857, "logits/rejected": -2.6325182914733887, "logps/chosen": -482.51422119140625, "logps/rejected": -471.16632080078125, "loss": 0.5774, "rewards/accuracies": 0.75, "rewards/chosen": -1.011075496673584, "rewards/margins": 0.36485710740089417, "rewards/rejected": -1.3759326934814453, "step": 2670 }, { "epoch": 0.7, "grad_norm": 3.8125, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -2.649975299835205, "logits/rejected": -2.6116244792938232, "logps/chosen": -506.97186279296875, "logps/rejected": -472.00714111328125, "loss": 0.6322, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0412980318069458, "rewards/margins": 0.2263387143611908, "rewards/rejected": -1.2676366567611694, "step": 2680 }, { "epoch": 0.7, "grad_norm": 4.15625, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -2.704833984375, "logits/rejected": -2.6896462440490723, "logps/chosen": -495.80877685546875, "logps/rejected": -516.6968994140625, "loss": 0.5828, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0221621990203857, "rewards/margins": 0.38270777463912964, "rewards/rejected": -1.404869794845581, "step": 2690 }, { "epoch": 0.71, "grad_norm": 5.09375, "learning_rate": 1.20087039953583e-06, "logits/chosen": -2.675306558609009, "logits/rejected": -2.650527000427246, "logps/chosen": -492.5806579589844, "logps/rejected": -478.58258056640625, "loss": 0.6118, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0126721858978271, "rewards/margins": 0.3190918564796448, "rewards/rejected": -1.3317642211914062, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": -2.627389669418335, "eval_logits/rejected": -2.594527244567871, "eval_logps/chosen": -504.3044738769531, "eval_logps/rejected": -490.2562255859375, "eval_loss": 0.6105741262435913, "eval_rewards/accuracies": 0.659500002861023, "eval_rewards/chosen": -1.0807968378067017, "eval_rewards/margins": 0.3049302399158478, "eval_rewards/rejected": -1.385727047920227, "eval_runtime": 347.7409, "eval_samples_per_second": 5.751, "eval_steps_per_second": 0.719, "step": 2700 }, { "epoch": 0.71, "grad_norm": 3.90625, "learning_rate": 1.181406963063507e-06, "logits/chosen": -2.6303482055664062, "logits/rejected": -2.6323742866516113, "logps/chosen": -503.49786376953125, "logps/rejected": -513.8046875, "loss": 0.6299, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0185651779174805, "rewards/margins": 0.29084575176239014, "rewards/rejected": -1.3094110488891602, "step": 2710 }, { "epoch": 0.71, "grad_norm": 7.09375, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -2.663734197616577, "logits/rejected": -2.6387317180633545, "logps/chosen": -552.3123779296875, "logps/rejected": -516.0528564453125, "loss": 0.6352, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1312639713287354, "rewards/margins": 0.2474222630262375, "rewards/rejected": -1.3786863088607788, "step": 2720 }, { "epoch": 0.71, "grad_norm": 3.53125, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -2.66949725151062, "logits/rejected": -2.630190849304199, "logps/chosen": -495.2144470214844, "logps/rejected": -458.618408203125, "loss": 0.5847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1124293804168701, "rewards/margins": 0.3431801199913025, "rewards/rejected": -1.4556094408035278, "step": 2730 }, { "epoch": 0.72, "grad_norm": 4.75, "learning_rate": 1.123683721144223e-06, "logits/chosen": -2.6718780994415283, "logits/rejected": -2.6413564682006836, "logps/chosen": -537.8629150390625, "logps/rejected": -509.77056884765625, "loss": 0.631, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1508649587631226, "rewards/margins": 0.2626705765724182, "rewards/rejected": -1.4135355949401855, "step": 2740 }, { "epoch": 0.72, "grad_norm": 4.40625, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -2.6759867668151855, "logits/rejected": -2.66206955909729, "logps/chosen": -496.76458740234375, "logps/rejected": -523.6871948242188, "loss": 0.6281, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.131340503692627, "rewards/margins": 0.26272568106651306, "rewards/rejected": -1.3940664529800415, "step": 2750 }, { "epoch": 0.72, "grad_norm": 5.03125, "learning_rate": 1.085773492015028e-06, "logits/chosen": -2.6411399841308594, "logits/rejected": -2.598507881164551, "logps/chosen": -491.46722412109375, "logps/rejected": -468.1087951660156, "loss": 0.5897, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1277272701263428, "rewards/margins": 0.36656227707862854, "rewards/rejected": -1.494289517402649, "step": 2760 }, { "epoch": 0.72, "grad_norm": 4.96875, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -2.6684410572052, "logits/rejected": -2.619168996810913, "logps/chosen": -516.2086181640625, "logps/rejected": -494.1477966308594, "loss": 0.608, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.11739182472229, "rewards/margins": 0.31762081384658813, "rewards/rejected": -1.4350125789642334, "step": 2770 }, { "epoch": 0.73, "grad_norm": 3.765625, "learning_rate": 1.048335603051291e-06, "logits/chosen": -2.632700204849243, "logits/rejected": -2.6073365211486816, "logps/chosen": -529.2457885742188, "logps/rejected": -526.523681640625, "loss": 0.5543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0624479055404663, "rewards/margins": 0.49154725670814514, "rewards/rejected": -1.5539953708648682, "step": 2780 }, { "epoch": 0.73, "grad_norm": 4.46875, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -2.6516880989074707, "logits/rejected": -2.6374242305755615, "logps/chosen": -508.30126953125, "logps/rejected": -489.03680419921875, "loss": 0.5965, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1453887224197388, "rewards/margins": 0.35155534744262695, "rewards/rejected": -1.4969440698623657, "step": 2790 }, { "epoch": 0.73, "grad_norm": 4.0, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -2.6636240482330322, "logits/rejected": -2.643951654434204, "logps/chosen": -511.94073486328125, "logps/rejected": -502.1498107910156, "loss": 0.6134, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1842275857925415, "rewards/margins": 0.28109192848205566, "rewards/rejected": -1.4653196334838867, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": -2.6303441524505615, "eval_logits/rejected": -2.5978312492370605, "eval_logps/chosen": -511.7178955078125, "eval_logps/rejected": -498.03662109375, "eval_loss": 0.6096385717391968, "eval_rewards/accuracies": 0.6585000157356262, "eval_rewards/chosen": -1.1549309492111206, "eval_rewards/margins": 0.3085997700691223, "eval_rewards/rejected": -1.4635308980941772, "eval_runtime": 347.7207, "eval_samples_per_second": 5.752, "eval_steps_per_second": 0.719, "step": 2800 }, { "epoch": 0.74, "grad_norm": 3.15625, "learning_rate": 9.930917156425477e-07, "logits/chosen": -2.677860736846924, "logits/rejected": -2.6445260047912598, "logps/chosen": -506.71746826171875, "logps/rejected": -513.6080932617188, "loss": 0.6076, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1613363027572632, "rewards/margins": 0.32913991808891296, "rewards/rejected": -1.490476369857788, "step": 2810 }, { "epoch": 0.74, "grad_norm": 3.796875, "learning_rate": 9.749266994893756e-07, "logits/chosen": -2.6382384300231934, "logits/rejected": -2.5947837829589844, "logps/chosen": -486.73236083984375, "logps/rejected": -469.90087890625, "loss": 0.6641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2020397186279297, "rewards/margins": 0.1584251970052719, "rewards/rejected": -1.3604648113250732, "step": 2820 }, { "epoch": 0.74, "grad_norm": 4.3125, "learning_rate": 9.56889026517913e-07, "logits/chosen": -2.6399216651916504, "logits/rejected": -2.635921001434326, "logps/chosen": -508.64031982421875, "logps/rejected": -471.86297607421875, "loss": 0.6411, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1394246816635132, "rewards/margins": 0.22725781798362732, "rewards/rejected": -1.3666824102401733, "step": 2830 }, { "epoch": 0.74, "grad_norm": 5.15625, "learning_rate": 9.389802028686617e-07, "logits/chosen": -2.6898536682128906, "logits/rejected": -2.655017137527466, "logps/chosen": -504.71441650390625, "logps/rejected": -477.46258544921875, "loss": 0.6408, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1481832265853882, "rewards/margins": 0.22253009676933289, "rewards/rejected": -1.370713472366333, "step": 2840 }, { "epoch": 0.75, "grad_norm": 4.125, "learning_rate": 9.212017239232427e-07, "logits/chosen": -2.66972017288208, "logits/rejected": -2.6421947479248047, "logps/chosen": -510.1853942871094, "logps/rejected": -497.7322692871094, "loss": 0.5764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0694119930267334, "rewards/margins": 0.37935250997543335, "rewards/rejected": -1.448764681816101, "step": 2850 }, { "epoch": 0.75, "grad_norm": 3.171875, "learning_rate": 9.03555074179533e-07, "logits/chosen": -2.651383638381958, "logits/rejected": -2.6579291820526123, "logps/chosen": -488.3523864746094, "logps/rejected": -507.35272216796875, "loss": 0.5909, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9926462173461914, "rewards/margins": 0.34531423449516296, "rewards/rejected": -1.3379603624343872, "step": 2860 }, { "epoch": 0.75, "grad_norm": 4.25, "learning_rate": 8.860417271277067e-07, "logits/chosen": -2.722695827484131, "logits/rejected": -2.719184398651123, "logps/chosen": -504.15966796875, "logps/rejected": -503.24078369140625, "loss": 0.6191, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9827505350112915, "rewards/margins": 0.25871509313583374, "rewards/rejected": -1.2414658069610596, "step": 2870 }, { "epoch": 0.75, "grad_norm": 6.25, "learning_rate": 8.686631451272029e-07, "logits/chosen": -2.6997554302215576, "logits/rejected": -2.6661736965179443, "logps/chosen": -492.560791015625, "logps/rejected": -473.57403564453125, "loss": 0.6363, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.149613618850708, "rewards/margins": 0.23957280814647675, "rewards/rejected": -1.3891866207122803, "step": 2880 }, { "epoch": 0.76, "grad_norm": 4.40625, "learning_rate": 8.514207792846168e-07, "logits/chosen": -2.7065916061401367, "logits/rejected": -2.6797971725463867, "logps/chosen": -494.64764404296875, "logps/rejected": -473.16558837890625, "loss": 0.6074, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1044108867645264, "rewards/margins": 0.31540459394454956, "rewards/rejected": -1.4198153018951416, "step": 2890 }, { "epoch": 0.76, "grad_norm": 3.828125, "learning_rate": 8.343160693325356e-07, "logits/chosen": -2.6506381034851074, "logits/rejected": -2.6390433311462402, "logps/chosen": -493.5978088378906, "logps/rejected": -498.072509765625, "loss": 0.6159, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0802371501922607, "rewards/margins": 0.29849973320961, "rewards/rejected": -1.378736972808838, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": -2.649965763092041, "eval_logits/rejected": -2.6174795627593994, "eval_logps/chosen": -501.72564697265625, "eval_logps/rejected": -486.77386474609375, "eval_loss": 0.6097070574760437, "eval_rewards/accuracies": 0.6585000157356262, "eval_rewards/chosen": -1.0550086498260498, "eval_rewards/margins": 0.2958948612213135, "eval_rewards/rejected": -1.3509035110473633, "eval_runtime": 348.4416, "eval_samples_per_second": 5.74, "eval_steps_per_second": 0.717, "step": 2900 }, { "epoch": 0.76, "grad_norm": 4.0, "learning_rate": 8.173504435093174e-07, "logits/chosen": -2.640998363494873, "logits/rejected": -2.595895290374756, "logps/chosen": -470.3514709472656, "logps/rejected": -444.5486755371094, "loss": 0.5996, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.065406084060669, "rewards/margins": 0.3272332549095154, "rewards/rejected": -1.3926395177841187, "step": 2910 }, { "epoch": 0.76, "grad_norm": 3.96875, "learning_rate": 8.00525318439836e-07, "logits/chosen": -2.6588757038116455, "logits/rejected": -2.636864423751831, "logps/chosen": -520.3292846679688, "logps/rejected": -518.36328125, "loss": 0.6253, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0216096639633179, "rewards/margins": 0.25970107316970825, "rewards/rejected": -1.281310796737671, "step": 2920 }, { "epoch": 0.77, "grad_norm": 3.484375, "learning_rate": 7.838420990171927e-07, "logits/chosen": -2.7186899185180664, "logits/rejected": -2.671178102493286, "logps/chosen": -507.38653564453125, "logps/rejected": -495.093994140625, "loss": 0.6065, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.029103398323059, "rewards/margins": 0.2967410683631897, "rewards/rejected": -1.325844407081604, "step": 2930 }, { "epoch": 0.77, "grad_norm": 3.5625, "learning_rate": 7.673021782854084e-07, "logits/chosen": -2.6119322776794434, "logits/rejected": -2.5850017070770264, "logps/chosen": -498.72705078125, "logps/rejected": -450.4840393066406, "loss": 0.6209, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.052886724472046, "rewards/margins": 0.27135077118873596, "rewards/rejected": -1.32423734664917, "step": 2940 }, { "epoch": 0.77, "grad_norm": 4.03125, "learning_rate": 7.509069373231039e-07, "logits/chosen": -2.6534907817840576, "logits/rejected": -2.6120922565460205, "logps/chosen": -490.2318420410156, "logps/rejected": -474.75811767578125, "loss": 0.5999, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0758155584335327, "rewards/margins": 0.31523874402046204, "rewards/rejected": -1.391054391860962, "step": 2950 }, { "epoch": 0.77, "grad_norm": 3.75, "learning_rate": 7.346577451281822e-07, "logits/chosen": -2.648266315460205, "logits/rejected": -2.6395068168640137, "logps/chosen": -505.58984375, "logps/rejected": -491.96356201171875, "loss": 0.595, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0662881135940552, "rewards/margins": 0.33815911412239075, "rewards/rejected": -1.404447317123413, "step": 2960 }, { "epoch": 0.78, "grad_norm": 5.03125, "learning_rate": 7.185559585035138e-07, "logits/chosen": -2.666665554046631, "logits/rejected": -2.6183128356933594, "logps/chosen": -520.7285766601562, "logps/rejected": -519.2037963867188, "loss": 0.5844, "rewards/accuracies": 0.625, "rewards/chosen": -1.031095266342163, "rewards/margins": 0.3654022812843323, "rewards/rejected": -1.3964974880218506, "step": 2970 }, { "epoch": 0.78, "grad_norm": 3.90625, "learning_rate": 7.026029219436504e-07, "logits/chosen": -2.6764984130859375, "logits/rejected": -2.6465909481048584, "logps/chosen": -487.6134338378906, "logps/rejected": -479.92742919921875, "loss": 0.6011, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.068189263343811, "rewards/margins": 0.2995051443576813, "rewards/rejected": -1.36769437789917, "step": 2980 }, { "epoch": 0.78, "grad_norm": 2.96875, "learning_rate": 6.867999675225523e-07, "logits/chosen": -2.706010580062866, "logits/rejected": -2.6768994331359863, "logps/chosen": -467.7933044433594, "logps/rejected": -459.462890625, "loss": 0.6033, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1356656551361084, "rewards/margins": 0.3141850531101227, "rewards/rejected": -1.4498507976531982, "step": 2990 }, { "epoch": 0.79, "grad_norm": 5.4375, "learning_rate": 6.711484147823663e-07, "logits/chosen": -2.645399570465088, "logits/rejected": -2.6416878700256348, "logps/chosen": -460.01348876953125, "logps/rejected": -490.2184143066406, "loss": 0.5815, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0389206409454346, "rewards/margins": 0.35546866059303284, "rewards/rejected": -1.3943893909454346, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": -2.6419765949249268, "eval_logits/rejected": -2.608949661254883, "eval_logps/chosen": -506.4727478027344, "eval_logps/rejected": -492.1650085449219, "eval_loss": 0.6090958714485168, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": -1.1024789810180664, "eval_rewards/margins": 0.3023359179496765, "eval_rewards/rejected": -1.4048149585723877, "eval_runtime": 347.5981, "eval_samples_per_second": 5.754, "eval_steps_per_second": 0.719, "step": 3000 }, { "epoch": 0.79, "grad_norm": 3.15625, "learning_rate": 6.556495706232413e-07, "logits/chosen": -2.6427597999572754, "logits/rejected": -2.6378586292266846, "logps/chosen": -508.255615234375, "logps/rejected": -498.01300048828125, "loss": 0.6168, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.076911211013794, "rewards/margins": 0.3045389652252197, "rewards/rejected": -1.3814500570297241, "step": 3010 }, { "epoch": 0.79, "grad_norm": 3.8125, "learning_rate": 6.403047291942057e-07, "logits/chosen": -2.635432720184326, "logits/rejected": -2.580549716949463, "logps/chosen": -463.28411865234375, "logps/rejected": -442.45684814453125, "loss": 0.5944, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.119048833847046, "rewards/margins": 0.3251452147960663, "rewards/rejected": -1.444193959236145, "step": 3020 }, { "epoch": 0.79, "grad_norm": 4.40625, "learning_rate": 6.251151717851023e-07, "logits/chosen": -2.6761887073516846, "logits/rejected": -2.6448588371276855, "logps/chosen": -470.81353759765625, "logps/rejected": -462.959228515625, "loss": 0.6316, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1331965923309326, "rewards/margins": 0.2631588578224182, "rewards/rejected": -1.3963555097579956, "step": 3030 }, { "epoch": 0.8, "grad_norm": 3.4375, "learning_rate": 6.100821667196041e-07, "logits/chosen": -2.747220754623413, "logits/rejected": -2.6560819149017334, "logps/chosen": -522.0474853515625, "logps/rejected": -451.25836181640625, "loss": 0.5939, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0959001779556274, "rewards/margins": 0.3455473482608795, "rewards/rejected": -1.441447377204895, "step": 3040 }, { "epoch": 0.8, "grad_norm": 3.40625, "learning_rate": 5.952069692493062e-07, "logits/chosen": -2.619112730026245, "logits/rejected": -2.6229631900787354, "logps/chosen": -463.21539306640625, "logps/rejected": -479.87957763671875, "loss": 0.5902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0741875171661377, "rewards/margins": 0.34632328152656555, "rewards/rejected": -1.4205108880996704, "step": 3050 }, { "epoch": 0.8, "grad_norm": 4.5625, "learning_rate": 5.80490821448918e-07, "logits/chosen": -2.6366074085235596, "logits/rejected": -2.636824131011963, "logps/chosen": -507.3749084472656, "logps/rejected": -560.8580322265625, "loss": 0.5986, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0962883234024048, "rewards/margins": 0.34130847454071045, "rewards/rejected": -1.4375969171524048, "step": 3060 }, { "epoch": 0.8, "grad_norm": 4.84375, "learning_rate": 5.659349521125459e-07, "logits/chosen": -2.7327983379364014, "logits/rejected": -2.7102444171905518, "logps/chosen": -523.8292846679688, "logps/rejected": -506.67742919921875, "loss": 0.6271, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0699433088302612, "rewards/margins": 0.2572742998600006, "rewards/rejected": -1.3272178173065186, "step": 3070 }, { "epoch": 0.81, "grad_norm": 3.625, "learning_rate": 5.5154057665109e-07, "logits/chosen": -2.7022974491119385, "logits/rejected": -2.6688647270202637, "logps/chosen": -505.4044494628906, "logps/rejected": -490.70867919921875, "loss": 0.5871, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1016364097595215, "rewards/margins": 0.34242209792137146, "rewards/rejected": -1.4440586566925049, "step": 3080 }, { "epoch": 0.81, "grad_norm": 2.828125, "learning_rate": 5.373088969907586e-07, "logits/chosen": -2.707888126373291, "logits/rejected": -2.647016763687134, "logps/chosen": -517.645751953125, "logps/rejected": -478.8772888183594, "loss": 0.5976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.051059603691101, "rewards/margins": 0.3384644389152527, "rewards/rejected": -1.389524221420288, "step": 3090 }, { "epoch": 0.81, "grad_norm": 3.234375, "learning_rate": 5.23241101472709e-07, "logits/chosen": -2.671968936920166, "logits/rejected": -2.647467851638794, "logps/chosen": -513.36669921875, "logps/rejected": -501.7236328125, "loss": 0.5885, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9971181154251099, "rewards/margins": 0.36711040139198303, "rewards/rejected": -1.3642284870147705, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": -2.633734703063965, "eval_logits/rejected": -2.600095272064209, "eval_logps/chosen": -505.9959716796875, "eval_logps/rejected": -491.744384765625, "eval_loss": 0.6088695526123047, "eval_rewards/accuracies": 0.659500002861023, "eval_rewards/chosen": -1.0977121591567993, "eval_rewards/margins": 0.3028964698314667, "eval_rewards/rejected": -1.4006085395812988, "eval_runtime": 347.6519, "eval_samples_per_second": 5.753, "eval_steps_per_second": 0.719, "step": 3100 }, { "epoch": 0.81, "grad_norm": 4.0625, "learning_rate": 5.09338364753818e-07, "logits/chosen": -2.7147414684295654, "logits/rejected": -2.6615347862243652, "logps/chosen": -522.6924438476562, "logps/rejected": -512.82373046875, "loss": 0.6231, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9803631901741028, "rewards/margins": 0.28811120986938477, "rewards/rejected": -1.2684743404388428, "step": 3110 }, { "epoch": 0.82, "grad_norm": 3.828125, "learning_rate": 4.956018477086005e-07, "logits/chosen": -2.671032190322876, "logits/rejected": -2.633485794067383, "logps/chosen": -516.436767578125, "logps/rejected": -491.43963623046875, "loss": 0.6036, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0946416854858398, "rewards/margins": 0.33089718222618103, "rewards/rejected": -1.4255390167236328, "step": 3120 }, { "epoch": 0.82, "grad_norm": 3.234375, "learning_rate": 4.820326973322764e-07, "logits/chosen": -2.63783860206604, "logits/rejected": -2.608571767807007, "logps/chosen": -481.3115234375, "logps/rejected": -481.80499267578125, "loss": 0.619, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1024813652038574, "rewards/margins": 0.2848733067512512, "rewards/rejected": -1.3873546123504639, "step": 3130 }, { "epoch": 0.82, "grad_norm": 3.75, "learning_rate": 4.686320466449981e-07, "logits/chosen": -2.6718032360076904, "logits/rejected": -2.6056950092315674, "logps/chosen": -484.45489501953125, "logps/rejected": -480.09619140625, "loss": 0.6237, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1476237773895264, "rewards/margins": 0.28761929273605347, "rewards/rejected": -1.4352428913116455, "step": 3140 }, { "epoch": 0.82, "grad_norm": 3.53125, "learning_rate": 4.554010145972418e-07, "logits/chosen": -2.72918438911438, "logits/rejected": -2.6762828826904297, "logps/chosen": -508.759521484375, "logps/rejected": -497.8362731933594, "loss": 0.6242, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1187124252319336, "rewards/margins": 0.27222010493278503, "rewards/rejected": -1.3909324407577515, "step": 3150 }, { "epoch": 0.83, "grad_norm": 3.4375, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -2.6751275062561035, "logits/rejected": -2.640679121017456, "logps/chosen": -520.294921875, "logps/rejected": -521.7930908203125, "loss": 0.5949, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0745341777801514, "rewards/margins": 0.33133482933044434, "rewards/rejected": -1.4058691263198853, "step": 3160 }, { "epoch": 0.83, "grad_norm": 4.0, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -2.6148898601531982, "logits/rejected": -2.5540575981140137, "logps/chosen": -503.8189392089844, "logps/rejected": -468.48468017578125, "loss": 0.6091, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0721933841705322, "rewards/margins": 0.2973105311393738, "rewards/rejected": -1.3695039749145508, "step": 3170 }, { "epoch": 0.83, "grad_norm": 3.234375, "learning_rate": 4.167366067969381e-07, "logits/chosen": -2.6700875759124756, "logits/rejected": -2.6666862964630127, "logps/chosen": -451.3338928222656, "logps/rejected": -494.00537109375, "loss": 0.6031, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0849757194519043, "rewards/margins": 0.3115970492362976, "rewards/rejected": -1.3965727090835571, "step": 3180 }, { "epoch": 0.83, "grad_norm": 3.65625, "learning_rate": 4.041949541732826e-07, "logits/chosen": -2.6685242652893066, "logits/rejected": -2.658634662628174, "logps/chosen": -498.36871337890625, "logps/rejected": -510.15118408203125, "loss": 0.584, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0659822225570679, "rewards/margins": 0.3512209951877594, "rewards/rejected": -1.4172031879425049, "step": 3190 }, { "epoch": 0.84, "grad_norm": 2.203125, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -2.637341022491455, "logits/rejected": -2.636918544769287, "logps/chosen": -501.9124450683594, "logps/rejected": -526.0079345703125, "loss": 0.6074, "rewards/accuracies": 0.625, "rewards/chosen": -1.0901362895965576, "rewards/margins": 0.30762726068496704, "rewards/rejected": -1.3977636098861694, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": -2.6388015747070312, "eval_logits/rejected": -2.605605363845825, "eval_logps/chosen": -506.04547119140625, "eval_logps/rejected": -491.9724426269531, "eval_loss": 0.6086438894271851, "eval_rewards/accuracies": 0.6604999899864197, "eval_rewards/chosen": -1.0982069969177246, "eval_rewards/margins": 0.3046818971633911, "eval_rewards/rejected": -1.4028888940811157, "eval_runtime": 347.8304, "eval_samples_per_second": 5.75, "eval_steps_per_second": 0.719, "step": 3200 }, { "epoch": 0.84, "grad_norm": 3.28125, "learning_rate": 3.796376788925771e-07, "logits/chosen": -2.6419506072998047, "logits/rejected": -2.6541552543640137, "logps/chosen": -491.3660583496094, "logps/rejected": -470.6940002441406, "loss": 0.6613, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.115142583847046, "rewards/margins": 0.18211853504180908, "rewards/rejected": -1.2972612380981445, "step": 3210 }, { "epoch": 0.84, "grad_norm": 3.90625, "learning_rate": 3.676241067609465e-07, "logits/chosen": -2.672940731048584, "logits/rejected": -2.6646289825439453, "logps/chosen": -534.7271728515625, "logps/rejected": -501.14385986328125, "loss": 0.6169, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0392810106277466, "rewards/margins": 0.2989768981933594, "rewards/rejected": -1.3382577896118164, "step": 3220 }, { "epoch": 0.85, "grad_norm": 4.46875, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -2.627161979675293, "logits/rejected": -2.624307155609131, "logps/chosen": -498.4679260253906, "logps/rejected": -492.9081115722656, "loss": 0.6572, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0917952060699463, "rewards/margins": 0.18084125220775604, "rewards/rejected": -1.2726365327835083, "step": 3230 }, { "epoch": 0.85, "grad_norm": 5.125, "learning_rate": 3.44132109080447e-07, "logits/chosen": -2.7235939502716064, "logits/rejected": -2.68570613861084, "logps/chosen": -511.3997497558594, "logps/rejected": -466.37591552734375, "loss": 0.6249, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0969817638397217, "rewards/margins": 0.2552695572376251, "rewards/rejected": -1.3522512912750244, "step": 3240 }, { "epoch": 0.85, "grad_norm": 3.234375, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -2.7106406688690186, "logits/rejected": -2.665052890777588, "logps/chosen": -538.0487060546875, "logps/rejected": -513.3031005859375, "loss": 0.5986, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.014005422592163, "rewards/margins": 0.33192679286003113, "rewards/rejected": -1.3459322452545166, "step": 3250 }, { "epoch": 0.85, "grad_norm": 3.6875, "learning_rate": 3.213601537627195e-07, "logits/chosen": -2.6548619270324707, "logits/rejected": -2.6132593154907227, "logps/chosen": -503.3099060058594, "logps/rejected": -498.5037536621094, "loss": 0.599, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1849998235702515, "rewards/margins": 0.2957174777984619, "rewards/rejected": -1.480717420578003, "step": 3260 }, { "epoch": 0.86, "grad_norm": 3.421875, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -2.6821701526641846, "logits/rejected": -2.667227268218994, "logps/chosen": -484.52783203125, "logps/rejected": -481.2435607910156, "loss": 0.6298, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0785605907440186, "rewards/margins": 0.2731327414512634, "rewards/rejected": -1.3516933917999268, "step": 3270 }, { "epoch": 0.86, "grad_norm": 3.125, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -2.6534295082092285, "logits/rejected": -2.655452013015747, "logps/chosen": -510.244384765625, "logps/rejected": -527.35546875, "loss": 0.639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.032504677772522, "rewards/margins": 0.2230747640132904, "rewards/rejected": -1.2555794715881348, "step": 3280 }, { "epoch": 0.86, "grad_norm": 3.078125, "learning_rate": 2.885688711862136e-07, "logits/chosen": -2.6963908672332764, "logits/rejected": -2.7045657634735107, "logps/chosen": -497.63653564453125, "logps/rejected": -511.78851318359375, "loss": 0.6386, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0954744815826416, "rewards/margins": 0.259405255317688, "rewards/rejected": -1.3548799753189087, "step": 3290 }, { "epoch": 0.86, "grad_norm": 3.25, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -2.6455070972442627, "logits/rejected": -2.602128505706787, "logps/chosen": -493.291259765625, "logps/rejected": -495.4798889160156, "loss": 0.5981, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0171977281570435, "rewards/margins": 0.32562780380249023, "rewards/rejected": -1.3428254127502441, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": -2.64424991607666, "eval_logits/rejected": -2.611661911010742, "eval_logps/chosen": -504.757080078125, "eval_logps/rejected": -490.4914855957031, "eval_loss": 0.6086958050727844, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": -1.0853232145309448, "eval_rewards/margins": 0.30275672674179077, "eval_rewards/rejected": -1.3880800008773804, "eval_runtime": 347.7972, "eval_samples_per_second": 5.75, "eval_steps_per_second": 0.719, "step": 3300 }, { "epoch": 0.87, "grad_norm": 6.28125, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -2.68962025642395, "logits/rejected": -2.662564754486084, "logps/chosen": -457.9961853027344, "logps/rejected": -422.3614807128906, "loss": 0.5896, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0693327188491821, "rewards/margins": 0.3496701717376709, "rewards/rejected": -1.4190027713775635, "step": 3310 }, { "epoch": 0.87, "grad_norm": 3.46875, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -2.6620020866394043, "logits/rejected": -2.629638433456421, "logps/chosen": -499.93896484375, "logps/rejected": -487.2354431152344, "loss": 0.6226, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.130860686302185, "rewards/margins": 0.27780821919441223, "rewards/rejected": -1.408668875694275, "step": 3320 }, { "epoch": 0.87, "grad_norm": 3.984375, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -2.7372817993164062, "logits/rejected": -2.6844122409820557, "logps/chosen": -516.2169189453125, "logps/rejected": -501.70721435546875, "loss": 0.5967, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0415713787078857, "rewards/margins": 0.37831583619117737, "rewards/rejected": -1.4198873043060303, "step": 3330 }, { "epoch": 0.87, "grad_norm": 4.125, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -2.6761159896850586, "logits/rejected": -2.651787281036377, "logps/chosen": -508.19891357421875, "logps/rejected": -515.2844848632812, "loss": 0.5993, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1085197925567627, "rewards/margins": 0.3225208520889282, "rewards/rejected": -1.4310405254364014, "step": 3340 }, { "epoch": 0.88, "grad_norm": 3.390625, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -2.6180145740509033, "logits/rejected": -2.5854008197784424, "logps/chosen": -513.4478759765625, "logps/rejected": -516.1890869140625, "loss": 0.5651, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0392944812774658, "rewards/margins": 0.3910349905490875, "rewards/rejected": -1.4303295612335205, "step": 3350 }, { "epoch": 0.88, "grad_norm": 3.6875, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -2.6438541412353516, "logits/rejected": -2.5801243782043457, "logps/chosen": -512.1270751953125, "logps/rejected": -525.7010498046875, "loss": 0.5627, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0036083459854126, "rewards/margins": 0.4596267640590668, "rewards/rejected": -1.4632351398468018, "step": 3360 }, { "epoch": 0.88, "grad_norm": 3.90625, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -2.637359380722046, "logits/rejected": -2.609163761138916, "logps/chosen": -479.6756896972656, "logps/rejected": -470.44158935546875, "loss": 0.5943, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1648880243301392, "rewards/margins": 0.31521743535995483, "rewards/rejected": -1.4801056385040283, "step": 3370 }, { "epoch": 0.88, "grad_norm": 4.25, "learning_rate": 2.002580803659873e-07, "logits/chosen": -2.655611276626587, "logits/rejected": -2.6167244911193848, "logps/chosen": -495.48779296875, "logps/rejected": -477.42877197265625, "loss": 0.6252, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.13040292263031, "rewards/margins": 0.2747969925403595, "rewards/rejected": -1.4052000045776367, "step": 3380 }, { "epoch": 0.89, "grad_norm": 3.328125, "learning_rate": 1.913954575837826e-07, "logits/chosen": -2.6717348098754883, "logits/rejected": -2.594569683074951, "logps/chosen": -509.77593994140625, "logps/rejected": -469.570068359375, "loss": 0.573, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0561670064926147, "rewards/margins": 0.38789016008377075, "rewards/rejected": -1.4440572261810303, "step": 3390 }, { "epoch": 0.89, "grad_norm": 3.8125, "learning_rate": 1.827256026165028e-07, "logits/chosen": -2.6890311241149902, "logits/rejected": -2.6426968574523926, "logps/chosen": -555.2242431640625, "logps/rejected": -503.68292236328125, "loss": 0.5944, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9910923838615417, "rewards/margins": 0.3401171565055847, "rewards/rejected": -1.331209421157837, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": -2.6360013484954834, "eval_logits/rejected": -2.602590560913086, "eval_logps/chosen": -505.1947021484375, "eval_logps/rejected": -490.9886779785156, "eval_loss": 0.6087493300437927, "eval_rewards/accuracies": 0.6579999923706055, "eval_rewards/chosen": -1.0896990299224854, "eval_rewards/margins": 0.3033522665500641, "eval_rewards/rejected": -1.393051266670227, "eval_runtime": 347.6769, "eval_samples_per_second": 5.752, "eval_steps_per_second": 0.719, "step": 3400 }, { "epoch": 0.89, "grad_norm": 4.46875, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -2.651811122894287, "logits/rejected": -2.603327512741089, "logps/chosen": -529.9959106445312, "logps/rejected": -487.5215759277344, "loss": 0.6188, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1013660430908203, "rewards/margins": 0.261643648147583, "rewards/rejected": -1.3630096912384033, "step": 3410 }, { "epoch": 0.9, "grad_norm": 4.0, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -2.723508596420288, "logits/rejected": -2.6757960319519043, "logps/chosen": -535.6593627929688, "logps/rejected": -493.79840087890625, "loss": 0.6145, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.080249309539795, "rewards/margins": 0.29932349920272827, "rewards/rejected": -1.379572868347168, "step": 3420 }, { "epoch": 0.9, "grad_norm": 3.53125, "learning_rate": 1.578798030665385e-07, "logits/chosen": -2.687269687652588, "logits/rejected": -2.629744529724121, "logps/chosen": -520.5648193359375, "logps/rejected": -503.39532470703125, "loss": 0.6145, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0786540508270264, "rewards/margins": 0.2842092216014862, "rewards/rejected": -1.362863302230835, "step": 3430 }, { "epoch": 0.9, "grad_norm": 3.8125, "learning_rate": 1.499880968037165e-07, "logits/chosen": -2.7056689262390137, "logits/rejected": -2.6712379455566406, "logps/chosen": -483.80413818359375, "logps/rejected": -458.0314025878906, "loss": 0.5931, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0653079748153687, "rewards/margins": 0.31929486989974976, "rewards/rejected": -1.3846029043197632, "step": 3440 }, { "epoch": 0.9, "grad_norm": 3.9375, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -2.697467803955078, "logits/rejected": -2.6769003868103027, "logps/chosen": -504.36572265625, "logps/rejected": -491.1905212402344, "loss": 0.5925, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0281347036361694, "rewards/margins": 0.3227378726005554, "rewards/rejected": -1.3508726358413696, "step": 3450 }, { "epoch": 0.91, "grad_norm": 3.4375, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -2.667304039001465, "logits/rejected": -2.652622938156128, "logps/chosen": -477.559326171875, "logps/rejected": -501.595703125, "loss": 0.5803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0702049732208252, "rewards/margins": 0.3717937469482422, "rewards/rejected": -1.4419987201690674, "step": 3460 }, { "epoch": 0.91, "grad_norm": 3.828125, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -2.677260398864746, "logits/rejected": -2.619901180267334, "logps/chosen": -533.4788818359375, "logps/rejected": -490.34552001953125, "loss": 0.5897, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0250308513641357, "rewards/margins": 0.38970544934272766, "rewards/rejected": -1.4147361516952515, "step": 3470 }, { "epoch": 0.91, "grad_norm": 3.640625, "learning_rate": 1.203898683888713e-07, "logits/chosen": -2.7128968238830566, "logits/rejected": -2.679882764816284, "logps/chosen": -489.0921325683594, "logps/rejected": -481.76470947265625, "loss": 0.6548, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1597046852111816, "rewards/margins": 0.2149733006954193, "rewards/rejected": -1.3746780157089233, "step": 3480 }, { "epoch": 0.91, "grad_norm": 2.953125, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -2.703012466430664, "logits/rejected": -2.672036647796631, "logps/chosen": -522.70849609375, "logps/rejected": -502.2613830566406, "loss": 0.5897, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0457698106765747, "rewards/margins": 0.35509949922561646, "rewards/rejected": -1.4008692502975464, "step": 3490 }, { "epoch": 0.92, "grad_norm": 2.96875, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -2.6770823001861572, "logits/rejected": -2.624206066131592, "logps/chosen": -506.50311279296875, "logps/rejected": -476.0753479003906, "loss": 0.5979, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.052063226699829, "rewards/margins": 0.32971978187561035, "rewards/rejected": -1.381783127784729, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": -2.6460154056549072, "eval_logits/rejected": -2.613610029220581, "eval_logps/chosen": -505.44384765625, "eval_logps/rejected": -491.3070068359375, "eval_loss": 0.6085324883460999, "eval_rewards/accuracies": 0.659500002861023, "eval_rewards/chosen": -1.0921905040740967, "eval_rewards/margins": 0.30404436588287354, "eval_rewards/rejected": -1.3962348699569702, "eval_runtime": 347.657, "eval_samples_per_second": 5.753, "eval_steps_per_second": 0.719, "step": 3500 }, { "epoch": 0.92, "grad_norm": 3.765625, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -2.6578614711761475, "logits/rejected": -2.6162824630737305, "logps/chosen": -483.9580993652344, "logps/rejected": -489.7320251464844, "loss": 0.5651, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0210082530975342, "rewards/margins": 0.37885022163391113, "rewards/rejected": -1.3998584747314453, "step": 3510 }, { "epoch": 0.92, "grad_norm": 3.59375, "learning_rate": 9.397045634168766e-08, "logits/chosen": -2.7051877975463867, "logits/rejected": -2.6907284259796143, "logps/chosen": -506.522216796875, "logps/rejected": -518.1729736328125, "loss": 0.586, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9934719204902649, "rewards/margins": 0.3790653347969055, "rewards/rejected": -1.3725372552871704, "step": 3520 }, { "epoch": 0.92, "grad_norm": 4.03125, "learning_rate": 8.78665232332998e-08, "logits/chosen": -2.62373948097229, "logits/rejected": -2.6049044132232666, "logps/chosen": -467.76312255859375, "logps/rejected": -479.63446044921875, "loss": 0.6085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.143489122390747, "rewards/margins": 0.27841717004776, "rewards/rejected": -1.4219063520431519, "step": 3530 }, { "epoch": 0.93, "grad_norm": 4.625, "learning_rate": 8.196400257606208e-08, "logits/chosen": -2.706756114959717, "logits/rejected": -2.6707985401153564, "logps/chosen": -539.0672607421875, "logps/rejected": -535.8557739257812, "loss": 0.609, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0309526920318604, "rewards/margins": 0.33775442838668823, "rewards/rejected": -1.3687069416046143, "step": 3540 }, { "epoch": 0.93, "grad_norm": 3.59375, "learning_rate": 7.626338722875076e-08, "logits/chosen": -2.6710562705993652, "logits/rejected": -2.6908836364746094, "logps/chosen": -492.14251708984375, "logps/rejected": -494.04534912109375, "loss": 0.6436, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0950779914855957, "rewards/margins": 0.22633162140846252, "rewards/rejected": -1.3214095830917358, "step": 3550 }, { "epoch": 0.93, "grad_norm": 3.453125, "learning_rate": 7.076515319110688e-08, "logits/chosen": -2.6932473182678223, "logits/rejected": -2.676130771636963, "logps/chosen": -497.710693359375, "logps/rejected": -469.8008728027344, "loss": 0.5853, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0803611278533936, "rewards/margins": 0.3779391050338745, "rewards/rejected": -1.458300232887268, "step": 3560 }, { "epoch": 0.93, "grad_norm": 5.1875, "learning_rate": 6.54697595640899e-08, "logits/chosen": -2.695744752883911, "logits/rejected": -2.657261610031128, "logps/chosen": -534.2222900390625, "logps/rejected": -514.4498291015625, "loss": 0.6, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9726902842521667, "rewards/margins": 0.33556845784187317, "rewards/rejected": -1.3082587718963623, "step": 3570 }, { "epoch": 0.94, "grad_norm": 3.3125, "learning_rate": 6.037764851154426e-08, "logits/chosen": -2.645259380340576, "logits/rejected": -2.626682758331299, "logps/chosen": -505.6795959472656, "logps/rejected": -522.542724609375, "loss": 0.5892, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0380734205245972, "rewards/margins": 0.3490239083766937, "rewards/rejected": -1.3870973587036133, "step": 3580 }, { "epoch": 0.94, "grad_norm": 4.3125, "learning_rate": 5.548924522327748e-08, "logits/chosen": -2.6519410610198975, "logits/rejected": -2.6350717544555664, "logps/chosen": -507.71734619140625, "logps/rejected": -494.5531311035156, "loss": 0.6209, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1069073677062988, "rewards/margins": 0.278178870677948, "rewards/rejected": -1.3850862979888916, "step": 3590 }, { "epoch": 0.94, "grad_norm": 4.375, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -2.623077869415283, "logits/rejected": -2.615199565887451, "logps/chosen": -456.0958557128906, "logps/rejected": -476.017578125, "loss": 0.6154, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1336462497711182, "rewards/margins": 0.26231056451797485, "rewards/rejected": -1.3959566354751587, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": -2.639690399169922, "eval_logits/rejected": -2.6065900325775146, "eval_logps/chosen": -505.278076171875, "eval_logps/rejected": -491.1413269042969, "eval_loss": 0.6085542440414429, "eval_rewards/accuracies": 0.659500002861023, "eval_rewards/chosen": -1.0905324220657349, "eval_rewards/margins": 0.30404558777809143, "eval_rewards/rejected": -1.394577980041504, "eval_runtime": 347.9164, "eval_samples_per_second": 5.749, "eval_steps_per_second": 0.719, "step": 3600 }, { "epoch": 0.94, "grad_norm": 3.140625, "learning_rate": 4.632517761702815e-08, "logits/chosen": -2.6383109092712402, "logits/rejected": -2.6063365936279297, "logps/chosen": -466.67974853515625, "logps/rejected": -459.43292236328125, "loss": 0.6194, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0989251136779785, "rewards/margins": 0.2716377377510071, "rewards/rejected": -1.3705627918243408, "step": 3610 }, { "epoch": 0.95, "grad_norm": 4.53125, "learning_rate": 4.205027849605359e-08, "logits/chosen": -2.677117347717285, "logits/rejected": -2.641345500946045, "logps/chosen": -486.0668029785156, "logps/rejected": -461.42364501953125, "loss": 0.6305, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1190522909164429, "rewards/margins": 0.26842787861824036, "rewards/rejected": -1.3874801397323608, "step": 3620 }, { "epoch": 0.95, "grad_norm": 3.671875, "learning_rate": 3.798061746947995e-08, "logits/chosen": -2.729708433151245, "logits/rejected": -2.699693202972412, "logps/chosen": -496.13580322265625, "logps/rejected": -466.28607177734375, "loss": 0.6133, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.098503589630127, "rewards/margins": 0.29642829298973083, "rewards/rejected": -1.3949320316314697, "step": 3630 }, { "epoch": 0.95, "grad_norm": 3.984375, "learning_rate": 3.411653435283158e-08, "logits/chosen": -2.66868257522583, "logits/rejected": -2.611743450164795, "logps/chosen": -512.9805908203125, "logps/rejected": -457.05517578125, "loss": 0.6083, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0667917728424072, "rewards/margins": 0.3067484498023987, "rewards/rejected": -1.3735402822494507, "step": 3640 }, { "epoch": 0.96, "grad_norm": 3.171875, "learning_rate": 3.04583517959367e-08, "logits/chosen": -2.7076990604400635, "logits/rejected": -2.6731173992156982, "logps/chosen": -483.63604736328125, "logps/rejected": -467.524658203125, "loss": 0.5848, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0306214094161987, "rewards/margins": 0.3499363660812378, "rewards/rejected": -1.3805577754974365, "step": 3650 }, { "epoch": 0.96, "grad_norm": 3.890625, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -2.664285182952881, "logits/rejected": -2.6678249835968018, "logps/chosen": -514.0250244140625, "logps/rejected": -520.1316528320312, "loss": 0.6323, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1207656860351562, "rewards/margins": 0.2250840663909912, "rewards/rejected": -1.3458497524261475, "step": 3660 }, { "epoch": 0.96, "grad_norm": 4.4375, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -2.7375500202178955, "logits/rejected": -2.689077377319336, "logps/chosen": -519.54150390625, "logps/rejected": -497.95574951171875, "loss": 0.6163, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1362625360488892, "rewards/margins": 0.3190504312515259, "rewards/rejected": -1.455312967300415, "step": 3670 }, { "epoch": 0.96, "grad_norm": 6.71875, "learning_rate": 2.072217594089765e-08, "logits/chosen": -2.6551547050476074, "logits/rejected": -2.6728549003601074, "logps/chosen": -494.8778381347656, "logps/rejected": -511.70086669921875, "loss": 0.5927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0726226568222046, "rewards/margins": 0.3504863679409027, "rewards/rejected": -1.4231090545654297, "step": 3680 }, { "epoch": 0.97, "grad_norm": 3.5, "learning_rate": 1.789047789459375e-08, "logits/chosen": -2.6753056049346924, "logits/rejected": -2.628095865249634, "logps/chosen": -563.8702392578125, "logps/rejected": -515.8682861328125, "loss": 0.5758, "rewards/accuracies": 0.75, "rewards/chosen": -0.9584245681762695, "rewards/margins": 0.3854634165763855, "rewards/rejected": -1.3438880443572998, "step": 3690 }, { "epoch": 0.97, "grad_norm": 5.125, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -2.586505174636841, "logits/rejected": -2.5706770420074463, "logps/chosen": -538.0152587890625, "logps/rejected": -521.6544189453125, "loss": 0.6053, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0640016794204712, "rewards/margins": 0.30589136481285095, "rewards/rejected": -1.369892954826355, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": -2.6422648429870605, "eval_logits/rejected": -2.609360694885254, "eval_logps/chosen": -505.29425048828125, "eval_logps/rejected": -491.1404724121094, "eval_loss": 0.6086028218269348, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": -1.0906946659088135, "eval_rewards/margins": 0.30387499928474426, "eval_rewards/rejected": -1.394569754600525, "eval_runtime": 347.9396, "eval_samples_per_second": 5.748, "eval_steps_per_second": 0.719, "step": 3700 }, { "epoch": 0.97, "grad_norm": 5.1875, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -2.6033217906951904, "logits/rejected": -2.595217227935791, "logps/chosen": -475.32647705078125, "logps/rejected": -481.47039794921875, "loss": 0.6223, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0801260471343994, "rewards/margins": 0.30023181438446045, "rewards/rejected": -1.3803579807281494, "step": 3710 }, { "epoch": 0.97, "grad_norm": 3.59375, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -2.6388094425201416, "logits/rejected": -2.5794878005981445, "logps/chosen": -490.6612854003906, "logps/rejected": -458.67999267578125, "loss": 0.593, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1297202110290527, "rewards/margins": 0.3556092083454132, "rewards/rejected": -1.4853293895721436, "step": 3720 }, { "epoch": 0.98, "grad_norm": 3.65625, "learning_rate": 8.638344782207486e-09, "logits/chosen": -2.6429569721221924, "logits/rejected": -2.6291909217834473, "logps/chosen": -483.68841552734375, "logps/rejected": -469.73638916015625, "loss": 0.6087, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.080833077430725, "rewards/margins": 0.29357820749282837, "rewards/rejected": -1.3744113445281982, "step": 3730 }, { "epoch": 0.98, "grad_norm": 3.015625, "learning_rate": 6.84494196844715e-09, "logits/chosen": -2.6733694076538086, "logits/rejected": -2.645113468170166, "logps/chosen": -516.4934692382812, "logps/rejected": -511.3959045410156, "loss": 0.5696, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0565123558044434, "rewards/margins": 0.41344934701919556, "rewards/rejected": -1.4699615240097046, "step": 3740 }, { "epoch": 0.98, "grad_norm": 2.765625, "learning_rate": 5.259716884556121e-09, "logits/chosen": -2.6986021995544434, "logits/rejected": -2.662108898162842, "logps/chosen": -497.4439392089844, "logps/rejected": -491.6142578125, "loss": 0.5763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0045363903045654, "rewards/margins": 0.381572961807251, "rewards/rejected": -1.3861093521118164, "step": 3750 }, { "epoch": 0.98, "grad_norm": 3.234375, "learning_rate": 3.882801896372967e-09, "logits/chosen": -2.6970138549804688, "logits/rejected": -2.6791810989379883, "logps/chosen": -501.97576904296875, "logps/rejected": -475.7244567871094, "loss": 0.6286, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0755218267440796, "rewards/margins": 0.2614571452140808, "rewards/rejected": -1.3369790315628052, "step": 3760 }, { "epoch": 0.99, "grad_norm": 3.453125, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -2.6607155799865723, "logits/rejected": -2.617769241333008, "logps/chosen": -522.4849853515625, "logps/rejected": -511.22186279296875, "loss": 0.5748, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0117288827896118, "rewards/margins": 0.3699984848499298, "rewards/rejected": -1.3817272186279297, "step": 3770 }, { "epoch": 0.99, "grad_norm": 4.21875, "learning_rate": 1.754344691717591e-09, "logits/chosen": -2.6623787879943848, "logits/rejected": -2.649315357208252, "logps/chosen": -489.23443603515625, "logps/rejected": -522.0306396484375, "loss": 0.6375, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0721361637115479, "rewards/margins": 0.22307145595550537, "rewards/rejected": -1.2952076196670532, "step": 3780 }, { "epoch": 0.99, "grad_norm": 3.84375, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -2.676607608795166, "logits/rejected": -2.614905595779419, "logps/chosen": -527.0256958007812, "logps/rejected": -511.72943115234375, "loss": 0.5925, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1001697778701782, "rewards/margins": 0.33944058418273926, "rewards/rejected": -1.4396103620529175, "step": 3790 }, { "epoch": 0.99, "grad_norm": 3.40625, "learning_rate": 4.602812418974534e-10, "logits/chosen": -2.7049756050109863, "logits/rejected": -2.668239116668701, "logps/chosen": -520.1328125, "logps/rejected": -504.3516540527344, "loss": 0.602, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.012479305267334, "rewards/margins": 0.34507402777671814, "rewards/rejected": -1.357553243637085, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": -2.642503023147583, "eval_logits/rejected": -2.609644889831543, "eval_logps/chosen": -504.9806823730469, "eval_logps/rejected": -490.8211364746094, "eval_loss": 0.6085299253463745, "eval_rewards/accuracies": 0.6579999923706055, "eval_rewards/chosen": -1.0875595808029175, "eval_rewards/margins": 0.3038162589073181, "eval_rewards/rejected": -1.3913757801055908, "eval_runtime": 347.9755, "eval_samples_per_second": 5.748, "eval_steps_per_second": 0.718, "step": 3800 }, { "epoch": 1.0, "grad_norm": 4.15625, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -2.6654155254364014, "logits/rejected": -2.625561475753784, "logps/chosen": -482.1392517089844, "logps/rejected": -486.5087890625, "loss": 0.5962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0852601528167725, "rewards/margins": 0.3787585496902466, "rewards/rejected": -1.464018702507019, "step": 3810 }, { "epoch": 1.0, "grad_norm": 3.78125, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -2.6726207733154297, "logits/rejected": -2.636209487915039, "logps/chosen": -527.1629638671875, "logps/rejected": -491.42041015625, "loss": 0.5974, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.100174069404602, "rewards/margins": 0.3280791640281677, "rewards/rejected": -1.428253173828125, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6288731582999011, "train_runtime": 37165.2285, "train_samples_per_second": 1.645, "train_steps_per_second": 0.103 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }