{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.982222222222222, "eval_steps": 1, "global_step": 336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011851851851851851, "grad_norm": 62.70548815519655, "learning_rate": 1.4705882352941176e-08, "logits/chosen": 0.030916133895516396, "logits/rejected": 0.09742362797260284, "logps/chosen": -40.58351516723633, "logps/rejected": -58.42578887939453, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.023703703703703703, "grad_norm": 67.3907670519946, "learning_rate": 2.941176470588235e-08, "logits/chosen": 0.15014928579330444, "logits/rejected": 0.2673640847206116, "logps/chosen": -31.35921859741211, "logps/rejected": -54.71299743652344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.035555555555555556, "grad_norm": 73.59075381908265, "learning_rate": 4.411764705882353e-08, "logits/chosen": 0.35403603315353394, "logits/rejected": 0.3630790412425995, "logps/chosen": -30.862504959106445, "logps/rejected": -43.55963897705078, "loss": 0.6991, "rewards/accuracies": 0.5, "rewards/chosen": 0.023167992010712624, "rewards/margins": 0.016683291643857956, "rewards/rejected": 0.00648469990119338, "step": 3 }, { "epoch": 0.047407407407407405, "grad_norm": 58.78689431688176, "learning_rate": 5.88235294117647e-08, "logits/chosen": 0.3042946457862854, "logits/rejected": 0.25474676489830017, "logps/chosen": -34.22315979003906, "logps/rejected": -39.93827438354492, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": -0.021718814969062805, "rewards/margins": -0.024322079494595528, "rewards/rejected": 0.0026032691821455956, "step": 4 }, { "epoch": 0.05925925925925926, "grad_norm": 59.402728518681464, "learning_rate": 7.352941176470588e-08, "logits/chosen": 0.20607078075408936, "logits/rejected": 0.27008742094039917, "logps/chosen": -40.86919403076172, "logps/rejected": -51.17314910888672, "loss": 0.6937, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02542915567755699, "rewards/margins": 0.01238556019961834, "rewards/rejected": 0.013043595477938652, "step": 5 }, { "epoch": 0.07111111111111111, "grad_norm": 60.3208791733097, "learning_rate": 8.823529411764706e-08, "logits/chosen": 0.39524325728416443, "logits/rejected": 0.3147166669368744, "logps/chosen": -45.889522552490234, "logps/rejected": -47.271080017089844, "loss": 0.6967, "rewards/accuracies": 0.625, "rewards/chosen": 0.035939525812864304, "rewards/margins": 0.035303808748722076, "rewards/rejected": 0.0006357184611260891, "step": 6 }, { "epoch": 0.08296296296296296, "grad_norm": 65.84343450368385, "learning_rate": 1.0294117647058822e-07, "logits/chosen": 0.19424982368946075, "logits/rejected": 0.36947980523109436, "logps/chosen": -32.91363525390625, "logps/rejected": -43.79743194580078, "loss": 0.7058, "rewards/accuracies": 0.5, "rewards/chosen": 0.012993477284908295, "rewards/margins": -0.015153911896049976, "rewards/rejected": 0.028147388249635696, "step": 7 }, { "epoch": 0.09481481481481481, "grad_norm": 68.52239761198476, "learning_rate": 1.176470588235294e-07, "logits/chosen": 0.1946137249469757, "logits/rejected": 0.28064286708831787, "logps/chosen": -32.246864318847656, "logps/rejected": -41.628746032714844, "loss": 0.6735, "rewards/accuracies": 0.5, "rewards/chosen": -0.010772847570478916, "rewards/margins": 0.011450938880443573, "rewards/rejected": -0.022223783656954765, "step": 8 }, { "epoch": 0.10666666666666667, "grad_norm": 77.93025419789252, "learning_rate": 1.3235294117647057e-07, "logits/chosen": 0.32008448243141174, "logits/rejected": 0.21636219322681427, "logps/chosen": -40.00132369995117, "logps/rejected": -44.613426208496094, "loss": 0.6976, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008494901470839977, "rewards/margins": 0.011597584001719952, "rewards/rejected": -0.003102683462202549, "step": 9 }, { "epoch": 0.11851851851851852, "grad_norm": 60.27023294648464, "learning_rate": 1.4705882352941175e-07, "logits/chosen": 0.011551467701792717, "logits/rejected": 0.1401338428258896, "logps/chosen": -35.68666076660156, "logps/rejected": -47.44255065917969, "loss": 0.6817, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02254348061978817, "rewards/margins": 0.029230808839201927, "rewards/rejected": -0.0517742857336998, "step": 10 }, { "epoch": 0.13037037037037036, "grad_norm": 70.77005341416117, "learning_rate": 1.6176470588235293e-07, "logits/chosen": 0.07894501090049744, "logits/rejected": 0.09966235607862473, "logps/chosen": -30.685501098632812, "logps/rejected": -42.800785064697266, "loss": 0.6773, "rewards/accuracies": 0.6875, "rewards/chosen": -0.006754852831363678, "rewards/margins": 0.031156515702605247, "rewards/rejected": -0.037911366671323776, "step": 11 }, { "epoch": 0.14222222222222222, "grad_norm": 58.87977165275071, "learning_rate": 1.764705882352941e-07, "logits/chosen": 0.23514162003993988, "logits/rejected": 0.2450232207775116, "logps/chosen": -41.01308822631836, "logps/rejected": -52.138641357421875, "loss": 0.6775, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05471659079194069, "rewards/margins": -0.00840845424681902, "rewards/rejected": -0.046308137476444244, "step": 12 }, { "epoch": 0.15407407407407409, "grad_norm": 68.28764496281121, "learning_rate": 1.9117647058823527e-07, "logits/chosen": 0.13845381140708923, "logits/rejected": 0.06714074313640594, "logps/chosen": -36.72666549682617, "logps/rejected": -44.98724365234375, "loss": 0.6698, "rewards/accuracies": 0.5625, "rewards/chosen": -0.027201365679502487, "rewards/margins": 0.0457880012691021, "rewards/rejected": -0.07298936694860458, "step": 13 }, { "epoch": 0.16592592592592592, "grad_norm": 67.0476123108747, "learning_rate": 2.0588235294117645e-07, "logits/chosen": 0.13570713996887207, "logits/rejected": 0.02110590785741806, "logps/chosen": -39.4144287109375, "logps/rejected": -46.626033782958984, "loss": 0.6694, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05015880987048149, "rewards/margins": 0.06351868808269501, "rewards/rejected": -0.1136775016784668, "step": 14 }, { "epoch": 0.17777777777777778, "grad_norm": 56.97224502598152, "learning_rate": 2.2058823529411763e-07, "logits/chosen": 0.14530636370182037, "logits/rejected": 0.22717420756816864, "logps/chosen": -33.9251823425293, "logps/rejected": -47.67527770996094, "loss": 0.6504, "rewards/accuracies": 0.625, "rewards/chosen": -0.07779295742511749, "rewards/margins": 0.12380316108465195, "rewards/rejected": -0.20159611105918884, "step": 15 }, { "epoch": 0.18962962962962962, "grad_norm": 58.43872340752561, "learning_rate": 2.352941176470588e-07, "logits/chosen": 0.1471043974161148, "logits/rejected": 0.26890620589256287, "logps/chosen": -36.48509979248047, "logps/rejected": -53.876888275146484, "loss": 0.6503, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12628403306007385, "rewards/margins": 0.1476879119873047, "rewards/rejected": -0.27397194504737854, "step": 16 }, { "epoch": 0.20148148148148148, "grad_norm": 53.27712198861993, "learning_rate": 2.5e-07, "logits/chosen": 0.3565051555633545, "logits/rejected": 0.25773561000823975, "logps/chosen": -32.79841232299805, "logps/rejected": -36.324119567871094, "loss": 0.6414, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06849764287471771, "rewards/margins": 0.11378694325685501, "rewards/rejected": -0.18228457868099213, "step": 17 }, { "epoch": 0.21333333333333335, "grad_norm": 58.90356354190709, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -0.1638399213552475, "logits/rejected": -0.027556225657463074, "logps/chosen": -35.214149475097656, "logps/rejected": -54.174049377441406, "loss": 0.6328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15428400039672852, "rewards/margins": 0.18084901571273804, "rewards/rejected": -0.33513307571411133, "step": 18 }, { "epoch": 0.22518518518518518, "grad_norm": 54.674461580476404, "learning_rate": 2.7941176470588235e-07, "logits/chosen": 0.2881383001804352, "logits/rejected": 0.32903048396110535, "logps/chosen": -30.270973205566406, "logps/rejected": -40.31577682495117, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": -0.22714650630950928, "rewards/margins": 0.15492179989814758, "rewards/rejected": -0.38206830620765686, "step": 19 }, { "epoch": 0.23703703703703705, "grad_norm": 49.124258720104926, "learning_rate": 2.941176470588235e-07, "logits/chosen": 0.1832781583070755, "logits/rejected": 0.22061079740524292, "logps/chosen": -31.4530029296875, "logps/rejected": -43.574642181396484, "loss": 0.5737, "rewards/accuracies": 0.875, "rewards/chosen": -0.2013498842716217, "rewards/margins": 0.24712926149368286, "rewards/rejected": -0.44847914576530457, "step": 20 }, { "epoch": 0.24888888888888888, "grad_norm": 49.31720284300177, "learning_rate": 3.088235294117647e-07, "logits/chosen": 0.049509014934301376, "logits/rejected": 0.09894949197769165, "logps/chosen": -38.46732711791992, "logps/rejected": -53.03515625, "loss": 0.5683, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25855958461761475, "rewards/margins": 0.35330522060394287, "rewards/rejected": -0.6118648052215576, "step": 21 }, { "epoch": 0.2607407407407407, "grad_norm": 49.41172490919738, "learning_rate": 3.2352941176470586e-07, "logits/chosen": 0.09565885365009308, "logits/rejected": 0.13914039731025696, "logps/chosen": -28.3892765045166, "logps/rejected": -40.65375518798828, "loss": 0.5608, "rewards/accuracies": 0.625, "rewards/chosen": -0.2191687971353531, "rewards/margins": 0.21512456238269806, "rewards/rejected": -0.43429338932037354, "step": 22 }, { "epoch": 0.2725925925925926, "grad_norm": 46.290056641280934, "learning_rate": 3.3823529411764707e-07, "logits/chosen": 0.33468642830848694, "logits/rejected": 0.3287414312362671, "logps/chosen": -41.56681823730469, "logps/rejected": -49.69163131713867, "loss": 0.5531, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6832336187362671, "rewards/margins": 0.1509827822446823, "rewards/rejected": -0.8342164158821106, "step": 23 }, { "epoch": 0.28444444444444444, "grad_norm": 45.27726525885794, "learning_rate": 3.529411764705882e-07, "logits/chosen": 0.29047060012817383, "logits/rejected": 0.25504833459854126, "logps/chosen": -40.559715270996094, "logps/rejected": -43.744140625, "loss": 0.5839, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4380490183830261, "rewards/margins": 0.21297261118888855, "rewards/rejected": -0.6510215997695923, "step": 24 }, { "epoch": 0.2962962962962963, "grad_norm": 45.96606934366563, "learning_rate": 3.6764705882352943e-07, "logits/chosen": 0.19954881072044373, "logits/rejected": 0.24337519705295563, "logps/chosen": -26.71196937561035, "logps/rejected": -45.159339904785156, "loss": 0.5114, "rewards/accuracies": 0.875, "rewards/chosen": -0.4065704047679901, "rewards/margins": 0.6559778451919556, "rewards/rejected": -1.0625481605529785, "step": 25 }, { "epoch": 0.30814814814814817, "grad_norm": 40.51480580129527, "learning_rate": 3.8235294117647053e-07, "logits/chosen": 0.18569591641426086, "logits/rejected": 0.24005870521068573, "logps/chosen": -32.402259826660156, "logps/rejected": -50.5438117980957, "loss": 0.4871, "rewards/accuracies": 0.9375, "rewards/chosen": -0.35379940271377563, "rewards/margins": 0.88099205493927, "rewards/rejected": -1.2347913980484009, "step": 26 }, { "epoch": 0.32, "grad_norm": 42.88615154569158, "learning_rate": 3.9705882352941174e-07, "logits/chosen": 0.28236454725265503, "logits/rejected": 0.25888901948928833, "logps/chosen": -39.2940673828125, "logps/rejected": -53.0938606262207, "loss": 0.4142, "rewards/accuracies": 0.75, "rewards/chosen": -0.6994470953941345, "rewards/margins": 1.0878021717071533, "rewards/rejected": -1.7872494459152222, "step": 27 }, { "epoch": 0.33185185185185184, "grad_norm": 47.29875578100757, "learning_rate": 4.117647058823529e-07, "logits/chosen": 0.3194928467273712, "logits/rejected": 0.32101473212242126, "logps/chosen": -45.893978118896484, "logps/rejected": -52.5146484375, "loss": 0.5076, "rewards/accuracies": 0.625, "rewards/chosen": -1.0526106357574463, "rewards/margins": 0.718239426612854, "rewards/rejected": -1.7708501815795898, "step": 28 }, { "epoch": 0.3437037037037037, "grad_norm": 37.18564900832489, "learning_rate": 4.264705882352941e-07, "logits/chosen": 0.19094619154930115, "logits/rejected": 0.23362189531326294, "logps/chosen": -36.12297439575195, "logps/rejected": -46.864871978759766, "loss": 0.4014, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7660520076751709, "rewards/margins": 1.0089162588119507, "rewards/rejected": -1.774968147277832, "step": 29 }, { "epoch": 0.35555555555555557, "grad_norm": 43.31791813857577, "learning_rate": 4.4117647058823526e-07, "logits/chosen": 0.15151675045490265, "logits/rejected": 0.11997775733470917, "logps/chosen": -38.58032989501953, "logps/rejected": -48.824798583984375, "loss": 0.4572, "rewards/accuracies": 0.875, "rewards/chosen": -1.2670563459396362, "rewards/margins": 0.831606388092041, "rewards/rejected": -2.098662853240967, "step": 30 }, { "epoch": 0.3674074074074074, "grad_norm": 37.063035160208685, "learning_rate": 4.5588235294117646e-07, "logits/chosen": 0.2560815215110779, "logits/rejected": 0.3223097026348114, "logps/chosen": -33.990726470947266, "logps/rejected": -47.65345764160156, "loss": 0.3806, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2836799621582031, "rewards/margins": 1.0427335500717163, "rewards/rejected": -2.326413631439209, "step": 31 }, { "epoch": 0.37925925925925924, "grad_norm": 37.90256850942511, "learning_rate": 4.705882352941176e-07, "logits/chosen": 0.18444910645484924, "logits/rejected": 0.27790239453315735, "logps/chosen": -34.04519271850586, "logps/rejected": -58.64192199707031, "loss": 0.3634, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8921431303024292, "rewards/margins": 1.9067623615264893, "rewards/rejected": -2.798905372619629, "step": 32 }, { "epoch": 0.39111111111111113, "grad_norm": 38.28825801306995, "learning_rate": 4.852941176470588e-07, "logits/chosen": 0.21531561017036438, "logits/rejected": 0.22173307836055756, "logps/chosen": -38.73649978637695, "logps/rejected": -51.891937255859375, "loss": 0.3549, "rewards/accuracies": 0.875, "rewards/chosen": -1.469386339187622, "rewards/margins": 1.3347928524017334, "rewards/rejected": -2.8041794300079346, "step": 33 }, { "epoch": 0.40296296296296297, "grad_norm": 53.57688480965446, "learning_rate": 5e-07, "logits/chosen": -0.14031767845153809, "logits/rejected": -0.009732939302921295, "logps/chosen": -30.219802856445312, "logps/rejected": -48.51620864868164, "loss": 0.4584, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7153716683387756, "rewards/margins": 2.133821487426758, "rewards/rejected": -2.8491930961608887, "step": 34 }, { "epoch": 0.4148148148148148, "grad_norm": 43.6191926386301, "learning_rate": 4.999864732969518e-07, "logits/chosen": 0.2249789983034134, "logits/rejected": 0.2375878542661667, "logps/chosen": -42.989952087402344, "logps/rejected": -60.248451232910156, "loss": 0.307, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3359755277633667, "rewards/margins": 2.923412799835205, "rewards/rejected": -4.259388446807861, "step": 35 }, { "epoch": 0.4266666666666667, "grad_norm": 41.68496640354453, "learning_rate": 4.999458946515807e-07, "logits/chosen": 0.04158564656972885, "logits/rejected": 0.04120251536369324, "logps/chosen": -47.079593658447266, "logps/rejected": -64.6259536743164, "loss": 0.3243, "rewards/accuracies": 0.875, "rewards/chosen": -1.9504570960998535, "rewards/margins": 2.1364314556121826, "rewards/rejected": -4.086888313293457, "step": 36 }, { "epoch": 0.43851851851851853, "grad_norm": 37.998309675066224, "learning_rate": 4.998782684550491e-07, "logits/chosen": 0.15689387917518616, "logits/rejected": 0.22760489583015442, "logps/chosen": -31.412202835083008, "logps/rejected": -57.521270751953125, "loss": 0.3499, "rewards/accuracies": 0.8125, "rewards/chosen": -1.422287940979004, "rewards/margins": 2.10537052154541, "rewards/rejected": -3.527658462524414, "step": 37 }, { "epoch": 0.45037037037037037, "grad_norm": 39.14624538309661, "learning_rate": 4.997836020254328e-07, "logits/chosen": 0.09242415428161621, "logits/rejected": 0.12390726059675217, "logps/chosen": -38.68524932861328, "logps/rejected": -59.322471618652344, "loss": 0.3809, "rewards/accuracies": 0.875, "rewards/chosen": -1.4135229587554932, "rewards/margins": 2.5775866508483887, "rewards/rejected": -3.991109609603882, "step": 38 }, { "epoch": 0.4622222222222222, "grad_norm": 34.10903781264625, "learning_rate": 4.996619056069291e-07, "logits/chosen": 0.15454381704330444, "logits/rejected": 0.16882330179214478, "logps/chosen": -44.294654846191406, "logps/rejected": -66.8642578125, "loss": 0.3106, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2089152336120605, "rewards/margins": 3.4627161026000977, "rewards/rejected": -5.671631336212158, "step": 39 }, { "epoch": 0.4740740740740741, "grad_norm": 41.64494289319624, "learning_rate": 4.995131923687487e-07, "logits/chosen": 0.03869347274303436, "logits/rejected": 0.13989922404289246, "logps/chosen": -48.224884033203125, "logps/rejected": -68.6059341430664, "loss": 0.3563, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3006927967071533, "rewards/margins": 3.536550521850586, "rewards/rejected": -5.837243556976318, "step": 40 }, { "epoch": 0.48592592592592593, "grad_norm": 43.27974364485945, "learning_rate": 4.993374784036901e-07, "logits/chosen": -0.13991862535476685, "logits/rejected": 0.015330532565712929, "logps/chosen": -44.3278923034668, "logps/rejected": -62.52472686767578, "loss": 0.4348, "rewards/accuracies": 0.75, "rewards/chosen": -2.4953625202178955, "rewards/margins": 3.03660249710083, "rewards/rejected": -5.5319647789001465, "step": 41 }, { "epoch": 0.49777777777777776, "grad_norm": 46.320431667154644, "learning_rate": 4.991347827263982e-07, "logits/chosen": -0.051238611340522766, "logits/rejected": -0.0033771172165870667, "logps/chosen": -43.90919876098633, "logps/rejected": -65.11723327636719, "loss": 0.4051, "rewards/accuracies": 0.75, "rewards/chosen": -1.612180471420288, "rewards/margins": 3.3464155197143555, "rewards/rejected": -4.9585957527160645, "step": 42 }, { "epoch": 0.5096296296296297, "grad_norm": 52.045033376790954, "learning_rate": 4.989051272713069e-07, "logits/chosen": -0.10396721214056015, "logits/rejected": 0.10225249826908112, "logps/chosen": -45.72818374633789, "logps/rejected": -77.21394348144531, "loss": 0.3283, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5264527797698975, "rewards/margins": 4.776139736175537, "rewards/rejected": -7.302592754364014, "step": 43 }, { "epoch": 0.5214814814814814, "grad_norm": 40.91521718681392, "learning_rate": 4.986485368902656e-07, "logits/chosen": -0.08990158140659332, "logits/rejected": 0.014565035700798035, "logps/chosen": -38.39900588989258, "logps/rejected": -57.75357437133789, "loss": 0.3584, "rewards/accuracies": 0.875, "rewards/chosen": -2.275327205657959, "rewards/margins": 2.417283058166504, "rewards/rejected": -4.692610263824463, "step": 44 }, { "epoch": 0.5333333333333333, "grad_norm": 50.74127412580387, "learning_rate": 4.983650393498489e-07, "logits/chosen": 0.037050001323223114, "logits/rejected": -0.008276170119643211, "logps/chosen": -50.93760299682617, "logps/rejected": -55.42501449584961, "loss": 0.3717, "rewards/accuracies": 0.75, "rewards/chosen": -2.5346438884735107, "rewards/margins": 1.2831640243530273, "rewards/rejected": -3.8178083896636963, "step": 45 }, { "epoch": 0.5451851851851852, "grad_norm": 38.61624054825256, "learning_rate": 4.980546653283537e-07, "logits/chosen": -0.41439855098724365, "logits/rejected": -0.4113887548446655, "logps/chosen": -41.68315124511719, "logps/rejected": -67.61707305908203, "loss": 0.3181, "rewards/accuracies": 1.0, "rewards/chosen": -1.8887252807617188, "rewards/margins": 4.549580097198486, "rewards/rejected": -6.438305377960205, "step": 46 }, { "epoch": 0.557037037037037, "grad_norm": 38.9863916338003, "learning_rate": 4.977174484124775e-07, "logits/chosen": -0.009788192808628082, "logits/rejected": -0.09325724095106125, "logps/chosen": -46.573936462402344, "logps/rejected": -59.36362838745117, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": -2.1944921016693115, "rewards/margins": 4.177679538726807, "rewards/rejected": -6.372171401977539, "step": 47 }, { "epoch": 0.5688888888888889, "grad_norm": 46.90129706189482, "learning_rate": 4.97353425093685e-07, "logits/chosen": -0.1219746470451355, "logits/rejected": -0.10111116617918015, "logps/chosen": -49.24342346191406, "logps/rejected": -65.33878326416016, "loss": 0.3779, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4194295406341553, "rewards/margins": 3.601774215698242, "rewards/rejected": -6.021203994750977, "step": 48 }, { "epoch": 0.5807407407407408, "grad_norm": 54.305491610936826, "learning_rate": 4.96962634764259e-07, "logits/chosen": -0.13463924825191498, "logits/rejected": -0.09778477251529694, "logps/chosen": -50.75926971435547, "logps/rejected": -64.77645111083984, "loss": 0.3942, "rewards/accuracies": 0.75, "rewards/chosen": -2.944636344909668, "rewards/margins": 3.1688618659973145, "rewards/rejected": -6.113498687744141, "step": 49 }, { "epoch": 0.5925925925925926, "grad_norm": 40.79377104202132, "learning_rate": 4.965451197130372e-07, "logits/chosen": -0.0598304346203804, "logits/rejected": 0.03133855387568474, "logps/chosen": -41.48918151855469, "logps/rejected": -72.72964477539062, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": -1.6170328855514526, "rewards/margins": 4.565612316131592, "rewards/rejected": -6.182644844055176, "step": 50 }, { "epoch": 0.6044444444444445, "grad_norm": 42.877178470441834, "learning_rate": 4.961009251208367e-07, "logits/chosen": -0.014419106766581535, "logits/rejected": -0.018680818378925323, "logps/chosen": -34.062870025634766, "logps/rejected": -66.85511779785156, "loss": 0.2943, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9279564023017883, "rewards/margins": 6.2386088371276855, "rewards/rejected": -7.166565418243408, "step": 51 }, { "epoch": 0.6162962962962963, "grad_norm": 36.563663666294495, "learning_rate": 4.956300990555643e-07, "logits/chosen": -0.23208701610565186, "logits/rejected": -0.15281593799591064, "logps/chosen": -34.16992950439453, "logps/rejected": -48.25387954711914, "loss": 0.2707, "rewards/accuracies": 0.875, "rewards/chosen": -1.390017032623291, "rewards/margins": 2.9138500690460205, "rewards/rejected": -4.303867340087891, "step": 52 }, { "epoch": 0.6281481481481481, "grad_norm": 48.996542745383714, "learning_rate": 4.951326924670147e-07, "logits/chosen": -0.029582835733890533, "logits/rejected": 0.13870403170585632, "logps/chosen": -46.177825927734375, "logps/rejected": -64.03628540039062, "loss": 0.4606, "rewards/accuracies": 0.8125, "rewards/chosen": -2.360954761505127, "rewards/margins": 2.338740825653076, "rewards/rejected": -4.699695587158203, "step": 53 }, { "epoch": 0.64, "grad_norm": 37.53257312713413, "learning_rate": 4.94608759181358e-07, "logits/chosen": -0.2242709845304489, "logits/rejected": -0.022289041429758072, "logps/chosen": -43.091976165771484, "logps/rejected": -56.94826126098633, "loss": 0.2428, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1584746837615967, "rewards/margins": 2.6602895259857178, "rewards/rejected": -3.8187644481658936, "step": 54 }, { "epoch": 0.6518518518518519, "grad_norm": 38.495616892469634, "learning_rate": 4.940583558953137e-07, "logits/chosen": -0.3163710832595825, "logits/rejected": -0.2686666250228882, "logps/chosen": -41.02560806274414, "logps/rejected": -75.63497924804688, "loss": 0.3113, "rewards/accuracies": 1.0, "rewards/chosen": -1.4144959449768066, "rewards/margins": 5.705674648284912, "rewards/rejected": -7.120170593261719, "step": 55 }, { "epoch": 0.6637037037037037, "grad_norm": 44.72353141808658, "learning_rate": 4.934815421700164e-07, "logits/chosen": -0.28492411971092224, "logits/rejected": -0.2709801495075226, "logps/chosen": -36.71954345703125, "logps/rejected": -55.933624267578125, "loss": 0.3607, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8905187845230103, "rewards/margins": 3.9906599521636963, "rewards/rejected": -4.881179332733154, "step": 56 }, { "epoch": 0.6755555555555556, "grad_norm": 36.35665672904958, "learning_rate": 4.928783804245699e-07, "logits/chosen": 0.2555558681488037, "logits/rejected": 0.18786108493804932, "logps/chosen": -40.80218505859375, "logps/rejected": -54.24163055419922, "loss": 0.2751, "rewards/accuracies": 0.875, "rewards/chosen": -0.8028124570846558, "rewards/margins": 3.192255735397339, "rewards/rejected": -3.995068073272705, "step": 57 }, { "epoch": 0.6874074074074074, "grad_norm": 29.859662847563737, "learning_rate": 4.922489359292927e-07, "logits/chosen": -0.17547199130058289, "logits/rejected": -0.06896121799945831, "logps/chosen": -40.20637893676758, "logps/rejected": -68.45681762695312, "loss": 0.2406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9151208400726318, "rewards/margins": 4.224143981933594, "rewards/rejected": -5.139265060424805, "step": 58 }, { "epoch": 0.6992592592592592, "grad_norm": 33.69014037830799, "learning_rate": 4.915932767986551e-07, "logits/chosen": -0.2176772654056549, "logits/rejected": -0.14603368937969208, "logps/chosen": -35.77494430541992, "logps/rejected": -56.28825378417969, "loss": 0.2639, "rewards/accuracies": 0.875, "rewards/chosen": -0.7671470642089844, "rewards/margins": 3.049193859100342, "rewards/rejected": -3.816340923309326, "step": 59 }, { "epoch": 0.7111111111111111, "grad_norm": 33.1631924356696, "learning_rate": 4.909114739839079e-07, "logits/chosen": -0.09617012739181519, "logits/rejected": -0.08796259015798569, "logps/chosen": -33.88630294799805, "logps/rejected": -55.623878479003906, "loss": 0.2556, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8068188428878784, "rewards/margins": 3.1127305030822754, "rewards/rejected": -3.9195497035980225, "step": 60 }, { "epoch": 0.7229629629629629, "grad_norm": 40.838783872872355, "learning_rate": 4.902036012654048e-07, "logits/chosen": 0.11093666404485703, "logits/rejected": 0.1355137974023819, "logps/chosen": -34.699256896972656, "logps/rejected": -55.77449035644531, "loss": 0.2753, "rewards/accuracies": 0.875, "rewards/chosen": -1.288212776184082, "rewards/margins": 3.224027633666992, "rewards/rejected": -4.512240409851074, "step": 61 }, { "epoch": 0.7348148148148148, "grad_norm": 35.91873621159586, "learning_rate": 4.894697352446182e-07, "logits/chosen": -0.10412248969078064, "logits/rejected": -0.1209147572517395, "logps/chosen": -34.93061447143555, "logps/rejected": -52.149208068847656, "loss": 0.2958, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7685253620147705, "rewards/margins": 2.5662384033203125, "rewards/rejected": -3.334764003753662, "step": 62 }, { "epoch": 0.7466666666666667, "grad_norm": 28.279332840993643, "learning_rate": 4.887099553358501e-07, "logits/chosen": -0.1916661560535431, "logits/rejected": -0.14164935052394867, "logps/chosen": -40.58860397338867, "logps/rejected": -50.00385284423828, "loss": 0.229, "rewards/accuracies": 0.75, "rewards/chosen": -0.3654404282569885, "rewards/margins": 2.5260181427001953, "rewards/rejected": -2.891458511352539, "step": 63 }, { "epoch": 0.7585185185185185, "grad_norm": 43.562956759714005, "learning_rate": 4.879243437576383e-07, "logits/chosen": -0.09250672161579132, "logits/rejected": -0.06184221804141998, "logps/chosen": -33.61621856689453, "logps/rejected": -48.81718826293945, "loss": 0.286, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7941789031028748, "rewards/margins": 2.6042628288269043, "rewards/rejected": -3.398441791534424, "step": 64 }, { "epoch": 0.7703703703703704, "grad_norm": 43.71274039442652, "learning_rate": 4.871129855238588e-07, "logits/chosen": -0.1322498917579651, "logits/rejected": -0.05121883377432823, "logps/chosen": -40.41231918334961, "logps/rejected": -68.0873031616211, "loss": 0.3236, "rewards/accuracies": 1.0, "rewards/chosen": -0.39779654145240784, "rewards/margins": 3.27742600440979, "rewards/rejected": -3.675222396850586, "step": 65 }, { "epoch": 0.7822222222222223, "grad_norm": 33.295271431641005, "learning_rate": 4.862759684345269e-07, "logits/chosen": -0.35007691383361816, "logits/rejected": -0.3479149341583252, "logps/chosen": -38.88081359863281, "logps/rejected": -49.428382873535156, "loss": 0.245, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42857033014297485, "rewards/margins": 3.455155372619629, "rewards/rejected": -3.883725643157959, "step": 66 }, { "epoch": 0.794074074074074, "grad_norm": 22.368867699781628, "learning_rate": 4.854133830662955e-07, "logits/chosen": -0.26884549856185913, "logits/rejected": -0.27821600437164307, "logps/chosen": -40.386207580566406, "logps/rejected": -55.080604553222656, "loss": 0.1921, "rewards/accuracies": 0.875, "rewards/chosen": -1.1675169467926025, "rewards/margins": 4.228756904602051, "rewards/rejected": -5.396273136138916, "step": 67 }, { "epoch": 0.8059259259259259, "grad_norm": 32.90815737667523, "learning_rate": 4.845253227626536e-07, "logits/chosen": 0.12926915287971497, "logits/rejected": 0.03172997385263443, "logps/chosen": -56.2171516418457, "logps/rejected": -61.29096984863281, "loss": 0.2468, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6303930282592773, "rewards/margins": 2.693972110748291, "rewards/rejected": -3.3243653774261475, "step": 68 }, { "epoch": 0.8177777777777778, "grad_norm": 31.286678704407063, "learning_rate": 4.836118836238252e-07, "logits/chosen": -0.11010141670703888, "logits/rejected": -0.09770654886960983, "logps/chosen": -38.13661193847656, "logps/rejected": -59.40013122558594, "loss": 0.2512, "rewards/accuracies": 1.0, "rewards/chosen": 0.08025422692298889, "rewards/margins": 2.979079246520996, "rewards/rejected": -2.898824691772461, "step": 69 }, { "epoch": 0.8296296296296296, "grad_norm": 27.574620445125923, "learning_rate": 4.826731644963704e-07, "logits/chosen": -0.25498491525650024, "logits/rejected": -0.24903275072574615, "logps/chosen": -32.7315788269043, "logps/rejected": -47.2717399597168, "loss": 0.222, "rewards/accuracies": 0.875, "rewards/chosen": -0.6683451533317566, "rewards/margins": 3.523477554321289, "rewards/rejected": -4.191822528839111, "step": 70 }, { "epoch": 0.8414814814814815, "grad_norm": 35.73348427460115, "learning_rate": 4.817092669624882e-07, "logits/chosen": -0.018255462870001793, "logits/rejected": -0.0020784977823495865, "logps/chosen": -34.584449768066406, "logps/rejected": -54.02947998046875, "loss": 0.3535, "rewards/accuracies": 0.875, "rewards/chosen": -0.04187864065170288, "rewards/margins": 3.698057174682617, "rewards/rejected": -3.739936113357544, "step": 71 }, { "epoch": 0.8533333333333334, "grad_norm": 28.886553550180775, "learning_rate": 4.807202953290243e-07, "logits/chosen": -0.2388785183429718, "logits/rejected": -0.1512915939092636, "logps/chosen": -32.676937103271484, "logps/rejected": -51.861934661865234, "loss": 0.2502, "rewards/accuracies": 1.0, "rewards/chosen": -0.43571335077285767, "rewards/margins": 3.212143898010254, "rewards/rejected": -3.6478569507598877, "step": 72 }, { "epoch": 0.8651851851851852, "grad_norm": 31.329548468083193, "learning_rate": 4.797063566161834e-07, "logits/chosen": -0.016514137387275696, "logits/rejected": -0.02128826081752777, "logps/chosen": -42.21635818481445, "logps/rejected": -53.1080436706543, "loss": 0.2729, "rewards/accuracies": 0.75, "rewards/chosen": -0.650065541267395, "rewards/margins": 1.7332451343536377, "rewards/rejected": -2.383310556411743, "step": 73 }, { "epoch": 0.8770370370370371, "grad_norm": 25.241915116630672, "learning_rate": 4.786675605459487e-07, "logits/chosen": -0.18203724920749664, "logits/rejected": -0.12511923909187317, "logps/chosen": -37.860111236572266, "logps/rejected": -67.00776672363281, "loss": 0.2269, "rewards/accuracies": 0.9375, "rewards/chosen": 0.003457695245742798, "rewards/margins": 4.446203708648682, "rewards/rejected": -4.442746162414551, "step": 74 }, { "epoch": 0.8888888888888888, "grad_norm": 26.546681076158777, "learning_rate": 4.776040195302079e-07, "logits/chosen": -0.20350059866905212, "logits/rejected": -0.16488413512706757, "logps/chosen": -29.37200164794922, "logps/rejected": -53.26748275756836, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 0.028457432985305786, "rewards/margins": 4.081994533538818, "rewards/rejected": -4.053536891937256, "step": 75 }, { "epoch": 0.9007407407407407, "grad_norm": 36.79575801377004, "learning_rate": 4.76515848658589e-07, "logits/chosen": -0.06815146654844284, "logits/rejected": 0.04448368400335312, "logps/chosen": -40.69670867919922, "logps/rejected": -58.95963668823242, "loss": 0.3201, "rewards/accuracies": 0.875, "rewards/chosen": -0.6759670972824097, "rewards/margins": 2.9853808879852295, "rewards/rejected": -3.6613478660583496, "step": 76 }, { "epoch": 0.9125925925925926, "grad_norm": 33.040470283627364, "learning_rate": 4.754031656860059e-07, "logits/chosen": 0.059698522090911865, "logits/rejected": 0.1003262847661972, "logps/chosen": -36.32437515258789, "logps/rejected": -45.55727005004883, "loss": 0.2451, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1681911200284958, "rewards/margins": 3.335202217102051, "rewards/rejected": -3.167011022567749, "step": 77 }, { "epoch": 0.9244444444444444, "grad_norm": 23.533686338243243, "learning_rate": 4.74266091019916e-07, "logits/chosen": -0.03607799857854843, "logits/rejected": -0.07650981843471527, "logps/chosen": -40.76026153564453, "logps/rejected": -51.91889190673828, "loss": 0.1909, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06481152772903442, "rewards/margins": 3.4574201107025146, "rewards/rejected": -3.392608642578125, "step": 78 }, { "epoch": 0.9362962962962963, "grad_norm": 34.00814256216538, "learning_rate": 4.7310474770728996e-07, "logits/chosen": -0.24094080924987793, "logits/rejected": -0.2244417816400528, "logps/chosen": -36.15470504760742, "logps/rejected": -50.766448974609375, "loss": 0.3092, "rewards/accuracies": 0.875, "rewards/chosen": -0.04541383683681488, "rewards/margins": 2.0906879901885986, "rewards/rejected": -2.1361019611358643, "step": 79 }, { "epoch": 0.9481481481481482, "grad_norm": 28.43036399847798, "learning_rate": 4.719192614212969e-07, "logits/chosen": 0.04508206248283386, "logits/rejected": 0.04841914027929306, "logps/chosen": -44.18794631958008, "logps/rejected": -74.9228515625, "loss": 0.1841, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0396180152893066, "rewards/margins": 3.7275805473327637, "rewards/rejected": -4.76719856262207, "step": 80 }, { "epoch": 0.96, "grad_norm": 32.15305104854747, "learning_rate": 4.707097604477045e-07, "logits/chosen": 0.10437710583209991, "logits/rejected": 0.10021185874938965, "logps/chosen": -41.348716735839844, "logps/rejected": -53.517127990722656, "loss": 0.2768, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03640615940093994, "rewards/margins": 3.230294704437256, "rewards/rejected": -3.1938881874084473, "step": 81 }, { "epoch": 0.9718518518518519, "grad_norm": 30.563184515965578, "learning_rate": 4.694763756709967e-07, "logits/chosen": -0.17636063694953918, "logits/rejected": -0.21469731628894806, "logps/chosen": -39.04137420654297, "logps/rejected": -52.79186248779297, "loss": 0.2415, "rewards/accuracies": 0.875, "rewards/chosen": -0.2509702444076538, "rewards/margins": 3.593198776245117, "rewards/rejected": -3.8441689014434814, "step": 82 }, { "epoch": 0.9837037037037037, "grad_norm": 28.891738925831007, "learning_rate": 4.6821924056021053e-07, "logits/chosen": -0.11742343008518219, "logits/rejected": 0.009909386746585369, "logps/chosen": -30.184894561767578, "logps/rejected": -65.191650390625, "loss": 0.2097, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2642693817615509, "rewards/margins": 5.374275207519531, "rewards/rejected": -5.638545036315918, "step": 83 }, { "epoch": 0.9955555555555555, "grad_norm": 34.49435985459448, "learning_rate": 4.669384911544926e-07, "logits/chosen": -0.07497820258140564, "logits/rejected": -0.026325395330786705, "logps/chosen": -33.93386459350586, "logps/rejected": -50.840423583984375, "loss": 0.2826, "rewards/accuracies": 0.75, "rewards/chosen": -0.5045543909072876, "rewards/margins": 2.2327775955200195, "rewards/rejected": -2.7373318672180176, "step": 84 }, { "epoch": 1.0074074074074073, "grad_norm": 25.60399667566007, "learning_rate": 4.6563426604837817e-07, "logits/chosen": -0.07658643275499344, "logits/rejected": -0.06745781004428864, "logps/chosen": -45.01917266845703, "logps/rejected": -59.43114471435547, "loss": 0.1973, "rewards/accuracies": 0.875, "rewards/chosen": -0.29737135767936707, "rewards/margins": 5.139443874359131, "rewards/rejected": -5.4368157386779785, "step": 85 }, { "epoch": 1.0192592592592593, "grad_norm": 21.460741017889536, "learning_rate": 4.6430670637679294e-07, "logits/chosen": -0.2205628752708435, "logits/rejected": -0.0864521712064743, "logps/chosen": -31.902761459350586, "logps/rejected": -51.85260009765625, "loss": 0.1667, "rewards/accuracies": 1.0, "rewards/chosen": -0.194830521941185, "rewards/margins": 3.8075246810913086, "rewards/rejected": -4.002355098724365, "step": 86 }, { "epoch": 1.031111111111111, "grad_norm": 15.350784705262274, "learning_rate": 4.629559557997804e-07, "logits/chosen": -0.12179061770439148, "logits/rejected": -0.10868389904499054, "logps/chosen": -40.21116638183594, "logps/rejected": -62.20214080810547, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": -0.19310609996318817, "rewards/margins": 4.397428512573242, "rewards/rejected": -4.590534687042236, "step": 87 }, { "epoch": 1.0429629629629629, "grad_norm": 13.188223717252573, "learning_rate": 4.615821604869563e-07, "logits/chosen": -0.13621510565280914, "logits/rejected": -0.044875748455524445, "logps/chosen": -39.49315643310547, "logps/rejected": -65.40257263183594, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": -0.10489638894796371, "rewards/margins": 5.503582000732422, "rewards/rejected": -5.608478546142578, "step": 88 }, { "epoch": 1.0548148148148149, "grad_norm": 27.4435618488601, "learning_rate": 4.6018546910169067e-07, "logits/chosen": -0.2304653376340866, "logits/rejected": -0.30428558588027954, "logps/chosen": -38.354759216308594, "logps/rejected": -57.675498962402344, "loss": 0.1946, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1100564002990723, "rewards/margins": 4.001605033874512, "rewards/rejected": -5.111660957336426, "step": 89 }, { "epoch": 1.0666666666666667, "grad_norm": 18.807247445635458, "learning_rate": 4.5876603278502027e-07, "logits/chosen": -0.12274541705846786, "logits/rejected": 0.009613536298274994, "logps/chosen": -41.16261291503906, "logps/rejected": -73.863037109375, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": -0.8482255935668945, "rewards/margins": 5.244417667388916, "rewards/rejected": -6.0926432609558105, "step": 90 }, { "epoch": 1.0785185185185184, "grad_norm": 24.62512250085559, "learning_rate": 4.573240051392935e-07, "logits/chosen": -0.1303870528936386, "logits/rejected": -0.14886482059955597, "logps/chosen": -39.42107009887695, "logps/rejected": -54.32372283935547, "loss": 0.1612, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6151587963104248, "rewards/margins": 3.3702774047851562, "rewards/rejected": -3.98543643951416, "step": 91 }, { "epoch": 1.0903703703703704, "grad_norm": 18.29323289164473, "learning_rate": 4.5585954221154853e-07, "logits/chosen": -0.4573056101799011, "logits/rejected": -0.31245023012161255, "logps/chosen": -32.956050872802734, "logps/rejected": -59.963035583496094, "loss": 0.1649, "rewards/accuracies": 1.0, "rewards/chosen": -0.5122833251953125, "rewards/margins": 4.091685771942139, "rewards/rejected": -4.603969097137451, "step": 92 }, { "epoch": 1.1022222222222222, "grad_norm": 19.321908739095367, "learning_rate": 4.5437280247662646e-07, "logits/chosen": -0.023740939795970917, "logits/rejected": -0.0003247186541557312, "logps/chosen": -39.27166748046875, "logps/rejected": -55.00670623779297, "loss": 0.1251, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7074174880981445, "rewards/margins": 3.3870468139648438, "rewards/rejected": -4.094464302062988, "step": 93 }, { "epoch": 1.114074074074074, "grad_norm": 17.03649474940601, "learning_rate": 4.528639468200226e-07, "logits/chosen": 0.2261081337928772, "logits/rejected": 0.2597027122974396, "logps/chosen": -37.43301773071289, "logps/rejected": -51.47749328613281, "loss": 0.1186, "rewards/accuracies": 1.0, "rewards/chosen": 0.10310641676187515, "rewards/margins": 3.6022109985351562, "rewards/rejected": -3.4991047382354736, "step": 94 }, { "epoch": 1.125925925925926, "grad_norm": 16.559378309982034, "learning_rate": 4.5133313852047613e-07, "logits/chosen": -0.11725334078073502, "logits/rejected": -0.08741730451583862, "logps/chosen": -33.64775085449219, "logps/rejected": -55.1715087890625, "loss": 0.1273, "rewards/accuracies": 0.8125, "rewards/chosen": -0.01142565906047821, "rewards/margins": 3.431326150894165, "rewards/rejected": -3.442751407623291, "step": 95 }, { "epoch": 1.1377777777777778, "grad_norm": 20.611754629262013, "learning_rate": 4.4978054323230144e-07, "logits/chosen": 0.051543403416872025, "logits/rejected": 0.11705614626407623, "logps/chosen": -33.534278869628906, "logps/rejected": -51.728057861328125, "loss": 0.1365, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07938066124916077, "rewards/margins": 3.254304885864258, "rewards/rejected": -3.17492413520813, "step": 96 }, { "epoch": 1.1496296296296296, "grad_norm": 12.616532653018533, "learning_rate": 4.482063289674618e-07, "logits/chosen": -0.0702984482049942, "logits/rejected": 0.004200812429189682, "logps/chosen": -34.993507385253906, "logps/rejected": -57.94596481323242, "loss": 0.0914, "rewards/accuracies": 1.0, "rewards/chosen": 0.19305679202079773, "rewards/margins": 4.31368350982666, "rewards/rejected": -4.120626449584961, "step": 97 }, { "epoch": 1.1614814814814816, "grad_norm": 12.762301209570738, "learning_rate": 4.466106660773884e-07, "logits/chosen": -0.1491287350654602, "logits/rejected": -0.03166097402572632, "logps/chosen": -39.788230895996094, "logps/rejected": -59.54990005493164, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": -0.40045109391212463, "rewards/margins": 4.702445030212402, "rewards/rejected": -5.102896213531494, "step": 98 }, { "epoch": 1.1733333333333333, "grad_norm": 17.826250546113457, "learning_rate": 4.44993727234546e-07, "logits/chosen": 0.11196614801883698, "logits/rejected": 0.07824762165546417, "logps/chosen": -39.74019241333008, "logps/rejected": -49.98065185546875, "loss": 0.1199, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37030303478240967, "rewards/margins": 2.883664608001709, "rewards/rejected": -3.25396728515625, "step": 99 }, { "epoch": 1.1851851851851851, "grad_norm": 16.340929612227512, "learning_rate": 4.4335568741374695e-07, "logits/chosen": -0.3065292239189148, "logits/rejected": -0.243222177028656, "logps/chosen": -39.87810516357422, "logps/rejected": -49.80472946166992, "loss": 0.1362, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09828188270330429, "rewards/margins": 4.0216569900512695, "rewards/rejected": -4.119938850402832, "step": 100 }, { "epoch": 1.1970370370370371, "grad_norm": 15.511054375166765, "learning_rate": 4.4169672387321735e-07, "logits/chosen": -0.06340894848108292, "logits/rejected": -0.0665612518787384, "logps/chosen": -43.078765869140625, "logps/rejected": -65.42969512939453, "loss": 0.1102, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3077806830406189, "rewards/margins": 5.954348087310791, "rewards/rejected": -6.262128829956055, "step": 101 }, { "epoch": 1.208888888888889, "grad_norm": 17.09598873374456, "learning_rate": 4.4001701613541454e-07, "logits/chosen": 0.09894056618213654, "logits/rejected": 0.11925836652517319, "logps/chosen": -32.494293212890625, "logps/rejected": -51.34983825683594, "loss": 0.128, "rewards/accuracies": 1.0, "rewards/chosen": -0.22729653120040894, "rewards/margins": 4.20453405380249, "rewards/rejected": -4.431830406188965, "step": 102 }, { "epoch": 1.2207407407407407, "grad_norm": 22.603451847311774, "learning_rate": 4.383167459676008e-07, "logits/chosen": -0.09994232654571533, "logits/rejected": -0.03670894354581833, "logps/chosen": -34.782405853271484, "logps/rejected": -57.43785095214844, "loss": 0.1482, "rewards/accuracies": 0.9375, "rewards/chosen": -0.40158677101135254, "rewards/margins": 3.9667258262634277, "rewards/rejected": -4.368312358856201, "step": 103 }, { "epoch": 1.2325925925925927, "grad_norm": 17.176654931396367, "learning_rate": 4.365960973621734e-07, "logits/chosen": -0.3010917007923126, "logits/rejected": -0.2688751220703125, "logps/chosen": -30.779415130615234, "logps/rejected": -59.66587448120117, "loss": 0.1077, "rewards/accuracies": 0.875, "rewards/chosen": -0.5755228996276855, "rewards/margins": 5.500998497009277, "rewards/rejected": -6.076521396636963, "step": 104 }, { "epoch": 1.2444444444444445, "grad_norm": 16.54203727829074, "learning_rate": 4.348552565167542e-07, "logits/chosen": 0.03528839722275734, "logits/rejected": 0.025072041898965836, "logps/chosen": -35.11994171142578, "logps/rejected": -49.86817169189453, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": -0.683509349822998, "rewards/margins": 4.702343940734863, "rewards/rejected": -5.385853290557861, "step": 105 }, { "epoch": 1.2562962962962962, "grad_norm": 26.56869923226247, "learning_rate": 4.330944118140406e-07, "logits/chosen": -0.08033540099859238, "logits/rejected": -0.015175499022006989, "logps/chosen": -40.06020736694336, "logps/rejected": -58.90888977050781, "loss": 0.1479, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02627614513039589, "rewards/margins": 4.818339824676514, "rewards/rejected": -4.7920637130737305, "step": 106 }, { "epoch": 1.268148148148148, "grad_norm": 14.298587740255858, "learning_rate": 4.313137538014198e-07, "logits/chosen": -0.08439959585666656, "logits/rejected": -0.15276381373405457, "logps/chosen": -34.021949768066406, "logps/rejected": -43.68275451660156, "loss": 0.0835, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03106861561536789, "rewards/margins": 3.872872829437256, "rewards/rejected": -3.841804265975952, "step": 107 }, { "epoch": 1.28, "grad_norm": 25.029691995336584, "learning_rate": 4.295134751703492e-07, "logits/chosen": 0.039663467556238174, "logits/rejected": 0.03390619903802872, "logps/chosen": -50.46000671386719, "logps/rejected": -61.62244415283203, "loss": 0.1408, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41419780254364014, "rewards/margins": 5.797426223754883, "rewards/rejected": -6.2116241455078125, "step": 108 }, { "epoch": 1.2918518518518518, "grad_norm": 23.688132743134673, "learning_rate": 4.276937707355044e-07, "logits/chosen": -0.017152896150946617, "logits/rejected": -0.053842976689338684, "logps/chosen": -41.905521392822266, "logps/rejected": -65.56050109863281, "loss": 0.1163, "rewards/accuracies": 1.0, "rewards/chosen": -0.573278546333313, "rewards/margins": 6.835091590881348, "rewards/rejected": -7.408369064331055, "step": 109 }, { "epoch": 1.3037037037037038, "grad_norm": 22.26748210189321, "learning_rate": 4.2585483741369755e-07, "logits/chosen": -0.3594672679901123, "logits/rejected": -0.2787400782108307, "logps/chosen": -33.10090255737305, "logps/rejected": -65.98353576660156, "loss": 0.1206, "rewards/accuracies": 1.0, "rewards/chosen": -1.1710546016693115, "rewards/margins": 5.390981197357178, "rewards/rejected": -6.562036037445068, "step": 110 }, { "epoch": 1.3155555555555556, "grad_norm": 12.864266715484298, "learning_rate": 4.239968742025684e-07, "logits/chosen": -0.1566499024629593, "logits/rejected": -0.02022075653076172, "logps/chosen": -31.641050338745117, "logps/rejected": -70.607421875, "loss": 0.0872, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5755561590194702, "rewards/margins": 5.668292999267578, "rewards/rejected": -6.243849277496338, "step": 111 }, { "epoch": 1.3274074074074074, "grad_norm": 11.527896741156843, "learning_rate": 4.2212008215905e-07, "logits/chosen": -0.18084248900413513, "logits/rejected": -0.06485521793365479, "logps/chosen": -32.15618133544922, "logps/rejected": -60.565914154052734, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": -0.524111270904541, "rewards/margins": 4.708486557006836, "rewards/rejected": -5.2325968742370605, "step": 112 }, { "epoch": 1.3392592592592591, "grad_norm": 18.981341764098254, "learning_rate": 4.2022466437761154e-07, "logits/chosen": 0.11151312291622162, "logits/rejected": 0.2410213202238083, "logps/chosen": -35.73467254638672, "logps/rejected": -61.45711135864258, "loss": 0.1104, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03812497854232788, "rewards/margins": 3.9962849617004395, "rewards/rejected": -4.034409999847412, "step": 113 }, { "epoch": 1.3511111111111112, "grad_norm": 15.055020570658602, "learning_rate": 4.18310825968281e-07, "logits/chosen": -0.13380703330039978, "logits/rejected": -0.056154295802116394, "logps/chosen": -47.65923309326172, "logps/rejected": -68.50222778320312, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": -1.0497931241989136, "rewards/margins": 5.255577564239502, "rewards/rejected": -6.305370807647705, "step": 114 }, { "epoch": 1.362962962962963, "grad_norm": 15.94507937001582, "learning_rate": 4.1637877403444923e-07, "logits/chosen": -0.13782085478305817, "logits/rejected": -0.1237938329577446, "logps/chosen": -34.57268142700195, "logps/rejected": -60.96284103393555, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": 0.2373143583536148, "rewards/margins": 5.874894142150879, "rewards/rejected": -5.637579917907715, "step": 115 }, { "epoch": 1.374814814814815, "grad_norm": 17.6499216351512, "learning_rate": 4.144287176504582e-07, "logits/chosen": -0.028891492635011673, "logits/rejected": 0.010559901595115662, "logps/chosen": -41.175785064697266, "logps/rejected": -56.148109436035156, "loss": 0.1233, "rewards/accuracies": 0.875, "rewards/chosen": -0.5991055965423584, "rewards/margins": 4.076951503753662, "rewards/rejected": -4.6760573387146, "step": 116 }, { "epoch": 1.3866666666666667, "grad_norm": 17.440881402981706, "learning_rate": 4.1246086783897713e-07, "logits/chosen": -0.07107866555452347, "logits/rejected": -0.05355262756347656, "logps/chosen": -29.337356567382812, "logps/rejected": -58.07263946533203, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 0.25651735067367554, "rewards/margins": 5.097599983215332, "rewards/rejected": -4.841082572937012, "step": 117 }, { "epoch": 1.3985185185185185, "grad_norm": 20.485113734375798, "learning_rate": 4.104754375481664e-07, "logits/chosen": -0.027082689106464386, "logits/rejected": 0.0022195279598236084, "logps/chosen": -35.18750762939453, "logps/rejected": -54.038787841796875, "loss": 0.0963, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7344773411750793, "rewards/margins": 4.06456995010376, "rewards/rejected": -4.799046993255615, "step": 118 }, { "epoch": 1.4103703703703703, "grad_norm": 15.73045559795655, "learning_rate": 4.084726416286337e-07, "logits/chosen": -0.2252836376428604, "logits/rejected": -0.1367400735616684, "logps/chosen": -28.606430053710938, "logps/rejected": -54.020267486572266, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 0.016864344477653503, "rewards/margins": 4.223577499389648, "rewards/rejected": -4.2067131996154785, "step": 119 }, { "epoch": 1.4222222222222223, "grad_norm": 18.987468125277015, "learning_rate": 4.0645269681018434e-07, "logits/chosen": -0.16107773780822754, "logits/rejected": -0.030207287520170212, "logps/chosen": -29.666810989379883, "logps/rejected": -61.85116958618164, "loss": 0.119, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07655519992113113, "rewards/margins": 5.755571365356445, "rewards/rejected": -5.832127571105957, "step": 120 }, { "epoch": 1.434074074074074, "grad_norm": 14.503662362117073, "learning_rate": 4.044158216783684e-07, "logits/chosen": -0.515570878982544, "logits/rejected": -0.3124653100967407, "logps/chosen": -37.86850357055664, "logps/rejected": -69.43701934814453, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": -0.2003718763589859, "rewards/margins": 7.45107364654541, "rewards/rejected": -7.6514458656311035, "step": 121 }, { "epoch": 1.445925925925926, "grad_norm": 18.569284512541163, "learning_rate": 4.0236223665082605e-07, "logits/chosen": -0.21073125302791595, "logits/rejected": -0.2305406928062439, "logps/chosen": -31.442550659179688, "logps/rejected": -55.32566833496094, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": -0.6064890027046204, "rewards/margins": 5.698796272277832, "rewards/rejected": -6.3052849769592285, "step": 122 }, { "epoch": 1.4577777777777778, "grad_norm": 17.336812678453057, "learning_rate": 4.0029216395343617e-07, "logits/chosen": -0.06248122453689575, "logits/rejected": -0.05541558563709259, "logps/chosen": -35.73994445800781, "logps/rejected": -60.30720901489258, "loss": 0.1032, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7068940997123718, "rewards/margins": 5.66333532333374, "rewards/rejected": -6.3702287673950195, "step": 123 }, { "epoch": 1.4696296296296296, "grad_norm": 14.094947491287853, "learning_rate": 3.982058275962682e-07, "logits/chosen": -0.21389123797416687, "logits/rejected": -0.16907303035259247, "logps/chosen": -28.326904296875, "logps/rejected": -56.3559455871582, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": -0.10696560144424438, "rewards/margins": 4.861944675445557, "rewards/rejected": -4.968909740447998, "step": 124 }, { "epoch": 1.4814814814814814, "grad_norm": 16.97368933164184, "learning_rate": 3.9610345334934094e-07, "logits/chosen": -0.13031917810440063, "logits/rejected": -0.03798413649201393, "logps/chosen": -41.05464172363281, "logps/rejected": -65.29788208007812, "loss": 0.1205, "rewards/accuracies": 1.0, "rewards/chosen": 0.038674697279930115, "rewards/margins": 6.0994062423706055, "rewards/rejected": -6.060731887817383, "step": 125 }, { "epoch": 1.4933333333333334, "grad_norm": 15.284029740961785, "learning_rate": 3.939852687181915e-07, "logits/chosen": -0.14361430704593658, "logits/rejected": -0.10849830508232117, "logps/chosen": -36.10377502441406, "logps/rejected": -67.89326477050781, "loss": 0.0939, "rewards/accuracies": 1.0, "rewards/chosen": -0.7120703458786011, "rewards/margins": 5.921784400939941, "rewards/rejected": -6.633854389190674, "step": 126 }, { "epoch": 1.5051851851851852, "grad_norm": 18.417078359527594, "learning_rate": 3.9185150291925585e-07, "logits/chosen": -0.21707814931869507, "logits/rejected": -0.20940172672271729, "logps/chosen": -36.678348541259766, "logps/rejected": -59.70578384399414, "loss": 0.1107, "rewards/accuracies": 1.0, "rewards/chosen": -1.0581213235855103, "rewards/margins": 5.090505123138428, "rewards/rejected": -6.14862585067749, "step": 127 }, { "epoch": 1.5170370370370372, "grad_norm": 15.713415294775897, "learning_rate": 3.8970238685506486e-07, "logits/chosen": -0.04535888880491257, "logits/rejected": 0.03404983878135681, "logps/chosen": -34.13862609863281, "logps/rejected": -66.47319793701172, "loss": 0.0976, "rewards/accuracies": 0.9375, "rewards/chosen": -0.735641360282898, "rewards/margins": 5.098552227020264, "rewards/rejected": -5.834194183349609, "step": 128 }, { "epoch": 1.528888888888889, "grad_norm": 15.290294429948705, "learning_rate": 3.8753815308925685e-07, "logits/chosen": -0.44447335600852966, "logits/rejected": -0.5022441148757935, "logps/chosen": -34.69385528564453, "logps/rejected": -63.007972717285156, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": -0.753921389579773, "rewards/margins": 6.32866907119751, "rewards/rejected": -7.082590579986572, "step": 129 }, { "epoch": 1.5407407407407407, "grad_norm": 22.7399302377654, "learning_rate": 3.8535903582141184e-07, "logits/chosen": -0.3744094669818878, "logits/rejected": -0.2060033231973648, "logps/chosen": -32.91036605834961, "logps/rejected": -61.743282318115234, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": -0.4923054873943329, "rewards/margins": 5.070975303649902, "rewards/rejected": -5.5632805824279785, "step": 130 }, { "epoch": 1.5525925925925925, "grad_norm": 19.47533309428857, "learning_rate": 3.8316527086170727e-07, "logits/chosen": -0.1633462905883789, "logits/rejected": -0.07288794964551926, "logps/chosen": -36.646484375, "logps/rejected": -59.22307205200195, "loss": 0.125, "rewards/accuracies": 1.0, "rewards/chosen": -0.2637479305267334, "rewards/margins": 5.560683727264404, "rewards/rejected": -5.824431419372559, "step": 131 }, { "epoch": 1.5644444444444443, "grad_norm": 18.594859296902843, "learning_rate": 3.809570956054003e-07, "logits/chosen": -0.5365747213363647, "logits/rejected": -0.39433979988098145, "logps/chosen": -31.459264755249023, "logps/rejected": -62.012969970703125, "loss": 0.1097, "rewards/accuracies": 0.9375, "rewards/chosen": -1.048090934753418, "rewards/margins": 5.831287860870361, "rewards/rejected": -6.879378318786621, "step": 132 }, { "epoch": 1.5762962962962963, "grad_norm": 14.282275106755108, "learning_rate": 3.787347490071389e-07, "logits/chosen": -0.20027217268943787, "logits/rejected": -0.11141454428434372, "logps/chosen": -39.01911163330078, "logps/rejected": -64.89835357666016, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": -0.3214702308177948, "rewards/margins": 5.419382095336914, "rewards/rejected": -5.740852355957031, "step": 133 }, { "epoch": 1.5881481481481483, "grad_norm": 13.42088885786826, "learning_rate": 3.764984715551031e-07, "logits/chosen": -0.12170754373073578, "logits/rejected": -0.046029090881347656, "logps/chosen": -29.234134674072266, "logps/rejected": -60.45512771606445, "loss": 0.0895, "rewards/accuracies": 1.0, "rewards/chosen": -0.1851249635219574, "rewards/margins": 6.193130970001221, "rewards/rejected": -6.378255844116211, "step": 134 }, { "epoch": 1.6, "grad_norm": 17.391251462773063, "learning_rate": 3.7424850524498113e-07, "logits/chosen": -0.18073627352714539, "logits/rejected": -0.05371435731649399, "logps/chosen": -35.178985595703125, "logps/rejected": -62.41261672973633, "loss": 0.112, "rewards/accuracies": 1.0, "rewards/chosen": 0.027589425444602966, "rewards/margins": 5.411984443664551, "rewards/rejected": -5.384395599365234, "step": 135 }, { "epoch": 1.6118518518518519, "grad_norm": 13.216285367583016, "learning_rate": 3.7198509355378207e-07, "logits/chosen": -0.3801528811454773, "logits/rejected": -0.3222590982913971, "logps/chosen": -40.775054931640625, "logps/rejected": -52.93413543701172, "loss": 0.1169, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5116928815841675, "rewards/margins": 3.9808902740478516, "rewards/rejected": -5.49258279800415, "step": 136 }, { "epoch": 1.6237037037037036, "grad_norm": 20.45643460429083, "learning_rate": 3.6970848141348855e-07, "logits/chosen": -0.17568367719650269, "logits/rejected": -0.11519981920719147, "logps/chosen": -39.35060501098633, "logps/rejected": -59.42844009399414, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": -0.02011704444885254, "rewards/margins": 5.978552341461182, "rewards/rejected": -5.998669624328613, "step": 137 }, { "epoch": 1.6355555555555554, "grad_norm": 20.16029126070904, "learning_rate": 3.6741891518455146e-07, "logits/chosen": -0.16909295320510864, "logits/rejected": -0.11791606992483139, "logps/chosen": -39.2324333190918, "logps/rejected": -67.33000183105469, "loss": 0.0818, "rewards/accuracies": 1.0, "rewards/chosen": -0.9399681091308594, "rewards/margins": 5.435766220092773, "rewards/rejected": -6.375733375549316, "step": 138 }, { "epoch": 1.6474074074074074, "grad_norm": 16.83558679733193, "learning_rate": 3.6511664262923094e-07, "logits/chosen": -0.2512515187263489, "logits/rejected": -0.12882237136363983, "logps/chosen": -27.65873908996582, "logps/rejected": -61.08441925048828, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": -0.27784237265586853, "rewards/margins": 6.476930141448975, "rewards/rejected": -6.754772663116455, "step": 139 }, { "epoch": 1.6592592592592592, "grad_norm": 12.155079766899926, "learning_rate": 3.6280191288478435e-07, "logits/chosen": -0.15503238141536713, "logits/rejected": -0.06224162131547928, "logps/chosen": -34.50273895263672, "logps/rejected": -62.98163604736328, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": -0.17935335636138916, "rewards/margins": 5.786401271820068, "rewards/rejected": -5.965755462646484, "step": 140 }, { "epoch": 1.6711111111111112, "grad_norm": 14.838007532934355, "learning_rate": 3.604749764365069e-07, "logits/chosen": -0.2061130404472351, "logits/rejected": -0.11659687012434006, "logps/chosen": -27.322792053222656, "logps/rejected": -62.887542724609375, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": -0.2183775156736374, "rewards/margins": 7.025249481201172, "rewards/rejected": -7.243628025054932, "step": 141 }, { "epoch": 1.682962962962963, "grad_norm": 18.816292977620716, "learning_rate": 3.5813608509062526e-07, "logits/chosen": -0.21707522869110107, "logits/rejected": -0.09010873734951019, "logps/chosen": -36.751190185546875, "logps/rejected": -75.8062744140625, "loss": 0.1009, "rewards/accuracies": 1.0, "rewards/chosen": -0.9384999871253967, "rewards/margins": 6.734908580780029, "rewards/rejected": -7.673408508300781, "step": 142 }, { "epoch": 1.6948148148148148, "grad_norm": 19.8821437700751, "learning_rate": 3.557854919470491e-07, "logits/chosen": -0.27580782771110535, "logits/rejected": -0.22370710968971252, "logps/chosen": -44.46840286254883, "logps/rejected": -56.556922912597656, "loss": 0.1165, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3054481744766235, "rewards/margins": 3.6606221199035645, "rewards/rejected": -4.966070175170898, "step": 143 }, { "epoch": 1.7066666666666666, "grad_norm": 16.302116776290895, "learning_rate": 3.5342345137198206e-07, "logits/chosen": -0.12199485301971436, "logits/rejected": -0.08353496342897415, "logps/chosen": -38.30879211425781, "logps/rejected": -51.09149932861328, "loss": 0.112, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1275089532136917, "rewards/margins": 3.7598049640655518, "rewards/rejected": -3.8873140811920166, "step": 144 }, { "epoch": 1.7185185185185186, "grad_norm": 15.133588204814354, "learning_rate": 3.510502189703954e-07, "logits/chosen": -0.10772836208343506, "logits/rejected": -0.06885837763547897, "logps/chosen": -40.77737045288086, "logps/rejected": -69.48592376708984, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": -0.8564466834068298, "rewards/margins": 8.315703392028809, "rewards/rejected": -9.172150611877441, "step": 145 }, { "epoch": 1.7303703703703703, "grad_norm": 19.362732647667297, "learning_rate": 3.486660515583691e-07, "logits/chosen": -0.2726586163043976, "logits/rejected": -0.21814075112342834, "logps/chosen": -30.717233657836914, "logps/rejected": -65.25225830078125, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": -0.21597516536712646, "rewards/margins": 6.636316776275635, "rewards/rejected": -6.852292060852051, "step": 146 }, { "epoch": 1.7422222222222223, "grad_norm": 11.230644499357242, "learning_rate": 3.4627120713529983e-07, "logits/chosen": -0.2115684449672699, "logits/rejected": -0.11108352243900299, "logps/chosen": -34.891475677490234, "logps/rejected": -77.10374450683594, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": -1.222525954246521, "rewards/margins": 9.00995922088623, "rewards/rejected": -10.232484817504883, "step": 147 }, { "epoch": 1.7540740740740741, "grad_norm": 13.137309578079025, "learning_rate": 3.438659448559825e-07, "logits/chosen": -0.13276290893554688, "logits/rejected": -0.10878665745258331, "logps/chosen": -37.001060485839844, "logps/rejected": -73.28951263427734, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": -0.598874568939209, "rewards/margins": 7.266829013824463, "rewards/rejected": -7.865704536437988, "step": 148 }, { "epoch": 1.765925925925926, "grad_norm": 21.677182436717583, "learning_rate": 3.414505250025659e-07, "logits/chosen": -0.04767221957445145, "logits/rejected": 0.07465275377035141, "logps/chosen": -40.81106185913086, "logps/rejected": -67.47488403320312, "loss": 0.0942, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7800440788269043, "rewards/margins": 5.173450469970703, "rewards/rejected": -5.953495025634766, "step": 149 }, { "epoch": 1.7777777777777777, "grad_norm": 16.148540981867313, "learning_rate": 3.390252089563867e-07, "logits/chosen": -0.18858963251113892, "logits/rejected": -0.20340172946453094, "logps/chosen": -36.109954833984375, "logps/rejected": -55.27710723876953, "loss": 0.0796, "rewards/accuracies": 1.0, "rewards/chosen": 0.0519292950630188, "rewards/margins": 5.729560852050781, "rewards/rejected": -5.677631855010986, "step": 150 }, { "epoch": 1.7896296296296297, "grad_norm": 24.312535239343674, "learning_rate": 3.3659025916968475e-07, "logits/chosen": -0.2818453013896942, "logits/rejected": -0.19897550344467163, "logps/chosen": -38.93151092529297, "logps/rejected": -76.9036636352539, "loss": 0.1537, "rewards/accuracies": 1.0, "rewards/chosen": -0.5907295346260071, "rewards/margins": 6.914373397827148, "rewards/rejected": -7.505102634429932, "step": 151 }, { "epoch": 1.8014814814814815, "grad_norm": 20.831076322049565, "learning_rate": 3.3414593913720155e-07, "logits/chosen": -0.22526244819164276, "logits/rejected": -0.18128839135169983, "logps/chosen": -34.0900764465332, "logps/rejected": -55.26824951171875, "loss": 0.126, "rewards/accuracies": 0.875, "rewards/chosen": -0.0966421514749527, "rewards/margins": 5.949459075927734, "rewards/rejected": -6.046100616455078, "step": 152 }, { "epoch": 1.8133333333333335, "grad_norm": 11.740893074967538, "learning_rate": 3.3169251336766697e-07, "logits/chosen": -0.1995951235294342, "logits/rejected": -0.08064538240432739, "logps/chosen": -34.26948165893555, "logps/rejected": -62.14799499511719, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": -1.565490484237671, "rewards/margins": 5.796334266662598, "rewards/rejected": -7.361824989318848, "step": 153 }, { "epoch": 1.8251851851851852, "grad_norm": 16.192262072429642, "learning_rate": 3.2923024735517567e-07, "logits/chosen": -0.3225496709346771, "logits/rejected": -0.23174233734607697, "logps/chosen": -31.906171798706055, "logps/rejected": -58.93853759765625, "loss": 0.0956, "rewards/accuracies": 0.9375, "rewards/chosen": -0.29122716188430786, "rewards/margins": 5.078674793243408, "rewards/rejected": -5.369902610778809, "step": 154 }, { "epoch": 1.837037037037037, "grad_norm": 20.0031978017647, "learning_rate": 3.2675940755045713e-07, "logits/chosen": 0.008214278146624565, "logits/rejected": 0.14055991172790527, "logps/chosen": -46.526527404785156, "logps/rejected": -81.31307983398438, "loss": 0.1503, "rewards/accuracies": 0.875, "rewards/chosen": -0.6333274245262146, "rewards/margins": 6.6069817543029785, "rewards/rejected": -7.240309238433838, "step": 155 }, { "epoch": 1.8488888888888888, "grad_norm": 21.076223385192634, "learning_rate": 3.242802613320418e-07, "logits/chosen": 0.0031320489943027496, "logits/rejected": 0.012668165378272533, "logps/chosen": -37.56488037109375, "logps/rejected": -64.01717376708984, "loss": 0.1549, "rewards/accuracies": 1.0, "rewards/chosen": -0.7534648180007935, "rewards/margins": 6.474049091339111, "rewards/rejected": -7.227513313293457, "step": 156 }, { "epoch": 1.8607407407407406, "grad_norm": 16.652570591848395, "learning_rate": 3.217930769773275e-07, "logits/chosen": -0.35275697708129883, "logits/rejected": -0.24555784463882446, "logps/chosen": -33.54517364501953, "logps/rejected": -62.51717758178711, "loss": 0.0959, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42744529247283936, "rewards/margins": 6.722561836242676, "rewards/rejected": -7.150007247924805, "step": 157 }, { "epoch": 1.8725925925925926, "grad_norm": 12.597802388134623, "learning_rate": 3.1929812363354764e-07, "logits/chosen": -0.2830018103122711, "logits/rejected": -0.1911703646183014, "logps/chosen": -34.85630416870117, "logps/rejected": -65.27281188964844, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": -0.6973313689231873, "rewards/margins": 5.912741184234619, "rewards/rejected": -6.610072612762451, "step": 158 }, { "epoch": 1.8844444444444446, "grad_norm": 12.10851684482882, "learning_rate": 3.167956712886463e-07, "logits/chosen": -0.11961568146944046, "logits/rejected": -0.07325749099254608, "logps/chosen": -39.96379470825195, "logps/rejected": -55.56465148925781, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": -1.3007912635803223, "rewards/margins": 4.756231307983398, "rewards/rejected": -6.0570220947265625, "step": 159 }, { "epoch": 1.8962962962962964, "grad_norm": 20.233134180855856, "learning_rate": 3.142859907420615e-07, "logits/chosen": -0.10496459901332855, "logits/rejected": 0.02405383251607418, "logps/chosen": -33.706703186035156, "logps/rejected": -68.788818359375, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": -0.46070706844329834, "rewards/margins": 5.680697917938232, "rewards/rejected": -6.14140510559082, "step": 160 }, { "epoch": 1.9081481481481481, "grad_norm": 19.119673878649753, "learning_rate": 3.117693535754213e-07, "logits/chosen": -0.10953935980796814, "logits/rejected": -0.043600670993328094, "logps/chosen": -32.68844985961914, "logps/rejected": -67.97357177734375, "loss": 0.098, "rewards/accuracies": 0.875, "rewards/chosen": 0.133280411362648, "rewards/margins": 7.29306173324585, "rewards/rejected": -7.159781455993652, "step": 161 }, { "epoch": 1.92, "grad_norm": 20.235536301916813, "learning_rate": 3.092460321231547e-07, "logits/chosen": -0.25959259271621704, "logits/rejected": -0.20939543843269348, "logps/chosen": -35.41835021972656, "logps/rejected": -67.97672271728516, "loss": 0.1272, "rewards/accuracies": 0.9375, "rewards/chosen": -1.018587589263916, "rewards/margins": 7.818422317504883, "rewards/rejected": -8.837010383605957, "step": 162 }, { "epoch": 1.9318518518518517, "grad_norm": 13.909230300778022, "learning_rate": 3.0671629944302164e-07, "logits/chosen": -0.12026870250701904, "logits/rejected": -0.1089366227388382, "logps/chosen": -36.70647048950195, "logps/rejected": -53.47910690307617, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": -0.5273434519767761, "rewards/margins": 5.643020153045654, "rewards/rejected": -6.170363903045654, "step": 163 }, { "epoch": 1.9437037037037037, "grad_norm": 12.864589613039078, "learning_rate": 3.0418042928656415e-07, "logits/chosen": -0.19225530326366425, "logits/rejected": -0.10623307526111603, "logps/chosen": -30.59122085571289, "logps/rejected": -59.536502838134766, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": -0.13015054166316986, "rewards/margins": 5.6972761154174805, "rewards/rejected": -5.827426910400391, "step": 164 }, { "epoch": 1.9555555555555557, "grad_norm": 15.975537356292229, "learning_rate": 3.016386960694827e-07, "logits/chosen": -0.41196244955062866, "logits/rejected": -0.28604307770729065, "logps/chosen": -39.74413299560547, "logps/rejected": -67.16585540771484, "loss": 0.1075, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3397538661956787, "rewards/margins": 5.61893892288208, "rewards/rejected": -6.958693027496338, "step": 165 }, { "epoch": 1.9674074074074075, "grad_norm": 32.234428734857076, "learning_rate": 2.990913748419411e-07, "logits/chosen": 0.07510136812925339, "logits/rejected": 0.11186552792787552, "logps/chosen": -41.38081741333008, "logps/rejected": -67.78083038330078, "loss": 0.1778, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3183833360671997, "rewards/margins": 5.662228107452393, "rewards/rejected": -5.980611801147461, "step": 166 }, { "epoch": 1.9792592592592593, "grad_norm": 19.969450441352706, "learning_rate": 2.9653874125880167e-07, "logits/chosen": -0.17725233733654022, "logits/rejected": -0.10895150154829025, "logps/chosen": -36.67229461669922, "logps/rejected": -61.394744873046875, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 0.03877316415309906, "rewards/margins": 5.38054895401001, "rewards/rejected": -5.341775894165039, "step": 167 }, { "epoch": 1.991111111111111, "grad_norm": 11.405884683324148, "learning_rate": 2.9398107154979634e-07, "logits/chosen": -0.21582955121994019, "logits/rejected": -0.1547984778881073, "logps/chosen": -33.66050338745117, "logps/rejected": -71.53202056884766, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": -0.2750725746154785, "rewards/margins": 6.011247158050537, "rewards/rejected": -6.286320209503174, "step": 168 }, { "epoch": 2.002962962962963, "grad_norm": 10.684152961615103, "learning_rate": 2.9141864248963427e-07, "logits/chosen": -0.43596649169921875, "logits/rejected": -0.3842291235923767, "logps/chosen": -39.145042419433594, "logps/rejected": -52.33580780029297, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": -0.2999729812145233, "rewards/margins": 5.18059778213501, "rewards/rejected": -5.4805707931518555, "step": 169 }, { "epoch": 2.0148148148148146, "grad_norm": 7.120772046885603, "learning_rate": 2.8885173136805125e-07, "logits/chosen": -0.0826089009642601, "logits/rejected": 0.052907198667526245, "logps/chosen": -34.04817199707031, "logps/rejected": -73.21731567382812, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -0.0369320884346962, "rewards/margins": 5.923883438110352, "rewards/rejected": -5.960815906524658, "step": 170 }, { "epoch": 2.026666666666667, "grad_norm": 6.919197934532035, "learning_rate": 2.862806159598032e-07, "logits/chosen": -0.45804017782211304, "logits/rejected": -0.408170610666275, "logps/chosen": -35.05866241455078, "logps/rejected": -58.95463562011719, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 0.28513020277023315, "rewards/margins": 6.706875324249268, "rewards/rejected": -6.421745300292969, "step": 171 }, { "epoch": 2.0385185185185186, "grad_norm": 7.6734571457046075, "learning_rate": 2.837055744946072e-07, "logits/chosen": -0.20093482732772827, "logits/rejected": -0.15153169631958008, "logps/chosen": -26.53860092163086, "logps/rejected": -60.75783157348633, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": 0.1763535887002945, "rewards/margins": 6.684775352478027, "rewards/rejected": -6.508421421051025, "step": 172 }, { "epoch": 2.0503703703703704, "grad_norm": 7.089127360949598, "learning_rate": 2.811268856270332e-07, "logits/chosen": -0.19816571474075317, "logits/rejected": -0.17036175727844238, "logps/chosen": -30.929105758666992, "logps/rejected": -63.191619873046875, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -0.12159377336502075, "rewards/margins": 6.226848602294922, "rewards/rejected": -6.348442077636719, "step": 173 }, { "epoch": 2.062222222222222, "grad_norm": 6.879444744601849, "learning_rate": 2.7854482840634965e-07, "logits/chosen": -0.3685060143470764, "logits/rejected": -0.2508692145347595, "logps/chosen": -30.003223419189453, "logps/rejected": -66.00552368164062, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": -0.6152498126029968, "rewards/margins": 7.741688251495361, "rewards/rejected": -8.356938362121582, "step": 174 }, { "epoch": 2.074074074074074, "grad_norm": 7.35254551338382, "learning_rate": 2.759596822463267e-07, "logits/chosen": -0.23846019804477692, "logits/rejected": -0.21598272025585175, "logps/chosen": -35.864341735839844, "logps/rejected": -60.58774948120117, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": -0.5300172567367554, "rewards/margins": 6.528465747833252, "rewards/rejected": -7.0584821701049805, "step": 175 }, { "epoch": 2.0859259259259257, "grad_norm": 6.0224009777317535, "learning_rate": 2.73371726895e-07, "logits/chosen": -0.4624573588371277, "logits/rejected": -0.3582268953323364, "logps/chosen": -38.6595573425293, "logps/rejected": -68.8594970703125, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.1498335599899292, "rewards/margins": 6.414737224578857, "rewards/rejected": -6.564570903778076, "step": 176 }, { "epoch": 2.097777777777778, "grad_norm": 5.373444495677693, "learning_rate": 2.7078124240439793e-07, "logits/chosen": -0.293597012758255, "logits/rejected": -0.18846404552459717, "logps/chosen": -37.71804428100586, "logps/rejected": -79.11727905273438, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -0.9366405606269836, "rewards/margins": 8.907712936401367, "rewards/rejected": -9.844353675842285, "step": 177 }, { "epoch": 2.1096296296296297, "grad_norm": 6.426554871574788, "learning_rate": 2.68188509100236e-07, "logits/chosen": -0.07860371470451355, "logits/rejected": -0.032592758536338806, "logps/chosen": -36.26288604736328, "logps/rejected": -67.72529602050781, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -0.021683871746063232, "rewards/margins": 6.0264506340026855, "rewards/rejected": -6.048134803771973, "step": 178 }, { "epoch": 2.1214814814814815, "grad_norm": 7.4465908422134435, "learning_rate": 2.6559380755158206e-07, "logits/chosen": -0.2125643938779831, "logits/rejected": -0.11938470602035522, "logps/chosen": -41.93673324584961, "logps/rejected": -67.91219329833984, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -0.314428448677063, "rewards/margins": 6.577666759490967, "rewards/rejected": -6.892095565795898, "step": 179 }, { "epoch": 2.1333333333333333, "grad_norm": 8.207906364980415, "learning_rate": 2.629974185404951e-07, "logits/chosen": -0.19172216951847076, "logits/rejected": -0.09138239920139313, "logps/chosen": -33.569427490234375, "logps/rejected": -83.35604858398438, "loss": 0.0522, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1560230553150177, "rewards/margins": 6.664412975311279, "rewards/rejected": -6.820435523986816, "step": 180 }, { "epoch": 2.145185185185185, "grad_norm": 7.056836784079578, "learning_rate": 2.603996230316402e-07, "logits/chosen": 0.04577064514160156, "logits/rejected": 0.027345050126314163, "logps/chosen": -28.81571388244629, "logps/rejected": -50.763343811035156, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.12452198565006256, "rewards/margins": 5.144538879394531, "rewards/rejected": -5.269061088562012, "step": 181 }, { "epoch": 2.157037037037037, "grad_norm": 11.894887100183427, "learning_rate": 2.5780070214188474e-07, "logits/chosen": -0.2564762532711029, "logits/rejected": -0.17662659287452698, "logps/chosen": -44.12029266357422, "logps/rejected": -67.84626007080078, "loss": 0.067, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7224655151367188, "rewards/margins": 6.115478038787842, "rewards/rejected": -6.837944030761719, "step": 182 }, { "epoch": 2.168888888888889, "grad_norm": 7.773592692412279, "learning_rate": 2.552009371098778e-07, "logits/chosen": -0.22470326721668243, "logits/rejected": -0.16598555445671082, "logps/chosen": -40.50588607788086, "logps/rejected": -66.48469543457031, "loss": 0.0511, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6875913143157959, "rewards/margins": 6.013528823852539, "rewards/rejected": -6.7011213302612305, "step": 183 }, { "epoch": 2.180740740740741, "grad_norm": 6.760102453494981, "learning_rate": 2.5260060926561604e-07, "logits/chosen": -0.10662063956260681, "logits/rejected": 0.02468992955982685, "logps/chosen": -30.439607620239258, "logps/rejected": -68.47404479980469, "loss": 0.042, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1491369903087616, "rewards/margins": 7.288580894470215, "rewards/rejected": -7.437718391418457, "step": 184 }, { "epoch": 2.1925925925925926, "grad_norm": 8.50819275884591, "learning_rate": 2.5e-07, "logits/chosen": -0.314007043838501, "logits/rejected": -0.2440987378358841, "logps/chosen": -40.95098114013672, "logps/rejected": -58.714630126953125, "loss": 0.061, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0700465440750122, "rewards/margins": 4.418706893920898, "rewards/rejected": -5.488753795623779, "step": 185 }, { "epoch": 2.2044444444444444, "grad_norm": 7.253010502441156, "learning_rate": 2.4739939073438393e-07, "logits/chosen": -0.3739926815032959, "logits/rejected": -0.29570120573043823, "logps/chosen": -46.3231201171875, "logps/rejected": -73.8995361328125, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -0.3167559504508972, "rewards/margins": 5.942593574523926, "rewards/rejected": -6.259350299835205, "step": 186 }, { "epoch": 2.216296296296296, "grad_norm": 10.218903811635839, "learning_rate": 2.4479906289012216e-07, "logits/chosen": -0.23329459130764008, "logits/rejected": -0.16335441172122955, "logps/chosen": -41.237979888916016, "logps/rejected": -61.02803039550781, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 0.29829394817352295, "rewards/margins": 6.824221611022949, "rewards/rejected": -6.525927543640137, "step": 187 }, { "epoch": 2.228148148148148, "grad_norm": 5.706051711082172, "learning_rate": 2.421992978581152e-07, "logits/chosen": -0.20098957419395447, "logits/rejected": -0.1625077724456787, "logps/chosen": -33.27192306518555, "logps/rejected": -63.34418869018555, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -0.16527414321899414, "rewards/margins": 7.369152069091797, "rewards/rejected": -7.534427165985107, "step": 188 }, { "epoch": 2.24, "grad_norm": 5.574632829097292, "learning_rate": 2.3960037696835987e-07, "logits/chosen": -0.15070542693138123, "logits/rejected": -0.09224209934473038, "logps/chosen": -36.12839126586914, "logps/rejected": -78.54652404785156, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": -1.1367915868759155, "rewards/margins": 9.016799926757812, "rewards/rejected": -10.15359115600586, "step": 189 }, { "epoch": 2.251851851851852, "grad_norm": 9.222402573182322, "learning_rate": 2.3700258145950493e-07, "logits/chosen": -0.16236115992069244, "logits/rejected": -0.21426168084144592, "logps/chosen": -32.171287536621094, "logps/rejected": -65.74127960205078, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -0.31431764364242554, "rewards/margins": 7.022213459014893, "rewards/rejected": -7.336531162261963, "step": 190 }, { "epoch": 2.2637037037037038, "grad_norm": 7.127001970520486, "learning_rate": 2.3440619244841794e-07, "logits/chosen": -0.2093840390443802, "logits/rejected": -0.20409150421619415, "logps/chosen": -31.866199493408203, "logps/rejected": -57.632957458496094, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": -0.5825138092041016, "rewards/margins": 5.033719539642334, "rewards/rejected": -5.616233825683594, "step": 191 }, { "epoch": 2.2755555555555556, "grad_norm": 7.894992729942256, "learning_rate": 2.3181149089976404e-07, "logits/chosen": -0.07013247907161713, "logits/rejected": -0.04877481237053871, "logps/chosen": -33.34114456176758, "logps/rejected": -54.995853424072266, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 0.006869621574878693, "rewards/margins": 6.3635053634643555, "rewards/rejected": -6.356635570526123, "step": 192 }, { "epoch": 2.2874074074074073, "grad_norm": 11.705188167009563, "learning_rate": 2.2921875759560207e-07, "logits/chosen": -0.13075098395347595, "logits/rejected": -0.20485468208789825, "logps/chosen": -47.33964157104492, "logps/rejected": -67.68054962158203, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": -0.68022620677948, "rewards/margins": 6.593554973602295, "rewards/rejected": -7.273780822753906, "step": 193 }, { "epoch": 2.299259259259259, "grad_norm": 7.134458814614045, "learning_rate": 2.2662827310499995e-07, "logits/chosen": -0.2494003027677536, "logits/rejected": -0.20588865876197815, "logps/chosen": -36.393733978271484, "logps/rejected": -58.504547119140625, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -0.11634793132543564, "rewards/margins": 6.01495885848999, "rewards/rejected": -6.1313066482543945, "step": 194 }, { "epoch": 2.311111111111111, "grad_norm": 12.206709819986875, "learning_rate": 2.2404031775367332e-07, "logits/chosen": -0.29956668615341187, "logits/rejected": -0.21865390241146088, "logps/chosen": -32.450687408447266, "logps/rejected": -69.80120086669922, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 0.12443738430738449, "rewards/margins": 8.257219314575195, "rewards/rejected": -8.132781982421875, "step": 195 }, { "epoch": 2.322962962962963, "grad_norm": 6.1999786310250675, "learning_rate": 2.2145517159365043e-07, "logits/chosen": -0.5200955271720886, "logits/rejected": -0.4296617805957794, "logps/chosen": -36.10881805419922, "logps/rejected": -63.84122848510742, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -0.5274075865745544, "rewards/margins": 6.768378257751465, "rewards/rejected": -7.295785903930664, "step": 196 }, { "epoch": 2.334814814814815, "grad_norm": 12.570655654481419, "learning_rate": 2.1887311437296684e-07, "logits/chosen": -0.31551796197891235, "logits/rejected": -0.281019389629364, "logps/chosen": -29.494091033935547, "logps/rejected": -46.46226501464844, "loss": 0.0761, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2344491183757782, "rewards/margins": 4.557922840118408, "rewards/rejected": -4.3234734535217285, "step": 197 }, { "epoch": 2.3466666666666667, "grad_norm": 7.628823812638006, "learning_rate": 2.162944255053928e-07, "logits/chosen": -0.31029197573661804, "logits/rejected": -0.24161145091056824, "logps/chosen": -29.65079689025879, "logps/rejected": -57.47043228149414, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 0.04058557003736496, "rewards/margins": 6.636114597320557, "rewards/rejected": -6.595529079437256, "step": 198 }, { "epoch": 2.3585185185185185, "grad_norm": 8.85295248102532, "learning_rate": 2.137193840401968e-07, "logits/chosen": -0.3979605734348297, "logits/rejected": -0.32846879959106445, "logps/chosen": -34.3480224609375, "logps/rejected": -58.354896545410156, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 0.3420531153678894, "rewards/margins": 5.625160217285156, "rewards/rejected": -5.283106803894043, "step": 199 }, { "epoch": 2.3703703703703702, "grad_norm": 6.562343692053369, "learning_rate": 2.1114826863194878e-07, "logits/chosen": -0.25852352380752563, "logits/rejected": -0.17369653284549713, "logps/chosen": -35.14963912963867, "logps/rejected": -68.84390258789062, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -0.06890328228473663, "rewards/margins": 7.833587169647217, "rewards/rejected": -7.902491092681885, "step": 200 }, { "epoch": 2.3822222222222225, "grad_norm": 7.374090556871228, "learning_rate": 2.0858135751036568e-07, "logits/chosen": -0.347494900226593, "logits/rejected": -0.366567462682724, "logps/chosen": -48.93431091308594, "logps/rejected": -69.73056030273438, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -1.1343318223953247, "rewards/margins": 7.379057884216309, "rewards/rejected": -8.51339054107666, "step": 201 }, { "epoch": 2.3940740740740742, "grad_norm": 7.264335195847015, "learning_rate": 2.060189284502037e-07, "logits/chosen": -0.28864482045173645, "logits/rejected": -0.14989601075649261, "logps/chosen": -36.92799377441406, "logps/rejected": -68.40786743164062, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -0.6765543818473816, "rewards/margins": 6.945480823516846, "rewards/rejected": -7.622035503387451, "step": 202 }, { "epoch": 2.405925925925926, "grad_norm": 7.5132642347332155, "learning_rate": 2.0346125874119838e-07, "logits/chosen": -0.35054826736450195, "logits/rejected": -0.35159796476364136, "logps/chosen": -35.19864273071289, "logps/rejected": -67.63153839111328, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -1.2543597221374512, "rewards/margins": 7.7994771003723145, "rewards/rejected": -9.053837776184082, "step": 203 }, { "epoch": 2.417777777777778, "grad_norm": 10.04344534438614, "learning_rate": 2.0090862515805895e-07, "logits/chosen": -0.13007110357284546, "logits/rejected": -0.1110701858997345, "logps/chosen": -43.942832946777344, "logps/rejected": -60.823211669921875, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": -1.1511293649673462, "rewards/margins": 6.567122936248779, "rewards/rejected": -7.718252658843994, "step": 204 }, { "epoch": 2.4296296296296296, "grad_norm": 9.49554839896552, "learning_rate": 1.983613039305173e-07, "logits/chosen": -0.4052940905094147, "logits/rejected": -0.23975247144699097, "logps/chosen": -27.93557357788086, "logps/rejected": -65.16735076904297, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": -0.3964582085609436, "rewards/margins": 8.21355152130127, "rewards/rejected": -8.610010147094727, "step": 205 }, { "epoch": 2.4414814814814814, "grad_norm": 7.396028421519832, "learning_rate": 1.9581957071343588e-07, "logits/chosen": -0.3185134828090668, "logits/rejected": -0.1753259003162384, "logps/chosen": -45.08576202392578, "logps/rejected": -88.92870330810547, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": -1.9330806732177734, "rewards/margins": 8.258587837219238, "rewards/rejected": -10.191668510437012, "step": 206 }, { "epoch": 2.453333333333333, "grad_norm": 6.455933266927346, "learning_rate": 1.9328370055697832e-07, "logits/chosen": -0.20377328991889954, "logits/rejected": -0.09884392470121384, "logps/chosen": -31.141347885131836, "logps/rejected": -68.2867431640625, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": -0.21240955591201782, "rewards/margins": 7.815065860748291, "rewards/rejected": -8.027475357055664, "step": 207 }, { "epoch": 2.4651851851851854, "grad_norm": 9.373669301801096, "learning_rate": 1.907539678768453e-07, "logits/chosen": -0.5164112448692322, "logits/rejected": -0.4492265284061432, "logps/chosen": -31.97795295715332, "logps/rejected": -73.8591079711914, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": -0.6608830094337463, "rewards/margins": 7.822652816772461, "rewards/rejected": -8.483535766601562, "step": 208 }, { "epoch": 2.477037037037037, "grad_norm": 8.80264276423414, "learning_rate": 1.8823064642457876e-07, "logits/chosen": -0.19362421333789825, "logits/rejected": -0.09952510893344879, "logps/chosen": -36.87741470336914, "logps/rejected": -76.23078918457031, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": -0.9911991357803345, "rewards/margins": 7.124609470367432, "rewards/rejected": -8.115808486938477, "step": 209 }, { "epoch": 2.488888888888889, "grad_norm": 6.872370358967133, "learning_rate": 1.8571400925793852e-07, "logits/chosen": -0.28052157163619995, "logits/rejected": -0.18670235574245453, "logps/chosen": -34.98965835571289, "logps/rejected": -62.01945495605469, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 0.1422077715396881, "rewards/margins": 7.268811225891113, "rewards/rejected": -7.126603126525879, "step": 210 }, { "epoch": 2.5007407407407407, "grad_norm": 6.102992921343477, "learning_rate": 1.8320432871135376e-07, "logits/chosen": -0.012273239903151989, "logits/rejected": 0.11726081371307373, "logps/chosen": -41.02720642089844, "logps/rejected": -70.41509246826172, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -0.5696543455123901, "rewards/margins": 6.976672649383545, "rewards/rejected": -7.546327114105225, "step": 211 }, { "epoch": 2.5125925925925925, "grad_norm": 7.523877695462096, "learning_rate": 1.8070187636645237e-07, "logits/chosen": -0.25425052642822266, "logits/rejected": -0.20172733068466187, "logps/chosen": -29.920835494995117, "logps/rejected": -58.127830505371094, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": -0.9108222126960754, "rewards/margins": 5.636702060699463, "rewards/rejected": -6.547523498535156, "step": 212 }, { "epoch": 2.5244444444444447, "grad_norm": 8.233825538587578, "learning_rate": 1.782069230226725e-07, "logits/chosen": -0.1111406460404396, "logits/rejected": -0.07830701768398285, "logps/chosen": -36.62953567504883, "logps/rejected": -69.00572204589844, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": -0.7046716809272766, "rewards/margins": 8.098491668701172, "rewards/rejected": -8.803162574768066, "step": 213 }, { "epoch": 2.536296296296296, "grad_norm": 7.26503668987421, "learning_rate": 1.7571973866795813e-07, "logits/chosen": -0.3010156750679016, "logits/rejected": -0.14240717887878418, "logps/chosen": -28.9267635345459, "logps/rejected": -62.530731201171875, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": -0.38880711793899536, "rewards/margins": 7.034039497375488, "rewards/rejected": -7.422846794128418, "step": 214 }, { "epoch": 2.5481481481481483, "grad_norm": 5.978089867317629, "learning_rate": 1.7324059244954292e-07, "logits/chosen": -0.4227255582809448, "logits/rejected": -0.40319374203681946, "logps/chosen": -33.19075012207031, "logps/rejected": -64.02904510498047, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -0.5749123096466064, "rewards/margins": 9.057025909423828, "rewards/rejected": -9.631938934326172, "step": 215 }, { "epoch": 2.56, "grad_norm": 9.696303087236457, "learning_rate": 1.7076975264482433e-07, "logits/chosen": -0.37839898467063904, "logits/rejected": -0.273608922958374, "logps/chosen": -36.7985954284668, "logps/rejected": -64.79552459716797, "loss": 0.0498, "rewards/accuracies": 0.875, "rewards/chosen": -1.2072323560714722, "rewards/margins": 6.241967678070068, "rewards/rejected": -7.449199676513672, "step": 216 }, { "epoch": 2.571851851851852, "grad_norm": 7.034579383121281, "learning_rate": 1.6830748663233303e-07, "logits/chosen": -0.25258129835128784, "logits/rejected": -0.23720452189445496, "logps/chosen": -31.781192779541016, "logps/rejected": -62.308006286621094, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -0.21115940809249878, "rewards/margins": 7.114487648010254, "rewards/rejected": -7.325647354125977, "step": 217 }, { "epoch": 2.5837037037037036, "grad_norm": 8.754965473386916, "learning_rate": 1.6585406086279846e-07, "logits/chosen": -0.43037766218185425, "logits/rejected": -0.375767320394516, "logps/chosen": -40.90904998779297, "logps/rejected": -76.52076721191406, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": -0.7350015044212341, "rewards/margins": 8.066032409667969, "rewards/rejected": -8.801033973693848, "step": 218 }, { "epoch": 2.5955555555555554, "grad_norm": 5.4726077818919565, "learning_rate": 1.6340974083031523e-07, "logits/chosen": -0.29419374465942383, "logits/rejected": -0.28271955251693726, "logps/chosen": -30.226680755615234, "logps/rejected": -55.334014892578125, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 0.04286186397075653, "rewards/margins": 6.613791465759277, "rewards/rejected": -6.570930004119873, "step": 219 }, { "epoch": 2.6074074074074076, "grad_norm": 7.600745751196765, "learning_rate": 1.6097479104361326e-07, "logits/chosen": -0.38003548979759216, "logits/rejected": -0.20779910683631897, "logps/chosen": -26.411277770996094, "logps/rejected": -65.45418548583984, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": 0.08172351866960526, "rewards/margins": 7.309746742248535, "rewards/rejected": -7.228023052215576, "step": 220 }, { "epoch": 2.6192592592592594, "grad_norm": 5.529941543826629, "learning_rate": 1.5854947499743413e-07, "logits/chosen": -0.2649455666542053, "logits/rejected": -0.1322605013847351, "logps/chosen": -28.605796813964844, "logps/rejected": -67.70567321777344, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -0.8953390717506409, "rewards/margins": 7.937841892242432, "rewards/rejected": -8.83318042755127, "step": 221 }, { "epoch": 2.631111111111111, "grad_norm": 7.828964734675065, "learning_rate": 1.5613405514401757e-07, "logits/chosen": -0.4999098479747772, "logits/rejected": -0.46459028124809265, "logps/chosen": -32.34528350830078, "logps/rejected": -64.39063262939453, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -1.7131352424621582, "rewards/margins": 7.366223335266113, "rewards/rejected": -9.07935905456543, "step": 222 }, { "epoch": 2.642962962962963, "grad_norm": 6.684903510896331, "learning_rate": 1.537287928647002e-07, "logits/chosen": -0.33326905965805054, "logits/rejected": -0.2772333025932312, "logps/chosen": -33.04732894897461, "logps/rejected": -55.9017448425293, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -0.4501342475414276, "rewards/margins": 6.482940196990967, "rewards/rejected": -6.933073997497559, "step": 223 }, { "epoch": 2.6548148148148147, "grad_norm": 7.774671778619875, "learning_rate": 1.513339484416309e-07, "logits/chosen": -0.350558876991272, "logits/rejected": -0.28618156909942627, "logps/chosen": -49.11450958251953, "logps/rejected": -80.40065002441406, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -1.9198662042617798, "rewards/margins": 8.049423217773438, "rewards/rejected": -9.969290733337402, "step": 224 }, { "epoch": 2.6666666666666665, "grad_norm": 6.147372232877328, "learning_rate": 1.489497810296046e-07, "logits/chosen": -0.2636696696281433, "logits/rejected": -0.1582798808813095, "logps/chosen": -33.32222366333008, "logps/rejected": -88.90251159667969, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -0.8391042947769165, "rewards/margins": 9.376372337341309, "rewards/rejected": -10.215476036071777, "step": 225 }, { "epoch": 2.6785185185185183, "grad_norm": 6.058104127594901, "learning_rate": 1.4657654862801797e-07, "logits/chosen": -0.3205685317516327, "logits/rejected": -0.24160242080688477, "logps/chosen": -28.52737045288086, "logps/rejected": -70.28912353515625, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -0.24154254794120789, "rewards/margins": 6.4893083572387695, "rewards/rejected": -6.730850696563721, "step": 226 }, { "epoch": 2.6903703703703705, "grad_norm": 10.148811706386386, "learning_rate": 1.4421450805295082e-07, "logits/chosen": -0.33272168040275574, "logits/rejected": -0.3017561733722687, "logps/chosen": -40.76533889770508, "logps/rejected": -58.748046875, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": -0.23764045536518097, "rewards/margins": 5.5487141609191895, "rewards/rejected": -5.7863545417785645, "step": 227 }, { "epoch": 2.7022222222222223, "grad_norm": 7.063093696385122, "learning_rate": 1.418639149093748e-07, "logits/chosen": -0.5206415057182312, "logits/rejected": -0.42945098876953125, "logps/chosen": -35.94019317626953, "logps/rejected": -51.362579345703125, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": -0.5331210494041443, "rewards/margins": 5.205883502960205, "rewards/rejected": -5.739004611968994, "step": 228 }, { "epoch": 2.714074074074074, "grad_norm": 7.873488891465637, "learning_rate": 1.3952502356349323e-07, "logits/chosen": -0.2090906947851181, "logits/rejected": -0.116541787981987, "logps/chosen": -36.972110748291016, "logps/rejected": -70.80089569091797, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -0.16982126235961914, "rewards/margins": 8.14144515991211, "rewards/rejected": -8.311266899108887, "step": 229 }, { "epoch": 2.725925925925926, "grad_norm": 6.247579493903217, "learning_rate": 1.371980871152157e-07, "logits/chosen": -0.14566001296043396, "logits/rejected": -0.19308951497077942, "logps/chosen": -41.63805389404297, "logps/rejected": -75.58000183105469, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -0.45496970415115356, "rewards/margins": 8.866785049438477, "rewards/rejected": -9.321755409240723, "step": 230 }, { "epoch": 2.7377777777777776, "grad_norm": 6.023624860692066, "learning_rate": 1.3488335737076911e-07, "logits/chosen": -0.24541382491588593, "logits/rejected": -0.2724686861038208, "logps/chosen": -33.94440841674805, "logps/rejected": -54.19181823730469, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -0.641716718673706, "rewards/margins": 6.787499904632568, "rewards/rejected": -7.4292168617248535, "step": 231 }, { "epoch": 2.74962962962963, "grad_norm": 6.829922502232146, "learning_rate": 1.3258108481544847e-07, "logits/chosen": -0.2750440537929535, "logits/rejected": -0.2242782562971115, "logps/chosen": -47.98163986206055, "logps/rejected": -71.17113494873047, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -1.330321192741394, "rewards/margins": 5.847900390625, "rewards/rejected": -7.178221702575684, "step": 232 }, { "epoch": 2.7614814814814816, "grad_norm": 9.939842224750613, "learning_rate": 1.3029151858651143e-07, "logits/chosen": -0.4768088757991791, "logits/rejected": -0.3678171634674072, "logps/chosen": -28.488832473754883, "logps/rejected": -72.2415542602539, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": -0.4457303285598755, "rewards/margins": 7.388967514038086, "rewards/rejected": -7.834697723388672, "step": 233 }, { "epoch": 2.7733333333333334, "grad_norm": 5.772137373566735, "learning_rate": 1.2801490644621788e-07, "logits/chosen": -0.10860705375671387, "logits/rejected": -0.09419623762369156, "logps/chosen": -41.14183807373047, "logps/rejected": -73.03929138183594, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -1.0385979413986206, "rewards/margins": 9.18422794342041, "rewards/rejected": -10.222824096679688, "step": 234 }, { "epoch": 2.785185185185185, "grad_norm": 6.872437338157428, "learning_rate": 1.257514947550189e-07, "logits/chosen": -0.32558369636535645, "logits/rejected": -0.23454414308071136, "logps/chosen": -27.442285537719727, "logps/rejected": -47.84419250488281, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -0.5560548305511475, "rewards/margins": 5.780346870422363, "rewards/rejected": -6.336400985717773, "step": 235 }, { "epoch": 2.797037037037037, "grad_norm": 9.526509052512719, "learning_rate": 1.2350152844489688e-07, "logits/chosen": -0.3666895925998688, "logits/rejected": -0.22046907246112823, "logps/chosen": -38.18906021118164, "logps/rejected": -68.36451721191406, "loss": 0.0521, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1897006034851074, "rewards/margins": 6.763792991638184, "rewards/rejected": -7.953493118286133, "step": 236 }, { "epoch": 2.8088888888888888, "grad_norm": 6.2325582014169205, "learning_rate": 1.2126525099286108e-07, "logits/chosen": -0.3752056956291199, "logits/rejected": -0.29132628440856934, "logps/chosen": -39.4105110168457, "logps/rejected": -70.36639404296875, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -1.190868854522705, "rewards/margins": 7.782485008239746, "rewards/rejected": -8.973353385925293, "step": 237 }, { "epoch": 2.8207407407407405, "grad_norm": 6.813356776563027, "learning_rate": 1.1904290439459971e-07, "logits/chosen": -0.4055876135826111, "logits/rejected": -0.34407296776771545, "logps/chosen": -36.34687805175781, "logps/rejected": -64.90667724609375, "loss": 0.029, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6599448323249817, "rewards/margins": 7.470705509185791, "rewards/rejected": -8.130650520324707, "step": 238 }, { "epoch": 2.8325925925925928, "grad_norm": 8.170401197759237, "learning_rate": 1.1683472913829284e-07, "logits/chosen": -0.20514726638793945, "logits/rejected": -0.11336632817983627, "logps/chosen": -51.73873519897461, "logps/rejected": -71.9918441772461, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -0.8510772585868835, "rewards/margins": 6.703639507293701, "rewards/rejected": -7.554717063903809, "step": 239 }, { "epoch": 2.8444444444444446, "grad_norm": 8.191853793447038, "learning_rate": 1.146409641785882e-07, "logits/chosen": -0.16900832951068878, "logits/rejected": -0.16538989543914795, "logps/chosen": -44.017845153808594, "logps/rejected": -54.396583557128906, "loss": 0.0415, "rewards/accuracies": 0.9375, "rewards/chosen": -1.656957983970642, "rewards/margins": 4.629300117492676, "rewards/rejected": -6.286258697509766, "step": 240 }, { "epoch": 2.8562962962962963, "grad_norm": 7.381077568622568, "learning_rate": 1.1246184691074314e-07, "logits/chosen": -0.24956950545310974, "logits/rejected": -0.24476227164268494, "logps/chosen": -42.77149963378906, "logps/rejected": -79.72845458984375, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -0.6535344123840332, "rewards/margins": 9.295120239257812, "rewards/rejected": -9.948655128479004, "step": 241 }, { "epoch": 2.868148148148148, "grad_norm": 8.160459964737315, "learning_rate": 1.1029761314493518e-07, "logits/chosen": -0.30951178073883057, "logits/rejected": -0.3085937201976776, "logps/chosen": -40.81586456298828, "logps/rejected": -60.63238525390625, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -1.3215899467468262, "rewards/margins": 6.157529830932617, "rewards/rejected": -7.479120254516602, "step": 242 }, { "epoch": 2.88, "grad_norm": 8.037040216975743, "learning_rate": 1.0814849708074414e-07, "logits/chosen": -0.30921998620033264, "logits/rejected": -0.3065870404243469, "logps/chosen": -51.514556884765625, "logps/rejected": -66.55127716064453, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -0.17993119359016418, "rewards/margins": 6.790800094604492, "rewards/rejected": -6.970730781555176, "step": 243 }, { "epoch": 2.891851851851852, "grad_norm": 9.656051990182503, "learning_rate": 1.0601473128180854e-07, "logits/chosen": -0.2805531620979309, "logits/rejected": -0.2533468008041382, "logps/chosen": -46.80023193359375, "logps/rejected": -68.6700668334961, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": -0.8430509567260742, "rewards/margins": 8.124579429626465, "rewards/rejected": -8.967630386352539, "step": 244 }, { "epoch": 2.9037037037037035, "grad_norm": 8.425061421071412, "learning_rate": 1.0389654665065908e-07, "logits/chosen": -0.2665305435657501, "logits/rejected": -0.30099448561668396, "logps/chosen": -36.76901626586914, "logps/rejected": -62.380638122558594, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -0.7122980952262878, "rewards/margins": 7.467397689819336, "rewards/rejected": -8.179695129394531, "step": 245 }, { "epoch": 2.9155555555555557, "grad_norm": 7.412242548453604, "learning_rate": 1.0179417240373182e-07, "logits/chosen": -0.32122743129730225, "logits/rejected": -0.22737433016300201, "logps/chosen": -52.4939079284668, "logps/rejected": -86.98854064941406, "loss": 0.0332, "rewards/accuracies": 0.9375, "rewards/chosen": -2.623575210571289, "rewards/margins": 8.667156219482422, "rewards/rejected": -11.290731430053711, "step": 246 }, { "epoch": 2.9274074074074075, "grad_norm": 6.343263474138892, "learning_rate": 9.970783604656383e-08, "logits/chosen": -0.40669649839401245, "logits/rejected": -0.2913494408130646, "logps/chosen": -40.742733001708984, "logps/rejected": -70.4676513671875, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": -0.4790029525756836, "rewards/margins": 7.971775054931641, "rewards/rejected": -8.450777053833008, "step": 247 }, { "epoch": 2.9392592592592592, "grad_norm": 8.350168639074793, "learning_rate": 9.763776334917398e-08, "logits/chosen": -0.28063714504241943, "logits/rejected": -0.2614033818244934, "logps/chosen": -39.683170318603516, "logps/rejected": -56.151100158691406, "loss": 0.0442, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2253810167312622, "rewards/margins": 5.419277667999268, "rewards/rejected": -6.644659042358398, "step": 248 }, { "epoch": 2.951111111111111, "grad_norm": 6.1204430667008864, "learning_rate": 9.558417832163162e-08, "logits/chosen": -0.13153290748596191, "logits/rejected": -0.19138801097869873, "logps/chosen": -38.79569625854492, "logps/rejected": -55.324180603027344, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -0.37145036458969116, "rewards/margins": 6.06140661239624, "rewards/rejected": -6.432857513427734, "step": 249 }, { "epoch": 2.962962962962963, "grad_norm": 7.267482062373579, "learning_rate": 9.354730318981561e-08, "logits/chosen": -0.4541955590248108, "logits/rejected": -0.3866829574108124, "logps/chosen": -31.26105308532715, "logps/rejected": -66.00698852539062, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -0.9516152739524841, "rewards/margins": 7.78145170211792, "rewards/rejected": -8.73306655883789, "step": 250 }, { "epoch": 2.974814814814815, "grad_norm": 5.662458264963396, "learning_rate": 9.15273583713663e-08, "logits/chosen": -0.37392503023147583, "logits/rejected": -0.29210343956947327, "logps/chosen": -47.48450469970703, "logps/rejected": -90.92308044433594, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -2.219913959503174, "rewards/margins": 12.272329330444336, "rewards/rejected": -14.492244720458984, "step": 251 }, { "epoch": 2.986666666666667, "grad_norm": 5.97436680259246, "learning_rate": 8.95245624518336e-08, "logits/chosen": -0.3016185760498047, "logits/rejected": -0.2964284420013428, "logps/chosen": -34.3846321105957, "logps/rejected": -68.51025390625, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.7737685441970825, "rewards/margins": 7.51839017868042, "rewards/rejected": -8.292159080505371, "step": 252 }, { "epoch": 2.9985185185185186, "grad_norm": 6.809722463225596, "learning_rate": 8.753913216102285e-08, "logits/chosen": -0.26927924156188965, "logits/rejected": -0.09772679954767227, "logps/chosen": -39.219181060791016, "logps/rejected": -83.87496185302734, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -1.874872088432312, "rewards/margins": 7.590030193328857, "rewards/rejected": -9.464902877807617, "step": 253 }, { "epoch": 3.0103703703703704, "grad_norm": 5.7996860529913565, "learning_rate": 8.557128234954189e-08, "logits/chosen": -0.40512141585350037, "logits/rejected": -0.3163852393627167, "logps/chosen": -29.341394424438477, "logps/rejected": -70.40370178222656, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -0.9042503833770752, "rewards/margins": 8.99386978149414, "rewards/rejected": -9.898119926452637, "step": 254 }, { "epoch": 3.022222222222222, "grad_norm": 5.286377326614028, "learning_rate": 8.362122596555088e-08, "logits/chosen": -0.47114288806915283, "logits/rejected": -0.3825288712978363, "logps/chosen": -33.60106658935547, "logps/rejected": -76.53120422363281, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -1.207690954208374, "rewards/margins": 10.646196365356445, "rewards/rejected": -11.853886604309082, "step": 255 }, { "epoch": 3.034074074074074, "grad_norm": 6.253088250116567, "learning_rate": 8.16891740317189e-08, "logits/chosen": -0.3761712908744812, "logits/rejected": -0.35195398330688477, "logps/chosen": -33.14177322387695, "logps/rejected": -55.4194450378418, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": -0.6758537888526917, "rewards/margins": 6.090365409851074, "rewards/rejected": -6.766219615936279, "step": 256 }, { "epoch": 3.0459259259259257, "grad_norm": 6.986196206894695, "learning_rate": 7.977533562238838e-08, "logits/chosen": -0.4037611186504364, "logits/rejected": -0.3634166419506073, "logps/chosen": -34.40519332885742, "logps/rejected": -70.6146240234375, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.2543308138847351, "rewards/margins": 7.034722328186035, "rewards/rejected": -7.289052963256836, "step": 257 }, { "epoch": 3.057777777777778, "grad_norm": 5.505207911608369, "learning_rate": 7.787991784094999e-08, "logits/chosen": -0.2387389987707138, "logits/rejected": -0.08904880285263062, "logps/chosen": -36.03857421875, "logps/rejected": -89.65563201904297, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -0.6398571133613586, "rewards/margins": 8.411537170410156, "rewards/rejected": -9.051393508911133, "step": 258 }, { "epoch": 3.0696296296296297, "grad_norm": 6.3788891492475415, "learning_rate": 7.60031257974316e-08, "logits/chosen": -0.36758318543434143, "logits/rejected": -0.28655973076820374, "logps/chosen": -34.49348831176758, "logps/rejected": -75.51551818847656, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -1.3124196529388428, "rewards/margins": 9.141406059265137, "rewards/rejected": -10.453825950622559, "step": 259 }, { "epoch": 3.0814814814814815, "grad_norm": 7.213670504047207, "learning_rate": 7.414516258630244e-08, "logits/chosen": -0.2934122681617737, "logits/rejected": -0.2819547653198242, "logps/chosen": -49.67085647583008, "logps/rejected": -82.62693786621094, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": -1.0997469425201416, "rewards/margins": 9.53528118133545, "rewards/rejected": -10.635027885437012, "step": 260 }, { "epoch": 3.0933333333333333, "grad_norm": 5.692432592478289, "learning_rate": 7.230622926449564e-08, "logits/chosen": -0.2739347219467163, "logits/rejected": -0.20775115489959717, "logps/chosen": -37.56914520263672, "logps/rejected": -65.73677062988281, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -1.4430259466171265, "rewards/margins": 8.155195236206055, "rewards/rejected": -9.598221778869629, "step": 261 }, { "epoch": 3.105185185185185, "grad_norm": 7.076581419281233, "learning_rate": 7.048652482965078e-08, "logits/chosen": -0.18068230152130127, "logits/rejected": -0.1852649450302124, "logps/chosen": -46.34651184082031, "logps/rejected": -68.5006103515625, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -1.1328610181808472, "rewards/margins": 7.682218551635742, "rewards/rejected": -8.815078735351562, "step": 262 }, { "epoch": 3.117037037037037, "grad_norm": 6.338236077273577, "learning_rate": 6.868624619858021e-08, "logits/chosen": -0.2783002257347107, "logits/rejected": -0.3345209062099457, "logps/chosen": -36.772254943847656, "logps/rejected": -83.5664291381836, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -0.4274479150772095, "rewards/margins": 7.226326942443848, "rewards/rejected": -7.653774261474609, "step": 263 }, { "epoch": 3.128888888888889, "grad_norm": 5.495669924723658, "learning_rate": 6.690558818595943e-08, "logits/chosen": -0.34206265211105347, "logits/rejected": -0.19553421437740326, "logps/chosen": -32.38424301147461, "logps/rejected": -85.89747619628906, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -1.1448084115982056, "rewards/margins": 9.975635528564453, "rewards/rejected": -11.120444297790527, "step": 264 }, { "epoch": 3.140740740740741, "grad_norm": 5.0666203766090385, "learning_rate": 6.514474348324581e-08, "logits/chosen": -0.38811901211738586, "logits/rejected": -0.2747833728790283, "logps/chosen": -48.448951721191406, "logps/rejected": -77.10838317871094, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": -1.6878550052642822, "rewards/margins": 7.391786575317383, "rewards/rejected": -9.079641342163086, "step": 265 }, { "epoch": 3.1525925925925926, "grad_norm": 8.885007920801026, "learning_rate": 6.340390263782655e-08, "logits/chosen": -0.5093058347702026, "logits/rejected": -0.3832343816757202, "logps/chosen": -32.23210144042969, "logps/rejected": -76.0536117553711, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": -1.038543462753296, "rewards/margins": 9.122876167297363, "rewards/rejected": -10.161420822143555, "step": 266 }, { "epoch": 3.1644444444444444, "grad_norm": 7.984860081690398, "learning_rate": 6.168325403239913e-08, "logits/chosen": -0.4433887004852295, "logits/rejected": -0.3792242109775543, "logps/chosen": -29.287479400634766, "logps/rejected": -58.2768669128418, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": -0.46901828050613403, "rewards/margins": 7.602441787719727, "rewards/rejected": -8.071460723876953, "step": 267 }, { "epoch": 3.176296296296296, "grad_norm": 4.958032867566694, "learning_rate": 5.998298386458545e-08, "logits/chosen": -0.22974154353141785, "logits/rejected": -0.19992095232009888, "logps/chosen": -38.70039367675781, "logps/rejected": -72.84990692138672, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -1.0671855211257935, "rewards/margins": 7.9272379875183105, "rewards/rejected": -8.994423866271973, "step": 268 }, { "epoch": 3.188148148148148, "grad_norm": 5.988243698878292, "learning_rate": 5.830327612678265e-08, "logits/chosen": -0.32180365920066833, "logits/rejected": -0.3045603632926941, "logps/chosen": -43.980316162109375, "logps/rejected": -83.64070129394531, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -2.710637331008911, "rewards/margins": 9.31506061553955, "rewards/rejected": -12.025696754455566, "step": 269 }, { "epoch": 3.2, "grad_norm": 5.602823035809897, "learning_rate": 5.6644312586253044e-08, "logits/chosen": -0.0014043133705854416, "logits/rejected": -0.04007536917924881, "logps/chosen": -63.08719253540039, "logps/rejected": -80.3905029296875, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -1.9075740575790405, "rewards/margins": 7.626714706420898, "rewards/rejected": -9.53428840637207, "step": 270 }, { "epoch": 3.211851851851852, "grad_norm": 7.326638872216747, "learning_rate": 5.5006272765454056e-08, "logits/chosen": -0.43287378549575806, "logits/rejected": -0.27151188254356384, "logps/chosen": -33.51972579956055, "logps/rejected": -58.82566452026367, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.519016444683075, "rewards/margins": 6.466248512268066, "rewards/rejected": -6.985265731811523, "step": 271 }, { "epoch": 3.2237037037037037, "grad_norm": 4.755215217001098, "learning_rate": 5.338933392261158e-08, "logits/chosen": -0.2298121452331543, "logits/rejected": -0.17249788343906403, "logps/chosen": -37.40292739868164, "logps/rejected": -64.0435562133789, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -0.865021288394928, "rewards/margins": 6.909914016723633, "rewards/rejected": -7.774934768676758, "step": 272 }, { "epoch": 3.2355555555555555, "grad_norm": 7.328735036408531, "learning_rate": 5.1793671032538206e-08, "logits/chosen": -0.5466493368148804, "logits/rejected": -0.49364450573921204, "logps/chosen": -31.749622344970703, "logps/rejected": -76.33462524414062, "loss": 0.0351, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5585615634918213, "rewards/margins": 8.347299575805664, "rewards/rejected": -8.905860900878906, "step": 273 }, { "epoch": 3.2474074074074073, "grad_norm": 4.866161229053689, "learning_rate": 5.021945676769859e-08, "logits/chosen": -0.5478118658065796, "logits/rejected": -0.38123688101768494, "logps/chosen": -26.91775131225586, "logps/rejected": -66.9200668334961, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -0.39309221506118774, "rewards/margins": 8.147336959838867, "rewards/rejected": -8.540430068969727, "step": 274 }, { "epoch": 3.259259259259259, "grad_norm": 5.465123706910723, "learning_rate": 4.866686147952387e-08, "logits/chosen": -0.15793977677822113, "logits/rejected": -0.13396653532981873, "logps/chosen": -38.39078140258789, "logps/rejected": -68.49137115478516, "loss": 0.0332, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5564990639686584, "rewards/margins": 7.465914726257324, "rewards/rejected": -8.022414207458496, "step": 275 }, { "epoch": 3.2711111111111113, "grad_norm": 6.892669002618101, "learning_rate": 4.71360531799774e-08, "logits/chosen": -0.17291544377803802, "logits/rejected": -0.11444761604070663, "logps/chosen": -52.285491943359375, "logps/rejected": -84.1561508178711, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": -2.8530375957489014, "rewards/margins": 7.457571983337402, "rewards/rejected": -10.310609817504883, "step": 276 }, { "epoch": 3.282962962962963, "grad_norm": 4.92957758138627, "learning_rate": 4.562719752337349e-08, "logits/chosen": -0.47689568996429443, "logits/rejected": -0.38014689087867737, "logps/chosen": -51.07635498046875, "logps/rejected": -94.97547149658203, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -2.258345603942871, "rewards/margins": 8.710776329040527, "rewards/rejected": -10.969121932983398, "step": 277 }, { "epoch": 3.294814814814815, "grad_norm": 5.454251196984929, "learning_rate": 4.4140457788451434e-08, "logits/chosen": -0.3425113260746002, "logits/rejected": -0.25056517124176025, "logps/chosen": -31.684978485107422, "logps/rejected": -69.71234130859375, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -0.24049748480319977, "rewards/margins": 7.845020771026611, "rewards/rejected": -8.085518836975098, "step": 278 }, { "epoch": 3.3066666666666666, "grad_norm": 5.883268469101933, "learning_rate": 4.267599486070647e-08, "logits/chosen": -0.172508105635643, "logits/rejected": -0.20862886309623718, "logps/chosen": -39.89122772216797, "logps/rejected": -52.314510345458984, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -0.3147379755973816, "rewards/margins": 5.7033185958862305, "rewards/rejected": -6.0180559158325195, "step": 279 }, { "epoch": 3.3185185185185184, "grad_norm": 6.523804321940008, "learning_rate": 4.1233967214979764e-08, "logits/chosen": -0.3144129812717438, "logits/rejected": -0.22518262267112732, "logps/chosen": -42.5799560546875, "logps/rejected": -53.63530349731445, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": -1.022313117980957, "rewards/margins": 4.446239471435547, "rewards/rejected": -5.468553066253662, "step": 280 }, { "epoch": 3.33037037037037, "grad_norm": 5.61217781900571, "learning_rate": 3.9814530898309356e-08, "logits/chosen": -0.2805265784263611, "logits/rejected": -0.17468589544296265, "logps/chosen": -36.89150619506836, "logps/rejected": -73.11276245117188, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -0.6258598566055298, "rewards/margins": 8.552447319030762, "rewards/rejected": -9.17830753326416, "step": 281 }, { "epoch": 3.3422222222222224, "grad_norm": 5.921953356785593, "learning_rate": 3.8417839513043646e-08, "logits/chosen": -0.28441232442855835, "logits/rejected": -0.20493623614311218, "logps/chosen": -41.19779586791992, "logps/rejected": -60.01893615722656, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -1.5952982902526855, "rewards/margins": 5.5062713623046875, "rewards/rejected": -7.101569652557373, "step": 282 }, { "epoch": 3.354074074074074, "grad_norm": 5.347638108561962, "learning_rate": 3.704404420021956e-08, "logits/chosen": -0.3048914670944214, "logits/rejected": -0.17332229018211365, "logps/chosen": -33.9517936706543, "logps/rejected": -71.91447448730469, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -0.8167021870613098, "rewards/margins": 8.507079124450684, "rewards/rejected": -9.32378101348877, "step": 283 }, { "epoch": 3.365925925925926, "grad_norm": 5.238374490213889, "learning_rate": 3.569329362320708e-08, "logits/chosen": -0.2728411853313446, "logits/rejected": -0.2584533393383026, "logps/chosen": -30.238557815551758, "logps/rejected": -75.02142333984375, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -0.49139711260795593, "rewards/margins": 9.32203197479248, "rewards/rejected": -9.813429832458496, "step": 284 }, { "epoch": 3.3777777777777778, "grad_norm": 6.525469512599618, "learning_rate": 3.436573395162179e-08, "logits/chosen": -0.3524834215641022, "logits/rejected": -0.30102020502090454, "logps/chosen": -30.73918914794922, "logps/rejected": -59.49354553222656, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -0.09267859905958176, "rewards/margins": 6.891367435455322, "rewards/rejected": -6.984046459197998, "step": 285 }, { "epoch": 3.3896296296296295, "grad_norm": 5.564012623506871, "learning_rate": 3.306150884550732e-08, "logits/chosen": -0.3768519461154938, "logits/rejected": -0.3264191448688507, "logps/chosen": -41.36799240112305, "logps/rejected": -67.31135559082031, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": -1.0806761980056763, "rewards/margins": 7.03513240814209, "rewards/rejected": -8.115808486938477, "step": 286 }, { "epoch": 3.4014814814814813, "grad_norm": 5.931359172922728, "learning_rate": 3.17807594397895e-08, "logits/chosen": -0.3008241653442383, "logits/rejected": -0.1854274868965149, "logps/chosen": -35.424800872802734, "logps/rejected": -75.77165222167969, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -1.3784024715423584, "rewards/margins": 9.62315845489502, "rewards/rejected": -11.00156021118164, "step": 287 }, { "epoch": 3.413333333333333, "grad_norm": 5.696220945908767, "learning_rate": 3.052362432900332e-08, "logits/chosen": -0.4203820526599884, "logits/rejected": -0.3877807557582855, "logps/chosen": -37.054630279541016, "logps/rejected": -61.031986236572266, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -0.9897009134292603, "rewards/margins": 7.574882984161377, "rewards/rejected": -8.564583778381348, "step": 288 }, { "epoch": 3.4251851851851853, "grad_norm": 5.040781372610454, "learning_rate": 2.9290239552295538e-08, "logits/chosen": -0.04420602694153786, "logits/rejected": -0.0779787227511406, "logps/chosen": -49.03828811645508, "logps/rejected": -64.86919403076172, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -1.1655118465423584, "rewards/margins": 7.464972496032715, "rewards/rejected": -8.630484580993652, "step": 289 }, { "epoch": 3.437037037037037, "grad_norm": 6.3238698051370825, "learning_rate": 2.8080738578703052e-08, "logits/chosen": -0.2396240085363388, "logits/rejected": -0.18526090681552887, "logps/chosen": -35.554222106933594, "logps/rejected": -80.34398651123047, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": -0.15909190475940704, "rewards/margins": 11.187956809997559, "rewards/rejected": -11.347049713134766, "step": 290 }, { "epoch": 3.448888888888889, "grad_norm": 6.096416193339527, "learning_rate": 2.6895252292709974e-08, "logits/chosen": -0.3143896460533142, "logits/rejected": -0.2961388826370239, "logps/chosen": -45.88547897338867, "logps/rejected": -72.7055892944336, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -1.5558788776397705, "rewards/margins": 7.8544135093688965, "rewards/rejected": -9.41029167175293, "step": 291 }, { "epoch": 3.4607407407407407, "grad_norm": 6.109539533433742, "learning_rate": 2.5733908980083984e-08, "logits/chosen": -0.20717650651931763, "logits/rejected": -0.1371382474899292, "logps/chosen": -34.01182556152344, "logps/rejected": -69.25910949707031, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -1.003832221031189, "rewards/margins": 6.742988586425781, "rewards/rejected": -7.74682092666626, "step": 292 }, { "epoch": 3.4725925925925925, "grad_norm": 5.397666624928578, "learning_rate": 2.4596834313994037e-08, "logits/chosen": -0.2090422511100769, "logits/rejected": -0.1905803680419922, "logps/chosen": -37.257659912109375, "logps/rejected": -59.028656005859375, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -0.009922012686729431, "rewards/margins": 7.401028633117676, "rewards/rejected": -7.410951614379883, "step": 293 }, { "epoch": 3.4844444444444447, "grad_norm": 4.277274237028717, "learning_rate": 2.3484151341411018e-08, "logits/chosen": -0.28495097160339355, "logits/rejected": -0.13665924966335297, "logps/chosen": -30.814395904541016, "logps/rejected": -71.38455963134766, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -0.883634090423584, "rewards/margins": 8.096288681030273, "rewards/rejected": -8.979923248291016, "step": 294 }, { "epoch": 3.4962962962962965, "grad_norm": 5.969698336918465, "learning_rate": 2.23959804697921e-08, "logits/chosen": 0.0012427568435668945, "logits/rejected": -0.05258895084261894, "logps/chosen": -43.88676071166992, "logps/rejected": -69.91567993164062, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -1.0892465114593506, "rewards/margins": 8.158609390258789, "rewards/rejected": -9.247856140136719, "step": 295 }, { "epoch": 3.5081481481481482, "grad_norm": 4.0211602970173095, "learning_rate": 2.1332439454051277e-08, "logits/chosen": -0.20002827048301697, "logits/rejected": -0.1429349184036255, "logps/chosen": -34.301002502441406, "logps/rejected": -55.76853561401367, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -0.09878039360046387, "rewards/margins": 6.826313495635986, "rewards/rejected": -6.925093650817871, "step": 296 }, { "epoch": 3.52, "grad_norm": 5.481157431560272, "learning_rate": 2.029364338381656e-08, "logits/chosen": -0.38807621598243713, "logits/rejected": -0.3801242709159851, "logps/chosen": -46.397727966308594, "logps/rejected": -55.0689582824707, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -0.3208569288253784, "rewards/margins": 5.903377056121826, "rewards/rejected": -6.224234580993652, "step": 297 }, { "epoch": 3.531851851851852, "grad_norm": 5.352166370096892, "learning_rate": 1.9279704670975726e-08, "logits/chosen": -0.2355162501335144, "logits/rejected": -0.07986889034509659, "logps/chosen": -34.6320686340332, "logps/rejected": -71.03166198730469, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -0.7058801651000977, "rewards/margins": 7.573249816894531, "rewards/rejected": -8.279129981994629, "step": 298 }, { "epoch": 3.5437037037037036, "grad_norm": 5.786741095008315, "learning_rate": 1.829073303751172e-08, "logits/chosen": -0.3151942193508148, "logits/rejected": -0.33904606103897095, "logps/chosen": -29.33087921142578, "logps/rejected": -61.94608688354492, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -1.2769862413406372, "rewards/margins": 8.623825073242188, "rewards/rejected": -9.900812149047852, "step": 299 }, { "epoch": 3.5555555555555554, "grad_norm": 4.36518877931545, "learning_rate": 1.732683550362954e-08, "logits/chosen": -0.23127204179763794, "logits/rejected": -0.15411251783370972, "logps/chosen": -50.591552734375, "logps/rejected": -77.29461669921875, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -1.3554518222808838, "rewards/margins": 7.547949314117432, "rewards/rejected": -8.903400421142578, "step": 300 }, { "epoch": 3.5674074074074076, "grad_norm": 4.847754834407613, "learning_rate": 1.6388116376174765e-08, "logits/chosen": -0.3548241853713989, "logits/rejected": -0.2721732556819916, "logps/chosen": -38.101295471191406, "logps/rejected": -78.74886322021484, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -1.5179784297943115, "rewards/margins": 8.753091812133789, "rewards/rejected": -10.27107048034668, "step": 301 }, { "epoch": 3.5792592592592594, "grad_norm": 7.781488916643506, "learning_rate": 1.5474677237346468e-08, "logits/chosen": -0.3061152994632721, "logits/rejected": -0.3108983337879181, "logps/chosen": -41.504512786865234, "logps/rejected": -78.45973205566406, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": -1.2482175827026367, "rewards/margins": 8.759658813476562, "rewards/rejected": -10.0078763961792, "step": 302 }, { "epoch": 3.591111111111111, "grad_norm": 6.94980194588748, "learning_rate": 1.4586616933704527e-08, "logits/chosen": -0.018258891999721527, "logits/rejected": -0.007322182413190603, "logps/chosen": -54.701812744140625, "logps/rejected": -73.30741882324219, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -1.3399360179901123, "rewards/margins": 6.371276378631592, "rewards/rejected": -7.711213111877441, "step": 303 }, { "epoch": 3.602962962962963, "grad_norm": 6.423577076131564, "learning_rate": 1.372403156547311e-08, "logits/chosen": -0.43270713090896606, "logits/rejected": -0.36236703395843506, "logps/chosen": -33.90000534057617, "logps/rejected": -60.915306091308594, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -1.318174958229065, "rewards/margins": 7.222588062286377, "rewards/rejected": -8.540762901306152, "step": 304 }, { "epoch": 3.6148148148148147, "grad_norm": 4.091095308871669, "learning_rate": 1.2887014476141212e-08, "logits/chosen": -0.30502232909202576, "logits/rejected": -0.36349910497665405, "logps/chosen": -40.1556510925293, "logps/rejected": -68.93075561523438, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": -0.034249335527420044, "rewards/margins": 9.958725929260254, "rewards/rejected": -9.992975234985352, "step": 305 }, { "epoch": 3.626666666666667, "grad_norm": 6.177303284695512, "learning_rate": 1.2075656242361732e-08, "logits/chosen": -0.2732085585594177, "logits/rejected": -0.1902616024017334, "logps/chosen": -34.206607818603516, "logps/rejected": -68.56214904785156, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -1.0075308084487915, "rewards/margins": 8.230666160583496, "rewards/rejected": -9.238195419311523, "step": 306 }, { "epoch": 3.6385185185185183, "grad_norm": 4.816450379471901, "learning_rate": 1.1290044664149873e-08, "logits/chosen": -0.11257211118936539, "logits/rejected": -0.13122713565826416, "logps/chosen": -48.688873291015625, "logps/rejected": -74.23355102539062, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.2951655387878418, "rewards/margins": 8.786190032958984, "rewards/rejected": -10.081355094909668, "step": 307 }, { "epoch": 3.6503703703703705, "grad_norm": 6.851539041964001, "learning_rate": 1.0530264755381824e-08, "logits/chosen": -0.4197385013103485, "logits/rejected": -0.43813198804855347, "logps/chosen": -33.74197769165039, "logps/rejected": -56.243553161621094, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -0.6660629510879517, "rewards/margins": 5.48460054397583, "rewards/rejected": -6.150663375854492, "step": 308 }, { "epoch": 3.6622222222222223, "grad_norm": 4.955677182707875, "learning_rate": 9.796398734595284e-09, "logits/chosen": -0.26788032054901123, "logits/rejected": -0.26230883598327637, "logps/chosen": -28.0145206451416, "logps/rejected": -51.17897033691406, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -0.11930375546216965, "rewards/margins": 5.247139930725098, "rewards/rejected": -5.366443634033203, "step": 309 }, { "epoch": 3.674074074074074, "grad_norm": 5.676351317770164, "learning_rate": 9.088526016092141e-09, "logits/chosen": -0.3300250470638275, "logits/rejected": -0.31215977668762207, "logps/chosen": -33.89133834838867, "logps/rejected": -67.8143539428711, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -0.08850759267807007, "rewards/margins": 9.25622272491455, "rewards/rejected": -9.344730377197266, "step": 310 }, { "epoch": 3.685925925925926, "grad_norm": 5.070558881430833, "learning_rate": 8.40672320134489e-09, "logits/chosen": -0.305147260427475, "logits/rejected": -0.17320549488067627, "logps/chosen": -33.947872161865234, "logps/rejected": -74.02790832519531, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": 0.0814896896481514, "rewards/margins": 9.62078857421875, "rewards/rejected": -9.539299011230469, "step": 311 }, { "epoch": 3.6977777777777776, "grad_norm": 4.665245148624672, "learning_rate": 7.751064070707247e-09, "logits/chosen": -0.44625863432884216, "logits/rejected": -0.4555772542953491, "logps/chosen": -42.39094161987305, "logps/rejected": -67.60736846923828, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": -0.6547796130180359, "rewards/margins": 7.189840793609619, "rewards/rejected": -7.844620227813721, "step": 312 }, { "epoch": 3.70962962962963, "grad_norm": 7.083642027360033, "learning_rate": 7.12161957543006e-09, "logits/chosen": -0.251234769821167, "logits/rejected": -0.13808919489383698, "logps/chosen": -54.0128288269043, "logps/rejected": -92.3724365234375, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -1.6177242994308472, "rewards/margins": 8.311200141906738, "rewards/rejected": -9.928923606872559, "step": 313 }, { "epoch": 3.7214814814814816, "grad_norm": 4.713181664530663, "learning_rate": 6.518457829983559e-09, "logits/chosen": -0.3703707158565521, "logits/rejected": -0.2648147940635681, "logps/chosen": -48.93006896972656, "logps/rejected": -66.18186950683594, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -0.7085801362991333, "rewards/margins": 5.302105903625488, "rewards/rejected": -6.010685920715332, "step": 314 }, { "epoch": 3.7333333333333334, "grad_norm": 4.802422252346654, "learning_rate": 5.9416441046862555e-09, "logits/chosen": -0.3589542508125305, "logits/rejected": -0.3287428319454193, "logps/chosen": -27.614389419555664, "logps/rejected": -59.38009262084961, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -0.17799943685531616, "rewards/margins": 7.3723649978637695, "rewards/rejected": -7.550364017486572, "step": 315 }, { "epoch": 3.745185185185185, "grad_norm": 5.504211939139539, "learning_rate": 5.3912408186420064e-09, "logits/chosen": -0.19225972890853882, "logits/rejected": -0.24219948053359985, "logps/chosen": -39.883174896240234, "logps/rejected": -57.19430160522461, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -0.7933384776115417, "rewards/margins": 7.422837257385254, "rewards/rejected": -8.21617603302002, "step": 316 }, { "epoch": 3.757037037037037, "grad_norm": 5.560326027847558, "learning_rate": 4.867307532985227e-09, "logits/chosen": -0.48098868131637573, "logits/rejected": -0.3568829894065857, "logps/chosen": -54.420379638671875, "logps/rejected": -85.24766540527344, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -1.9853860139846802, "rewards/margins": 7.260770320892334, "rewards/rejected": -9.246155738830566, "step": 317 }, { "epoch": 3.7688888888888887, "grad_norm": 7.196198216327019, "learning_rate": 4.369900944435734e-09, "logits/chosen": -0.16600579023361206, "logits/rejected": -0.0450252965092659, "logps/chosen": -42.486473083496094, "logps/rejected": -81.37212371826172, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -0.9354689121246338, "rewards/margins": 7.651963233947754, "rewards/rejected": -8.587431907653809, "step": 318 }, { "epoch": 3.7807407407407405, "grad_norm": 6.279239735558416, "learning_rate": 3.899074879163244e-09, "logits/chosen": -0.40112194418907166, "logits/rejected": -0.339353084564209, "logps/chosen": -34.84368133544922, "logps/rejected": -61.614341735839844, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -1.0974161624908447, "rewards/margins": 7.075118064880371, "rewards/rejected": -8.172533988952637, "step": 319 }, { "epoch": 3.7925925925925927, "grad_norm": 6.9892724428640705, "learning_rate": 3.4548802869627804e-09, "logits/chosen": -0.3009003698825836, "logits/rejected": -0.21556389331817627, "logps/chosen": -41.47652816772461, "logps/rejected": -67.13237762451172, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -0.2825208604335785, "rewards/margins": 4.948946952819824, "rewards/rejected": -5.231468200683594, "step": 320 }, { "epoch": 3.8044444444444445, "grad_norm": 6.856578977189586, "learning_rate": 3.037365235741024e-09, "logits/chosen": -0.16049635410308838, "logits/rejected": -0.12033607065677643, "logps/chosen": -32.831031799316406, "logps/rejected": -62.16967010498047, "loss": 0.0433, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41414308547973633, "rewards/margins": 7.311939239501953, "rewards/rejected": -7.726081371307373, "step": 321 }, { "epoch": 3.8162962962962963, "grad_norm": 9.951539288327684, "learning_rate": 2.6465749063149245e-09, "logits/chosen": -0.6789449453353882, "logits/rejected": -0.6332409381866455, "logps/chosen": -36.879947662353516, "logps/rejected": -80.12794494628906, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -1.3466782569885254, "rewards/margins": 9.836599349975586, "rewards/rejected": -11.183279037475586, "step": 322 }, { "epoch": 3.828148148148148, "grad_norm": 6.85068630218398, "learning_rate": 2.282551587522441e-09, "logits/chosen": -0.5462524890899658, "logits/rejected": -0.43087050318717957, "logps/chosen": -32.245567321777344, "logps/rejected": -56.410179138183594, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": -1.1382488012313843, "rewards/margins": 6.717283248901367, "rewards/rejected": -7.855532169342041, "step": 323 }, { "epoch": 3.84, "grad_norm": 6.505661948537336, "learning_rate": 1.9453346716462316e-09, "logits/chosen": -0.3759889602661133, "logits/rejected": -0.4021185040473938, "logps/chosen": -36.99543380737305, "logps/rejected": -46.82232666015625, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": -0.881350576877594, "rewards/margins": 4.79249382019043, "rewards/rejected": -5.673844337463379, "step": 324 }, { "epoch": 3.851851851851852, "grad_norm": 6.937575318919963, "learning_rate": 1.6349606501509794e-09, "logits/chosen": -0.2532769441604614, "logits/rejected": -0.2831670045852661, "logps/chosen": -41.752403259277344, "logps/rejected": -52.2098388671875, "loss": 0.0363, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7358890771865845, "rewards/margins": 6.947170734405518, "rewards/rejected": -7.6830596923828125, "step": 325 }, { "epoch": 3.863703703703704, "grad_norm": 3.940918067532272, "learning_rate": 1.351463109734441e-09, "logits/chosen": -0.6125096082687378, "logits/rejected": -0.3610725700855255, "logps/chosen": -36.78853225708008, "logps/rejected": -68.35499572753906, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -0.8846864700317383, "rewards/margins": 8.920228004455566, "rewards/rejected": -9.804913520812988, "step": 326 }, { "epoch": 3.8755555555555556, "grad_norm": 5.731819862714682, "learning_rate": 1.0948727286930192e-09, "logits/chosen": -0.07925964891910553, "logits/rejected": -0.04111175611615181, "logps/chosen": -35.71060562133789, "logps/rejected": -57.488651275634766, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.09710303694009781, "rewards/margins": 6.214367866516113, "rewards/rejected": -6.311470985412598, "step": 327 }, { "epoch": 3.8874074074074074, "grad_norm": 7.4093470103902845, "learning_rate": 8.652172736017816e-10, "logits/chosen": -0.2285485416650772, "logits/rejected": -0.22180640697479248, "logps/chosen": -45.58677673339844, "logps/rejected": -74.19960021972656, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -0.34262484312057495, "rewards/margins": 6.549678325653076, "rewards/rejected": -6.892302989959717, "step": 328 }, { "epoch": 3.899259259259259, "grad_norm": 5.597791664125433, "learning_rate": 6.625215963098896e-10, "logits/chosen": -0.22412040829658508, "logits/rejected": -0.25815126299858093, "logps/chosen": -39.15158462524414, "logps/rejected": -52.841060638427734, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": -1.1085604429244995, "rewards/margins": 5.7370734214782715, "rewards/rejected": -6.845633506774902, "step": 329 }, { "epoch": 3.911111111111111, "grad_norm": 4.652252051429055, "learning_rate": 4.868076312512515e-10, "logits/chosen": -0.3777186870574951, "logits/rejected": -0.3193652629852295, "logps/chosen": -32.07898712158203, "logps/rejected": -64.57249450683594, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -0.5756787657737732, "rewards/margins": 7.689653396606445, "rewards/rejected": -8.265332221984863, "step": 330 }, { "epoch": 3.9229629629629628, "grad_norm": 5.81932820439485, "learning_rate": 3.3809439307086463e-10, "logits/chosen": -0.20917584002017975, "logits/rejected": -0.12255613505840302, "logps/chosen": -31.15184783935547, "logps/rejected": -63.176910400390625, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": -0.12342405319213867, "rewards/margins": 6.832948684692383, "rewards/rejected": -6.956372261047363, "step": 331 }, { "epoch": 3.934814814814815, "grad_norm": 6.583884307182356, "learning_rate": 2.1639797456723952e-10, "logits/chosen": -0.3113446831703186, "logits/rejected": -0.34424495697021484, "logps/chosen": -50.414649963378906, "logps/rejected": -66.15986633300781, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": -0.8464060425758362, "rewards/margins": 7.252386093139648, "rewards/rejected": -8.098793029785156, "step": 332 }, { "epoch": 3.9466666666666668, "grad_norm": 5.902458355798929, "learning_rate": 1.21731544950876e-10, "logits/chosen": -0.23517660796642303, "logits/rejected": -0.3104686737060547, "logps/chosen": -45.34723663330078, "logps/rejected": -86.45767974853516, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -1.3326948881149292, "rewards/margins": 9.871392250061035, "rewards/rejected": -11.204087257385254, "step": 333 }, { "epoch": 3.9585185185185185, "grad_norm": 5.025564856386988, "learning_rate": 5.4105348419264394e-11, "logits/chosen": -0.6560889482498169, "logits/rejected": -0.6381913423538208, "logps/chosen": -31.789920806884766, "logps/rejected": -57.232948303222656, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -0.43755847215652466, "rewards/margins": 6.743692398071289, "rewards/rejected": -7.181251525878906, "step": 334 }, { "epoch": 3.9703703703703703, "grad_norm": 4.328140885361808, "learning_rate": 1.3526703048216682e-11, "logits/chosen": -0.5039613246917725, "logits/rejected": -0.4086998999118805, "logps/chosen": -34.5649528503418, "logps/rejected": -85.53738403320312, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -0.7040150165557861, "rewards/margins": 11.698722839355469, "rewards/rejected": -12.402737617492676, "step": 335 }, { "epoch": 3.982222222222222, "grad_norm": 5.989193672621692, "learning_rate": 0.0, "logits/chosen": -0.3169300854206085, "logits/rejected": -0.32203078269958496, "logps/chosen": -37.71720886230469, "logps/rejected": -64.16942596435547, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": -0.6902213096618652, "rewards/margins": 7.074714183807373, "rewards/rejected": -7.76493501663208, "step": 336 }, { "epoch": 3.982222222222222, "step": 336, "total_flos": 0.0, "train_loss": 0.14947671072912358, "train_runtime": 64472.8667, "train_samples_per_second": 0.67, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 336, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }