{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 400, "global_step": 17412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.0905284881591797, "learning_rate": 5.74052812858783e-11, "logits/chosen": -2.8080272674560547, "logits/rejected": -2.785019874572754, "logps/chosen": -44.8405876159668, "logps/rejected": -39.36625671386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.09809947013855, "learning_rate": 5.74052812858783e-10, "logits/chosen": -2.9044275283813477, "logits/rejected": -2.8818445205688477, "logps/chosen": -51.817386627197266, "logps/rejected": -49.23894119262695, "loss": 0.6932, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": -9.387239697389305e-05, "rewards/margins": -6.934934935998172e-05, "rewards/rejected": -2.4523074898752384e-05, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.099186658859253, "learning_rate": 1.148105625717566e-09, "logits/chosen": -2.9467902183532715, "logits/rejected": -2.941981077194214, "logps/chosen": -53.83275604248047, "logps/rejected": -52.87550735473633, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010858189489226788, "rewards/margins": -0.00010842746996786445, "rewards/rejected": -1.5443511358625983e-07, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.2349655628204346, "learning_rate": 1.7221584385763488e-09, "logits/chosen": -2.910006046295166, "logits/rejected": -2.891695261001587, "logps/chosen": -57.67896270751953, "logps/rejected": -57.83086395263672, "loss": 0.6932, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 4.440903921931749e-06, "rewards/margins": -4.769151928485371e-05, "rewards/rejected": 5.213242911850102e-05, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 1.8455334901809692, "learning_rate": 2.296211251435132e-09, "logits/chosen": -2.9271020889282227, "logits/rejected": -2.9034230709075928, "logps/chosen": -56.056373596191406, "logps/rejected": -50.16437530517578, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00015602688654325902, "rewards/margins": 0.00022045333753339946, "rewards/rejected": -6.442649464588612e-05, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 1.9800422191619873, "learning_rate": 2.870264064293915e-09, "logits/chosen": -2.9295787811279297, "logits/rejected": -2.918616771697998, "logps/chosen": -53.17473220825195, "logps/rejected": -50.476890563964844, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -6.461610610131174e-05, "rewards/margins": -8.848131756167277e-07, "rewards/rejected": -6.373132055159658e-05, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.3492772579193115, "learning_rate": 3.4443168771526976e-09, "logits/chosen": -2.9493114948272705, "logits/rejected": -2.926198720932007, "logps/chosen": -58.42417526245117, "logps/rejected": -53.90165328979492, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00026348684332333505, "rewards/margins": -0.00033530019572936, "rewards/rejected": 7.18133378541097e-05, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.0364973545074463, "learning_rate": 4.018369690011481e-09, "logits/chosen": -2.9074623584747314, "logits/rejected": -2.895207166671753, "logps/chosen": -54.81498336791992, "logps/rejected": -52.3970832824707, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00012572194100357592, "rewards/margins": 7.510065915994346e-05, "rewards/rejected": 5.062127092969604e-05, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.25162672996521, "learning_rate": 4.592422502870264e-09, "logits/chosen": -2.9643242359161377, "logits/rejected": -2.942950487136841, "logps/chosen": -60.2001953125, "logps/rejected": -53.245811462402344, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.0969393517589197e-05, "rewards/margins": -5.4584954341407865e-05, "rewards/rejected": 3.361555718583986e-05, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.1411986351013184, "learning_rate": 5.166475315729047e-09, "logits/chosen": -2.867920160293579, "logits/rejected": -2.8608827590942383, "logps/chosen": -54.94426345825195, "logps/rejected": -51.807090759277344, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00012062456517014652, "rewards/margins": -4.8005091230152175e-05, "rewards/rejected": -7.261949212988839e-05, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.2051501274108887, "learning_rate": 5.74052812858783e-09, "logits/chosen": -2.967716693878174, "logits/rejected": -2.919863224029541, "logps/chosen": -57.4222526550293, "logps/rejected": -48.79217529296875, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.00012718760990537703, "rewards/margins": 0.00012836657697334886, "rewards/rejected": -0.00025555412867106497, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.246222496032715, "learning_rate": 6.314580941446612e-09, "logits/chosen": -2.9464218616485596, "logits/rejected": -2.9271929264068604, "logps/chosen": -56.66094970703125, "logps/rejected": -51.9701042175293, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00010782321623992175, "rewards/margins": 0.00011707199882948771, "rewards/rejected": -9.2488016889547e-06, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.312595844268799, "learning_rate": 6.888633754305395e-09, "logits/chosen": -2.885158061981201, "logits/rejected": -2.873599052429199, "logps/chosen": -53.678932189941406, "logps/rejected": -54.923248291015625, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.00021055006072856486, "rewards/margins": 0.0002522490103729069, "rewards/rejected": -4.1698935092426836e-05, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 1.9637584686279297, "learning_rate": 7.462686567164179e-09, "logits/chosen": -2.932377338409424, "logits/rejected": -2.9271585941314697, "logps/chosen": -56.641029357910156, "logps/rejected": -53.10710525512695, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.00017390199354849756, "rewards/margins": 7.098014066286851e-06, "rewards/rejected": -0.00018099998123943806, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.3946006298065186, "learning_rate": 8.036739380022962e-09, "logits/chosen": -2.9414381980895996, "logits/rejected": -2.9314544200897217, "logps/chosen": -54.4839973449707, "logps/rejected": -52.5967903137207, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -9.17953802854754e-05, "rewards/margins": -9.829508780967444e-05, "rewards/rejected": 6.49970297672553e-06, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.067854642868042, "learning_rate": 8.610792192881745e-09, "logits/chosen": -2.8870601654052734, "logits/rejected": -2.874255657196045, "logps/chosen": -53.06481170654297, "logps/rejected": -51.183555603027344, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.808160959626548e-05, "rewards/margins": -0.0002152116794604808, "rewards/rejected": 0.00019713006622623652, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 1.8846144676208496, "learning_rate": 9.184845005740529e-09, "logits/chosen": -2.9312491416931152, "logits/rejected": -2.9175572395324707, "logps/chosen": -54.59100341796875, "logps/rejected": -54.2659912109375, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 5.2008450438734144e-05, "rewards/margins": 8.009701559785753e-05, "rewards/rejected": -2.8088572435081005e-05, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.0794315338134766, "learning_rate": 9.758897818599312e-09, "logits/chosen": -2.9155006408691406, "logits/rejected": -2.901543617248535, "logps/chosen": -56.4522705078125, "logps/rejected": -50.75872802734375, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.9222959963371977e-05, "rewards/margins": 0.0001044358141371049, "rewards/rejected": -8.521286508766934e-05, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.257279872894287, "learning_rate": 1.0332950631458094e-08, "logits/chosen": -2.9141793251037598, "logits/rejected": -2.898280382156372, "logps/chosen": -57.03478240966797, "logps/rejected": -52.420867919921875, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 5.353235246730037e-05, "rewards/margins": 9.774983482202515e-05, "rewards/rejected": -4.4217464164830744e-05, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.538918972015381, "learning_rate": 1.0907003444316877e-08, "logits/chosen": -2.9484705924987793, "logits/rejected": -2.9152588844299316, "logps/chosen": -59.644805908203125, "logps/rejected": -51.64323043823242, "loss": 0.6933, "rewards/accuracies": 0.46875, "rewards/chosen": -0.000212798360735178, "rewards/margins": -0.00027094813412986696, "rewards/rejected": 5.8149791584583e-05, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.2350316047668457, "learning_rate": 1.148105625717566e-08, "logits/chosen": -2.907341241836548, "logits/rejected": -2.897705554962158, "logps/chosen": -54.77196502685547, "logps/rejected": -53.700828552246094, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -9.717059583636001e-05, "rewards/margins": 5.300307020661421e-05, "rewards/rejected": -0.00015017366968095303, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.1636910438537598, "learning_rate": 1.2055109070034444e-08, "logits/chosen": -2.8660168647766113, "logits/rejected": -2.8632164001464844, "logps/chosen": -54.080406188964844, "logps/rejected": -56.40966033935547, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 8.996956603368744e-05, "rewards/margins": 3.7124355003470555e-05, "rewards/rejected": 5.284523285808973e-05, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.0409598350524902, "learning_rate": 1.2629161882893224e-08, "logits/chosen": -2.904695749282837, "logits/rejected": -2.8822054862976074, "logps/chosen": -53.443138122558594, "logps/rejected": -49.99930953979492, "loss": 0.6933, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00015412228822242469, "rewards/margins": -0.0002943346626125276, "rewards/rejected": 0.0001402123598381877, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.233654737472534, "learning_rate": 1.3203214695752007e-08, "logits/chosen": -2.900219202041626, "logits/rejected": -2.8891189098358154, "logps/chosen": -49.81378936767578, "logps/rejected": -49.349815368652344, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00014936344814486802, "rewards/margins": -0.00013175979256629944, "rewards/rejected": -1.760370287229307e-05, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 1.9850331544876099, "learning_rate": 1.377726750861079e-08, "logits/chosen": -2.8763766288757324, "logits/rejected": -2.84673810005188, "logps/chosen": -56.71735382080078, "logps/rejected": -51.65503692626953, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.7611677321838215e-05, "rewards/margins": 0.0001161120380857028, "rewards/rejected": -0.00013372373359743506, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.068523406982422, "learning_rate": 1.4351320321469574e-08, "logits/chosen": -2.949462413787842, "logits/rejected": -2.9321811199188232, "logps/chosen": -53.439109802246094, "logps/rejected": -50.085819244384766, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 3.4617838537087664e-05, "rewards/margins": 1.9717685063369572e-05, "rewards/rejected": 1.490015256422339e-05, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 1.9957587718963623, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -2.9308934211730957, "logits/rejected": -2.9233505725860596, "logps/chosen": -55.7227897644043, "logps/rejected": -55.18525314331055, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00011331225687172264, "rewards/margins": 0.00031197606585919857, "rewards/rejected": -0.00019866382353939116, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.175574541091919, "learning_rate": 1.549942594718714e-08, "logits/chosen": -2.8968732357025146, "logits/rejected": -2.889274835586548, "logps/chosen": -53.792694091796875, "logps/rejected": -53.44324493408203, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0001622785785002634, "rewards/margins": 7.451194687746465e-05, "rewards/rejected": -0.00023679053992964327, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.0508830547332764, "learning_rate": 1.6073478760045924e-08, "logits/chosen": -2.9564032554626465, "logits/rejected": -2.9346067905426025, "logps/chosen": -58.7863883972168, "logps/rejected": -52.57866287231445, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00020536321972031146, "rewards/margins": -7.604634447488934e-05, "rewards/rejected": -0.0001293168606935069, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 1.9876508712768555, "learning_rate": 1.6647531572904707e-08, "logits/chosen": -2.9049034118652344, "logits/rejected": -2.895927667617798, "logps/chosen": -56.95619583129883, "logps/rejected": -53.31736373901367, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00022282273857854307, "rewards/margins": 7.085135439410806e-05, "rewards/rejected": -0.00029367406386882067, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.060868501663208, "learning_rate": 1.722158438576349e-08, "logits/chosen": -2.853228807449341, "logits/rejected": -2.8549416065216064, "logps/chosen": -54.9939079284668, "logps/rejected": -53.275054931640625, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.7072605007560924e-05, "rewards/margins": 0.00010046066017821431, "rewards/rejected": -0.0001275332469958812, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.057678699493408, "learning_rate": 1.7795637198622274e-08, "logits/chosen": -2.9021923542022705, "logits/rejected": -2.9028067588806152, "logps/chosen": -54.8409423828125, "logps/rejected": -52.515342712402344, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -7.745550101390108e-05, "rewards/margins": 5.208273069001734e-05, "rewards/rejected": -0.0001295382244279608, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.3376967906951904, "learning_rate": 1.8369690011481057e-08, "logits/chosen": -2.887517213821411, "logits/rejected": -2.869492769241333, "logps/chosen": -56.575233459472656, "logps/rejected": -48.890525817871094, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00010880133777391165, "rewards/margins": -3.789450420299545e-05, "rewards/rejected": -7.09068845026195e-05, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.0127511024475098, "learning_rate": 1.894374282433984e-08, "logits/chosen": -2.9153316020965576, "logits/rejected": -2.898012161254883, "logps/chosen": -56.18683624267578, "logps/rejected": -51.056007385253906, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": 8.49324424052611e-05, "rewards/margins": 0.00045079676783643663, "rewards/rejected": -0.0003658643108792603, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 1.9972854852676392, "learning_rate": 1.9517795637198624e-08, "logits/chosen": -2.883319139480591, "logits/rejected": -2.8704676628112793, "logps/chosen": -52.8604621887207, "logps/rejected": -51.86638259887695, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00010438141180202365, "rewards/margins": 0.0001349625817965716, "rewards/rejected": -0.00023934399359859526, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.250826597213745, "learning_rate": 2.0091848450057404e-08, "logits/chosen": -2.8549716472625732, "logits/rejected": -2.82232666015625, "logps/chosen": -57.10233688354492, "logps/rejected": -53.84283447265625, "loss": 0.6931, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 4.836320294998586e-05, "rewards/margins": 0.00014005022239871323, "rewards/rejected": -9.168702672468498e-05, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.1809022426605225, "learning_rate": 2.0665901262916187e-08, "logits/chosen": -2.9523656368255615, "logits/rejected": -2.9341259002685547, "logps/chosen": -56.03881072998047, "logps/rejected": -49.349849700927734, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00038100595702417195, "rewards/margins": -6.664998181804549e-06, "rewards/rejected": -0.0003743409179151058, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.0530834197998047, "learning_rate": 2.123995407577497e-08, "logits/chosen": -2.938208818435669, "logits/rejected": -2.913538694381714, "logps/chosen": -54.34980392456055, "logps/rejected": -50.99808120727539, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00012077521387254819, "rewards/margins": 0.00036179949529469013, "rewards/rejected": -0.0004825748037546873, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 1.9240037202835083, "learning_rate": 2.1814006888633754e-08, "logits/chosen": -2.9995782375335693, "logits/rejected": -2.9801979064941406, "logps/chosen": -55.090370178222656, "logps/rejected": -51.10495376586914, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0001042260555550456, "rewards/margins": 0.0003570080443751067, "rewards/rejected": -0.0004612340999301523, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.231076240539551, "learning_rate": 2.2388059701492537e-08, "logits/chosen": -2.9322702884674072, "logits/rejected": -2.9176416397094727, "logps/chosen": -57.369293212890625, "logps/rejected": -54.06401824951172, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0004567280411720276, "rewards/margins": -0.00018372261547483504, "rewards/rejected": -0.0002730053965933621, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 1.7321640253067017, "learning_rate": 2.296211251435132e-08, "logits/chosen": -2.919123649597168, "logits/rejected": -2.9075767993927, "logps/chosen": -54.519500732421875, "logps/rejected": -51.47249221801758, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00017625563486944884, "rewards/margins": 0.00038831119309179485, "rewards/rejected": -0.0005645668716169894, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -2.9726641178131104, "eval_logits/rejected": -2.9690604209899902, "eval_logps/chosen": -58.985782623291016, "eval_logps/rejected": -62.72700500488281, "eval_loss": 0.693112313747406, "eval_rewards/accuracies": 0.5111523866653442, "eval_rewards/chosen": 0.00029688214999623597, "eval_rewards/margins": 7.089837890816852e-05, "eval_rewards/rejected": 0.00022598376381210983, "eval_runtime": 383.0475, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.405, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 1.944787621498108, "learning_rate": 2.3536165327210104e-08, "logits/chosen": -2.9126317501068115, "logits/rejected": -2.913689136505127, "logps/chosen": -51.16768264770508, "logps/rejected": -54.39495849609375, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0004532871244009584, "rewards/margins": 0.00013652675261255354, "rewards/rejected": -0.0005898139206692576, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.370176076889038, "learning_rate": 2.4110218140068887e-08, "logits/chosen": -2.899533748626709, "logits/rejected": -2.896271228790283, "logps/chosen": -55.39063262939453, "logps/rejected": -53.54706573486328, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00029050689772702754, "rewards/margins": 0.00030943285673856735, "rewards/rejected": -0.0005999397253617644, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.0748910903930664, "learning_rate": 2.4684270952927668e-08, "logits/chosen": -2.93806529045105, "logits/rejected": -2.927563190460205, "logps/chosen": -54.763023376464844, "logps/rejected": -53.027503967285156, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.900352723780088e-05, "rewards/margins": 0.0003839733253698796, "rewards/rejected": -0.00043297684169374406, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.3829500675201416, "learning_rate": 2.5258323765786448e-08, "logits/chosen": -2.979527235031128, "logits/rejected": -2.953399896621704, "logps/chosen": -55.02161407470703, "logps/rejected": -52.73406982421875, "loss": 0.6927, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.88020084251184e-05, "rewards/margins": 0.0008097798563539982, "rewards/rejected": -0.000868581875693053, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.041431427001953, "learning_rate": 2.583237657864523e-08, "logits/chosen": -2.9178643226623535, "logits/rejected": -2.8964293003082275, "logps/chosen": -57.30718231201172, "logps/rejected": -54.427452087402344, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0002218265290139243, "rewards/margins": 0.00039240572368726134, "rewards/rejected": -0.0006142322672531009, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 1.999345064163208, "learning_rate": 2.6406429391504014e-08, "logits/chosen": -2.8885178565979004, "logits/rejected": -2.877211570739746, "logps/chosen": -57.013694763183594, "logps/rejected": -52.124473571777344, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00032575431396253407, "rewards/margins": 0.00021151344117242843, "rewards/rejected": -0.0005372677696868777, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.046933174133301, "learning_rate": 2.6980482204362798e-08, "logits/chosen": -2.9009499549865723, "logits/rejected": -2.8764655590057373, "logps/chosen": -54.453948974609375, "logps/rejected": -50.358951568603516, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.428888496477157e-05, "rewards/margins": 0.0006898181745782495, "rewards/rejected": -0.0007341071031987667, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.3568708896636963, "learning_rate": 2.755453501722158e-08, "logits/chosen": -2.9118103981018066, "logits/rejected": -2.904237747192383, "logps/chosen": -54.8182373046875, "logps/rejected": -58.23457717895508, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00026756085571832955, "rewards/margins": 0.000665828469209373, "rewards/rejected": -0.000933389354031533, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.236090898513794, "learning_rate": 2.8128587830080364e-08, "logits/chosen": -2.8598079681396484, "logits/rejected": -2.8242173194885254, "logps/chosen": -61.14402389526367, "logps/rejected": -50.303977966308594, "loss": 0.6927, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0002923067077063024, "rewards/margins": 0.0008667553775012493, "rewards/rejected": -0.0011590620269998908, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 1.9498893022537231, "learning_rate": 2.8702640642939148e-08, "logits/chosen": -2.8858821392059326, "logits/rejected": -2.868650436401367, "logps/chosen": -56.36248016357422, "logps/rejected": -51.5005989074707, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0007818065350875258, "rewards/margins": 0.00028844509506598115, "rewards/rejected": -0.001070251688361168, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 1.9891581535339355, "learning_rate": 2.927669345579793e-08, "logits/chosen": -2.8594257831573486, "logits/rejected": -2.84855318069458, "logps/chosen": -58.753883361816406, "logps/rejected": -51.88996124267578, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0007896844181232154, "rewards/margins": 0.00022462100605480373, "rewards/rejected": -0.0010143055114895105, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 1.96133291721344, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -2.9103283882141113, "logits/rejected": -2.893756151199341, "logps/chosen": -56.875282287597656, "logps/rejected": -51.322296142578125, "loss": 0.6928, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0010125295957550406, "rewards/margins": 0.0006925197667442262, "rewards/rejected": -0.001705049304291606, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 1.8310251235961914, "learning_rate": 3.0424799081515494e-08, "logits/chosen": -2.908512592315674, "logits/rejected": -2.877866744995117, "logps/chosen": -56.7138671875, "logps/rejected": -50.077171325683594, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0005267611122690141, "rewards/margins": 0.0011749586556106806, "rewards/rejected": -0.0017017197096720338, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 1.9770342111587524, "learning_rate": 3.099885189437428e-08, "logits/chosen": -2.8999149799346924, "logits/rejected": -2.88718318939209, "logps/chosen": -54.016265869140625, "logps/rejected": -52.41388702392578, "loss": 0.6927, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0007578878430649638, "rewards/margins": 0.0009833236690610647, "rewards/rejected": -0.0017412115121260285, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.0536069869995117, "learning_rate": 3.157290470723307e-08, "logits/chosen": -2.9287686347961426, "logits/rejected": -2.9126522541046143, "logps/chosen": -54.85844802856445, "logps/rejected": -51.18532180786133, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0011350096901878715, "rewards/margins": 0.0010482745710760355, "rewards/rejected": -0.0021832843776792288, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.260427236557007, "learning_rate": 3.214695752009185e-08, "logits/chosen": -2.914752721786499, "logits/rejected": -2.904942035675049, "logps/chosen": -53.504119873046875, "logps/rejected": -54.135826110839844, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0010297555709257722, "rewards/margins": 0.000813187682069838, "rewards/rejected": -0.001842943369410932, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.097816228866577, "learning_rate": 3.2721010332950634e-08, "logits/chosen": -2.8879055976867676, "logits/rejected": -2.883946180343628, "logps/chosen": -52.45808792114258, "logps/rejected": -53.73316192626953, "loss": 0.6926, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.000989855034276843, "rewards/margins": 0.0010882084025070071, "rewards/rejected": -0.0020780630875378847, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 1.696934461593628, "learning_rate": 3.3295063145809414e-08, "logits/chosen": -2.9020581245422363, "logits/rejected": -2.8965868949890137, "logps/chosen": -52.064788818359375, "logps/rejected": -51.971031188964844, "loss": 0.6928, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0011961839627474546, "rewards/margins": 0.0007127647986635566, "rewards/rejected": -0.0019089489942416549, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 2.097994565963745, "learning_rate": 3.38691159586682e-08, "logits/chosen": -2.899217367172241, "logits/rejected": -2.885105848312378, "logps/chosen": -55.10315704345703, "logps/rejected": -54.668983459472656, "loss": 0.6928, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0012872371589764953, "rewards/margins": 0.0007017455063760281, "rewards/rejected": -0.0019889825489372015, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.353062868118286, "learning_rate": 3.444316877152698e-08, "logits/chosen": -2.901080846786499, "logits/rejected": -2.8811850547790527, "logps/chosen": -55.64324951171875, "logps/rejected": -55.54071044921875, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": -0.00037466268986463547, "rewards/margins": 0.0022983322851359844, "rewards/rejected": -0.00267299497500062, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.102449893951416, "learning_rate": 3.501722158438576e-08, "logits/chosen": -2.8592283725738525, "logits/rejected": -2.8586907386779785, "logps/chosen": -54.448280334472656, "logps/rejected": -53.2332649230957, "loss": 0.6928, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0012435053940862417, "rewards/margins": 0.0006144942017272115, "rewards/rejected": -0.0018579994793981314, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.2198610305786133, "learning_rate": 3.559127439724455e-08, "logits/chosen": -2.9471917152404785, "logits/rejected": -2.9253733158111572, "logps/chosen": -56.49320602416992, "logps/rejected": -52.7685432434082, "loss": 0.6924, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0014289176324382424, "rewards/margins": 0.0015511448727920651, "rewards/rejected": -0.002980062272399664, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.3251566886901855, "learning_rate": 3.616532721010333e-08, "logits/chosen": -2.9427034854888916, "logits/rejected": -2.9193663597106934, "logps/chosen": -55.36882400512695, "logps/rejected": -50.096778869628906, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0016340886941179633, "rewards/margins": 0.0012800354743376374, "rewards/rejected": -0.0029141241684556007, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.282984972000122, "learning_rate": 3.6739380022962115e-08, "logits/chosen": -2.9252994060516357, "logits/rejected": -2.922450065612793, "logps/chosen": -53.620811462402344, "logps/rejected": -53.71160888671875, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0018658351618796587, "rewards/margins": 0.0008461518445983529, "rewards/rejected": -0.002711986657232046, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.365267515182495, "learning_rate": 3.7313432835820895e-08, "logits/chosen": -2.9289305210113525, "logits/rejected": -2.9324915409088135, "logps/chosen": -53.2087287902832, "logps/rejected": -54.3664665222168, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.002466582925990224, "rewards/margins": 0.00023730102111585438, "rewards/rejected": -0.0027038834523409605, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.1056787967681885, "learning_rate": 3.788748564867968e-08, "logits/chosen": -2.87402081489563, "logits/rejected": -2.874648094177246, "logps/chosen": -55.8626708984375, "logps/rejected": -52.24543380737305, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0017661073943600059, "rewards/margins": 0.0010808532824739814, "rewards/rejected": -0.0028469606768339872, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.0044331550598145, "learning_rate": 3.846153846153846e-08, "logits/chosen": -2.8894381523132324, "logits/rejected": -2.8855738639831543, "logps/chosen": -54.400245666503906, "logps/rejected": -57.206260681152344, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0014983124565333128, "rewards/margins": 0.0009925318881869316, "rewards/rejected": -0.0024908443447202444, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.1866514682769775, "learning_rate": 3.903559127439725e-08, "logits/chosen": -2.8572490215301514, "logits/rejected": -2.834527015686035, "logps/chosen": -55.36687088012695, "logps/rejected": -50.750118255615234, "loss": 0.6925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0026890942826867104, "rewards/margins": 0.0012137128505855799, "rewards/rejected": -0.0039028071332722902, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.242223024368286, "learning_rate": 3.960964408725603e-08, "logits/chosen": -2.947134494781494, "logits/rejected": -2.923008441925049, "logps/chosen": -60.630027770996094, "logps/rejected": -50.56222915649414, "loss": 0.6923, "rewards/accuracies": 0.59375, "rewards/chosen": -0.001378271379508078, "rewards/margins": 0.0016906490782275796, "rewards/rejected": -0.0030689204577356577, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.1509509086608887, "learning_rate": 4.018369690011481e-08, "logits/chosen": -2.9139673709869385, "logits/rejected": -2.890505313873291, "logps/chosen": -56.62127685546875, "logps/rejected": -52.38640213012695, "loss": 0.6921, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0015380937838926911, "rewards/margins": 0.0020837439224123955, "rewards/rejected": -0.003621837589889765, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.0617010593414307, "learning_rate": 4.0757749712973595e-08, "logits/chosen": -2.9065475463867188, "logits/rejected": -2.8897900581359863, "logps/chosen": -55.495445251464844, "logps/rejected": -54.319091796875, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0023886431008577347, "rewards/margins": 0.0016835425049066544, "rewards/rejected": -0.004072185140103102, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.1775104999542236, "learning_rate": 4.1331802525832375e-08, "logits/chosen": -2.8805248737335205, "logits/rejected": -2.8769776821136475, "logps/chosen": -54.568397521972656, "logps/rejected": -53.88288116455078, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.002390020526945591, "rewards/margins": 0.0008614700054749846, "rewards/rejected": -0.0032514906488358974, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.2033438682556152, "learning_rate": 4.190585533869116e-08, "logits/chosen": -2.9489083290100098, "logits/rejected": -2.9298577308654785, "logps/chosen": -57.70631790161133, "logps/rejected": -52.4577751159668, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": -0.002133384346961975, "rewards/margins": 0.002538988832384348, "rewards/rejected": -0.004672372248023748, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 2.2610881328582764, "learning_rate": 4.247990815154994e-08, "logits/chosen": -2.8875927925109863, "logits/rejected": -2.867264747619629, "logps/chosen": -55.90110397338867, "logps/rejected": -53.75722122192383, "loss": 0.6921, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002014304045587778, "rewards/margins": 0.002179005416110158, "rewards/rejected": -0.00419330969452858, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.137699842453003, "learning_rate": 4.305396096440873e-08, "logits/chosen": -2.997771978378296, "logits/rejected": -2.9786887168884277, "logps/chosen": -57.344635009765625, "logps/rejected": -53.46533966064453, "loss": 0.6913, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0014387632254511118, "rewards/margins": 0.0038202754221856594, "rewards/rejected": -0.0052590384148061275, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 2.469003200531006, "learning_rate": 4.362801377726751e-08, "logits/chosen": -2.907745838165283, "logits/rejected": -2.879517078399658, "logps/chosen": -55.790306091308594, "logps/rejected": -49.0444450378418, "loss": 0.6912, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00264460826292634, "rewards/margins": 0.003907923586666584, "rewards/rejected": -0.006552531383931637, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 2.072821617126465, "learning_rate": 4.420206659012629e-08, "logits/chosen": -2.930954694747925, "logits/rejected": -2.917003631591797, "logps/chosen": -54.62306594848633, "logps/rejected": -52.37617874145508, "loss": 0.6914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0027048583142459393, "rewards/margins": 0.0036215726286172867, "rewards/rejected": -0.0063264318741858006, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 2.3292832374572754, "learning_rate": 4.4776119402985075e-08, "logits/chosen": -2.929471254348755, "logits/rejected": -2.9057729244232178, "logps/chosen": -55.01372528076172, "logps/rejected": -51.13895797729492, "loss": 0.6912, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0043188584968447685, "rewards/margins": 0.0038614545483142138, "rewards/rejected": -0.008180314674973488, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.2163500785827637, "learning_rate": 4.5350172215843855e-08, "logits/chosen": -2.927473545074463, "logits/rejected": -2.9015581607818604, "logps/chosen": -55.29557418823242, "logps/rejected": -53.673065185546875, "loss": 0.6909, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0033913864754140377, "rewards/margins": 0.0045530120842158794, "rewards/rejected": -0.007944399490952492, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.277212381362915, "learning_rate": 4.592422502870264e-08, "logits/chosen": -2.8656978607177734, "logits/rejected": -2.8462979793548584, "logps/chosen": -56.265106201171875, "logps/rejected": -55.311790466308594, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.005564314313232899, "rewards/margins": 0.001834776485338807, "rewards/rejected": -0.0073990910314023495, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -2.970120668411255, "eval_logits/rejected": -2.966658115386963, "eval_logps/chosen": -58.77965545654297, "eval_logps/rejected": -62.625755310058594, "eval_loss": 0.6925970315933228, "eval_rewards/accuracies": 0.5492565035820007, "eval_rewards/chosen": 0.0023581732530146837, "eval_rewards/margins": 0.0011197492713108659, "eval_rewards/rejected": 0.0012384242145344615, "eval_runtime": 382.8666, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 2.4686055183410645, "learning_rate": 4.649827784156142e-08, "logits/chosen": -2.8969507217407227, "logits/rejected": -2.8752167224884033, "logps/chosen": -57.342254638671875, "logps/rejected": -55.21208572387695, "loss": 0.6917, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.004046624060720205, "rewards/margins": 0.002936845412477851, "rewards/rejected": -0.006983468774706125, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.2558839321136475, "learning_rate": 4.707233065442021e-08, "logits/chosen": -2.9297451972961426, "logits/rejected": -2.9120051860809326, "logps/chosen": -53.40732955932617, "logps/rejected": -51.31257247924805, "loss": 0.6918, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.005798877216875553, "rewards/margins": 0.002780771581456065, "rewards/rejected": -0.008579649031162262, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 2.1874077320098877, "learning_rate": 4.764638346727899e-08, "logits/chosen": -2.918710231781006, "logits/rejected": -2.902024984359741, "logps/chosen": -56.356163024902344, "logps/rejected": -54.185760498046875, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": -0.004841519054025412, "rewards/margins": 0.005869493819773197, "rewards/rejected": -0.010711013339459896, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.145451784133911, "learning_rate": 4.8220436280137775e-08, "logits/chosen": -2.951972484588623, "logits/rejected": -2.930785894393921, "logps/chosen": -55.628273010253906, "logps/rejected": -50.5267333984375, "loss": 0.6905, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.005067192949354649, "rewards/margins": 0.005340776406228542, "rewards/rejected": -0.010407969355583191, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.073517322540283, "learning_rate": 4.8794489092996555e-08, "logits/chosen": -2.890193223953247, "logits/rejected": -2.8816397190093994, "logps/chosen": -52.18341827392578, "logps/rejected": -55.33275604248047, "loss": 0.6918, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.006669786758720875, "rewards/margins": 0.002874907338991761, "rewards/rejected": -0.009544694796204567, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.4177420139312744, "learning_rate": 4.9368541905855335e-08, "logits/chosen": -2.915323495864868, "logits/rejected": -2.901231050491333, "logps/chosen": -54.65660858154297, "logps/rejected": -52.22821807861328, "loss": 0.6905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007540437392890453, "rewards/margins": 0.005416450090706348, "rewards/rejected": -0.012956887483596802, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 1.7828757762908936, "learning_rate": 4.994259471871412e-08, "logits/chosen": -2.940559148788452, "logits/rejected": -2.9429688453674316, "logps/chosen": -52.750709533691406, "logps/rejected": -54.044281005859375, "loss": 0.6911, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008697355166077614, "rewards/margins": 0.0041740224696695805, "rewards/rejected": -0.012871377170085907, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 1.9108960628509521, "learning_rate": 5.0516647531572895e-08, "logits/chosen": -2.8837485313415527, "logits/rejected": -2.863524913787842, "logps/chosen": -54.0827751159668, "logps/rejected": -51.63903045654297, "loss": 0.6893, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.006638042628765106, "rewards/margins": 0.007870601490139961, "rewards/rejected": -0.014508644118905067, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.07565975189209, "learning_rate": 5.109070034443168e-08, "logits/chosen": -2.908128499984741, "logits/rejected": -2.8816587924957275, "logps/chosen": -60.31031036376953, "logps/rejected": -54.7435302734375, "loss": 0.6894, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0049650766886770725, "rewards/margins": 0.007679118309170008, "rewards/rejected": -0.012644194066524506, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 2.0747740268707275, "learning_rate": 5.166475315729046e-08, "logits/chosen": -2.927508592605591, "logits/rejected": -2.921992540359497, "logps/chosen": -55.90435028076172, "logps/rejected": -52.45256805419922, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008611300960183144, "rewards/margins": 0.004552994389086962, "rewards/rejected": -0.013164294883608818, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 2.0262808799743652, "learning_rate": 5.223880597014925e-08, "logits/chosen": -2.904489040374756, "logits/rejected": -2.902407646179199, "logps/chosen": -53.1060676574707, "logps/rejected": -53.28779220581055, "loss": 0.6914, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.010441782884299755, "rewards/margins": 0.0036925789900124073, "rewards/rejected": -0.01413436233997345, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 2.245786428451538, "learning_rate": 5.281285878300803e-08, "logits/chosen": -2.928288459777832, "logits/rejected": -2.896723985671997, "logps/chosen": -56.58086013793945, "logps/rejected": -50.05290603637695, "loss": 0.6894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.010822747834026814, "rewards/margins": 0.007769578602164984, "rewards/rejected": -0.01859232783317566, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 2.340181827545166, "learning_rate": 5.3386911595866815e-08, "logits/chosen": -2.9540724754333496, "logits/rejected": -2.9424567222595215, "logps/chosen": -54.608428955078125, "logps/rejected": -54.735633850097656, "loss": 0.6899, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01000906340777874, "rewards/margins": 0.006677950266748667, "rewards/rejected": -0.01668701320886612, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 2.0220203399658203, "learning_rate": 5.3960964408725595e-08, "logits/chosen": -2.9646830558776855, "logits/rejected": -2.937711238861084, "logps/chosen": -62.19846725463867, "logps/rejected": -56.09409713745117, "loss": 0.6898, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008925000205636024, "rewards/margins": 0.0069125862792134285, "rewards/rejected": -0.01583758369088173, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 2.1144919395446777, "learning_rate": 5.4535017221584375e-08, "logits/chosen": -2.8292360305786133, "logits/rejected": -2.8140792846679688, "logps/chosen": -57.392913818359375, "logps/rejected": -56.34737014770508, "loss": 0.6908, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.014584523625671864, "rewards/margins": 0.005001784302294254, "rewards/rejected": -0.019586309790611267, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 2.0776162147521973, "learning_rate": 5.510907003444316e-08, "logits/chosen": -2.8197903633117676, "logits/rejected": -2.8229846954345703, "logps/chosen": -53.091064453125, "logps/rejected": -56.24619674682617, "loss": 0.6928, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.015429055318236351, "rewards/margins": 0.0008552552899345756, "rewards/rejected": -0.016284313052892685, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 2.197368621826172, "learning_rate": 5.568312284730194e-08, "logits/chosen": -2.9082279205322266, "logits/rejected": -2.881479501724243, "logps/chosen": -62.6414794921875, "logps/rejected": -52.523406982421875, "loss": 0.6912, "rewards/accuracies": 0.53125, "rewards/chosen": -0.013698337599635124, "rewards/margins": 0.0042149461805820465, "rewards/rejected": -0.01791328564286232, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 2.396287441253662, "learning_rate": 5.625717566016073e-08, "logits/chosen": -2.9761950969696045, "logits/rejected": -2.95814847946167, "logps/chosen": -58.28235626220703, "logps/rejected": -55.292930603027344, "loss": 0.689, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.008706005290150642, "rewards/margins": 0.0086164940148592, "rewards/rejected": -0.01732250303030014, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 2.2035701274871826, "learning_rate": 5.683122847301951e-08, "logits/chosen": -2.897202730178833, "logits/rejected": -2.878493309020996, "logps/chosen": -56.3300666809082, "logps/rejected": -53.285552978515625, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.015088607557117939, "rewards/margins": 0.004525421187281609, "rewards/rejected": -0.019614029675722122, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 1.9301676750183105, "learning_rate": 5.7405281285878295e-08, "logits/chosen": -2.8635621070861816, "logits/rejected": -2.841841220855713, "logps/chosen": -59.418907165527344, "logps/rejected": -52.82847213745117, "loss": 0.6902, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.012757417745888233, "rewards/margins": 0.006062129978090525, "rewards/rejected": -0.018819546326994896, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 2.214897632598877, "learning_rate": 5.7979334098737075e-08, "logits/chosen": -2.823845863342285, "logits/rejected": -2.8300139904022217, "logps/chosen": -56.06037139892578, "logps/rejected": -58.37239456176758, "loss": 0.6918, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.017535416409373283, "rewards/margins": 0.0029864010866731405, "rewards/rejected": -0.020521817728877068, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 2.2331719398498535, "learning_rate": 5.855338691159586e-08, "logits/chosen": -2.9262149333953857, "logits/rejected": -2.902949333190918, "logps/chosen": -59.038734436035156, "logps/rejected": -55.38860321044922, "loss": 0.6868, "rewards/accuracies": 0.65625, "rewards/chosen": -0.008785584941506386, "rewards/margins": 0.013060608878731728, "rewards/rejected": -0.021846191957592964, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 2.039205312728882, "learning_rate": 5.912743972445464e-08, "logits/chosen": -2.9541501998901367, "logits/rejected": -2.934560775756836, "logps/chosen": -56.76179122924805, "logps/rejected": -53.00408935546875, "loss": 0.6899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013884862884879112, "rewards/margins": 0.006750473286956549, "rewards/rejected": -0.02063533291220665, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 2.338857889175415, "learning_rate": 5.970149253731343e-08, "logits/chosen": -2.9137110710144043, "logits/rejected": -2.9044034481048584, "logps/chosen": -55.28099822998047, "logps/rejected": -57.13520050048828, "loss": 0.6923, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.019616015255451202, "rewards/margins": 0.00203714263625443, "rewards/rejected": -0.021653154864907265, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 2.2119388580322266, "learning_rate": 6.02755453501722e-08, "logits/chosen": -2.856088161468506, "logits/rejected": -2.824749708175659, "logps/chosen": -59.729095458984375, "logps/rejected": -52.37529754638672, "loss": 0.6887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01591629534959793, "rewards/margins": 0.009324030950665474, "rewards/rejected": -0.025240326300263405, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 2.2295567989349365, "learning_rate": 6.084959816303099e-08, "logits/chosen": -2.873307704925537, "logits/rejected": -2.855834484100342, "logps/chosen": -59.4355354309082, "logps/rejected": -56.32249069213867, "loss": 0.6916, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.017835207283496857, "rewards/margins": 0.0034090355038642883, "rewards/rejected": -0.021244242787361145, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 2.0987818241119385, "learning_rate": 6.142365097588978e-08, "logits/chosen": -2.9532623291015625, "logits/rejected": -2.924928903579712, "logps/chosen": -59.63908767700195, "logps/rejected": -54.68932342529297, "loss": 0.6887, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01841794326901436, "rewards/margins": 0.009393568150699139, "rewards/rejected": -0.027811508625745773, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 2.119971513748169, "learning_rate": 6.199770378874856e-08, "logits/chosen": -2.9519741535186768, "logits/rejected": -2.933764934539795, "logps/chosen": -59.26136016845703, "logps/rejected": -53.21483612060547, "loss": 0.6895, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.021389838308095932, "rewards/margins": 0.007719496730715036, "rewards/rejected": -0.02910933457314968, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 2.250812292098999, "learning_rate": 6.257175660160735e-08, "logits/chosen": -2.864903211593628, "logits/rejected": -2.863048553466797, "logps/chosen": -55.70839309692383, "logps/rejected": -54.59096145629883, "loss": 0.6915, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.023310277611017227, "rewards/margins": 0.003674765583127737, "rewards/rejected": -0.026985038071870804, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 1.9611529111862183, "learning_rate": 6.314580941446614e-08, "logits/chosen": -2.88694429397583, "logits/rejected": -2.8873910903930664, "logps/chosen": -54.11775588989258, "logps/rejected": -57.08624267578125, "loss": 0.6886, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.018750367686152458, "rewards/margins": 0.009554053656756878, "rewards/rejected": -0.02830442227423191, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 2.4082181453704834, "learning_rate": 6.371986222732492e-08, "logits/chosen": -2.9054970741271973, "logits/rejected": -2.9155497550964355, "logps/chosen": -56.35832595825195, "logps/rejected": -57.75475311279297, "loss": 0.6914, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.025233492255210876, "rewards/margins": 0.004043240565806627, "rewards/rejected": -0.029276732355356216, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 2.127530574798584, "learning_rate": 6.42939150401837e-08, "logits/chosen": -2.9269912242889404, "logits/rejected": -2.910547971725464, "logps/chosen": -59.35639190673828, "logps/rejected": -55.61716842651367, "loss": 0.6893, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.023090193048119545, "rewards/margins": 0.008246874436736107, "rewards/rejected": -0.03133706375956535, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 2.0124289989471436, "learning_rate": 6.486796785304248e-08, "logits/chosen": -2.945962429046631, "logits/rejected": -2.9308059215545654, "logps/chosen": -54.93196487426758, "logps/rejected": -56.35882568359375, "loss": 0.6874, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.024388344958424568, "rewards/margins": 0.012135234661400318, "rewards/rejected": -0.03652357682585716, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 2.5086684226989746, "learning_rate": 6.544202066590127e-08, "logits/chosen": -2.9003541469573975, "logits/rejected": -2.878969669342041, "logps/chosen": -59.991233825683594, "logps/rejected": -55.50764083862305, "loss": 0.688, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02201252616941929, "rewards/margins": 0.010831797495484352, "rewards/rejected": -0.03284432366490364, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 2.2725441455841064, "learning_rate": 6.601607347876004e-08, "logits/chosen": -2.8616178035736084, "logits/rejected": -2.8515572547912598, "logps/chosen": -57.003143310546875, "logps/rejected": -56.3914680480957, "loss": 0.6879, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.023792926222085953, "rewards/margins": 0.01105764415115118, "rewards/rejected": -0.034850575029850006, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 1.9798253774642944, "learning_rate": 6.659012629161883e-08, "logits/chosen": -2.927191734313965, "logits/rejected": -2.920241117477417, "logps/chosen": -57.67218780517578, "logps/rejected": -56.62792205810547, "loss": 0.6874, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.019430024549365044, "rewards/margins": 0.011981760151684284, "rewards/rejected": -0.0314117856323719, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 1.8830546140670776, "learning_rate": 6.716417910447762e-08, "logits/chosen": -2.8572309017181396, "logits/rejected": -2.848026752471924, "logps/chosen": -55.777008056640625, "logps/rejected": -56.994873046875, "loss": 0.6899, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.028517460450530052, "rewards/margins": 0.0071545811370015144, "rewards/rejected": -0.03567204624414444, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 2.2484617233276367, "learning_rate": 6.77382319173364e-08, "logits/chosen": -2.835697889328003, "logits/rejected": -2.8139901161193848, "logps/chosen": -56.77092361450195, "logps/rejected": -54.284942626953125, "loss": 0.6851, "rewards/accuracies": 0.625, "rewards/chosen": -0.03087441995739937, "rewards/margins": 0.01692943647503853, "rewards/rejected": -0.0478038564324379, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 2.282123327255249, "learning_rate": 6.831228473019518e-08, "logits/chosen": -2.944603443145752, "logits/rejected": -2.9229369163513184, "logps/chosen": -61.991363525390625, "logps/rejected": -53.7884521484375, "loss": 0.6854, "rewards/accuracies": 0.5625, "rewards/chosen": -0.029348094016313553, "rewards/margins": 0.01629854366183281, "rewards/rejected": -0.04564663767814636, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 1.9270927906036377, "learning_rate": 6.888633754305396e-08, "logits/chosen": -2.847102403640747, "logits/rejected": -2.8369410037994385, "logps/chosen": -57.47083282470703, "logps/rejected": -54.379608154296875, "loss": 0.6901, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.03375508263707161, "rewards/margins": 0.006854252424091101, "rewards/rejected": -0.04060933366417885, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -2.961275577545166, "eval_logits/rejected": -2.9578850269317627, "eval_logps/chosen": -59.81459426879883, "eval_logps/rejected": -64.08273315429688, "eval_loss": 0.6906687617301941, "eval_rewards/accuracies": 0.5697026252746582, "eval_rewards/chosen": -0.007991200312972069, "eval_rewards/margins": 0.005340192466974258, "eval_rewards/rejected": -0.013331393711268902, "eval_runtime": 382.9771, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 2.3979392051696777, "learning_rate": 6.946039035591275e-08, "logits/chosen": -2.911381959915161, "logits/rejected": -2.8858389854431152, "logps/chosen": -58.13728713989258, "logps/rejected": -57.347434997558594, "loss": 0.6857, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03157086297869682, "rewards/margins": 0.015606355853378773, "rewards/rejected": -0.04717721790075302, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 2.074423313140869, "learning_rate": 7.003444316877152e-08, "logits/chosen": -2.9131548404693604, "logits/rejected": -2.8910086154937744, "logps/chosen": -57.07306671142578, "logps/rejected": -56.53594970703125, "loss": 0.687, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.033832598477602005, "rewards/margins": 0.013145034201443195, "rewards/rejected": -0.046977631747722626, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 2.4180092811584473, "learning_rate": 7.060849598163031e-08, "logits/chosen": -2.979348659515381, "logits/rejected": -2.9486043453216553, "logps/chosen": -60.274391174316406, "logps/rejected": -56.62663650512695, "loss": 0.6835, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03272354230284691, "rewards/margins": 0.02001432329416275, "rewards/rejected": -0.05273786187171936, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 2.1716294288635254, "learning_rate": 7.11825487944891e-08, "logits/chosen": -2.8780722618103027, "logits/rejected": -2.868040084838867, "logps/chosen": -57.13683319091797, "logps/rejected": -57.61468505859375, "loss": 0.6873, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.041347816586494446, "rewards/margins": 0.012507520616054535, "rewards/rejected": -0.05385533720254898, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 2.6842474937438965, "learning_rate": 7.175660160734788e-08, "logits/chosen": -2.9559080600738525, "logits/rejected": -2.925466775894165, "logps/chosen": -59.06212615966797, "logps/rejected": -56.11531448364258, "loss": 0.6855, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.038944486528635025, "rewards/margins": 0.01640475168824196, "rewards/rejected": -0.05534924194216728, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 2.1675713062286377, "learning_rate": 7.233065442020666e-08, "logits/chosen": -2.844125747680664, "logits/rejected": -2.8245441913604736, "logps/chosen": -58.59064483642578, "logps/rejected": -55.4991569519043, "loss": 0.685, "rewards/accuracies": 0.59375, "rewards/chosen": -0.041406117379665375, "rewards/margins": 0.017199214547872543, "rewards/rejected": -0.05860533565282822, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 2.3532440662384033, "learning_rate": 7.290470723306544e-08, "logits/chosen": -2.880176067352295, "logits/rejected": -2.8723835945129395, "logps/chosen": -57.689796447753906, "logps/rejected": -59.45355224609375, "loss": 0.6895, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04507274553179741, "rewards/margins": 0.008155112154781818, "rewards/rejected": -0.05322786048054695, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 2.100616693496704, "learning_rate": 7.347876004592423e-08, "logits/chosen": -2.905412197113037, "logits/rejected": -2.913015365600586, "logps/chosen": -56.19207763671875, "logps/rejected": -64.13707733154297, "loss": 0.6896, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04303373023867607, "rewards/margins": 0.008113773539662361, "rewards/rejected": -0.05114750191569328, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 2.2928290367126465, "learning_rate": 7.405281285878302e-08, "logits/chosen": -2.8585543632507324, "logits/rejected": -2.8368325233459473, "logps/chosen": -59.599952697753906, "logps/rejected": -55.80320358276367, "loss": 0.6844, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03863871842622757, "rewards/margins": 0.018526822328567505, "rewards/rejected": -0.057165540754795074, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 2.045578718185425, "learning_rate": 7.462686567164179e-08, "logits/chosen": -2.904303550720215, "logits/rejected": -2.8966355323791504, "logps/chosen": -55.30950927734375, "logps/rejected": -59.39851760864258, "loss": 0.6894, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0525943860411644, "rewards/margins": 0.008315442129969597, "rewards/rejected": -0.060909826308488846, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 2.3341314792633057, "learning_rate": 7.520091848450058e-08, "logits/chosen": -2.90086030960083, "logits/rejected": -2.877695083618164, "logps/chosen": -58.40484619140625, "logps/rejected": -55.46175003051758, "loss": 0.6821, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.044036369770765305, "rewards/margins": 0.023425104096531868, "rewards/rejected": -0.06746147572994232, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 2.3799657821655273, "learning_rate": 7.577497129735936e-08, "logits/chosen": -2.9232773780822754, "logits/rejected": -2.9070727825164795, "logps/chosen": -60.878334045410156, "logps/rejected": -56.428741455078125, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04339348524808884, "rewards/margins": 0.017966218292713165, "rewards/rejected": -0.061359703540802, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 2.3074443340301514, "learning_rate": 7.634902411021814e-08, "logits/chosen": -2.9480531215667725, "logits/rejected": -2.924940586090088, "logps/chosen": -60.75572967529297, "logps/rejected": -56.17448806762695, "loss": 0.6844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04371759295463562, "rewards/margins": 0.01845361292362213, "rewards/rejected": -0.06217120215296745, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 2.1518850326538086, "learning_rate": 7.692307692307692e-08, "logits/chosen": -2.841428279876709, "logits/rejected": -2.8257040977478027, "logps/chosen": -60.455482482910156, "logps/rejected": -57.828460693359375, "loss": 0.6864, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04494331032037735, "rewards/margins": 0.014719230122864246, "rewards/rejected": -0.05966253951191902, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 2.1697258949279785, "learning_rate": 7.749712973593571e-08, "logits/chosen": -2.8844552040100098, "logits/rejected": -2.867018461227417, "logps/chosen": -60.273338317871094, "logps/rejected": -59.45403289794922, "loss": 0.6899, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.05731966346502304, "rewards/margins": 0.007771348115056753, "rewards/rejected": -0.06509101390838623, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 2.2805898189544678, "learning_rate": 7.80711825487945e-08, "logits/chosen": -2.8427302837371826, "logits/rejected": -2.831007957458496, "logps/chosen": -61.870033264160156, "logps/rejected": -61.115379333496094, "loss": 0.6899, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05456281825900078, "rewards/margins": 0.00770781422033906, "rewards/rejected": -0.0622706413269043, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 2.7378504276275635, "learning_rate": 7.864523536165327e-08, "logits/chosen": -2.927980899810791, "logits/rejected": -2.9119479656219482, "logps/chosen": -59.125083923339844, "logps/rejected": -59.64502716064453, "loss": 0.6837, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05251600593328476, "rewards/margins": 0.020127257332205772, "rewards/rejected": -0.07264326512813568, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 2.2284791469573975, "learning_rate": 7.921928817451206e-08, "logits/chosen": -2.870211362838745, "logits/rejected": -2.861656904220581, "logps/chosen": -61.35654830932617, "logps/rejected": -57.8406982421875, "loss": 0.687, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.05080818384885788, "rewards/margins": 0.01347988098859787, "rewards/rejected": -0.06428805738687515, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 2.3655896186828613, "learning_rate": 7.979334098737084e-08, "logits/chosen": -2.9632561206817627, "logits/rejected": -2.9431121349334717, "logps/chosen": -62.521148681640625, "logps/rejected": -58.51555252075195, "loss": 0.686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04506484419107437, "rewards/margins": 0.01579303853213787, "rewards/rejected": -0.06085788086056709, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 2.1747207641601562, "learning_rate": 8.036739380022962e-08, "logits/chosen": -2.8493809700012207, "logits/rejected": -2.831693410873413, "logps/chosen": -61.77557373046875, "logps/rejected": -58.64817428588867, "loss": 0.6844, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04902204871177673, "rewards/margins": 0.018787894397974014, "rewards/rejected": -0.06780994683504105, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 2.0125632286071777, "learning_rate": 8.09414466130884e-08, "logits/chosen": -2.88521671295166, "logits/rejected": -2.8787474632263184, "logps/chosen": -59.67738723754883, "logps/rejected": -59.01367950439453, "loss": 0.6878, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.052916985005140305, "rewards/margins": 0.01177090872079134, "rewards/rejected": -0.06468789279460907, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 2.345404624938965, "learning_rate": 8.151549942594719e-08, "logits/chosen": -2.836278200149536, "logits/rejected": -2.840115547180176, "logps/chosen": -57.09819412231445, "logps/rejected": -60.34763717651367, "loss": 0.6927, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.05967681482434273, "rewards/margins": 0.002343767322599888, "rewards/rejected": -0.06202058121562004, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 2.3286633491516113, "learning_rate": 8.208955223880598e-08, "logits/chosen": -2.910238742828369, "logits/rejected": -2.9011566638946533, "logps/chosen": -60.003883361816406, "logps/rejected": -61.60279083251953, "loss": 0.687, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.05464918166399002, "rewards/margins": 0.013536456041038036, "rewards/rejected": -0.06818564236164093, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 2.111067295074463, "learning_rate": 8.266360505166475e-08, "logits/chosen": -2.8533780574798584, "logits/rejected": -2.834259510040283, "logps/chosen": -57.13257598876953, "logps/rejected": -54.819725036621094, "loss": 0.6852, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05546184256672859, "rewards/margins": 0.01728229410946369, "rewards/rejected": -0.07274413853883743, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 2.4927828311920166, "learning_rate": 8.323765786452354e-08, "logits/chosen": -2.8578834533691406, "logits/rejected": -2.828831911087036, "logps/chosen": -60.613731384277344, "logps/rejected": -59.09722900390625, "loss": 0.6838, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.049021851271390915, "rewards/margins": 0.02033841609954834, "rewards/rejected": -0.06936026364564896, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 2.4668631553649902, "learning_rate": 8.381171067738232e-08, "logits/chosen": -2.876624584197998, "logits/rejected": -2.8503682613372803, "logps/chosen": -59.0427360534668, "logps/rejected": -57.6472053527832, "loss": 0.6832, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05816395953297615, "rewards/margins": 0.021529266610741615, "rewards/rejected": -0.07969322055578232, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 2.469766616821289, "learning_rate": 8.43857634902411e-08, "logits/chosen": -2.9928104877471924, "logits/rejected": -2.9630894660949707, "logps/chosen": -63.2796516418457, "logps/rejected": -61.028541564941406, "loss": 0.6829, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04897930100560188, "rewards/margins": 0.02178030088543892, "rewards/rejected": -0.0707595944404602, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 2.3830063343048096, "learning_rate": 8.495981630309988e-08, "logits/chosen": -2.944361448287964, "logits/rejected": -2.914533853530884, "logps/chosen": -57.114402770996094, "logps/rejected": -58.12091064453125, "loss": 0.6848, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.049399979412555695, "rewards/margins": 0.017925377935171127, "rewards/rejected": -0.06732536852359772, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 2.330536365509033, "learning_rate": 8.553386911595867e-08, "logits/chosen": -2.927044630050659, "logits/rejected": -2.90609073638916, "logps/chosen": -63.553245544433594, "logps/rejected": -59.14959716796875, "loss": 0.6855, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05048053339123726, "rewards/margins": 0.016581468284130096, "rewards/rejected": -0.06706200540065765, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 2.8656115531921387, "learning_rate": 8.610792192881746e-08, "logits/chosen": -2.8543686866760254, "logits/rejected": -2.853294610977173, "logps/chosen": -57.01494216918945, "logps/rejected": -59.04634475708008, "loss": 0.6882, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05923793837428093, "rewards/margins": 0.011348679661750793, "rewards/rejected": -0.07058661431074142, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 2.5738108158111572, "learning_rate": 8.668197474167623e-08, "logits/chosen": -2.8481411933898926, "logits/rejected": -2.822608470916748, "logps/chosen": -63.966583251953125, "logps/rejected": -60.66066360473633, "loss": 0.6796, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04969775304198265, "rewards/margins": 0.02851223386824131, "rewards/rejected": -0.07820998132228851, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 2.3993465900421143, "learning_rate": 8.725602755453502e-08, "logits/chosen": -2.9035634994506836, "logits/rejected": -2.901787519454956, "logps/chosen": -59.19148635864258, "logps/rejected": -57.19902420043945, "loss": 0.6905, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.05953991413116455, "rewards/margins": 0.006711998488754034, "rewards/rejected": -0.06625191122293472, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 2.638784408569336, "learning_rate": 8.78300803673938e-08, "logits/chosen": -2.8348419666290283, "logits/rejected": -2.822012424468994, "logps/chosen": -58.7889518737793, "logps/rejected": -56.48737716674805, "loss": 0.6879, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.059633769094944, "rewards/margins": 0.012106634676456451, "rewards/rejected": -0.07174040377140045, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 2.5973594188690186, "learning_rate": 8.840413318025258e-08, "logits/chosen": -2.913525342941284, "logits/rejected": -2.8881072998046875, "logps/chosen": -62.00640106201172, "logps/rejected": -58.09117889404297, "loss": 0.6821, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05526846647262573, "rewards/margins": 0.023841742426156998, "rewards/rejected": -0.07911021262407303, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 2.4358787536621094, "learning_rate": 8.897818599311136e-08, "logits/chosen": -2.847963333129883, "logits/rejected": -2.8368639945983887, "logps/chosen": -60.75475311279297, "logps/rejected": -57.487464904785156, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": -0.059478919953107834, "rewards/margins": 0.02126147784292698, "rewards/rejected": -0.08074040710926056, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 2.2986135482788086, "learning_rate": 8.955223880597015e-08, "logits/chosen": -2.8618578910827637, "logits/rejected": -2.8462741374969482, "logps/chosen": -57.96033477783203, "logps/rejected": -57.77971267700195, "loss": 0.6864, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06349469721317291, "rewards/margins": 0.01494310237467289, "rewards/rejected": -0.07843779027462006, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 2.231766939163208, "learning_rate": 9.012629161882894e-08, "logits/chosen": -2.901459217071533, "logits/rejected": -2.906397581100464, "logps/chosen": -59.78357696533203, "logps/rejected": -63.125389099121094, "loss": 0.6896, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.06565506756305695, "rewards/margins": 0.008577165193855762, "rewards/rejected": -0.07423223555088043, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 2.6459128856658936, "learning_rate": 9.070034443168771e-08, "logits/chosen": -2.8522191047668457, "logits/rejected": -2.8468332290649414, "logps/chosen": -59.195945739746094, "logps/rejected": -61.25299072265625, "loss": 0.6854, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06301302462816238, "rewards/margins": 0.017189020290970802, "rewards/rejected": -0.08020204305648804, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 2.9546637535095215, "learning_rate": 9.12743972445465e-08, "logits/chosen": -2.8846755027770996, "logits/rejected": -2.8750617504119873, "logps/chosen": -60.621368408203125, "logps/rejected": -62.49888229370117, "loss": 0.684, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06025485321879387, "rewards/margins": 0.01954682543873787, "rewards/rejected": -0.07980167865753174, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 2.6051409244537354, "learning_rate": 9.184845005740528e-08, "logits/chosen": -2.8711392879486084, "logits/rejected": -2.8471274375915527, "logps/chosen": -59.82111740112305, "logps/rejected": -59.49725341796875, "loss": 0.6835, "rewards/accuracies": 0.625, "rewards/chosen": -0.06477855890989304, "rewards/margins": 0.02124771662056446, "rewards/rejected": -0.08602626621723175, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -2.9442405700683594, "eval_logits/rejected": -2.9410126209259033, "eval_logps/chosen": -62.226593017578125, "eval_logps/rejected": -67.10498046875, "eval_loss": 0.6880334615707397, "eval_rewards/accuracies": 0.5764405131340027, "eval_rewards/chosen": -0.032111138105392456, "eval_rewards/margins": 0.011442671529948711, "eval_rewards/rejected": -0.04355381056666374, "eval_runtime": 382.9622, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 2.2450082302093506, "learning_rate": 9.242250287026406e-08, "logits/chosen": -2.891284942626953, "logits/rejected": -2.8724706172943115, "logps/chosen": -63.180908203125, "logps/rejected": -65.12251281738281, "loss": 0.6863, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07282154262065887, "rewards/margins": 0.015445959754288197, "rewards/rejected": -0.0882674977183342, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 2.4446041584014893, "learning_rate": 9.299655568312284e-08, "logits/chosen": -2.830038547515869, "logits/rejected": -2.8137710094451904, "logps/chosen": -60.38079833984375, "logps/rejected": -62.97710418701172, "loss": 0.685, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07806779444217682, "rewards/margins": 0.018283111974596977, "rewards/rejected": -0.09635090827941895, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 2.6224751472473145, "learning_rate": 9.357060849598163e-08, "logits/chosen": -2.875746250152588, "logits/rejected": -2.8608651161193848, "logps/chosen": -63.53081512451172, "logps/rejected": -59.80907440185547, "loss": 0.6826, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.06455497443675995, "rewards/margins": 0.023133311420679092, "rewards/rejected": -0.08768828958272934, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 2.5814969539642334, "learning_rate": 9.414466130884042e-08, "logits/chosen": -2.9426326751708984, "logits/rejected": -2.9164793491363525, "logps/chosen": -64.81478881835938, "logps/rejected": -59.52144241333008, "loss": 0.6826, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06653226912021637, "rewards/margins": 0.023489978164434433, "rewards/rejected": -0.09002223610877991, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 2.584890604019165, "learning_rate": 9.471871412169919e-08, "logits/chosen": -2.912461519241333, "logits/rejected": -2.8958592414855957, "logps/chosen": -64.74848937988281, "logps/rejected": -63.9894905090332, "loss": 0.6821, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06611146032810211, "rewards/margins": 0.02439257875084877, "rewards/rejected": -0.09050404280424118, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 2.513695240020752, "learning_rate": 9.529276693455798e-08, "logits/chosen": -2.90999436378479, "logits/rejected": -2.881359577178955, "logps/chosen": -63.374961853027344, "logps/rejected": -58.766632080078125, "loss": 0.6795, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07055828720331192, "rewards/margins": 0.029589012265205383, "rewards/rejected": -0.1001473069190979, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 2.6238155364990234, "learning_rate": 9.586681974741676e-08, "logits/chosen": -2.9074604511260986, "logits/rejected": -2.892113447189331, "logps/chosen": -63.04148483276367, "logps/rejected": -63.07508087158203, "loss": 0.6839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0675562173128128, "rewards/margins": 0.02040962502360344, "rewards/rejected": -0.08796583116054535, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 2.5961592197418213, "learning_rate": 9.644087256027555e-08, "logits/chosen": -2.881363868713379, "logits/rejected": -2.8643298149108887, "logps/chosen": -65.00291442871094, "logps/rejected": -63.203407287597656, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08005902916193008, "rewards/margins": 0.017578277736902237, "rewards/rejected": -0.09763731062412262, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 2.4997031688690186, "learning_rate": 9.701492537313432e-08, "logits/chosen": -2.8670291900634766, "logits/rejected": -2.857682466506958, "logps/chosen": -63.41624069213867, "logps/rejected": -59.528472900390625, "loss": 0.6907, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07418099790811539, "rewards/margins": 0.0065845223143696785, "rewards/rejected": -0.08076552301645279, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 2.485443592071533, "learning_rate": 9.758897818599311e-08, "logits/chosen": -2.823608636856079, "logits/rejected": -2.8208167552948, "logps/chosen": -58.42906951904297, "logps/rejected": -63.24658203125, "loss": 0.6896, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0760272890329361, "rewards/margins": 0.00900364015251398, "rewards/rejected": -0.0850309282541275, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 2.994943857192993, "learning_rate": 9.81630309988519e-08, "logits/chosen": -2.8936543464660645, "logits/rejected": -2.863755226135254, "logps/chosen": -65.0849838256836, "logps/rejected": -62.181243896484375, "loss": 0.6805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0581812784075737, "rewards/margins": 0.027432983741164207, "rewards/rejected": -0.08561427146196365, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 2.805964708328247, "learning_rate": 9.873708381171067e-08, "logits/chosen": -2.874479293823242, "logits/rejected": -2.8514676094055176, "logps/chosen": -64.06068420410156, "logps/rejected": -58.78630447387695, "loss": 0.6855, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0691644698381424, "rewards/margins": 0.017094207927584648, "rewards/rejected": -0.08625867962837219, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 2.502676486968994, "learning_rate": 9.931113662456946e-08, "logits/chosen": -2.903549909591675, "logits/rejected": -2.8940093517303467, "logps/chosen": -62.865089416503906, "logps/rejected": -60.605796813964844, "loss": 0.6892, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07542423903942108, "rewards/margins": 0.01010823529213667, "rewards/rejected": -0.08553248643875122, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 2.5997302532196045, "learning_rate": 9.988518943742824e-08, "logits/chosen": -2.8790831565856934, "logits/rejected": -2.8717141151428223, "logps/chosen": -60.32866668701172, "logps/rejected": -63.63134765625, "loss": 0.6829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07663007080554962, "rewards/margins": 0.022682690992951393, "rewards/rejected": -0.09931276738643646, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 2.848219633102417, "learning_rate": 9.999993568953616e-08, "logits/chosen": -2.877021312713623, "logits/rejected": -2.870337724685669, "logps/chosen": -62.94545364379883, "logps/rejected": -64.73450469970703, "loss": 0.6853, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.06983025372028351, "rewards/margins": 0.017571711912751198, "rewards/rejected": -0.08740197122097015, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 2.517097234725952, "learning_rate": 9.99996744285603e-08, "logits/chosen": -2.8967089653015137, "logits/rejected": -2.87041974067688, "logps/chosen": -64.09073638916016, "logps/rejected": -61.933387756347656, "loss": 0.6799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07073361426591873, "rewards/margins": 0.028766438364982605, "rewards/rejected": -0.09950004518032074, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 2.3977444171905518, "learning_rate": 9.999921219871774e-08, "logits/chosen": -2.8966360092163086, "logits/rejected": -2.8674557209014893, "logps/chosen": -63.9361572265625, "logps/rejected": -58.58233642578125, "loss": 0.6789, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07565224170684814, "rewards/margins": 0.030777927488088608, "rewards/rejected": -0.10643017292022705, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 3.279787540435791, "learning_rate": 9.99985490018664e-08, "logits/chosen": -2.8627562522888184, "logits/rejected": -2.863924741744995, "logps/chosen": -62.496971130371094, "logps/rejected": -67.43034362792969, "loss": 0.6822, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07575863599777222, "rewards/margins": 0.024096451699733734, "rewards/rejected": -0.09985508024692535, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 2.9971201419830322, "learning_rate": 9.99976848406719e-08, "logits/chosen": -2.9246413707733154, "logits/rejected": -2.9111831188201904, "logps/chosen": -60.837120056152344, "logps/rejected": -62.72722244262695, "loss": 0.6796, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07828166335821152, "rewards/margins": 0.029480207711458206, "rewards/rejected": -0.10776188224554062, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 2.7353081703186035, "learning_rate": 9.999661971860766e-08, "logits/chosen": -2.9237895011901855, "logits/rejected": -2.9061765670776367, "logps/chosen": -61.31614303588867, "logps/rejected": -62.551971435546875, "loss": 0.6838, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09031987190246582, "rewards/margins": 0.020791074261069298, "rewards/rejected": -0.11111094802618027, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 3.083889961242676, "learning_rate": 9.999535363995486e-08, "logits/chosen": -2.9037399291992188, "logits/rejected": -2.8783864974975586, "logps/chosen": -65.89918518066406, "logps/rejected": -63.4612922668457, "loss": 0.6836, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08302709460258484, "rewards/margins": 0.021477878093719482, "rewards/rejected": -0.10450496524572372, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 2.7066895961761475, "learning_rate": 9.999388660980235e-08, "logits/chosen": -2.8926279544830322, "logits/rejected": -2.8697822093963623, "logps/chosen": -64.84931945800781, "logps/rejected": -61.872032165527344, "loss": 0.6836, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0911707729101181, "rewards/margins": 0.021647578105330467, "rewards/rejected": -0.11281834542751312, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 2.6957552433013916, "learning_rate": 9.999221863404672e-08, "logits/chosen": -2.8292651176452637, "logits/rejected": -2.8238108158111572, "logps/chosen": -64.50049591064453, "logps/rejected": -64.85762786865234, "loss": 0.6837, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08205254375934601, "rewards/margins": 0.0213328804820776, "rewards/rejected": -0.10338541120290756, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 2.7012877464294434, "learning_rate": 9.999034971939226e-08, "logits/chosen": -2.9520134925842285, "logits/rejected": -2.947157621383667, "logps/chosen": -66.78407287597656, "logps/rejected": -66.19480895996094, "loss": 0.6873, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.10865718126296997, "rewards/margins": 0.01415513176470995, "rewards/rejected": -0.12281231582164764, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 3.3216099739074707, "learning_rate": 9.998827987335088e-08, "logits/chosen": -2.8799540996551514, "logits/rejected": -2.8814988136291504, "logps/chosen": -65.33901977539062, "logps/rejected": -65.83836364746094, "loss": 0.6898, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11164311319589615, "rewards/margins": 0.009035361930727959, "rewards/rejected": -0.12067846953868866, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 2.923114061355591, "learning_rate": 9.998600910424211e-08, "logits/chosen": -2.8394248485565186, "logits/rejected": -2.816847324371338, "logps/chosen": -67.88639068603516, "logps/rejected": -64.48649597167969, "loss": 0.6762, "rewards/accuracies": 0.625, "rewards/chosen": -0.08513197302818298, "rewards/margins": 0.036557335406541824, "rewards/rejected": -0.12168930470943451, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 3.0153424739837646, "learning_rate": 9.99835374211931e-08, "logits/chosen": -2.8406834602355957, "logits/rejected": -2.8223063945770264, "logps/chosen": -65.89734649658203, "logps/rejected": -64.9059066772461, "loss": 0.6785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09063565731048584, "rewards/margins": 0.03220418840646744, "rewards/rejected": -0.12283984571695328, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 2.781935453414917, "learning_rate": 9.998086483413856e-08, "logits/chosen": -2.8889684677124023, "logits/rejected": -2.86552095413208, "logps/chosen": -63.65220260620117, "logps/rejected": -63.5311393737793, "loss": 0.6768, "rewards/accuracies": 0.625, "rewards/chosen": -0.09271065145730972, "rewards/margins": 0.03604603186249733, "rewards/rejected": -0.12875667214393616, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 3.0753352642059326, "learning_rate": 9.997799135382066e-08, "logits/chosen": -2.8843913078308105, "logits/rejected": -2.879636287689209, "logps/chosen": -62.85695266723633, "logps/rejected": -64.9014892578125, "loss": 0.6829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09137921035289764, "rewards/margins": 0.02333098277449608, "rewards/rejected": -0.11471019685268402, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 2.8090198040008545, "learning_rate": 9.997491699178911e-08, "logits/chosen": -2.8752949237823486, "logits/rejected": -2.854523181915283, "logps/chosen": -66.43936157226562, "logps/rejected": -62.73388671875, "loss": 0.6775, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0817694216966629, "rewards/margins": 0.034055549651384354, "rewards/rejected": -0.11582496017217636, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 3.1820809841156006, "learning_rate": 9.997164176040098e-08, "logits/chosen": -2.796372175216675, "logits/rejected": -2.771918773651123, "logps/chosen": -64.6166763305664, "logps/rejected": -66.13924407958984, "loss": 0.6767, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09831389784812927, "rewards/margins": 0.03565414994955063, "rewards/rejected": -0.1339680403470993, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 3.1440021991729736, "learning_rate": 9.996816567282078e-08, "logits/chosen": -2.853527545928955, "logits/rejected": -2.8333311080932617, "logps/chosen": -66.1489028930664, "logps/rejected": -65.1463851928711, "loss": 0.6801, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0968015193939209, "rewards/margins": 0.02934463880956173, "rewards/rejected": -0.12614616751670837, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 2.9556357860565186, "learning_rate": 9.996448874302028e-08, "logits/chosen": -2.8163414001464844, "logits/rejected": -2.7914609909057617, "logps/chosen": -66.2785873413086, "logps/rejected": -66.2317123413086, "loss": 0.68, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10773171484470367, "rewards/margins": 0.02885953150689602, "rewards/rejected": -0.13659124076366425, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 3.3226914405822754, "learning_rate": 9.996061098577856e-08, "logits/chosen": -2.8367037773132324, "logits/rejected": -2.826648473739624, "logps/chosen": -63.57377243041992, "logps/rejected": -63.56377029418945, "loss": 0.6829, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11578904092311859, "rewards/margins": 0.02378334477543831, "rewards/rejected": -0.1395723819732666, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 2.77713680267334, "learning_rate": 9.995653241668189e-08, "logits/chosen": -2.861672878265381, "logits/rejected": -2.859971046447754, "logps/chosen": -64.96310424804688, "logps/rejected": -68.37250518798828, "loss": 0.6861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11934344470500946, "rewards/margins": 0.01718161627650261, "rewards/rejected": -0.13652506470680237, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 2.9392669200897217, "learning_rate": 9.995225305212369e-08, "logits/chosen": -2.8549649715423584, "logits/rejected": -2.8416645526885986, "logps/chosen": -66.29956817626953, "logps/rejected": -67.89146423339844, "loss": 0.6798, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11181751638650894, "rewards/margins": 0.02995738759636879, "rewards/rejected": -0.14177490770816803, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 2.8342604637145996, "learning_rate": 9.994777290930442e-08, "logits/chosen": -2.8708348274230957, "logits/rejected": -2.8521673679351807, "logps/chosen": -68.05824279785156, "logps/rejected": -64.81471252441406, "loss": 0.6819, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.10849817097187042, "rewards/margins": 0.02602517604827881, "rewards/rejected": -0.13452333211898804, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 3.1742191314697266, "learning_rate": 9.994309200623163e-08, "logits/chosen": -2.814614772796631, "logits/rejected": -2.807882308959961, "logps/chosen": -68.45671081542969, "logps/rejected": -66.14873504638672, "loss": 0.6907, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12103524059057236, "rewards/margins": 0.007765748538076878, "rewards/rejected": -0.12880098819732666, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 2.822662353515625, "learning_rate": 9.993821036171974e-08, "logits/chosen": -2.8995895385742188, "logits/rejected": -2.8736507892608643, "logps/chosen": -63.92681884765625, "logps/rejected": -63.00006866455078, "loss": 0.6751, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11917207390069962, "rewards/margins": 0.039555564522743225, "rewards/rejected": -0.15872761607170105, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 3.4786696434020996, "learning_rate": 9.993312799539004e-08, "logits/chosen": -2.8508896827697754, "logits/rejected": -2.8577001094818115, "logps/chosen": -63.96714401245117, "logps/rejected": -71.07959747314453, "loss": 0.6865, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13510540127754211, "rewards/margins": 0.016423583030700684, "rewards/rejected": -0.151528999209404, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -2.9192111492156982, "eval_logits/rejected": -2.915797233581543, "eval_logps/chosen": -65.91581726074219, "eval_logps/rejected": -71.4878158569336, "eval_loss": 0.685180127620697, "eval_rewards/accuracies": 0.5713289976119995, "eval_rewards/chosen": -0.06900343298912048, "eval_rewards/margins": 0.018378695473074913, "eval_rewards/rejected": -0.08738213777542114, "eval_runtime": 382.8756, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 3.048440456390381, "learning_rate": 9.992784492767061e-08, "logits/chosen": -2.8806567192077637, "logits/rejected": -2.8591017723083496, "logps/chosen": -67.69590759277344, "logps/rejected": -64.71334075927734, "loss": 0.6796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10972080379724503, "rewards/margins": 0.030075108632445335, "rewards/rejected": -0.1397959142923355, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 3.0328872203826904, "learning_rate": 9.992236117979623e-08, "logits/chosen": -2.847768545150757, "logits/rejected": -2.834040403366089, "logps/chosen": -62.909156799316406, "logps/rejected": -69.29357147216797, "loss": 0.6768, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11996938288211823, "rewards/margins": 0.0360104963183403, "rewards/rejected": -0.15597985684871674, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 5.03533935546875, "learning_rate": 9.991667677380831e-08, "logits/chosen": -2.917463779449463, "logits/rejected": -2.9009978771209717, "logps/chosen": -68.2878189086914, "logps/rejected": -68.7306137084961, "loss": 0.6816, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10773104429244995, "rewards/margins": 0.026072192937135696, "rewards/rejected": -0.13380321860313416, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 2.9555203914642334, "learning_rate": 9.991079173255476e-08, "logits/chosen": -2.817652702331543, "logits/rejected": -2.8122856616973877, "logps/chosen": -65.14109802246094, "logps/rejected": -66.4891357421875, "loss": 0.6815, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10871319472789764, "rewards/margins": 0.02643074095249176, "rewards/rejected": -0.1351439505815506, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 3.105513572692871, "learning_rate": 9.990470607968994e-08, "logits/chosen": -2.886338233947754, "logits/rejected": -2.8689470291137695, "logps/chosen": -62.5875358581543, "logps/rejected": -66.26753234863281, "loss": 0.6845, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.10509078204631805, "rewards/margins": 0.02006548084318638, "rewards/rejected": -0.1251562535762787, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 3.6310055255889893, "learning_rate": 9.989841983967456e-08, "logits/chosen": -2.8897366523742676, "logits/rejected": -2.8637399673461914, "logps/chosen": -65.2972183227539, "logps/rejected": -64.59178161621094, "loss": 0.6762, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09557583928108215, "rewards/margins": 0.0371161587536335, "rewards/rejected": -0.13269200921058655, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 2.7718076705932617, "learning_rate": 9.989193303777551e-08, "logits/chosen": -2.903705596923828, "logits/rejected": -2.8876516819000244, "logps/chosen": -66.88301086425781, "logps/rejected": -67.48914337158203, "loss": 0.6812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10493052005767822, "rewards/margins": 0.027421722188591957, "rewards/rejected": -0.13235223293304443, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 3.4504640102386475, "learning_rate": 9.988524570006591e-08, "logits/chosen": -2.839569091796875, "logits/rejected": -2.8218274116516113, "logps/chosen": -64.51222229003906, "logps/rejected": -65.25495910644531, "loss": 0.6755, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11703022569417953, "rewards/margins": 0.038812413811683655, "rewards/rejected": -0.1558426320552826, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 3.131420612335205, "learning_rate": 9.987835785342484e-08, "logits/chosen": -2.870603561401367, "logits/rejected": -2.8721537590026855, "logps/chosen": -67.33354187011719, "logps/rejected": -67.72920989990234, "loss": 0.6841, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11783289909362793, "rewards/margins": 0.020953617990016937, "rewards/rejected": -0.13878652453422546, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 3.2706260681152344, "learning_rate": 9.987126952553735e-08, "logits/chosen": -2.834599256515503, "logits/rejected": -2.8189969062805176, "logps/chosen": -68.30904388427734, "logps/rejected": -65.57106018066406, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.12041591107845306, "rewards/margins": 0.03133242577314377, "rewards/rejected": -0.15174834430217743, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 3.030902862548828, "learning_rate": 9.986398074489428e-08, "logits/chosen": -2.8516478538513184, "logits/rejected": -2.8527791500091553, "logps/chosen": -63.29868698120117, "logps/rejected": -68.52655792236328, "loss": 0.6874, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1283387690782547, "rewards/margins": 0.015200036577880383, "rewards/rejected": -0.14353878796100616, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 3.139935255050659, "learning_rate": 9.985649154079221e-08, "logits/chosen": -2.8046183586120605, "logits/rejected": -2.786285400390625, "logps/chosen": -65.4697036743164, "logps/rejected": -64.31707000732422, "loss": 0.6797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10919544845819473, "rewards/margins": 0.030463799834251404, "rewards/rejected": -0.13965924084186554, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 3.0774590969085693, "learning_rate": 9.984880194333322e-08, "logits/chosen": -2.808724880218506, "logits/rejected": -2.79413104057312, "logps/chosen": -67.2655258178711, "logps/rejected": -68.5387191772461, "loss": 0.6719, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12331392616033554, "rewards/margins": 0.04621954262256622, "rewards/rejected": -0.16953346133232117, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 3.1437113285064697, "learning_rate": 9.984091198342495e-08, "logits/chosen": -2.794424533843994, "logits/rejected": -2.7891032695770264, "logps/chosen": -63.76580047607422, "logps/rejected": -69.71234130859375, "loss": 0.6809, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.14377468824386597, "rewards/margins": 0.02725168690085411, "rewards/rejected": -0.17102637887001038, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 3.1667582988739014, "learning_rate": 9.983282169278032e-08, "logits/chosen": -2.8194727897644043, "logits/rejected": -2.787461042404175, "logps/chosen": -68.21852111816406, "logps/rejected": -64.78960418701172, "loss": 0.6677, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.11504881083965302, "rewards/margins": 0.05498508736491203, "rewards/rejected": -0.17003390192985535, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 3.9630634784698486, "learning_rate": 9.982453110391746e-08, "logits/chosen": -2.8034210205078125, "logits/rejected": -2.7924370765686035, "logps/chosen": -69.6778335571289, "logps/rejected": -64.76737976074219, "loss": 0.6812, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13194985687732697, "rewards/margins": 0.027023714035749435, "rewards/rejected": -0.1589735895395279, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 3.4963889122009277, "learning_rate": 9.981604025015961e-08, "logits/chosen": -2.8941502571105957, "logits/rejected": -2.882629871368408, "logps/chosen": -67.54387664794922, "logps/rejected": -67.07762145996094, "loss": 0.6756, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1255626529455185, "rewards/margins": 0.03818827494978905, "rewards/rejected": -0.16375091671943665, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 3.656014919281006, "learning_rate": 9.980734916563493e-08, "logits/chosen": -2.9170708656311035, "logits/rejected": -2.892664909362793, "logps/chosen": -73.61463165283203, "logps/rejected": -70.44910430908203, "loss": 0.6746, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.14075477421283722, "rewards/margins": 0.041772354394197464, "rewards/rejected": -0.1825271099805832, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 3.52813720703125, "learning_rate": 9.97984578852764e-08, "logits/chosen": -2.9582104682922363, "logits/rejected": -2.9556591510772705, "logps/chosen": -66.61128997802734, "logps/rejected": -68.55953979492188, "loss": 0.6844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1338820457458496, "rewards/margins": 0.020972993224859238, "rewards/rejected": -0.15485504269599915, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 4.082705497741699, "learning_rate": 9.978936644482165e-08, "logits/chosen": -2.8421432971954346, "logits/rejected": -2.829256534576416, "logps/chosen": -65.4125747680664, "logps/rejected": -70.35450744628906, "loss": 0.6735, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12541845440864563, "rewards/margins": 0.043675925582647324, "rewards/rejected": -0.16909436881542206, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 3.55794620513916, "learning_rate": 9.978007488081286e-08, "logits/chosen": -2.8825392723083496, "logits/rejected": -2.8723480701446533, "logps/chosen": -68.32068634033203, "logps/rejected": -68.001220703125, "loss": 0.6797, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12466458976268768, "rewards/margins": 0.031077438965439796, "rewards/rejected": -0.15574204921722412, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 3.548194169998169, "learning_rate": 9.977058323059658e-08, "logits/chosen": -2.763171672821045, "logits/rejected": -2.756930112838745, "logps/chosen": -67.42240142822266, "logps/rejected": -68.9684066772461, "loss": 0.6809, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12630632519721985, "rewards/margins": 0.0291056577116251, "rewards/rejected": -0.1554119884967804, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 4.489490509033203, "learning_rate": 9.976089153232354e-08, "logits/chosen": -2.8702690601348877, "logits/rejected": -2.849177598953247, "logps/chosen": -68.55046844482422, "logps/rejected": -66.85404968261719, "loss": 0.6804, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14111894369125366, "rewards/margins": 0.02869265154004097, "rewards/rejected": -0.16981160640716553, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 3.8217995166778564, "learning_rate": 9.975099982494864e-08, "logits/chosen": -2.8618292808532715, "logits/rejected": -2.8326847553253174, "logps/chosen": -70.48759460449219, "logps/rejected": -69.0060806274414, "loss": 0.6802, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13084883987903595, "rewards/margins": 0.02972751297056675, "rewards/rejected": -0.16057637333869934, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 3.6844048500061035, "learning_rate": 9.974090814823062e-08, "logits/chosen": -2.8011302947998047, "logits/rejected": -2.774014711380005, "logps/chosen": -69.90562438964844, "logps/rejected": -68.50813293457031, "loss": 0.6814, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.13328006863594055, "rewards/margins": 0.026850569993257523, "rewards/rejected": -0.16013064980506897, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 3.8670570850372314, "learning_rate": 9.9730616542732e-08, "logits/chosen": -2.805306911468506, "logits/rejected": -2.7785191535949707, "logps/chosen": -74.76165771484375, "logps/rejected": -73.67105102539062, "loss": 0.6778, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13669700920581818, "rewards/margins": 0.034947678446769714, "rewards/rejected": -0.1716446876525879, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 4.4944047927856445, "learning_rate": 9.972012504981892e-08, "logits/chosen": -2.795133113861084, "logits/rejected": -2.76835560798645, "logps/chosen": -69.20185852050781, "logps/rejected": -68.29955291748047, "loss": 0.6784, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1428747922182083, "rewards/margins": 0.03427129238843918, "rewards/rejected": -0.1771460920572281, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 3.459359645843506, "learning_rate": 9.970943371166087e-08, "logits/chosen": -2.8179843425750732, "logits/rejected": -2.8150620460510254, "logps/chosen": -67.28515625, "logps/rejected": -70.26941680908203, "loss": 0.683, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12593597173690796, "rewards/margins": 0.023327378556132317, "rewards/rejected": -0.14926335215568542, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 3.351472854614258, "learning_rate": 9.969854257123071e-08, "logits/chosen": -2.7689754962921143, "logits/rejected": -2.751616954803467, "logps/chosen": -66.91730499267578, "logps/rejected": -69.58160400390625, "loss": 0.6754, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.13506877422332764, "rewards/margins": 0.03949803113937378, "rewards/rejected": -0.17456680536270142, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 3.6811389923095703, "learning_rate": 9.968745167230428e-08, "logits/chosen": -2.881148099899292, "logits/rejected": -2.855869770050049, "logps/chosen": -69.73558807373047, "logps/rejected": -68.98726654052734, "loss": 0.6736, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11700799316167831, "rewards/margins": 0.043230026960372925, "rewards/rejected": -0.16023802757263184, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 3.873969078063965, "learning_rate": 9.967616105946042e-08, "logits/chosen": -2.8135409355163574, "logits/rejected": -2.801391124725342, "logps/chosen": -65.8260498046875, "logps/rejected": -67.92823791503906, "loss": 0.6757, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.13845619559288025, "rewards/margins": 0.03907342627644539, "rewards/rejected": -0.17752963304519653, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 3.8343992233276367, "learning_rate": 9.966467077808063e-08, "logits/chosen": -2.8660356998443604, "logits/rejected": -2.833324432373047, "logps/chosen": -71.3488540649414, "logps/rejected": -69.46772003173828, "loss": 0.6662, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1275714635848999, "rewards/margins": 0.05821261554956436, "rewards/rejected": -0.18578408658504486, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 3.9437179565429688, "learning_rate": 9.965298087434898e-08, "logits/chosen": -2.8199102878570557, "logits/rejected": -2.8103442192077637, "logps/chosen": -71.7131118774414, "logps/rejected": -71.01388549804688, "loss": 0.6686, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14351196587085724, "rewards/margins": 0.05403792858123779, "rewards/rejected": -0.19754989445209503, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 4.509121417999268, "learning_rate": 9.964109139525195e-08, "logits/chosen": -2.8408901691436768, "logits/rejected": -2.834322214126587, "logps/chosen": -69.10721588134766, "logps/rejected": -72.46755981445312, "loss": 0.6859, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15892067551612854, "rewards/margins": 0.019664833322167397, "rewards/rejected": -0.1785854995250702, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 3.775643825531006, "learning_rate": 9.962900238857812e-08, "logits/chosen": -2.8256442546844482, "logits/rejected": -2.8138177394866943, "logps/chosen": -71.73432922363281, "logps/rejected": -72.02094268798828, "loss": 0.6713, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12964202463626862, "rewards/margins": 0.04859600216150284, "rewards/rejected": -0.17823801934719086, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 3.601081132888794, "learning_rate": 9.96167139029181e-08, "logits/chosen": -2.8538124561309814, "logits/rejected": -2.8433425426483154, "logps/chosen": -66.75715637207031, "logps/rejected": -68.6590347290039, "loss": 0.6775, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14916284382343292, "rewards/margins": 0.03558550402522087, "rewards/rejected": -0.1847483515739441, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 4.274349689483643, "learning_rate": 9.960422598766427e-08, "logits/chosen": -2.8547940254211426, "logits/rejected": -2.8508224487304688, "logps/chosen": -71.09428405761719, "logps/rejected": -73.29460906982422, "loss": 0.6749, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16236889362335205, "rewards/margins": 0.03996804356575012, "rewards/rejected": -0.20233693718910217, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 3.6479640007019043, "learning_rate": 9.95915386930106e-08, "logits/chosen": -2.8166608810424805, "logits/rejected": -2.8011667728424072, "logps/chosen": -69.60545349121094, "logps/rejected": -70.36505126953125, "loss": 0.678, "rewards/accuracies": 0.625, "rewards/chosen": -0.14873133599758148, "rewards/margins": 0.03490529954433441, "rewards/rejected": -0.1836366355419159, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 4.576180934906006, "learning_rate": 9.957865206995243e-08, "logits/chosen": -2.8613834381103516, "logits/rejected": -2.842647075653076, "logps/chosen": -73.18601989746094, "logps/rejected": -72.28538513183594, "loss": 0.6719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16160975396633148, "rewards/margins": 0.04711094871163368, "rewards/rejected": -0.20872068405151367, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 3.625098943710327, "learning_rate": 9.956556617028632e-08, "logits/chosen": -2.8957629203796387, "logits/rejected": -2.885424852371216, "logps/chosen": -68.98573303222656, "logps/rejected": -72.5274429321289, "loss": 0.6767, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.16070397198200226, "rewards/margins": 0.037665095180273056, "rewards/rejected": -0.19836905598640442, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -2.8938286304473877, "eval_logits/rejected": -2.890582799911499, "eval_logps/chosen": -69.88031768798828, "eval_logps/rejected": -76.26507568359375, "eval_loss": 0.681674063205719, "eval_rewards/accuracies": 0.5815520286560059, "eval_rewards/chosen": -0.108648382127285, "eval_rewards/margins": 0.026506369933485985, "eval_rewards/rejected": -0.13515476882457733, "eval_runtime": 383.5029, "eval_samples_per_second": 11.223, "eval_steps_per_second": 1.403, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 3.9140706062316895, "learning_rate": 9.955228104660978e-08, "logits/chosen": -2.860299825668335, "logits/rejected": -2.8347489833831787, "logps/chosen": -71.68931579589844, "logps/rejected": -69.27977752685547, "loss": 0.6715, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.159579336643219, "rewards/margins": 0.048062894493341446, "rewards/rejected": -0.20764222741127014, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 3.6326847076416016, "learning_rate": 9.953879675232106e-08, "logits/chosen": -2.869041919708252, "logits/rejected": -2.849848747253418, "logps/chosen": -73.65645599365234, "logps/rejected": -74.63279724121094, "loss": 0.6765, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1667543202638626, "rewards/margins": 0.03787728026509285, "rewards/rejected": -0.20463159680366516, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 3.5972187519073486, "learning_rate": 9.952511334161901e-08, "logits/chosen": -2.818706750869751, "logits/rejected": -2.802964448928833, "logps/chosen": -72.43135070800781, "logps/rejected": -71.22188568115234, "loss": 0.6795, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16664886474609375, "rewards/margins": 0.03179159015417099, "rewards/rejected": -0.19844046235084534, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 6.047811985015869, "learning_rate": 9.951123086950277e-08, "logits/chosen": -2.8392794132232666, "logits/rejected": -2.829869270324707, "logps/chosen": -73.2491683959961, "logps/rejected": -74.15043640136719, "loss": 0.6764, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.16629108786582947, "rewards/margins": 0.03869754448533058, "rewards/rejected": -0.20498862862586975, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 4.199243545532227, "learning_rate": 9.949714939177159e-08, "logits/chosen": -2.8145751953125, "logits/rejected": -2.797954559326172, "logps/chosen": -72.74483489990234, "logps/rejected": -74.14974212646484, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": -0.18195998668670654, "rewards/margins": 0.042485881596803665, "rewards/rejected": -0.2244458943605423, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 3.3683722019195557, "learning_rate": 9.94828689650246e-08, "logits/chosen": -2.817074775695801, "logits/rejected": -2.792792797088623, "logps/chosen": -73.75148010253906, "logps/rejected": -72.57868957519531, "loss": 0.673, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.16400498151779175, "rewards/margins": 0.045658182352781296, "rewards/rejected": -0.20966319739818573, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 4.756073951721191, "learning_rate": 9.946838964666062e-08, "logits/chosen": -2.8950090408325195, "logits/rejected": -2.8776695728302, "logps/chosen": -72.48672485351562, "logps/rejected": -73.16866302490234, "loss": 0.6726, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16018885374069214, "rewards/margins": 0.04700163006782532, "rewards/rejected": -0.20719048380851746, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 4.101457118988037, "learning_rate": 9.945371149487787e-08, "logits/chosen": -2.8252413272857666, "logits/rejected": -2.802150011062622, "logps/chosen": -73.47563171386719, "logps/rejected": -71.92816925048828, "loss": 0.677, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.18649457395076752, "rewards/margins": 0.037734054028987885, "rewards/rejected": -0.2242286503314972, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 4.334775447845459, "learning_rate": 9.943883456867374e-08, "logits/chosen": -2.8387622833251953, "logits/rejected": -2.8354439735412598, "logps/chosen": -68.5081558227539, "logps/rejected": -74.07096862792969, "loss": 0.6722, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.18232309818267822, "rewards/margins": 0.04770932346582413, "rewards/rejected": -0.23003241419792175, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 4.520421981811523, "learning_rate": 9.942375892784464e-08, "logits/chosen": -2.8706278800964355, "logits/rejected": -2.852548122406006, "logps/chosen": -77.92839050292969, "logps/rejected": -80.6195068359375, "loss": 0.6772, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2113899290561676, "rewards/margins": 0.03781526908278465, "rewards/rejected": -0.24920520186424255, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 4.175434112548828, "learning_rate": 9.940848463298563e-08, "logits/chosen": -2.785006046295166, "logits/rejected": -2.7793169021606445, "logps/chosen": -73.74462127685547, "logps/rejected": -76.24616241455078, "loss": 0.6711, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20549897849559784, "rewards/margins": 0.04951024800539017, "rewards/rejected": -0.2550092339515686, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 4.60468864440918, "learning_rate": 9.939301174549025e-08, "logits/chosen": -2.775952100753784, "logits/rejected": -2.7564125061035156, "logps/chosen": -73.282958984375, "logps/rejected": -74.0230712890625, "loss": 0.6674, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.20053362846374512, "rewards/margins": 0.057301588356494904, "rewards/rejected": -0.25783517956733704, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 5.754627704620361, "learning_rate": 9.93773403275503e-08, "logits/chosen": -2.8354198932647705, "logits/rejected": -2.8365046977996826, "logps/chosen": -73.2640151977539, "logps/rejected": -76.65324401855469, "loss": 0.6841, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.21130648255348206, "rewards/margins": 0.024294385686516762, "rewards/rejected": -0.23560085892677307, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 7.0299577713012695, "learning_rate": 9.936147044215552e-08, "logits/chosen": -2.8332438468933105, "logits/rejected": -2.821242570877075, "logps/chosen": -76.29328918457031, "logps/rejected": -79.34749603271484, "loss": 0.6766, "rewards/accuracies": 0.59375, "rewards/chosen": -0.22417590022087097, "rewards/margins": 0.03858811408281326, "rewards/rejected": -0.262764036655426, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 4.671040058135986, "learning_rate": 9.934540215309342e-08, "logits/chosen": -2.828977584838867, "logits/rejected": -2.8029513359069824, "logps/chosen": -80.964111328125, "logps/rejected": -78.33306121826172, "loss": 0.6766, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2089128941297531, "rewards/margins": 0.041519373655319214, "rewards/rejected": -0.2504322826862335, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 4.371002197265625, "learning_rate": 9.932913552494887e-08, "logits/chosen": -2.875154495239258, "logits/rejected": -2.855593204498291, "logps/chosen": -77.19731140136719, "logps/rejected": -78.25651550292969, "loss": 0.6788, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.210457444190979, "rewards/margins": 0.034095216542482376, "rewards/rejected": -0.24455265700817108, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 4.936331748962402, "learning_rate": 9.931267062310407e-08, "logits/chosen": -2.818891763687134, "logits/rejected": -2.808858871459961, "logps/chosen": -79.46900939941406, "logps/rejected": -78.81519317626953, "loss": 0.6793, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.19554241001605988, "rewards/margins": 0.03420441597700119, "rewards/rejected": -0.22974681854248047, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 4.78171968460083, "learning_rate": 9.929600751373807e-08, "logits/chosen": -2.839102268218994, "logits/rejected": -2.8269808292388916, "logps/chosen": -74.9248275756836, "logps/rejected": -76.74894714355469, "loss": 0.6785, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.20845679938793182, "rewards/margins": 0.035146139562129974, "rewards/rejected": -0.2436029464006424, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 6.440813064575195, "learning_rate": 9.927914626382665e-08, "logits/chosen": -2.8103909492492676, "logits/rejected": -2.782987117767334, "logps/chosen": -77.16362762451172, "logps/rejected": -75.17143249511719, "loss": 0.6709, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2005034238100052, "rewards/margins": 0.04929278790950775, "rewards/rejected": -0.24979619681835175, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 5.180968761444092, "learning_rate": 9.926208694114196e-08, "logits/chosen": -2.833908796310425, "logits/rejected": -2.8047003746032715, "logps/chosen": -80.12142944335938, "logps/rejected": -74.12831115722656, "loss": 0.675, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2155274599790573, "rewards/margins": 0.043395183980464935, "rewards/rejected": -0.25892263650894165, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 5.009751796722412, "learning_rate": 9.924482961425232e-08, "logits/chosen": -2.8132882118225098, "logits/rejected": -2.7851760387420654, "logps/chosen": -79.89849853515625, "logps/rejected": -75.80006408691406, "loss": 0.6793, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.21182087063789368, "rewards/margins": 0.03493381291627884, "rewards/rejected": -0.24675467610359192, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 4.712821960449219, "learning_rate": 9.922737435252189e-08, "logits/chosen": -2.8382675647735596, "logits/rejected": -2.8158297538757324, "logps/chosen": -71.83258056640625, "logps/rejected": -76.35133361816406, "loss": 0.6626, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19858713448047638, "rewards/margins": 0.06706685572862625, "rewards/rejected": -0.2656540274620056, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 5.183804988861084, "learning_rate": 9.92097212261104e-08, "logits/chosen": -2.7748849391937256, "logits/rejected": -2.755716562271118, "logps/chosen": -74.21707153320312, "logps/rejected": -78.53225708007812, "loss": 0.6618, "rewards/accuracies": 0.65625, "rewards/chosen": -0.18580952286720276, "rewards/margins": 0.07012667506933212, "rewards/rejected": -0.2559362053871155, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 6.439208030700684, "learning_rate": 9.919187030597288e-08, "logits/chosen": -2.7924511432647705, "logits/rejected": -2.78043532371521, "logps/chosen": -71.54872131347656, "logps/rejected": -72.90419006347656, "loss": 0.6738, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.20230814814567566, "rewards/margins": 0.04494600370526314, "rewards/rejected": -0.2472541779279709, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 4.538118362426758, "learning_rate": 9.91738216638594e-08, "logits/chosen": -2.7581429481506348, "logits/rejected": -2.7481188774108887, "logps/chosen": -72.51766204833984, "logps/rejected": -76.71464538574219, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": -0.18791751563549042, "rewards/margins": 0.04803906008601189, "rewards/rejected": -0.23595662415027618, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 4.052431583404541, "learning_rate": 9.915557537231472e-08, "logits/chosen": -2.797879934310913, "logits/rejected": -2.769853115081787, "logps/chosen": -77.92301177978516, "logps/rejected": -77.94435119628906, "loss": 0.6642, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19476445019245148, "rewards/margins": 0.06543554365634918, "rewards/rejected": -0.26019999384880066, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 4.2934112548828125, "learning_rate": 9.913713150467805e-08, "logits/chosen": -2.773261308670044, "logits/rejected": -2.753025770187378, "logps/chosen": -77.44517517089844, "logps/rejected": -78.64486694335938, "loss": 0.673, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.22384801506996155, "rewards/margins": 0.04718298092484474, "rewards/rejected": -0.2710309624671936, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 4.471557140350342, "learning_rate": 9.911849013508274e-08, "logits/chosen": -2.8380188941955566, "logits/rejected": -2.814384698867798, "logps/chosen": -81.60688781738281, "logps/rejected": -78.93445587158203, "loss": 0.6756, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.2197762429714203, "rewards/margins": 0.04226057603955269, "rewards/rejected": -0.26203683018684387, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 5.9764814376831055, "learning_rate": 9.9099651338456e-08, "logits/chosen": -2.7970428466796875, "logits/rejected": -2.783587694168091, "logps/chosen": -73.98988342285156, "logps/rejected": -77.5247573852539, "loss": 0.6692, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.22796630859375, "rewards/margins": 0.05491773411631584, "rewards/rejected": -0.28288403153419495, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 4.950373649597168, "learning_rate": 9.908061519051851e-08, "logits/chosen": -2.802163600921631, "logits/rejected": -2.778865337371826, "logps/chosen": -74.62218475341797, "logps/rejected": -78.43358612060547, "loss": 0.6701, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20865485072135925, "rewards/margins": 0.05166729539632797, "rewards/rejected": -0.2603221535682678, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 5.659837245941162, "learning_rate": 9.906138176778426e-08, "logits/chosen": -2.8237104415893555, "logits/rejected": -2.8062551021575928, "logps/chosen": -81.23680877685547, "logps/rejected": -77.97103881835938, "loss": 0.6818, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20353296399116516, "rewards/margins": 0.02904905006289482, "rewards/rejected": -0.2325820028781891, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 5.343284606933594, "learning_rate": 9.904195114756013e-08, "logits/chosen": -2.7975616455078125, "logits/rejected": -2.804506301879883, "logps/chosen": -75.00023651123047, "logps/rejected": -79.24310302734375, "loss": 0.6755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21659204363822937, "rewards/margins": 0.04194109886884689, "rewards/rejected": -0.25853317975997925, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 6.924177646636963, "learning_rate": 9.90223234079456e-08, "logits/chosen": -2.8070428371429443, "logits/rejected": -2.7972311973571777, "logps/chosen": -81.18864440917969, "logps/rejected": -79.83781433105469, "loss": 0.6759, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2139493227005005, "rewards/margins": 0.03996586799621582, "rewards/rejected": -0.2539151906967163, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 5.208271503448486, "learning_rate": 9.900249862783253e-08, "logits/chosen": -2.7930102348327637, "logits/rejected": -2.7804884910583496, "logps/chosen": -75.43363952636719, "logps/rejected": -72.59468078613281, "loss": 0.6826, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2124830037355423, "rewards/margins": 0.02715051732957363, "rewards/rejected": -0.23963353037834167, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 6.84669303894043, "learning_rate": 9.898247688690467e-08, "logits/chosen": -2.738506555557251, "logits/rejected": -2.7383079528808594, "logps/chosen": -69.57581329345703, "logps/rejected": -77.10731506347656, "loss": 0.6696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19511033594608307, "rewards/margins": 0.053858477622270584, "rewards/rejected": -0.24896883964538574, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 5.511141300201416, "learning_rate": 9.896225826563748e-08, "logits/chosen": -2.7798218727111816, "logits/rejected": -2.772571325302124, "logps/chosen": -76.56613159179688, "logps/rejected": -80.88214111328125, "loss": 0.6691, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.21049802005290985, "rewards/margins": 0.05490432307124138, "rewards/rejected": -0.26540234684944153, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 5.58799934387207, "learning_rate": 9.894184284529776e-08, "logits/chosen": -2.8549933433532715, "logits/rejected": -2.82932710647583, "logps/chosen": -76.98066711425781, "logps/rejected": -76.55406188964844, "loss": 0.6775, "rewards/accuracies": 0.625, "rewards/chosen": -0.2074943482875824, "rewards/margins": 0.03717175871133804, "rewards/rejected": -0.24466614425182343, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 5.142263412475586, "learning_rate": 9.892123070794331e-08, "logits/chosen": -2.738034725189209, "logits/rejected": -2.717013120651245, "logps/chosen": -76.2380599975586, "logps/rejected": -77.95225524902344, "loss": 0.6714, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.23084776103496552, "rewards/margins": 0.05035693198442459, "rewards/rejected": -0.2812047302722931, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 5.2472734451293945, "learning_rate": 9.890042193642267e-08, "logits/chosen": -2.8136708736419678, "logits/rejected": -2.789565324783325, "logps/chosen": -74.58724212646484, "logps/rejected": -76.51789855957031, "loss": 0.668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.21276339888572693, "rewards/margins": 0.056669749319553375, "rewards/rejected": -0.2694331407546997, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 6.30368185043335, "learning_rate": 9.887941661437464e-08, "logits/chosen": -2.8293824195861816, "logits/rejected": -2.807325839996338, "logps/chosen": -84.5650634765625, "logps/rejected": -84.91242980957031, "loss": 0.6726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.24537885189056396, "rewards/margins": 0.047574784606695175, "rewards/rejected": -0.2929536700248718, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -2.8651082515716553, "eval_logits/rejected": -2.861708641052246, "eval_logps/chosen": -75.15972137451172, "eval_logps/rejected": -82.17526245117188, "eval_loss": 0.6792241334915161, "eval_rewards/accuracies": 0.5766728520393372, "eval_rewards/chosen": -0.16144251823425293, "eval_rewards/margins": 0.032814137637615204, "eval_rewards/rejected": -0.19425663352012634, "eval_runtime": 383.2273, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 5.956264019012451, "learning_rate": 9.885821482622812e-08, "logits/chosen": -2.755012035369873, "logits/rejected": -2.734792709350586, "logps/chosen": -79.61177825927734, "logps/rejected": -83.44078826904297, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": -0.22803381085395813, "rewards/margins": 0.051030099391937256, "rewards/rejected": -0.2790639102458954, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 5.045444965362549, "learning_rate": 9.883681665720162e-08, "logits/chosen": -2.832822799682617, "logits/rejected": -2.8235433101654053, "logps/chosen": -79.9222640991211, "logps/rejected": -79.30545806884766, "loss": 0.6795, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.24114854633808136, "rewards/margins": 0.03448627144098282, "rewards/rejected": -0.2756348252296448, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 5.051169395446777, "learning_rate": 9.881522219330303e-08, "logits/chosen": -2.709955930709839, "logits/rejected": -2.697514057159424, "logps/chosen": -79.45323944091797, "logps/rejected": -83.29679107666016, "loss": 0.6666, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22701826691627502, "rewards/margins": 0.060922276228666306, "rewards/rejected": -0.2879405617713928, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 5.050863742828369, "learning_rate": 9.879343152132922e-08, "logits/chosen": -2.8177552223205566, "logits/rejected": -2.808845043182373, "logps/chosen": -78.31355285644531, "logps/rejected": -79.11454010009766, "loss": 0.6735, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.24140088260173798, "rewards/margins": 0.045556072145700455, "rewards/rejected": -0.28695693612098694, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 4.998874187469482, "learning_rate": 9.87714447288657e-08, "logits/chosen": -2.7891416549682617, "logits/rejected": -2.775479555130005, "logps/chosen": -77.19549560546875, "logps/rejected": -84.73179626464844, "loss": 0.6641, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2396540641784668, "rewards/margins": 0.06720131635665894, "rewards/rejected": -0.30685538053512573, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 4.9351301193237305, "learning_rate": 9.874926190428623e-08, "logits/chosen": -2.779432535171509, "logits/rejected": -2.76192569732666, "logps/chosen": -78.51346588134766, "logps/rejected": -80.85316467285156, "loss": 0.6658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.23384924232959747, "rewards/margins": 0.062432099133729935, "rewards/rejected": -0.2962813079357147, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 6.352870941162109, "learning_rate": 9.872688313675258e-08, "logits/chosen": -2.8443655967712402, "logits/rejected": -2.830497980117798, "logps/chosen": -81.04945373535156, "logps/rejected": -81.22298431396484, "loss": 0.6719, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.24311265349388123, "rewards/margins": 0.049511753022670746, "rewards/rejected": -0.29262441396713257, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 6.229991912841797, "learning_rate": 9.870430851621399e-08, "logits/chosen": -2.859992504119873, "logits/rejected": -2.8380045890808105, "logps/chosen": -80.40449523925781, "logps/rejected": -80.57371520996094, "loss": 0.6644, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24097883701324463, "rewards/margins": 0.06652016192674637, "rewards/rejected": -0.3074989914894104, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 5.559286594390869, "learning_rate": 9.8681538133407e-08, "logits/chosen": -2.8639652729034424, "logits/rejected": -2.857152223587036, "logps/chosen": -78.74775695800781, "logps/rejected": -82.1052017211914, "loss": 0.6705, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24852073192596436, "rewards/margins": 0.05366664007306099, "rewards/rejected": -0.30218738317489624, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 6.255499362945557, "learning_rate": 9.865857207985499e-08, "logits/chosen": -2.813190221786499, "logits/rejected": -2.807352066040039, "logps/chosen": -76.86582946777344, "logps/rejected": -80.31356811523438, "loss": 0.6684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.23233918845653534, "rewards/margins": 0.05936668440699577, "rewards/rejected": -0.291705846786499, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 6.7790398597717285, "learning_rate": 9.863541044786776e-08, "logits/chosen": -2.837186098098755, "logits/rejected": -2.8275084495544434, "logps/chosen": -82.68029022216797, "logps/rejected": -86.81315612792969, "loss": 0.6661, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2539251446723938, "rewards/margins": 0.0632144957780838, "rewards/rejected": -0.3171396255493164, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 5.92188835144043, "learning_rate": 9.861205333054126e-08, "logits/chosen": -2.7996227741241455, "logits/rejected": -2.794041872024536, "logps/chosen": -80.66837310791016, "logps/rejected": -87.4694595336914, "loss": 0.6617, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.26344019174575806, "rewards/margins": 0.07321880757808685, "rewards/rejected": -0.3366590142250061, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 4.815412998199463, "learning_rate": 9.858850082175718e-08, "logits/chosen": -2.7737913131713867, "logits/rejected": -2.7536840438842773, "logps/chosen": -81.28785705566406, "logps/rejected": -83.8777084350586, "loss": 0.6648, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27177712321281433, "rewards/margins": 0.06520970165729523, "rewards/rejected": -0.33698686957359314, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 5.337031364440918, "learning_rate": 9.856475301618254e-08, "logits/chosen": -2.8222336769104004, "logits/rejected": -2.7931721210479736, "logps/chosen": -79.30021667480469, "logps/rejected": -80.7382583618164, "loss": 0.6781, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.26826828718185425, "rewards/margins": 0.03749905899167061, "rewards/rejected": -0.30576735734939575, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 6.90012264251709, "learning_rate": 9.854081000926937e-08, "logits/chosen": -2.8115012645721436, "logits/rejected": -2.7984066009521484, "logps/chosen": -81.31830596923828, "logps/rejected": -86.6155014038086, "loss": 0.6643, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2607055604457855, "rewards/margins": 0.06801251322031021, "rewards/rejected": -0.3287180960178375, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 7.280730724334717, "learning_rate": 9.851667189725428e-08, "logits/chosen": -2.7949349880218506, "logits/rejected": -2.7746729850769043, "logps/chosen": -81.17364501953125, "logps/rejected": -83.45097351074219, "loss": 0.6695, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2636514902114868, "rewards/margins": 0.05686463788151741, "rewards/rejected": -0.3205161392688751, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 5.992996692657471, "learning_rate": 9.849233877715805e-08, "logits/chosen": -2.7750847339630127, "logits/rejected": -2.7555854320526123, "logps/chosen": -83.04480743408203, "logps/rejected": -84.0536880493164, "loss": 0.671, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.27440935373306274, "rewards/margins": 0.052616190165281296, "rewards/rejected": -0.32702553272247314, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 8.407181739807129, "learning_rate": 9.846781074678536e-08, "logits/chosen": -2.733599901199341, "logits/rejected": -2.71382474899292, "logps/chosen": -80.76288604736328, "logps/rejected": -83.85502624511719, "loss": 0.6649, "rewards/accuracies": 0.625, "rewards/chosen": -0.25527653098106384, "rewards/margins": 0.06554384529590607, "rewards/rejected": -0.3208203911781311, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 6.10256290435791, "learning_rate": 9.844308790472422e-08, "logits/chosen": -2.778179168701172, "logits/rejected": -2.7642033100128174, "logps/chosen": -85.82478332519531, "logps/rejected": -85.78666687011719, "loss": 0.6773, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.28172558546066284, "rewards/margins": 0.04021336883306503, "rewards/rejected": -0.32193902134895325, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 5.219784259796143, "learning_rate": 9.841817035034571e-08, "logits/chosen": -2.788325786590576, "logits/rejected": -2.782817840576172, "logps/chosen": -78.93931579589844, "logps/rejected": -84.350341796875, "loss": 0.6753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.26191726326942444, "rewards/margins": 0.044065751135349274, "rewards/rejected": -0.3059830069541931, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 6.176671981811523, "learning_rate": 9.839305818380355e-08, "logits/chosen": -2.804821491241455, "logits/rejected": -2.786984920501709, "logps/chosen": -83.13731384277344, "logps/rejected": -83.47764587402344, "loss": 0.6774, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2761631906032562, "rewards/margins": 0.03957538679242134, "rewards/rejected": -0.3157385587692261, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 7.169635772705078, "learning_rate": 9.836775150603366e-08, "logits/chosen": -2.8510546684265137, "logits/rejected": -2.830547571182251, "logps/chosen": -83.43858337402344, "logps/rejected": -82.36913299560547, "loss": 0.6723, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2823221683502197, "rewards/margins": 0.05141967535018921, "rewards/rejected": -0.33374184370040894, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 7.140235424041748, "learning_rate": 9.834225041875381e-08, "logits/chosen": -2.815683126449585, "logits/rejected": -2.799219846725464, "logps/chosen": -83.18513488769531, "logps/rejected": -84.9632339477539, "loss": 0.6781, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2686838209629059, "rewards/margins": 0.037618495523929596, "rewards/rejected": -0.3063023090362549, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 5.284769535064697, "learning_rate": 9.831655502446314e-08, "logits/chosen": -2.837007522583008, "logits/rejected": -2.8341352939605713, "logps/chosen": -77.17981719970703, "logps/rejected": -83.68660736083984, "loss": 0.6697, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2541511356830597, "rewards/margins": 0.053926460444927216, "rewards/rejected": -0.3080775737762451, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 6.481640815734863, "learning_rate": 9.829066542644183e-08, "logits/chosen": -2.7752394676208496, "logits/rejected": -2.7717671394348145, "logps/chosen": -79.08528900146484, "logps/rejected": -84.80096435546875, "loss": 0.6735, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.24965326488018036, "rewards/margins": 0.046648941934108734, "rewards/rejected": -0.2963022291660309, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 6.421154499053955, "learning_rate": 9.826458172875056e-08, "logits/chosen": -2.7950727939605713, "logits/rejected": -2.7809386253356934, "logps/chosen": -80.84529113769531, "logps/rejected": -82.78802490234375, "loss": 0.6762, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2448175847530365, "rewards/margins": 0.043594036251306534, "rewards/rejected": -0.28841158747673035, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 5.6619720458984375, "learning_rate": 9.823830403623031e-08, "logits/chosen": -2.7709195613861084, "logits/rejected": -2.752265691757202, "logps/chosen": -84.43778991699219, "logps/rejected": -84.05045318603516, "loss": 0.6678, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.25266867876052856, "rewards/margins": 0.058676183223724365, "rewards/rejected": -0.31134486198425293, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 6.314658164978027, "learning_rate": 9.821183245450169e-08, "logits/chosen": -2.729401111602783, "logits/rejected": -2.7170450687408447, "logps/chosen": -79.04667663574219, "logps/rejected": -85.69407653808594, "loss": 0.6782, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.28165268898010254, "rewards/margins": 0.03992915153503418, "rewards/rejected": -0.3215818405151367, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 6.192001819610596, "learning_rate": 9.818516708996468e-08, "logits/chosen": -2.764099597930908, "logits/rejected": -2.74798846244812, "logps/chosen": -81.06709289550781, "logps/rejected": -85.3973388671875, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2586060166358948, "rewards/margins": 0.06887234002351761, "rewards/rejected": -0.3274783492088318, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 7.931329727172852, "learning_rate": 9.815830804979814e-08, "logits/chosen": -2.777858257293701, "logits/rejected": -2.7577965259552, "logps/chosen": -81.28909301757812, "logps/rejected": -82.33088684082031, "loss": 0.6688, "rewards/accuracies": 0.625, "rewards/chosen": -0.25381529331207275, "rewards/margins": 0.05860390514135361, "rewards/rejected": -0.31241923570632935, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 9.322093963623047, "learning_rate": 9.813125544195938e-08, "logits/chosen": -2.753824472427368, "logits/rejected": -2.7594916820526123, "logps/chosen": -80.28367614746094, "logps/rejected": -87.63134765625, "loss": 0.6745, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2879082262516022, "rewards/margins": 0.045723266899585724, "rewards/rejected": -0.3336314857006073, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 6.275984764099121, "learning_rate": 9.810400937518376e-08, "logits/chosen": -2.7972092628479004, "logits/rejected": -2.778357982635498, "logps/chosen": -82.89414978027344, "logps/rejected": -87.1389389038086, "loss": 0.6637, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.25661715865135193, "rewards/margins": 0.06865452229976654, "rewards/rejected": -0.3252716660499573, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 8.541313171386719, "learning_rate": 9.807656995898422e-08, "logits/chosen": -2.732752561569214, "logits/rejected": -2.7282118797302246, "logps/chosen": -80.03063201904297, "logps/rejected": -85.75975799560547, "loss": 0.6714, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2712279260158539, "rewards/margins": 0.054630815982818604, "rewards/rejected": -0.3258587419986725, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 10.22986888885498, "learning_rate": 9.80489373036508e-08, "logits/chosen": -2.796969413757324, "logits/rejected": -2.7837982177734375, "logps/chosen": -83.42472839355469, "logps/rejected": -89.79510498046875, "loss": 0.6681, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3091219961643219, "rewards/margins": 0.06103752925992012, "rewards/rejected": -0.3701595067977905, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 7.790271282196045, "learning_rate": 9.802111152025037e-08, "logits/chosen": -2.814141035079956, "logits/rejected": -2.7935421466827393, "logps/chosen": -85.78771209716797, "logps/rejected": -87.28602600097656, "loss": 0.6746, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.30056244134902954, "rewards/margins": 0.046315573155879974, "rewards/rejected": -0.3468780219554901, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 6.410969257354736, "learning_rate": 9.799309272062592e-08, "logits/chosen": -2.761298656463623, "logits/rejected": -2.7406160831451416, "logps/chosen": -83.67314910888672, "logps/rejected": -87.19869995117188, "loss": 0.6613, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.28363630175590515, "rewards/margins": 0.07660378515720367, "rewards/rejected": -0.36024007201194763, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 6.787548065185547, "learning_rate": 9.796488101739633e-08, "logits/chosen": -2.7918598651885986, "logits/rejected": -2.769052028656006, "logps/chosen": -87.31083679199219, "logps/rejected": -85.34876251220703, "loss": 0.6624, "rewards/accuracies": 0.625, "rewards/chosen": -0.3019317090511322, "rewards/margins": 0.07252788543701172, "rewards/rejected": -0.37445956468582153, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 5.428675174713135, "learning_rate": 9.793647652395582e-08, "logits/chosen": -2.8168251514434814, "logits/rejected": -2.7892673015594482, "logps/chosen": -83.43238830566406, "logps/rejected": -87.53303527832031, "loss": 0.6618, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2937379479408264, "rewards/margins": 0.07411986589431763, "rewards/rejected": -0.36785784363746643, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 6.892975807189941, "learning_rate": 9.79078793544735e-08, "logits/chosen": -2.8068017959594727, "logits/rejected": -2.8054256439208984, "logps/chosen": -83.6307601928711, "logps/rejected": -94.66683197021484, "loss": 0.6563, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3018225133419037, "rewards/margins": 0.08390899747610092, "rewards/rejected": -0.3857315182685852, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 8.321106910705566, "learning_rate": 9.787908962389295e-08, "logits/chosen": -2.749690055847168, "logits/rejected": -2.7342047691345215, "logps/chosen": -87.84234619140625, "logps/rejected": -89.13800048828125, "loss": 0.6643, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3047778010368347, "rewards/margins": 0.06937982141971588, "rewards/rejected": -0.3741576373577118, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -2.842041015625, "eval_logits/rejected": -2.8386902809143066, "eval_logps/chosen": -84.82251739501953, "eval_logps/rejected": -93.49150848388672, "eval_loss": 0.672879159450531, "eval_rewards/accuracies": 0.5947955250740051, "eval_rewards/chosen": -0.2580704391002655, "eval_rewards/margins": 0.049348585307598114, "eval_rewards/rejected": -0.3074190318584442, "eval_runtime": 382.8841, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 9.703495979309082, "learning_rate": 9.785010744793172e-08, "logits/chosen": -2.6921780109405518, "logits/rejected": -2.672276735305786, "logps/chosen": -89.73343658447266, "logps/rejected": -93.46687316894531, "loss": 0.6672, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3509143590927124, "rewards/margins": 0.06420746445655823, "rewards/rejected": -0.415121853351593, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 7.2378740310668945, "learning_rate": 9.782093294308085e-08, "logits/chosen": -2.73669695854187, "logits/rejected": -2.7342042922973633, "logps/chosen": -85.49198150634766, "logps/rejected": -90.95780181884766, "loss": 0.6759, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3446595072746277, "rewards/margins": 0.04523409903049469, "rewards/rejected": -0.38989362120628357, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 8.422399520874023, "learning_rate": 9.779156622660444e-08, "logits/chosen": -2.7746098041534424, "logits/rejected": -2.763091564178467, "logps/chosen": -86.66752624511719, "logps/rejected": -93.50808715820312, "loss": 0.6708, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.32212287187576294, "rewards/margins": 0.0574420690536499, "rewards/rejected": -0.37956494092941284, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 7.248619556427002, "learning_rate": 9.77620074165392e-08, "logits/chosen": -2.8578269481658936, "logits/rejected": -2.8408889770507812, "logps/chosen": -92.19932556152344, "logps/rejected": -91.2228012084961, "loss": 0.6734, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3484489321708679, "rewards/margins": 0.05245286971330643, "rewards/rejected": -0.40090179443359375, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 7.993571758270264, "learning_rate": 9.77322566316939e-08, "logits/chosen": -2.776008367538452, "logits/rejected": -2.7632105350494385, "logps/chosen": -85.61407470703125, "logps/rejected": -91.9402847290039, "loss": 0.6633, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3181673586368561, "rewards/margins": 0.07308965176343918, "rewards/rejected": -0.3912569582462311, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 6.735551357269287, "learning_rate": 9.770231399164894e-08, "logits/chosen": -2.7842421531677246, "logits/rejected": -2.7732491493225098, "logps/chosen": -83.95924377441406, "logps/rejected": -89.4464111328125, "loss": 0.6662, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3159296214580536, "rewards/margins": 0.06474421918392181, "rewards/rejected": -0.3806738257408142, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 7.405908107757568, "learning_rate": 9.76721796167559e-08, "logits/chosen": -2.8190155029296875, "logits/rejected": -2.8150835037231445, "logps/chosen": -90.64122772216797, "logps/rejected": -97.86971282958984, "loss": 0.6661, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.345743328332901, "rewards/margins": 0.06943206489086151, "rewards/rejected": -0.4151753783226013, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 9.555808067321777, "learning_rate": 9.764185362813697e-08, "logits/chosen": -2.8297626972198486, "logits/rejected": -2.820317506790161, "logps/chosen": -83.25923156738281, "logps/rejected": -90.16539001464844, "loss": 0.6717, "rewards/accuracies": 0.59375, "rewards/chosen": -0.32972145080566406, "rewards/margins": 0.054350681602954865, "rewards/rejected": -0.38407212495803833, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 7.138297080993652, "learning_rate": 9.761133614768454e-08, "logits/chosen": -2.860161542892456, "logits/rejected": -2.836207389831543, "logps/chosen": -84.72539520263672, "logps/rejected": -92.99263763427734, "loss": 0.6543, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.30811357498168945, "rewards/margins": 0.09081108123064041, "rewards/rejected": -0.39892467856407166, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 13.185148239135742, "learning_rate": 9.758062729806067e-08, "logits/chosen": -2.769252061843872, "logits/rejected": -2.7512991428375244, "logps/chosen": -91.40082550048828, "logps/rejected": -95.87007141113281, "loss": 0.6628, "rewards/accuracies": 0.625, "rewards/chosen": -0.3581240773200989, "rewards/margins": 0.07508586347103119, "rewards/rejected": -0.43320995569229126, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 7.241480827331543, "learning_rate": 9.754972720269664e-08, "logits/chosen": -2.722712993621826, "logits/rejected": -2.697404384613037, "logps/chosen": -88.21076965332031, "logps/rejected": -92.04688262939453, "loss": 0.6605, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33576661348342896, "rewards/margins": 0.081563800573349, "rewards/rejected": -0.41733041405677795, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 8.014399528503418, "learning_rate": 9.751863598579238e-08, "logits/chosen": -2.750521183013916, "logits/rejected": -2.727144956588745, "logps/chosen": -89.49032592773438, "logps/rejected": -90.572265625, "loss": 0.6641, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.35034722089767456, "rewards/margins": 0.0724496990442276, "rewards/rejected": -0.42279696464538574, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 9.008350372314453, "learning_rate": 9.748735377231605e-08, "logits/chosen": -2.8291258811950684, "logits/rejected": -2.8091487884521484, "logps/chosen": -87.87861633300781, "logps/rejected": -95.35037231445312, "loss": 0.6536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3266400396823883, "rewards/margins": 0.0941314548254013, "rewards/rejected": -0.4207715094089508, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 8.974753379821777, "learning_rate": 9.745588068800347e-08, "logits/chosen": -2.785029172897339, "logits/rejected": -2.7689337730407715, "logps/chosen": -93.52928161621094, "logps/rejected": -96.78819274902344, "loss": 0.6607, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3577708601951599, "rewards/margins": 0.08071313798427582, "rewards/rejected": -0.4384840428829193, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 9.425326347351074, "learning_rate": 9.742421685935769e-08, "logits/chosen": -2.7120304107666016, "logits/rejected": -2.7023465633392334, "logps/chosen": -92.40120697021484, "logps/rejected": -97.11851501464844, "loss": 0.6694, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.35480552911758423, "rewards/margins": 0.060017216950654984, "rewards/rejected": -0.4148227572441101, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 6.583974361419678, "learning_rate": 9.739236241364839e-08, "logits/chosen": -2.7767224311828613, "logits/rejected": -2.753446340560913, "logps/chosen": -91.22637176513672, "logps/rejected": -94.33473205566406, "loss": 0.6637, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.35612645745277405, "rewards/margins": 0.07528246194124222, "rewards/rejected": -0.4314088821411133, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 9.38561725616455, "learning_rate": 9.736031747891145e-08, "logits/chosen": -2.7565040588378906, "logits/rejected": -2.7531654834747314, "logps/chosen": -86.56519317626953, "logps/rejected": -96.07508850097656, "loss": 0.6564, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34099626541137695, "rewards/margins": 0.08686031401157379, "rewards/rejected": -0.42785653471946716, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 13.254312515258789, "learning_rate": 9.732808218394841e-08, "logits/chosen": -2.808115005493164, "logits/rejected": -2.7841153144836426, "logps/chosen": -90.53816986083984, "logps/rejected": -90.27338409423828, "loss": 0.6664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.334627240896225, "rewards/margins": 0.0670941025018692, "rewards/rejected": -0.40172138810157776, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 11.683048248291016, "learning_rate": 9.729565665832591e-08, "logits/chosen": -2.764824390411377, "logits/rejected": -2.7460172176361084, "logps/chosen": -87.72078704833984, "logps/rejected": -89.53865814208984, "loss": 0.6725, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3330671489238739, "rewards/margins": 0.05587635189294815, "rewards/rejected": -0.38894352316856384, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 8.081195831298828, "learning_rate": 9.726304103237522e-08, "logits/chosen": -2.79416823387146, "logits/rejected": -2.765594482421875, "logps/chosen": -85.45794677734375, "logps/rejected": -91.94193267822266, "loss": 0.6539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3184451162815094, "rewards/margins": 0.09561134874820709, "rewards/rejected": -0.4140564799308777, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 7.517322063446045, "learning_rate": 9.723023543719171e-08, "logits/chosen": -2.7181074619293213, "logits/rejected": -2.6986443996429443, "logps/chosen": -80.88175964355469, "logps/rejected": -83.81219482421875, "loss": 0.6645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.30681008100509644, "rewards/margins": 0.06839577108621597, "rewards/rejected": -0.3752058446407318, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 8.775835037231445, "learning_rate": 9.719724000463429e-08, "logits/chosen": -2.730861186981201, "logits/rejected": -2.7171430587768555, "logps/chosen": -83.10078430175781, "logps/rejected": -90.54771423339844, "loss": 0.6586, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2924661636352539, "rewards/margins": 0.08081065118312836, "rewards/rejected": -0.37327679991722107, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 9.496828079223633, "learning_rate": 9.716405486732494e-08, "logits/chosen": -2.8000082969665527, "logits/rejected": -2.7859044075012207, "logps/chosen": -83.32926940917969, "logps/rejected": -92.90452575683594, "loss": 0.6599, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.31072303652763367, "rewards/margins": 0.07991008460521698, "rewards/rejected": -0.39063313603401184, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 8.886645317077637, "learning_rate": 9.71306801586481e-08, "logits/chosen": -2.732738494873047, "logits/rejected": -2.719587802886963, "logps/chosen": -89.0845718383789, "logps/rejected": -96.92495727539062, "loss": 0.657, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.36353951692581177, "rewards/margins": 0.08779202401638031, "rewards/rejected": -0.4513315260410309, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 8.436723709106445, "learning_rate": 9.709711601275018e-08, "logits/chosen": -2.9035556316375732, "logits/rejected": -2.8733808994293213, "logps/chosen": -98.23624420166016, "logps/rejected": -97.32003021240234, "loss": 0.6699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3759039342403412, "rewards/margins": 0.061831988394260406, "rewards/rejected": -0.4377359449863434, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 9.629737854003906, "learning_rate": 9.706336256453906e-08, "logits/chosen": -2.745333433151245, "logits/rejected": -2.7406082153320312, "logps/chosen": -87.07964324951172, "logps/rejected": -94.35381317138672, "loss": 0.6641, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.34154319763183594, "rewards/margins": 0.07244926691055298, "rewards/rejected": -0.4139924645423889, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 7.333967208862305, "learning_rate": 9.702941994968345e-08, "logits/chosen": -2.7896111011505127, "logits/rejected": -2.7804951667785645, "logps/chosen": -94.11683654785156, "logps/rejected": -97.30807495117188, "loss": 0.6645, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3686944842338562, "rewards/margins": 0.07138319313526154, "rewards/rejected": -0.44007769227027893, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 9.837492942810059, "learning_rate": 9.699528830461241e-08, "logits/chosen": -2.76103138923645, "logits/rejected": -2.7402420043945312, "logps/chosen": -94.16011810302734, "logps/rejected": -97.65756225585938, "loss": 0.6618, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3834958076477051, "rewards/margins": 0.08097387850284576, "rewards/rejected": -0.4644697308540344, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 12.011924743652344, "learning_rate": 9.69609677665148e-08, "logits/chosen": -2.759596347808838, "logits/rejected": -2.7367639541625977, "logps/chosen": -94.12642669677734, "logps/rejected": -102.76054382324219, "loss": 0.6558, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.39481350779533386, "rewards/margins": 0.09345494955778122, "rewards/rejected": -0.4882684648036957, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 7.919803142547607, "learning_rate": 9.692645847333871e-08, "logits/chosen": -2.7291781902313232, "logits/rejected": -2.7234292030334473, "logps/chosen": -90.75396728515625, "logps/rejected": -97.35155487060547, "loss": 0.6711, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.380003422498703, "rewards/margins": 0.05795666575431824, "rewards/rejected": -0.43796008825302124, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 6.984891891479492, "learning_rate": 9.689176056379091e-08, "logits/chosen": -2.6967155933380127, "logits/rejected": -2.6760733127593994, "logps/chosen": -93.64738464355469, "logps/rejected": -96.0606460571289, "loss": 0.6701, "rewards/accuracies": 0.625, "rewards/chosen": -0.39596277475357056, "rewards/margins": 0.06031836196780205, "rewards/rejected": -0.4562811255455017, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 8.624043464660645, "learning_rate": 9.68568741773363e-08, "logits/chosen": -2.71110463142395, "logits/rejected": -2.6912643909454346, "logps/chosen": -90.24055480957031, "logps/rejected": -95.48713684082031, "loss": 0.6522, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3449710011482239, "rewards/margins": 0.09635841846466064, "rewards/rejected": -0.44132938981056213, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 9.641185760498047, "learning_rate": 9.682179945419735e-08, "logits/chosen": -2.834808111190796, "logits/rejected": -2.797631025314331, "logps/chosen": -93.6556625366211, "logps/rejected": -97.22142028808594, "loss": 0.6525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3713035583496094, "rewards/margins": 0.0992220938205719, "rewards/rejected": -0.4705256521701813, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 8.934781074523926, "learning_rate": 9.678653653535353e-08, "logits/chosen": -2.7017455101013184, "logits/rejected": -2.6818392276763916, "logps/chosen": -96.44803619384766, "logps/rejected": -99.64411926269531, "loss": 0.6686, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.41043099761009216, "rewards/margins": 0.06640861928462982, "rewards/rejected": -0.4768396317958832, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 9.147663116455078, "learning_rate": 9.675108556254073e-08, "logits/chosen": -2.7528347969055176, "logits/rejected": -2.7469847202301025, "logps/chosen": -97.86283111572266, "logps/rejected": -99.20372772216797, "loss": 0.6765, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.42837315797805786, "rewards/margins": 0.04629306495189667, "rewards/rejected": -0.47466620802879333, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 8.061542510986328, "learning_rate": 9.67154466782507e-08, "logits/chosen": -2.7205893993377686, "logits/rejected": -2.701850414276123, "logps/chosen": -94.24388122558594, "logps/rejected": -96.93658447265625, "loss": 0.6704, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.39585188031196594, "rewards/margins": 0.05933908745646477, "rewards/rejected": -0.4551909565925598, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 7.042123794555664, "learning_rate": 9.667962002573053e-08, "logits/chosen": -2.8203914165496826, "logits/rejected": -2.795574426651001, "logps/chosen": -97.29734802246094, "logps/rejected": -98.71199798583984, "loss": 0.6682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.39522475004196167, "rewards/margins": 0.0649743527173996, "rewards/rejected": -0.46019911766052246, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 8.811630249023438, "learning_rate": 9.664360574898196e-08, "logits/chosen": -2.792978286743164, "logits/rejected": -2.7760887145996094, "logps/chosen": -98.10456848144531, "logps/rejected": -102.02606964111328, "loss": 0.668, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3899100124835968, "rewards/margins": 0.06529586017131805, "rewards/rejected": -0.45520591735839844, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 9.826918601989746, "learning_rate": 9.660740399276092e-08, "logits/chosen": -2.7625770568847656, "logits/rejected": -2.7533459663391113, "logps/chosen": -96.68208312988281, "logps/rejected": -100.1490707397461, "loss": 0.6715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4101640284061432, "rewards/margins": 0.062378399074077606, "rewards/rejected": -0.4725424349308014, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 9.431441307067871, "learning_rate": 9.657101490257689e-08, "logits/chosen": -2.7645809650421143, "logits/rejected": -2.7437069416046143, "logps/chosen": -90.44786071777344, "logps/rejected": -93.78791809082031, "loss": 0.6614, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.34735265374183655, "rewards/margins": 0.07902725040912628, "rewards/rejected": -0.42637985944747925, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -2.8143951892852783, "eval_logits/rejected": -2.811262845993042, "eval_logps/chosen": -84.9094467163086, "eval_logps/rejected": -93.341552734375, "eval_loss": 0.6740313172340393, "eval_rewards/accuracies": 0.5903810262680054, "eval_rewards/chosen": -0.2589397132396698, "eval_rewards/margins": 0.04697979614138603, "eval_rewards/rejected": -0.3059195280075073, "eval_runtime": 383.2425, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 9.241872787475586, "learning_rate": 9.653443862469226e-08, "logits/chosen": -2.7348623275756836, "logits/rejected": -2.7261312007904053, "logps/chosen": -92.20159149169922, "logps/rejected": -92.39556884765625, "loss": 0.6797, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3961929976940155, "rewards/margins": 0.041474197059869766, "rewards/rejected": -0.43766722083091736, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 9.939810752868652, "learning_rate": 9.64976753061219e-08, "logits/chosen": -2.6818900108337402, "logits/rejected": -2.663252353668213, "logps/chosen": -91.36266326904297, "logps/rejected": -97.33293151855469, "loss": 0.6522, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.34590521454811096, "rewards/margins": 0.09555970132350922, "rewards/rejected": -0.441464900970459, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 7.1370673179626465, "learning_rate": 9.646072509463239e-08, "logits/chosen": -2.7756075859069824, "logits/rejected": -2.774402379989624, "logps/chosen": -90.3294448852539, "logps/rejected": -103.80897521972656, "loss": 0.6467, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.37961071729660034, "rewards/margins": 0.11030860990285873, "rewards/rejected": -0.4899192750453949, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 7.43474817276001, "learning_rate": 9.642358813874154e-08, "logits/chosen": -2.7568860054016113, "logits/rejected": -2.747313976287842, "logps/chosen": -93.8477783203125, "logps/rejected": -102.84173583984375, "loss": 0.6506, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3741520941257477, "rewards/margins": 0.10234732925891876, "rewards/rejected": -0.4764993190765381, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 9.955434799194336, "learning_rate": 9.638626458771779e-08, "logits/chosen": -2.728194236755371, "logits/rejected": -2.7346928119659424, "logps/chosen": -90.55718231201172, "logps/rejected": -102.41837310791016, "loss": 0.6538, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.38671961426734924, "rewards/margins": 0.09734572470188141, "rewards/rejected": -0.48406535387039185, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 8.804527282714844, "learning_rate": 9.63487545915795e-08, "logits/chosen": -2.750636577606201, "logits/rejected": -2.7261979579925537, "logps/chosen": -99.72705078125, "logps/rejected": -106.32795715332031, "loss": 0.6507, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4412182867527008, "rewards/margins": 0.10072751343250275, "rewards/rejected": -0.5419458150863647, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 8.900711059570312, "learning_rate": 9.631105830109454e-08, "logits/chosen": -2.7332570552825928, "logits/rejected": -2.715251922607422, "logps/chosen": -100.47251892089844, "logps/rejected": -103.1319808959961, "loss": 0.677, "rewards/accuracies": 0.5625, "rewards/chosen": -0.465008407831192, "rewards/margins": 0.049564313143491745, "rewards/rejected": -0.5145727396011353, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 10.140378952026367, "learning_rate": 9.627317586777947e-08, "logits/chosen": -2.7660374641418457, "logits/rejected": -2.736175775527954, "logps/chosen": -101.82589721679688, "logps/rejected": -101.44891357421875, "loss": 0.6696, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4551924169063568, "rewards/margins": 0.06743863224983215, "rewards/rejected": -0.522631049156189, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 8.872066497802734, "learning_rate": 9.623510744389908e-08, "logits/chosen": -2.709749698638916, "logits/rejected": -2.7123544216156006, "logps/chosen": -94.74894714355469, "logps/rejected": -110.99127197265625, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -0.41778117418289185, "rewards/margins": 0.11049274355173111, "rewards/rejected": -0.52827388048172, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 12.560961723327637, "learning_rate": 9.619685318246575e-08, "logits/chosen": -2.741328716278076, "logits/rejected": -2.7154643535614014, "logps/chosen": -101.0983657836914, "logps/rejected": -108.60958099365234, "loss": 0.662, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4249047636985779, "rewards/margins": 0.08166369795799255, "rewards/rejected": -0.5065684914588928, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 9.318950653076172, "learning_rate": 9.615841323723878e-08, "logits/chosen": -2.7656779289245605, "logits/rejected": -2.7480380535125732, "logps/chosen": -96.64082336425781, "logps/rejected": -97.5372543334961, "loss": 0.6736, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4158157408237457, "rewards/margins": 0.05774500221014023, "rewards/rejected": -0.4735606610774994, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 9.425246238708496, "learning_rate": 9.611978776272381e-08, "logits/chosen": -2.7584054470062256, "logits/rejected": -2.7434277534484863, "logps/chosen": -91.47366333007812, "logps/rejected": -102.65911865234375, "loss": 0.6477, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.38570934534072876, "rewards/margins": 0.10816343873739243, "rewards/rejected": -0.493872731924057, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 8.263059616088867, "learning_rate": 9.608097691417222e-08, "logits/chosen": -2.7512903213500977, "logits/rejected": -2.7288222312927246, "logps/chosen": -92.90215301513672, "logps/rejected": -99.7225570678711, "loss": 0.6431, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3674978017807007, "rewards/margins": 0.12553486227989197, "rewards/rejected": -0.49303263425827026, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 9.822864532470703, "learning_rate": 9.604198084758046e-08, "logits/chosen": -2.736889362335205, "logits/rejected": -2.719757556915283, "logps/chosen": -92.25497436523438, "logps/rejected": -103.63455963134766, "loss": 0.654, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3756372332572937, "rewards/margins": 0.09760434180498123, "rewards/rejected": -0.47324156761169434, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 8.369536399841309, "learning_rate": 9.600279971968947e-08, "logits/chosen": -2.791196346282959, "logits/rejected": -2.7723984718322754, "logps/chosen": -94.96142578125, "logps/rejected": -100.91236877441406, "loss": 0.6624, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40177279710769653, "rewards/margins": 0.08017977327108383, "rewards/rejected": -0.4819525182247162, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 11.309000968933105, "learning_rate": 9.5963433687984e-08, "logits/chosen": -2.7534823417663574, "logits/rejected": -2.746103525161743, "logps/chosen": -101.5938491821289, "logps/rejected": -104.46724700927734, "loss": 0.6774, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.44744873046875, "rewards/margins": 0.05144646763801575, "rewards/rejected": -0.49889522790908813, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 10.615482330322266, "learning_rate": 9.592388291069204e-08, "logits/chosen": -2.74672794342041, "logits/rejected": -2.731895923614502, "logps/chosen": -98.07127380371094, "logps/rejected": -102.38003540039062, "loss": 0.6761, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4332544803619385, "rewards/margins": 0.05527599900960922, "rewards/rejected": -0.4885304570198059, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 14.162087440490723, "learning_rate": 9.588414754678408e-08, "logits/chosen": -2.7618002891540527, "logits/rejected": -2.7311835289001465, "logps/chosen": -96.56783294677734, "logps/rejected": -98.6578369140625, "loss": 0.6607, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4174883961677551, "rewards/margins": 0.08339640498161316, "rewards/rejected": -0.5008847713470459, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 11.101541519165039, "learning_rate": 9.584422775597263e-08, "logits/chosen": -2.7361607551574707, "logits/rejected": -2.7133641242980957, "logps/chosen": -96.97541046142578, "logps/rejected": -100.37510681152344, "loss": 0.6607, "rewards/accuracies": 0.59375, "rewards/chosen": -0.42009204626083374, "rewards/margins": 0.08410472422838211, "rewards/rejected": -0.5041967630386353, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 17.42683219909668, "learning_rate": 9.58041236987114e-08, "logits/chosen": -2.758574962615967, "logits/rejected": -2.738593101501465, "logps/chosen": -100.40462493896484, "logps/rejected": -102.95411682128906, "loss": 0.6594, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4191233217716217, "rewards/margins": 0.08777258545160294, "rewards/rejected": -0.506895899772644, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 9.499427795410156, "learning_rate": 9.576383553619479e-08, "logits/chosen": -2.774747610092163, "logits/rejected": -2.7435383796691895, "logps/chosen": -104.61952209472656, "logps/rejected": -106.6491928100586, "loss": 0.6534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.45622339844703674, "rewards/margins": 0.09755819290876389, "rewards/rejected": -0.5537816286087036, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 10.711869239807129, "learning_rate": 9.572336343035719e-08, "logits/chosen": -2.7234880924224854, "logits/rejected": -2.7070212364196777, "logps/chosen": -98.84184265136719, "logps/rejected": -104.13154602050781, "loss": 0.661, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.43561094999313354, "rewards/margins": 0.08251044899225235, "rewards/rejected": -0.5181214213371277, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 10.506270408630371, "learning_rate": 9.56827075438723e-08, "logits/chosen": -2.754150867462158, "logits/rejected": -2.7183361053466797, "logps/chosen": -103.78630065917969, "logps/rejected": -102.62027740478516, "loss": 0.6638, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4417910575866699, "rewards/margins": 0.07783858478069305, "rewards/rejected": -0.5196296572685242, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 8.079768180847168, "learning_rate": 9.564186804015257e-08, "logits/chosen": -2.7156500816345215, "logits/rejected": -2.7090981006622314, "logps/chosen": -97.9146728515625, "logps/rejected": -112.15087890625, "loss": 0.6478, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4257539212703705, "rewards/margins": 0.11482544243335724, "rewards/rejected": -0.5405794382095337, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 11.12878704071045, "learning_rate": 9.560084508334842e-08, "logits/chosen": -2.7950711250305176, "logits/rejected": -2.783581018447876, "logps/chosen": -100.24549865722656, "logps/rejected": -101.98298645019531, "loss": 0.6671, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4187573790550232, "rewards/margins": 0.07190145552158356, "rewards/rejected": -0.49065881967544556, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 11.741887092590332, "learning_rate": 9.555963883834766e-08, "logits/chosen": -2.8084073066711426, "logits/rejected": -2.7839548587799072, "logps/chosen": -100.43121337890625, "logps/rejected": -103.94758605957031, "loss": 0.6682, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4456968903541565, "rewards/margins": 0.07233995944261551, "rewards/rejected": -0.5180368423461914, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 11.502065658569336, "learning_rate": 9.551824947077482e-08, "logits/chosen": -2.736194372177124, "logits/rejected": -2.720412254333496, "logps/chosen": -102.61311340332031, "logps/rejected": -108.13037109375, "loss": 0.6503, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4265097677707672, "rewards/margins": 0.10871385037899017, "rewards/rejected": -0.5352236032485962, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 10.240920066833496, "learning_rate": 9.54766771469905e-08, "logits/chosen": -2.7539052963256836, "logits/rejected": -2.757045269012451, "logps/chosen": -98.1904067993164, "logps/rejected": -105.439453125, "loss": 0.6696, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4628649652004242, "rewards/margins": 0.06372271478176117, "rewards/rejected": -0.5265876054763794, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 11.330344200134277, "learning_rate": 9.54349220340906e-08, "logits/chosen": -2.805917263031006, "logits/rejected": -2.7870750427246094, "logps/chosen": -97.0656509399414, "logps/rejected": -103.60054016113281, "loss": 0.6511, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4208943247795105, "rewards/margins": 0.10386069864034653, "rewards/rejected": -0.5247550010681152, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 9.428828239440918, "learning_rate": 9.539298429990581e-08, "logits/chosen": -2.782029628753662, "logits/rejected": -2.743447780609131, "logps/chosen": -100.13716125488281, "logps/rejected": -101.36534118652344, "loss": 0.6573, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.41971486806869507, "rewards/margins": 0.09659760445356369, "rewards/rejected": -0.5163124799728394, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 12.973221778869629, "learning_rate": 9.535086411300076e-08, "logits/chosen": -2.774634838104248, "logits/rejected": -2.7620959281921387, "logps/chosen": -97.77294921875, "logps/rejected": -107.8936996459961, "loss": 0.639, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.41240349411964417, "rewards/margins": 0.13535526394844055, "rewards/rejected": -0.5477586984634399, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 9.753588676452637, "learning_rate": 9.53085616426735e-08, "logits/chosen": -2.799535036087036, "logits/rejected": -2.766714096069336, "logps/chosen": -98.39482116699219, "logps/rejected": -98.08587646484375, "loss": 0.6535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3804094195365906, "rewards/margins": 0.10100064426660538, "rewards/rejected": -0.48141002655029297, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 11.287006378173828, "learning_rate": 9.526607705895473e-08, "logits/chosen": -2.7926058769226074, "logits/rejected": -2.7882137298583984, "logps/chosen": -94.21101379394531, "logps/rejected": -100.4034652709961, "loss": 0.6699, "rewards/accuracies": 0.625, "rewards/chosen": -0.4150943160057068, "rewards/margins": 0.06874460726976395, "rewards/rejected": -0.48383888602256775, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 9.813089370727539, "learning_rate": 9.522341053260714e-08, "logits/chosen": -2.6760735511779785, "logits/rejected": -2.659942626953125, "logps/chosen": -95.42730712890625, "logps/rejected": -101.63002014160156, "loss": 0.6603, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4058200716972351, "rewards/margins": 0.08270197361707687, "rewards/rejected": -0.4885219931602478, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 13.817987442016602, "learning_rate": 9.51805622351247e-08, "logits/chosen": -2.6872096061706543, "logits/rejected": -2.6538822650909424, "logps/chosen": -94.05184936523438, "logps/rejected": -99.13981628417969, "loss": 0.6467, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3803722560405731, "rewards/margins": 0.11201455444097519, "rewards/rejected": -0.49238675832748413, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 12.471182823181152, "learning_rate": 9.513753233873202e-08, "logits/chosen": -2.7590904235839844, "logits/rejected": -2.7596237659454346, "logps/chosen": -95.321044921875, "logps/rejected": -108.5772933959961, "loss": 0.6568, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4106566905975342, "rewards/margins": 0.09463075548410416, "rewards/rejected": -0.5052874684333801, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 11.664698600769043, "learning_rate": 9.50943210163836e-08, "logits/chosen": -2.748607635498047, "logits/rejected": -2.7324156761169434, "logps/chosen": -98.6547622680664, "logps/rejected": -105.25838470458984, "loss": 0.6601, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4345720410346985, "rewards/margins": 0.08905188739299774, "rewards/rejected": -0.5236239433288574, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 9.298650741577148, "learning_rate": 9.505092844176322e-08, "logits/chosen": -2.687570571899414, "logits/rejected": -2.681464195251465, "logps/chosen": -94.56925201416016, "logps/rejected": -105.93550109863281, "loss": 0.6414, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.405214786529541, "rewards/margins": 0.12826186418533325, "rewards/rejected": -0.5334766507148743, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 8.55325984954834, "learning_rate": 9.500735478928307e-08, "logits/chosen": -2.7440967559814453, "logits/rejected": -2.7317655086517334, "logps/chosen": -92.8218994140625, "logps/rejected": -101.8056869506836, "loss": 0.6584, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3859381079673767, "rewards/margins": 0.0919785350561142, "rewards/rejected": -0.4779166579246521, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 10.714520454406738, "learning_rate": 9.496360023408332e-08, "logits/chosen": -2.7911689281463623, "logits/rejected": -2.765573263168335, "logps/chosen": -97.75958251953125, "logps/rejected": -100.08521270751953, "loss": 0.6609, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4003545641899109, "rewards/margins": 0.08431843668222427, "rewards/rejected": -0.4846729636192322, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -2.791173219680786, "eval_logits/rejected": -2.787909984588623, "eval_logps/chosen": -89.10730743408203, "eval_logps/rejected": -98.77848815917969, "eval_loss": 0.6695796251296997, "eval_rewards/accuracies": 0.6052509546279907, "eval_rewards/chosen": -0.300918310880661, "eval_rewards/margins": 0.059370510280132294, "eval_rewards/rejected": -0.3602888882160187, "eval_runtime": 383.5897, "eval_samples_per_second": 11.22, "eval_steps_per_second": 1.403, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 9.583688735961914, "learning_rate": 9.491966495203114e-08, "logits/chosen": -2.717632293701172, "logits/rejected": -2.7124199867248535, "logps/chosen": -89.47596740722656, "logps/rejected": -104.87911224365234, "loss": 0.6421, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4020870327949524, "rewards/margins": 0.12617447972297668, "rewards/rejected": -0.5282614231109619, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 8.319774627685547, "learning_rate": 9.487554911972019e-08, "logits/chosen": -2.72869610786438, "logits/rejected": -2.7254672050476074, "logps/chosen": -92.48283386230469, "logps/rejected": -102.42913818359375, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": -0.3974384069442749, "rewards/margins": 0.10052331537008286, "rewards/rejected": -0.49796175956726074, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 8.953021049499512, "learning_rate": 9.483125291446976e-08, "logits/chosen": -2.7326226234436035, "logits/rejected": -2.7165818214416504, "logps/chosen": -95.19175720214844, "logps/rejected": -102.15061950683594, "loss": 0.6586, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.38019615411758423, "rewards/margins": 0.08821289241313934, "rewards/rejected": -0.46840906143188477, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 12.53400707244873, "learning_rate": 9.478677651432421e-08, "logits/chosen": -2.779245376586914, "logits/rejected": -2.7705986499786377, "logps/chosen": -98.64287567138672, "logps/rejected": -106.11576080322266, "loss": 0.6537, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.41790658235549927, "rewards/margins": 0.10335121303796768, "rewards/rejected": -0.521257758140564, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 9.494404792785645, "learning_rate": 9.47421200980521e-08, "logits/chosen": -2.7377359867095947, "logits/rejected": -2.7219014167785645, "logps/chosen": -94.80987548828125, "logps/rejected": -105.46993255615234, "loss": 0.6493, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4234391748905182, "rewards/margins": 0.11501254886388779, "rewards/rejected": -0.5384517312049866, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 9.975348472595215, "learning_rate": 9.469728384514561e-08, "logits/chosen": -2.7234580516815186, "logits/rejected": -2.701939105987549, "logps/chosen": -102.89959716796875, "logps/rejected": -105.6712646484375, "loss": 0.6592, "rewards/accuracies": 0.625, "rewards/chosen": -0.456900417804718, "rewards/margins": 0.08754035085439682, "rewards/rejected": -0.544440746307373, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 10.78412914276123, "learning_rate": 9.465226793581974e-08, "logits/chosen": -2.696413993835449, "logits/rejected": -2.680673599243164, "logps/chosen": -98.06929016113281, "logps/rejected": -109.2234878540039, "loss": 0.6401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.44293469190597534, "rewards/margins": 0.13048727810382843, "rewards/rejected": -0.5734219551086426, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 13.74826717376709, "learning_rate": 9.460707255101159e-08, "logits/chosen": -2.7049059867858887, "logits/rejected": -2.696500539779663, "logps/chosen": -99.33048248291016, "logps/rejected": -108.95695495605469, "loss": 0.6559, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4602390229701996, "rewards/margins": 0.09505314379930496, "rewards/rejected": -0.5552922487258911, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 11.00015640258789, "learning_rate": 9.456169787237962e-08, "logits/chosen": -2.7260005474090576, "logits/rejected": -2.7050795555114746, "logps/chosen": -103.5158462524414, "logps/rejected": -112.79571533203125, "loss": 0.6457, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.46710270643234253, "rewards/margins": 0.1251661330461502, "rewards/rejected": -0.5922688245773315, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 14.23911190032959, "learning_rate": 9.451614408230299e-08, "logits/chosen": -2.7269930839538574, "logits/rejected": -2.7070746421813965, "logps/chosen": -105.35322570800781, "logps/rejected": -111.6051025390625, "loss": 0.648, "rewards/accuracies": 0.625, "rewards/chosen": -0.5135026574134827, "rewards/margins": 0.11763473600149155, "rewards/rejected": -0.631137490272522, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 12.999456405639648, "learning_rate": 9.447041136388078e-08, "logits/chosen": -2.686152696609497, "logits/rejected": -2.6782896518707275, "logps/chosen": -114.95936584472656, "logps/rejected": -113.77604675292969, "loss": 0.6839, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5828672051429749, "rewards/margins": 0.04749060794711113, "rewards/rejected": -0.6303578019142151, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 13.190668106079102, "learning_rate": 9.442449990093124e-08, "logits/chosen": -2.665459156036377, "logits/rejected": -2.672152042388916, "logps/chosen": -106.706787109375, "logps/rejected": -120.7962875366211, "loss": 0.6483, "rewards/accuracies": 0.625, "rewards/chosen": -0.5548613667488098, "rewards/margins": 0.11795143038034439, "rewards/rejected": -0.672812819480896, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 15.538352012634277, "learning_rate": 9.437840987799104e-08, "logits/chosen": -2.7314820289611816, "logits/rejected": -2.7162790298461914, "logps/chosen": -109.08930969238281, "logps/rejected": -115.02335357666016, "loss": 0.6564, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5342675447463989, "rewards/margins": 0.1000477522611618, "rewards/rejected": -0.6343153715133667, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 13.29371452331543, "learning_rate": 9.433214148031458e-08, "logits/chosen": -2.738450050354004, "logits/rejected": -2.72159481048584, "logps/chosen": -113.81733703613281, "logps/rejected": -110.34965515136719, "loss": 0.6921, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.566449761390686, "rewards/margins": 0.03347700834274292, "rewards/rejected": -0.599926769733429, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 12.126227378845215, "learning_rate": 9.428569489387324e-08, "logits/chosen": -2.7552881240844727, "logits/rejected": -2.739154815673828, "logps/chosen": -108.66898345947266, "logps/rejected": -109.2908706665039, "loss": 0.6604, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.49945053458213806, "rewards/margins": 0.08895335346460342, "rewards/rejected": -0.5884039402008057, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 11.490330696105957, "learning_rate": 9.423907030535459e-08, "logits/chosen": -2.6959240436553955, "logits/rejected": -2.672302722930908, "logps/chosen": -100.10540771484375, "logps/rejected": -106.18426513671875, "loss": 0.6493, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.46855083107948303, "rewards/margins": 0.111080601811409, "rewards/rejected": -0.579631507396698, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 10.55395793914795, "learning_rate": 9.419226790216164e-08, "logits/chosen": -2.7417430877685547, "logits/rejected": -2.735314130783081, "logps/chosen": -97.56913757324219, "logps/rejected": -108.05329895019531, "loss": 0.6539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.45069703459739685, "rewards/margins": 0.10149586200714111, "rewards/rejected": -0.5521928668022156, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 16.450298309326172, "learning_rate": 9.414528787241215e-08, "logits/chosen": -2.7118000984191895, "logits/rejected": -2.6969642639160156, "logps/chosen": -102.15837097167969, "logps/rejected": -113.5875015258789, "loss": 0.6456, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45310235023498535, "rewards/margins": 0.12073644250631332, "rewards/rejected": -0.5738388299942017, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 12.765628814697266, "learning_rate": 9.409813040493783e-08, "logits/chosen": -2.709646463394165, "logits/rejected": -2.699134111404419, "logps/chosen": -102.05935668945312, "logps/rejected": -113.42268371582031, "loss": 0.6498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5007042288780212, "rewards/margins": 0.11319047212600708, "rewards/rejected": -0.6138947010040283, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 10.587715148925781, "learning_rate": 9.405079568928355e-08, "logits/chosen": -2.738844394683838, "logits/rejected": -2.7267041206359863, "logps/chosen": -105.80863952636719, "logps/rejected": -105.4742431640625, "loss": 0.6766, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.48726946115493774, "rewards/margins": 0.053386181592941284, "rewards/rejected": -0.5406556129455566, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 14.55517292022705, "learning_rate": 9.400328391570665e-08, "logits/chosen": -2.7188029289245605, "logits/rejected": -2.7034573554992676, "logps/chosen": -106.10347747802734, "logps/rejected": -108.0635986328125, "loss": 0.6752, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5038017630577087, "rewards/margins": 0.060950059443712234, "rewards/rejected": -0.5647518038749695, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 8.819595336914062, "learning_rate": 9.395559527517611e-08, "logits/chosen": -2.6296181678771973, "logits/rejected": -2.6180548667907715, "logps/chosen": -98.534423828125, "logps/rejected": -107.92878723144531, "loss": 0.6527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4547065794467926, "rewards/margins": 0.10374270379543304, "rewards/rejected": -0.5584492683410645, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 12.428403854370117, "learning_rate": 9.390772995937181e-08, "logits/chosen": -2.767120361328125, "logits/rejected": -2.750506639480591, "logps/chosen": -105.1033935546875, "logps/rejected": -111.66764831542969, "loss": 0.653, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4818612039089203, "rewards/margins": 0.10422980785369873, "rewards/rejected": -0.5860909819602966, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 11.653403282165527, "learning_rate": 9.385968816068377e-08, "logits/chosen": -2.6813740730285645, "logits/rejected": -2.6649351119995117, "logps/chosen": -102.3239517211914, "logps/rejected": -112.22544860839844, "loss": 0.6529, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4676525592803955, "rewards/margins": 0.10822185128927231, "rewards/rejected": -0.575874388217926, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 12.479633331298828, "learning_rate": 9.381147007221137e-08, "logits/chosen": -2.6995177268981934, "logits/rejected": -2.6860814094543457, "logps/chosen": -100.09844970703125, "logps/rejected": -103.00732421875, "loss": 0.665, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4368916153907776, "rewards/margins": 0.07490735501050949, "rewards/rejected": -0.5117989778518677, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 11.699207305908203, "learning_rate": 9.376307588776258e-08, "logits/chosen": -2.6861469745635986, "logits/rejected": -2.671250581741333, "logps/chosen": -99.37552642822266, "logps/rejected": -108.71788024902344, "loss": 0.6508, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4348589777946472, "rewards/margins": 0.11435357481241226, "rewards/rejected": -0.5492125749588013, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 13.502720832824707, "learning_rate": 9.371450580185314e-08, "logits/chosen": -2.6920905113220215, "logits/rejected": -2.6746673583984375, "logps/chosen": -92.69775390625, "logps/rejected": -99.46900177001953, "loss": 0.6582, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.39805346727371216, "rewards/margins": 0.09268694370985031, "rewards/rejected": -0.4907403588294983, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 10.148627281188965, "learning_rate": 9.366576000970581e-08, "logits/chosen": -2.689373016357422, "logits/rejected": -2.674402952194214, "logps/chosen": -96.89897155761719, "logps/rejected": -106.751708984375, "loss": 0.6472, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4402523934841156, "rewards/margins": 0.11912872642278671, "rewards/rejected": -0.5593811273574829, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 13.121967315673828, "learning_rate": 9.36168387072496e-08, "logits/chosen": -2.6916027069091797, "logits/rejected": -2.6764674186706543, "logps/chosen": -104.63089752197266, "logps/rejected": -107.55049896240234, "loss": 0.666, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4880262017250061, "rewards/margins": 0.08436858654022217, "rewards/rejected": -0.5723947286605835, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 9.410064697265625, "learning_rate": 9.356774209111899e-08, "logits/chosen": -2.713801383972168, "logits/rejected": -2.706458568572998, "logps/chosen": -99.73240661621094, "logps/rejected": -108.1336441040039, "loss": 0.6515, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.45986419916152954, "rewards/margins": 0.10472697019577026, "rewards/rejected": -0.5645912289619446, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 10.330328941345215, "learning_rate": 9.351847035865306e-08, "logits/chosen": -2.6547112464904785, "logits/rejected": -2.6392548084259033, "logps/chosen": -103.56089782714844, "logps/rejected": -110.84979248046875, "loss": 0.6469, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.48212283849716187, "rewards/margins": 0.11652640253305435, "rewards/rejected": -0.598649263381958, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 14.346282005310059, "learning_rate": 9.346902370789482e-08, "logits/chosen": -2.704272985458374, "logits/rejected": -2.6856577396392822, "logps/chosen": -111.2483901977539, "logps/rejected": -120.86759185791016, "loss": 0.6321, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5146492719650269, "rewards/margins": 0.1525907665491104, "rewards/rejected": -0.6672400236129761, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 10.199442863464355, "learning_rate": 9.341940233759028e-08, "logits/chosen": -2.6659274101257324, "logits/rejected": -2.6443660259246826, "logps/chosen": -110.65473937988281, "logps/rejected": -112.12313079833984, "loss": 0.6677, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5498034954071045, "rewards/margins": 0.08058689534664154, "rewards/rejected": -0.6303903460502625, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 13.762100219726562, "learning_rate": 9.336960644718777e-08, "logits/chosen": -2.6515679359436035, "logits/rejected": -2.6344714164733887, "logps/chosen": -101.73217010498047, "logps/rejected": -115.3052978515625, "loss": 0.6392, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5107470750808716, "rewards/margins": 0.14539656043052673, "rewards/rejected": -0.6561435461044312, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 18.938304901123047, "learning_rate": 9.331963623683704e-08, "logits/chosen": -2.6708264350891113, "logits/rejected": -2.66340970993042, "logps/chosen": -101.90260314941406, "logps/rejected": -113.85909271240234, "loss": 0.6523, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4957842230796814, "rewards/margins": 0.11152273416519165, "rewards/rejected": -0.6073070168495178, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 12.086625099182129, "learning_rate": 9.326949190738855e-08, "logits/chosen": -2.7044968605041504, "logits/rejected": -2.689762592315674, "logps/chosen": -112.76841735839844, "logps/rejected": -117.0528335571289, "loss": 0.6693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5807456970214844, "rewards/margins": 0.08174094557762146, "rewards/rejected": -0.6624866724014282, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 13.480952262878418, "learning_rate": 9.32191736603926e-08, "logits/chosen": -2.7097010612487793, "logits/rejected": -2.695061683654785, "logps/chosen": -107.33966064453125, "logps/rejected": -117.2765121459961, "loss": 0.6499, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5145081281661987, "rewards/margins": 0.11671149730682373, "rewards/rejected": -0.6312196254730225, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 14.373209953308105, "learning_rate": 9.316868169809851e-08, "logits/chosen": -2.732989549636841, "logits/rejected": -2.7170796394348145, "logps/chosen": -110.73612213134766, "logps/rejected": -111.71165466308594, "loss": 0.6827, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5497013330459595, "rewards/margins": 0.050543613731861115, "rewards/rejected": -0.6002449989318848, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 12.738282203674316, "learning_rate": 9.311801622345386e-08, "logits/chosen": -2.6796634197235107, "logits/rejected": -2.6703240871429443, "logps/chosen": -103.5061264038086, "logps/rejected": -116.71205139160156, "loss": 0.6384, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.49565204977989197, "rewards/margins": 0.14099957048892975, "rewards/rejected": -0.6366516351699829, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 15.85651969909668, "learning_rate": 9.306717744010364e-08, "logits/chosen": -2.699371814727783, "logits/rejected": -2.6866402626037598, "logps/chosen": -111.85618591308594, "logps/rejected": -117.50447082519531, "loss": 0.6562, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5292888879776001, "rewards/margins": 0.09612816572189331, "rewards/rejected": -0.6254171133041382, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -2.7548279762268066, "eval_logits/rejected": -2.751525640487671, "eval_logps/chosen": -99.73304748535156, "eval_logps/rejected": -110.64991760253906, "eval_loss": 0.6667318344116211, "eval_rewards/accuracies": 0.598280668258667, "eval_rewards/chosen": -0.407175749540329, "eval_rewards/margins": 0.07182740420103073, "eval_rewards/rejected": -0.47900310158729553, "eval_runtime": 382.9812, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 14.734321594238281, "learning_rate": 9.301616555238942e-08, "logits/chosen": -2.641871213912964, "logits/rejected": -2.6299562454223633, "logps/chosen": -112.2957534790039, "logps/rejected": -118.21693420410156, "loss": 0.6678, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5567243099212646, "rewards/margins": 0.08240054547786713, "rewards/rejected": -0.639124870300293, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 13.55246639251709, "learning_rate": 9.296498076534858e-08, "logits/chosen": -2.7524256706237793, "logits/rejected": -2.717398166656494, "logps/chosen": -111.02726745605469, "logps/rejected": -113.57003021240234, "loss": 0.6631, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5495994687080383, "rewards/margins": 0.08970017731189728, "rewards/rejected": -0.6392996311187744, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 14.815827369689941, "learning_rate": 9.291362328471341e-08, "logits/chosen": -2.6723484992980957, "logits/rejected": -2.647158622741699, "logps/chosen": -106.09244537353516, "logps/rejected": -112.78398132324219, "loss": 0.6552, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5166307091712952, "rewards/margins": 0.10847940295934677, "rewards/rejected": -0.6251100301742554, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 13.4649076461792, "learning_rate": 9.286209331691037e-08, "logits/chosen": -2.705343246459961, "logits/rejected": -2.6825528144836426, "logps/chosen": -113.4854736328125, "logps/rejected": -121.1431884765625, "loss": 0.6418, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5344313979148865, "rewards/margins": 0.1367706060409546, "rewards/rejected": -0.6712020039558411, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 12.157050132751465, "learning_rate": 9.281039106905916e-08, "logits/chosen": -2.6409859657287598, "logits/rejected": -2.6314549446105957, "logps/chosen": -107.29130554199219, "logps/rejected": -113.2594985961914, "loss": 0.6558, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5019858479499817, "rewards/margins": 0.10079322010278702, "rewards/rejected": -0.6027790307998657, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 10.632394790649414, "learning_rate": 9.275851674897203e-08, "logits/chosen": -2.711153507232666, "logits/rejected": -2.704730749130249, "logps/chosen": -99.85983276367188, "logps/rejected": -111.9957504272461, "loss": 0.6438, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4664650559425354, "rewards/margins": 0.1344144493341446, "rewards/rejected": -0.6008794903755188, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 11.28720474243164, "learning_rate": 9.270647056515275e-08, "logits/chosen": -2.7313551902770996, "logits/rejected": -2.709290027618408, "logps/chosen": -104.88045501708984, "logps/rejected": -109.1452407836914, "loss": 0.6513, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4840072989463806, "rewards/margins": 0.11140353977680206, "rewards/rejected": -0.5954108238220215, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 14.331686019897461, "learning_rate": 9.265425272679596e-08, "logits/chosen": -2.7455172538757324, "logits/rejected": -2.7380411624908447, "logps/chosen": -104.9108657836914, "logps/rejected": -113.90400695800781, "loss": 0.659, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5284455418586731, "rewards/margins": 0.09997548907995224, "rewards/rejected": -0.6284209489822388, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 12.461320877075195, "learning_rate": 9.260186344378623e-08, "logits/chosen": -2.671069622039795, "logits/rejected": -2.648007869720459, "logps/chosen": -101.6649169921875, "logps/rejected": -108.2196044921875, "loss": 0.6586, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4936288893222809, "rewards/margins": 0.09367899596691132, "rewards/rejected": -0.5873079299926758, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 16.6155948638916, "learning_rate": 9.254930292669723e-08, "logits/chosen": -2.6818337440490723, "logits/rejected": -2.6689093112945557, "logps/chosen": -108.67326354980469, "logps/rejected": -112.25102233886719, "loss": 0.6617, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5086537599563599, "rewards/margins": 0.09293018281459808, "rewards/rejected": -0.6015838384628296, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 10.590887069702148, "learning_rate": 9.249657138679084e-08, "logits/chosen": -2.7585530281066895, "logits/rejected": -2.7360479831695557, "logps/chosen": -104.07576751708984, "logps/rejected": -118.41740417480469, "loss": 0.6364, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4872332215309143, "rewards/margins": 0.14657996594905853, "rewards/rejected": -0.633813202381134, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 11.587627410888672, "learning_rate": 9.244366903601644e-08, "logits/chosen": -2.7139852046966553, "logits/rejected": -2.6992831230163574, "logps/chosen": -108.73707580566406, "logps/rejected": -113.06243896484375, "loss": 0.6659, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.552254855632782, "rewards/margins": 0.0821274071931839, "rewards/rejected": -0.6343822479248047, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 13.454439163208008, "learning_rate": 9.239059608700992e-08, "logits/chosen": -2.697965145111084, "logits/rejected": -2.689426898956299, "logps/chosen": -105.71885681152344, "logps/rejected": -111.27046203613281, "loss": 0.671, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5099459886550903, "rewards/margins": 0.07720411568880081, "rewards/rejected": -0.5871500968933105, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 13.542638778686523, "learning_rate": 9.233735275309287e-08, "logits/chosen": -2.6490254402160645, "logits/rejected": -2.635833501815796, "logps/chosen": -102.23767852783203, "logps/rejected": -109.13404846191406, "loss": 0.6522, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.48313722014427185, "rewards/margins": 0.1087016835808754, "rewards/rejected": -0.5918388962745667, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 17.139244079589844, "learning_rate": 9.228393924827173e-08, "logits/chosen": -2.7150959968566895, "logits/rejected": -2.7003800868988037, "logps/chosen": -107.5324935913086, "logps/rejected": -113.44700622558594, "loss": 0.6557, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.49557629227638245, "rewards/margins": 0.10561362653970718, "rewards/rejected": -0.6011899709701538, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 13.037927627563477, "learning_rate": 9.223035578723695e-08, "logits/chosen": -2.651930570602417, "logits/rejected": -2.6261329650878906, "logps/chosen": -106.41754150390625, "logps/rejected": -117.70652770996094, "loss": 0.6305, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4938697814941406, "rewards/margins": 0.16125063598155975, "rewards/rejected": -0.6551204323768616, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 17.923662185668945, "learning_rate": 9.217660258536204e-08, "logits/chosen": -2.6618800163269043, "logits/rejected": -2.64208984375, "logps/chosen": -107.5634994506836, "logps/rejected": -117.63850402832031, "loss": 0.6547, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5490785837173462, "rewards/margins": 0.11649944633245468, "rewards/rejected": -0.6655780076980591, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 13.755855560302734, "learning_rate": 9.212267985870285e-08, "logits/chosen": -2.6529924869537354, "logits/rejected": -2.632174015045166, "logps/chosen": -100.74827575683594, "logps/rejected": -108.7590103149414, "loss": 0.6465, "rewards/accuracies": 0.65625, "rewards/chosen": -0.45046645402908325, "rewards/margins": 0.12104056030511856, "rewards/rejected": -0.5715069770812988, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 17.974578857421875, "learning_rate": 9.206858782399655e-08, "logits/chosen": -2.7094898223876953, "logits/rejected": -2.6892411708831787, "logps/chosen": -111.24505615234375, "logps/rejected": -114.80848693847656, "loss": 0.6729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5568535923957825, "rewards/margins": 0.07443558424711227, "rewards/rejected": -0.6312891840934753, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 11.914977073669434, "learning_rate": 9.201432669866086e-08, "logits/chosen": -2.6332201957702637, "logits/rejected": -2.617691993713379, "logps/chosen": -107.5090560913086, "logps/rejected": -125.3719482421875, "loss": 0.6198, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5328209400177002, "rewards/margins": 0.18362154066562653, "rewards/rejected": -0.7164424657821655, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 13.614599227905273, "learning_rate": 9.195989670079314e-08, "logits/chosen": -2.654611587524414, "logits/rejected": -2.647090435028076, "logps/chosen": -113.46919250488281, "logps/rejected": -117.45033264160156, "loss": 0.6792, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6137629151344299, "rewards/margins": 0.0618717297911644, "rewards/rejected": -0.6756345629692078, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 18.413545608520508, "learning_rate": 9.190529804916952e-08, "logits/chosen": -2.7000679969787598, "logits/rejected": -2.679298162460327, "logps/chosen": -110.68448638916016, "logps/rejected": -121.6512680053711, "loss": 0.6427, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5395559072494507, "rewards/margins": 0.12529203295707703, "rewards/rejected": -0.6648479104042053, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 13.928332328796387, "learning_rate": 9.1850530963244e-08, "logits/chosen": -2.7029757499694824, "logits/rejected": -2.677156686782837, "logps/chosen": -115.1556396484375, "logps/rejected": -125.21661376953125, "loss": 0.6445, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5706117153167725, "rewards/margins": 0.1268078088760376, "rewards/rejected": -0.6974195241928101, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 11.631983757019043, "learning_rate": 9.179559566314761e-08, "logits/chosen": -2.693277359008789, "logits/rejected": -2.685077667236328, "logps/chosen": -115.26069641113281, "logps/rejected": -125.75660705566406, "loss": 0.6575, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6026903390884399, "rewards/margins": 0.10752693563699722, "rewards/rejected": -0.7102171182632446, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 13.562352180480957, "learning_rate": 9.174049236968749e-08, "logits/chosen": -2.6991212368011475, "logits/rejected": -2.676464557647705, "logps/chosen": -111.49564361572266, "logps/rejected": -118.50048828125, "loss": 0.6501, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5621426701545715, "rewards/margins": 0.11947949975728989, "rewards/rejected": -0.6816221475601196, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 21.77356719970703, "learning_rate": 9.168522130434598e-08, "logits/chosen": -2.6700754165649414, "logits/rejected": -2.6562094688415527, "logps/chosen": -107.92274475097656, "logps/rejected": -114.88655853271484, "loss": 0.6583, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5256450772285461, "rewards/margins": 0.09301182627677917, "rewards/rejected": -0.6186568737030029, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 13.408563613891602, "learning_rate": 9.162978268927982e-08, "logits/chosen": -2.6990115642547607, "logits/rejected": -2.682617664337158, "logps/chosen": -105.44071197509766, "logps/rejected": -113.16312408447266, "loss": 0.6446, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5250687599182129, "rewards/margins": 0.12488999217748642, "rewards/rejected": -0.6499587297439575, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 13.76351547241211, "learning_rate": 9.157417674731917e-08, "logits/chosen": -2.6879332065582275, "logits/rejected": -2.6660237312316895, "logps/chosen": -110.22966003417969, "logps/rejected": -118.12898254394531, "loss": 0.6602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5635314583778381, "rewards/margins": 0.10251311212778091, "rewards/rejected": -0.6660445332527161, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 13.860352516174316, "learning_rate": 9.151840370196677e-08, "logits/chosen": -2.7110648155212402, "logits/rejected": -2.69399356842041, "logps/chosen": -114.106689453125, "logps/rejected": -126.93440246582031, "loss": 0.6373, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5762235522270203, "rewards/margins": 0.15776102244853973, "rewards/rejected": -0.7339845895767212, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 16.746814727783203, "learning_rate": 9.146246377739695e-08, "logits/chosen": -2.6945366859436035, "logits/rejected": -2.6857993602752686, "logps/chosen": -112.3055191040039, "logps/rejected": -128.11129760742188, "loss": 0.6441, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5938907861709595, "rewards/margins": 0.14227768778800964, "rewards/rejected": -0.7361685037612915, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 15.643275260925293, "learning_rate": 9.140635719845486e-08, "logits/chosen": -2.723626136779785, "logits/rejected": -2.6965630054473877, "logps/chosen": -115.5739974975586, "logps/rejected": -123.12554931640625, "loss": 0.6351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5669673681259155, "rewards/margins": 0.14785338938236237, "rewards/rejected": -0.7148207426071167, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 14.575557708740234, "learning_rate": 9.135008419065549e-08, "logits/chosen": -2.6363587379455566, "logits/rejected": -2.6148293018341064, "logps/chosen": -116.8868408203125, "logps/rejected": -129.30899047851562, "loss": 0.624, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5912224054336548, "rewards/margins": 0.18092842400074005, "rewards/rejected": -0.7721508145332336, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 16.16574478149414, "learning_rate": 9.129364498018274e-08, "logits/chosen": -2.6242127418518066, "logits/rejected": -2.604121446609497, "logps/chosen": -119.1768798828125, "logps/rejected": -125.98077392578125, "loss": 0.6594, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.642062783241272, "rewards/margins": 0.11331970989704132, "rewards/rejected": -0.7553825378417969, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 15.485376358032227, "learning_rate": 9.12370397938886e-08, "logits/chosen": -2.6874778270721436, "logits/rejected": -2.6816353797912598, "logps/chosen": -110.1086196899414, "logps/rejected": -124.9239730834961, "loss": 0.6253, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5432339906692505, "rewards/margins": 0.1717141717672348, "rewards/rejected": -0.7149480581283569, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 13.701244354248047, "learning_rate": 9.118026885929214e-08, "logits/chosen": -2.6680829524993896, "logits/rejected": -2.6603760719299316, "logps/chosen": -114.32022857666016, "logps/rejected": -121.4356918334961, "loss": 0.6575, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5776886343955994, "rewards/margins": 0.09786656498908997, "rewards/rejected": -0.6755552887916565, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 15.894256591796875, "learning_rate": 9.112333240457866e-08, "logits/chosen": -2.6610710620880127, "logits/rejected": -2.6477136611938477, "logps/chosen": -117.54109954833984, "logps/rejected": -126.5459213256836, "loss": 0.6537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6480070352554321, "rewards/margins": 0.11084611713886261, "rewards/rejected": -0.7588531374931335, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 15.977734565734863, "learning_rate": 9.106623065859873e-08, "logits/chosen": -2.703068971633911, "logits/rejected": -2.6900250911712646, "logps/chosen": -128.57907104492188, "logps/rejected": -133.2577362060547, "loss": 0.661, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6808390021324158, "rewards/margins": 0.11281619220972061, "rewards/rejected": -0.793655276298523, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 15.172703742980957, "learning_rate": 9.100896385086731e-08, "logits/chosen": -2.6161365509033203, "logits/rejected": -2.6057090759277344, "logps/chosen": -117.88389587402344, "logps/rejected": -132.16099548339844, "loss": 0.637, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6521201729774475, "rewards/margins": 0.14359167218208313, "rewards/rejected": -0.795711874961853, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 12.677447319030762, "learning_rate": 9.095153221156283e-08, "logits/chosen": -2.668921709060669, "logits/rejected": -2.6519389152526855, "logps/chosen": -127.98905944824219, "logps/rejected": -126.518310546875, "loss": 0.6862, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6938291192054749, "rewards/margins": 0.050543297082185745, "rewards/rejected": -0.7443724274635315, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 11.560530662536621, "learning_rate": 9.089393597152619e-08, "logits/chosen": -2.6407485008239746, "logits/rejected": -2.633110761642456, "logps/chosen": -114.29498291015625, "logps/rejected": -122.00848388671875, "loss": 0.6569, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6159349679946899, "rewards/margins": 0.10554057359695435, "rewards/rejected": -0.7214756011962891, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -2.731581687927246, "eval_logits/rejected": -2.7282795906066895, "eval_logps/chosen": -108.52731323242188, "eval_logps/rejected": -120.57421875, "eval_loss": 0.6637259721755981, "eval_rewards/accuracies": 0.6059479713439941, "eval_rewards/chosen": -0.49511826038360596, "eval_rewards/margins": 0.08312792330980301, "eval_rewards/rejected": -0.5782462358474731, "eval_runtime": 383.3031, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 12.14091682434082, "learning_rate": 9.083617536225994e-08, "logits/chosen": -2.6535537242889404, "logits/rejected": -2.6268935203552246, "logps/chosen": -118.03663635253906, "logps/rejected": -123.8675765991211, "loss": 0.6429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5971187353134155, "rewards/margins": 0.13232539594173431, "rewards/rejected": -0.729444146156311, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 13.891697883605957, "learning_rate": 9.077825061592729e-08, "logits/chosen": -2.6767055988311768, "logits/rejected": -2.670660972595215, "logps/chosen": -114.75425720214844, "logps/rejected": -124.70011138916016, "loss": 0.6553, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6179844737052917, "rewards/margins": 0.11821053922176361, "rewards/rejected": -0.7361949682235718, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 14.244720458984375, "learning_rate": 9.072016196535112e-08, "logits/chosen": -2.6639866828918457, "logits/rejected": -2.6526284217834473, "logps/chosen": -115.40333557128906, "logps/rejected": -121.971435546875, "loss": 0.664, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6033909320831299, "rewards/margins": 0.0941086858510971, "rewards/rejected": -0.6974996328353882, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 13.105058670043945, "learning_rate": 9.066190964401321e-08, "logits/chosen": -2.6423277854919434, "logits/rejected": -2.619694232940674, "logps/chosen": -121.77622985839844, "logps/rejected": -130.93408203125, "loss": 0.6435, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6425229907035828, "rewards/margins": 0.1487221121788025, "rewards/rejected": -0.7912451028823853, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 14.29844856262207, "learning_rate": 9.060349388605313e-08, "logits/chosen": -2.6907999515533447, "logits/rejected": -2.6778297424316406, "logps/chosen": -113.85269927978516, "logps/rejected": -126.59355163574219, "loss": 0.6355, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5850516557693481, "rewards/margins": 0.1524309366941452, "rewards/rejected": -0.7374826073646545, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 20.311260223388672, "learning_rate": 9.054491492626736e-08, "logits/chosen": -2.683197021484375, "logits/rejected": -2.656275987625122, "logps/chosen": -124.2918930053711, "logps/rejected": -120.60951232910156, "loss": 0.6815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6485036015510559, "rewards/margins": 0.058196693658828735, "rewards/rejected": -0.7067002654075623, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 16.1400203704834, "learning_rate": 9.048617300010839e-08, "logits/chosen": -2.708627939224243, "logits/rejected": -2.689025402069092, "logps/chosen": -121.8497543334961, "logps/rejected": -129.84942626953125, "loss": 0.6372, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6298328638076782, "rewards/margins": 0.15910618007183075, "rewards/rejected": -0.7889389991760254, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 16.34994125366211, "learning_rate": 9.042726834368372e-08, "logits/chosen": -2.634161949157715, "logits/rejected": -2.6114230155944824, "logps/chosen": -116.34796142578125, "logps/rejected": -122.90814208984375, "loss": 0.66, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6260043978691101, "rewards/margins": 0.10967379808425903, "rewards/rejected": -0.7356782555580139, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 16.96595573425293, "learning_rate": 9.036820119375494e-08, "logits/chosen": -2.6891283988952637, "logits/rejected": -2.6712119579315186, "logps/chosen": -117.00113677978516, "logps/rejected": -132.6031494140625, "loss": 0.6201, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6106215119361877, "rewards/margins": 0.18790537118911743, "rewards/rejected": -0.7985268831253052, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 17.874977111816406, "learning_rate": 9.030897178773676e-08, "logits/chosen": -2.642887592315674, "logits/rejected": -2.6208043098449707, "logps/chosen": -115.6777572631836, "logps/rejected": -121.92088317871094, "loss": 0.6622, "rewards/accuracies": 0.625, "rewards/chosen": -0.6188615560531616, "rewards/margins": 0.10674705356359482, "rewards/rejected": -0.7256086468696594, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 17.601911544799805, "learning_rate": 9.024958036369604e-08, "logits/chosen": -2.74424409866333, "logits/rejected": -2.7178235054016113, "logps/chosen": -116.89372253417969, "logps/rejected": -123.71903228759766, "loss": 0.6526, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5964015126228333, "rewards/margins": 0.124220110476017, "rewards/rejected": -0.7206215262413025, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 22.39488983154297, "learning_rate": 9.019002716035091e-08, "logits/chosen": -2.6259522438049316, "logits/rejected": -2.612258195877075, "logps/chosen": -112.20829010009766, "logps/rejected": -125.56230163574219, "loss": 0.6375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5727871060371399, "rewards/margins": 0.1611333191394806, "rewards/rejected": -0.7339202761650085, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 17.15035057067871, "learning_rate": 9.013031241706971e-08, "logits/chosen": -2.741135358810425, "logits/rejected": -2.7288200855255127, "logps/chosen": -115.10804748535156, "logps/rejected": -127.43843078613281, "loss": 0.6655, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6084795594215393, "rewards/margins": 0.09732518345117569, "rewards/rejected": -0.705804705619812, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 21.132137298583984, "learning_rate": 9.007043637387009e-08, "logits/chosen": -2.705995798110962, "logits/rejected": -2.6840262413024902, "logps/chosen": -112.5478286743164, "logps/rejected": -121.37569427490234, "loss": 0.6437, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5596413612365723, "rewards/margins": 0.1397586166858673, "rewards/rejected": -0.699400007724762, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 16.019283294677734, "learning_rate": 9.001039927141802e-08, "logits/chosen": -2.5744900703430176, "logits/rejected": -2.558807134628296, "logps/chosen": -108.00617980957031, "logps/rejected": -119.18704986572266, "loss": 0.6459, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5191596150398254, "rewards/margins": 0.1343960464000702, "rewards/rejected": -0.6535556316375732, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 20.813732147216797, "learning_rate": 8.995020135102685e-08, "logits/chosen": -2.6362557411193848, "logits/rejected": -2.641608476638794, "logps/chosen": -108.81939697265625, "logps/rejected": -129.12559509277344, "loss": 0.6246, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5596833229064941, "rewards/margins": 0.1810264140367508, "rewards/rejected": -0.7407097816467285, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 14.945688247680664, "learning_rate": 8.988984285465631e-08, "logits/chosen": -2.620194911956787, "logits/rejected": -2.6145381927490234, "logps/chosen": -112.78028869628906, "logps/rejected": -125.85223388671875, "loss": 0.6408, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.600356936454773, "rewards/margins": 0.14802278578281403, "rewards/rejected": -0.7483798265457153, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 15.188470840454102, "learning_rate": 8.982932402491154e-08, "logits/chosen": -2.659909725189209, "logits/rejected": -2.656132698059082, "logps/chosen": -115.32022857666016, "logps/rejected": -127.0588607788086, "loss": 0.6508, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6081004738807678, "rewards/margins": 0.1243026852607727, "rewards/rejected": -0.7324031591415405, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 20.90500831604004, "learning_rate": 8.976864510504217e-08, "logits/chosen": -2.6207499504089355, "logits/rejected": -2.615877151489258, "logps/chosen": -115.70672607421875, "logps/rejected": -133.36627197265625, "loss": 0.635, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.631032407283783, "rewards/margins": 0.15860576927661896, "rewards/rejected": -0.7896381616592407, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 18.265804290771484, "learning_rate": 8.970780633894122e-08, "logits/chosen": -2.6352665424346924, "logits/rejected": -2.6178627014160156, "logps/chosen": -118.26082611083984, "logps/rejected": -130.56179809570312, "loss": 0.6308, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6121785640716553, "rewards/margins": 0.17027755081653595, "rewards/rejected": -0.78245609998703, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 19.394807815551758, "learning_rate": 8.964680797114426e-08, "logits/chosen": -2.6262760162353516, "logits/rejected": -2.6074700355529785, "logps/chosen": -121.80794525146484, "logps/rejected": -132.30230712890625, "loss": 0.6437, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6812965869903564, "rewards/margins": 0.1452482044696808, "rewards/rejected": -0.8265447616577148, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 12.27147388458252, "learning_rate": 8.958565024682836e-08, "logits/chosen": -2.6230664253234863, "logits/rejected": -2.6053595542907715, "logps/chosen": -116.36280822753906, "logps/rejected": -130.62164306640625, "loss": 0.6335, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6199180483818054, "rewards/margins": 0.16995711624622345, "rewards/rejected": -0.7898751497268677, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 20.50470733642578, "learning_rate": 8.952433341181107e-08, "logits/chosen": -2.6090950965881348, "logits/rejected": -2.6000587940216064, "logps/chosen": -120.79141998291016, "logps/rejected": -133.48959350585938, "loss": 0.6455, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6697946786880493, "rewards/margins": 0.13562212884426117, "rewards/rejected": -0.8054167628288269, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 20.943683624267578, "learning_rate": 8.946285771254948e-08, "logits/chosen": -2.735042095184326, "logits/rejected": -2.705932855606079, "logps/chosen": -125.88616943359375, "logps/rejected": -130.8765106201172, "loss": 0.6496, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6771482229232788, "rewards/margins": 0.13066670298576355, "rewards/rejected": -0.80781489610672, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 14.9611177444458, "learning_rate": 8.940122339613927e-08, "logits/chosen": -2.649893045425415, "logits/rejected": -2.6332714557647705, "logps/chosen": -124.31199645996094, "logps/rejected": -136.4893035888672, "loss": 0.6442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6720517873764038, "rewards/margins": 0.14682091772556305, "rewards/rejected": -0.8188726305961609, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 17.631919860839844, "learning_rate": 8.933943071031359e-08, "logits/chosen": -2.5778708457946777, "logits/rejected": -2.573026657104492, "logps/chosen": -118.61930084228516, "logps/rejected": -129.90167236328125, "loss": 0.663, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6789823770523071, "rewards/margins": 0.1023738831281662, "rewards/rejected": -0.7813562154769897, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 14.673048973083496, "learning_rate": 8.92774799034422e-08, "logits/chosen": -2.619852066040039, "logits/rejected": -2.60081148147583, "logps/chosen": -121.1645278930664, "logps/rejected": -126.4146499633789, "loss": 0.6594, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6706386208534241, "rewards/margins": 0.12023203074932098, "rewards/rejected": -0.790870726108551, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 15.462677001953125, "learning_rate": 8.921537122453037e-08, "logits/chosen": -2.784825086593628, "logits/rejected": -2.755561351776123, "logps/chosen": -116.18212890625, "logps/rejected": -121.95218658447266, "loss": 0.6467, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5815830230712891, "rewards/margins": 0.13263188302516937, "rewards/rejected": -0.7142149209976196, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 16.479412078857422, "learning_rate": 8.915310492321799e-08, "logits/chosen": -2.6742799282073975, "logits/rejected": -2.6519150733947754, "logps/chosen": -112.9342041015625, "logps/rejected": -128.68777465820312, "loss": 0.6148, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5782161951065063, "rewards/margins": 0.20093123614788055, "rewards/rejected": -0.7791474461555481, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 28.277772903442383, "learning_rate": 8.909068124977839e-08, "logits/chosen": -2.5798137187957764, "logits/rejected": -2.5520870685577393, "logps/chosen": -119.62324523925781, "logps/rejected": -126.19020080566406, "loss": 0.6606, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6352814435958862, "rewards/margins": 0.10623638331890106, "rewards/rejected": -0.7415178418159485, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 16.072378158569336, "learning_rate": 8.902810045511753e-08, "logits/chosen": -2.685297727584839, "logits/rejected": -2.6612815856933594, "logps/chosen": -126.0315933227539, "logps/rejected": -130.48361206054688, "loss": 0.6739, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.6738030314445496, "rewards/margins": 0.09819827973842621, "rewards/rejected": -0.772001326084137, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 15.368549346923828, "learning_rate": 8.896536279077287e-08, "logits/chosen": -2.690248966217041, "logits/rejected": -2.668186902999878, "logps/chosen": -111.31550598144531, "logps/rejected": -119.72273254394531, "loss": 0.6442, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5687953233718872, "rewards/margins": 0.1302814781665802, "rewards/rejected": -0.6990768313407898, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 14.595114707946777, "learning_rate": 8.89024685089124e-08, "logits/chosen": -2.72908091545105, "logits/rejected": -2.7011799812316895, "logps/chosen": -114.46369934082031, "logps/rejected": -120.93522644042969, "loss": 0.6506, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5987579822540283, "rewards/margins": 0.11958907544612885, "rewards/rejected": -0.7183471322059631, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 14.865375518798828, "learning_rate": 8.883941786233363e-08, "logits/chosen": -2.6741504669189453, "logits/rejected": -2.646867275238037, "logps/chosen": -115.9564208984375, "logps/rejected": -119.01458740234375, "loss": 0.6591, "rewards/accuracies": 0.625, "rewards/chosen": -0.5695517659187317, "rewards/margins": 0.10228168964385986, "rewards/rejected": -0.6718333959579468, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 17.496746063232422, "learning_rate": 8.877621110446253e-08, "logits/chosen": -2.6742119789123535, "logits/rejected": -2.6619982719421387, "logps/chosen": -110.34440612792969, "logps/rejected": -125.23421478271484, "loss": 0.6378, "rewards/accuracies": 0.65625, "rewards/chosen": -0.562449038028717, "rewards/margins": 0.15165221691131592, "rewards/rejected": -0.7141013145446777, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 18.376039505004883, "learning_rate": 8.871284848935256e-08, "logits/chosen": -2.6834418773651123, "logits/rejected": -2.6559715270996094, "logps/chosen": -114.18431091308594, "logps/rejected": -120.5025405883789, "loss": 0.6589, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6160685420036316, "rewards/margins": 0.11094069480895996, "rewards/rejected": -0.7270091772079468, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 11.63632583618164, "learning_rate": 8.864933027168367e-08, "logits/chosen": -2.6618878841400146, "logits/rejected": -2.6420178413391113, "logps/chosen": -114.88929748535156, "logps/rejected": -129.62692260742188, "loss": 0.6179, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5751225352287292, "rewards/margins": 0.2080199271440506, "rewards/rejected": -0.7831424474716187, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 18.189315795898438, "learning_rate": 8.858565670676117e-08, "logits/chosen": -2.742037296295166, "logits/rejected": -2.7349371910095215, "logps/chosen": -122.88726806640625, "logps/rejected": -129.7611846923828, "loss": 0.6676, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6671861410140991, "rewards/margins": 0.0882277563214302, "rewards/rejected": -0.755413830280304, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 16.983613967895508, "learning_rate": 8.852182805051485e-08, "logits/chosen": -2.6411783695220947, "logits/rejected": -2.6334564685821533, "logps/chosen": -120.18841552734375, "logps/rejected": -129.97769165039062, "loss": 0.6659, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.671235203742981, "rewards/margins": 0.09620723873376846, "rewards/rejected": -0.767442524433136, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 16.245668411254883, "learning_rate": 8.845784455949778e-08, "logits/chosen": -2.712747097015381, "logits/rejected": -2.692870616912842, "logps/chosen": -119.13944244384766, "logps/rejected": -133.7500457763672, "loss": 0.6383, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6625715494155884, "rewards/margins": 0.1500093638896942, "rewards/rejected": -0.812580943107605, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -2.714919328689575, "eval_logits/rejected": -2.7111740112304688, "eval_logps/chosen": -110.81185913085938, "eval_logps/rejected": -123.86541748046875, "eval_loss": 0.6621139049530029, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -0.5179638266563416, "eval_rewards/margins": 0.0931943878531456, "eval_rewards/rejected": -0.6111582517623901, "eval_runtime": 383.2873, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 13.328145980834961, "learning_rate": 8.839370649088546e-08, "logits/chosen": -2.6668667793273926, "logits/rejected": -2.65277361869812, "logps/chosen": -119.27299499511719, "logps/rejected": -124.73472595214844, "loss": 0.6707, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6674652099609375, "rewards/margins": 0.08735474944114685, "rewards/rejected": -0.7548199892044067, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 16.024778366088867, "learning_rate": 8.83294141024747e-08, "logits/chosen": -2.683475971221924, "logits/rejected": -2.6743195056915283, "logps/chosen": -113.74040222167969, "logps/rejected": -123.02177429199219, "loss": 0.6752, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.605230450630188, "rewards/margins": 0.07791049778461456, "rewards/rejected": -0.6831408739089966, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 15.768523216247559, "learning_rate": 8.826496765268248e-08, "logits/chosen": -2.6493020057678223, "logits/rejected": -2.642354726791382, "logps/chosen": -114.64256286621094, "logps/rejected": -130.42835998535156, "loss": 0.6239, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5734395980834961, "rewards/margins": 0.18039682507514954, "rewards/rejected": -0.7538365125656128, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 16.26175308227539, "learning_rate": 8.820036740054516e-08, "logits/chosen": -2.6304409503936768, "logits/rejected": -2.6143264770507812, "logps/chosen": -116.3860092163086, "logps/rejected": -136.6753692626953, "loss": 0.6156, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6271839141845703, "rewards/margins": 0.2139817178249359, "rewards/rejected": -0.8411655426025391, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 15.620439529418945, "learning_rate": 8.813561360571715e-08, "logits/chosen": -2.5899417400360107, "logits/rejected": -2.5807578563690186, "logps/chosen": -114.19087219238281, "logps/rejected": -127.84722900390625, "loss": 0.6389, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6201349496841431, "rewards/margins": 0.1543225795030594, "rewards/rejected": -0.774457573890686, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 22.585975646972656, "learning_rate": 8.807070652847014e-08, "logits/chosen": -2.6384902000427246, "logits/rejected": -2.607151508331299, "logps/chosen": -118.38133239746094, "logps/rejected": -128.50912475585938, "loss": 0.6337, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6117981672286987, "rewards/margins": 0.16677974164485931, "rewards/rejected": -0.778577983379364, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 15.792411804199219, "learning_rate": 8.800564642969182e-08, "logits/chosen": -2.722902297973633, "logits/rejected": -2.7080540657043457, "logps/chosen": -117.5638656616211, "logps/rejected": -130.32357788085938, "loss": 0.6436, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6469516754150391, "rewards/margins": 0.15124547481536865, "rewards/rejected": -0.7981971502304077, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 14.785796165466309, "learning_rate": 8.794043357088501e-08, "logits/chosen": -2.6592485904693604, "logits/rejected": -2.636044979095459, "logps/chosen": -121.824951171875, "logps/rejected": -129.5763702392578, "loss": 0.6482, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6678295135498047, "rewards/margins": 0.13105496764183044, "rewards/rejected": -0.7988845109939575, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 20.555374145507812, "learning_rate": 8.787506821416648e-08, "logits/chosen": -2.6226296424865723, "logits/rejected": -2.5907247066497803, "logps/chosen": -119.1449966430664, "logps/rejected": -126.7865219116211, "loss": 0.6474, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.621123194694519, "rewards/margins": 0.1367107331752777, "rewards/rejected": -0.7578339576721191, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 16.926069259643555, "learning_rate": 8.780955062226598e-08, "logits/chosen": -2.634366750717163, "logits/rejected": -2.625247001647949, "logps/chosen": -115.128662109375, "logps/rejected": -128.33938598632812, "loss": 0.6332, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5826433897018433, "rewards/margins": 0.15657159686088562, "rewards/rejected": -0.7392150163650513, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 19.48094367980957, "learning_rate": 8.774388105852517e-08, "logits/chosen": -2.7098991870880127, "logits/rejected": -2.7008213996887207, "logps/chosen": -123.44694519042969, "logps/rejected": -125.55406188964844, "loss": 0.6657, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6424458622932434, "rewards/margins": 0.09618823230266571, "rewards/rejected": -0.7386341094970703, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 17.43693733215332, "learning_rate": 8.767805978689651e-08, "logits/chosen": -2.655910015106201, "logits/rejected": -2.611663818359375, "logps/chosen": -121.01139068603516, "logps/rejected": -127.2607192993164, "loss": 0.633, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.621241569519043, "rewards/margins": 0.16151300072669983, "rewards/rejected": -0.7827545404434204, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 15.040596961975098, "learning_rate": 8.761208707194223e-08, "logits/chosen": -2.6152052879333496, "logits/rejected": -2.6168980598449707, "logps/chosen": -116.71577453613281, "logps/rejected": -136.49380493164062, "loss": 0.6233, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6542342305183411, "rewards/margins": 0.19011548161506653, "rewards/rejected": -0.84434974193573, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 20.719085693359375, "learning_rate": 8.754596317883332e-08, "logits/chosen": -2.6348819732666016, "logits/rejected": -2.5992424488067627, "logps/chosen": -123.36546325683594, "logps/rejected": -130.93771362304688, "loss": 0.6367, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6524416208267212, "rewards/margins": 0.14736978709697723, "rewards/rejected": -0.7998114824295044, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 13.634549140930176, "learning_rate": 8.747968837334837e-08, "logits/chosen": -2.6150898933410645, "logits/rejected": -2.590125322341919, "logps/chosen": -120.84356689453125, "logps/rejected": -132.3267364501953, "loss": 0.6436, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6730222702026367, "rewards/margins": 0.14201988279819489, "rewards/rejected": -0.8150421380996704, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 19.82248306274414, "learning_rate": 8.741326292187257e-08, "logits/chosen": -2.666996479034424, "logits/rejected": -2.6632487773895264, "logps/chosen": -118.6470947265625, "logps/rejected": -140.34963989257812, "loss": 0.6188, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6400326490402222, "rewards/margins": 0.20354370772838593, "rewards/rejected": -0.8435763120651245, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 21.43902587890625, "learning_rate": 8.734668709139663e-08, "logits/chosen": -2.610877752304077, "logits/rejected": -2.6005136966705322, "logps/chosen": -122.8233413696289, "logps/rejected": -130.63795471191406, "loss": 0.6712, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6907214522361755, "rewards/margins": 0.08820674568414688, "rewards/rejected": -0.7789281606674194, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 18.302011489868164, "learning_rate": 8.727996114951566e-08, "logits/chosen": -2.683851957321167, "logits/rejected": -2.658590316772461, "logps/chosen": -130.3781280517578, "logps/rejected": -135.687744140625, "loss": 0.6489, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7127219438552856, "rewards/margins": 0.13830821216106415, "rewards/rejected": -0.851030170917511, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 18.31463050842285, "learning_rate": 8.721308536442814e-08, "logits/chosen": -2.6187527179718018, "logits/rejected": -2.5875821113586426, "logps/chosen": -130.81814575195312, "logps/rejected": -134.22793579101562, "loss": 0.6592, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7203988432884216, "rewards/margins": 0.1155281662940979, "rewards/rejected": -0.8359270095825195, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 17.170841217041016, "learning_rate": 8.714606000493482e-08, "logits/chosen": -2.634840250015259, "logits/rejected": -2.6232903003692627, "logps/chosen": -115.47178649902344, "logps/rejected": -137.46221923828125, "loss": 0.6212, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6360267400741577, "rewards/margins": 0.20517496764659882, "rewards/rejected": -0.8412017822265625, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 29.442955017089844, "learning_rate": 8.707888534043772e-08, "logits/chosen": -2.6460509300231934, "logits/rejected": -2.6388349533081055, "logps/chosen": -129.9828643798828, "logps/rejected": -134.42715454101562, "loss": 0.674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7190936803817749, "rewards/margins": 0.08293198049068451, "rewards/rejected": -0.8020256161689758, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 22.263671875, "learning_rate": 8.701156164093888e-08, "logits/chosen": -2.6946921348571777, "logits/rejected": -2.6782920360565186, "logps/chosen": -123.95640563964844, "logps/rejected": -132.80471801757812, "loss": 0.6583, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6873513460159302, "rewards/margins": 0.1127699613571167, "rewards/rejected": -0.8001214265823364, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 18.80925178527832, "learning_rate": 8.694408917703942e-08, "logits/chosen": -2.6444718837738037, "logits/rejected": -2.636350631713867, "logps/chosen": -126.26493072509766, "logps/rejected": -136.88291931152344, "loss": 0.6464, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.720744252204895, "rewards/margins": 0.14062415063381195, "rewards/rejected": -0.8613685369491577, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 21.596277236938477, "learning_rate": 8.68764682199384e-08, "logits/chosen": -2.649989604949951, "logits/rejected": -2.637254476547241, "logps/chosen": -127.28349304199219, "logps/rejected": -138.6378631591797, "loss": 0.6493, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7071490287780762, "rewards/margins": 0.13525404036045074, "rewards/rejected": -0.8424030542373657, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 19.736589431762695, "learning_rate": 8.680869904143172e-08, "logits/chosen": -2.606205463409424, "logits/rejected": -2.5994503498077393, "logps/chosen": -126.2559585571289, "logps/rejected": -139.34756469726562, "loss": 0.633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6941956877708435, "rewards/margins": 0.17159244418144226, "rewards/rejected": -0.8657881617546082, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 16.4277400970459, "learning_rate": 8.674078191391108e-08, "logits/chosen": -2.6205191612243652, "logits/rejected": -2.6084847450256348, "logps/chosen": -122.77671813964844, "logps/rejected": -127.8461685180664, "loss": 0.671, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6828287243843079, "rewards/margins": 0.08287617564201355, "rewards/rejected": -0.7657049298286438, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 15.285874366760254, "learning_rate": 8.66727171103628e-08, "logits/chosen": -2.6129372119903564, "logits/rejected": -2.6003642082214355, "logps/chosen": -121.86590576171875, "logps/rejected": -127.73139953613281, "loss": 0.6835, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6971479654312134, "rewards/margins": 0.0788634642958641, "rewards/rejected": -0.7760114073753357, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 17.484487533569336, "learning_rate": 8.66045049043668e-08, "logits/chosen": -2.6272430419921875, "logits/rejected": -2.60965633392334, "logps/chosen": -120.57098388671875, "logps/rejected": -134.08132934570312, "loss": 0.6295, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6604935526847839, "rewards/margins": 0.17122098803520203, "rewards/rejected": -0.8317145109176636, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 15.28332233428955, "learning_rate": 8.653614557009546e-08, "logits/chosen": -2.6022276878356934, "logits/rejected": -2.581787109375, "logps/chosen": -122.64559173583984, "logps/rejected": -127.6083984375, "loss": 0.6602, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6580213308334351, "rewards/margins": 0.1082652360200882, "rewards/rejected": -0.7662865519523621, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 18.69580078125, "learning_rate": 8.646763938231252e-08, "logits/chosen": -2.6790108680725098, "logits/rejected": -2.6685914993286133, "logps/chosen": -118.9625473022461, "logps/rejected": -138.61312866210938, "loss": 0.6201, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6429628133773804, "rewards/margins": 0.20192137360572815, "rewards/rejected": -0.8448840975761414, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 23.761962890625, "learning_rate": 8.6398986616372e-08, "logits/chosen": -2.6038689613342285, "logits/rejected": -2.593151569366455, "logps/chosen": -124.92496490478516, "logps/rejected": -134.55838012695312, "loss": 0.6578, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7127937078475952, "rewards/margins": 0.10803844779729843, "rewards/rejected": -0.8208320736885071, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 18.504318237304688, "learning_rate": 8.633018754821704e-08, "logits/chosen": -2.595757007598877, "logits/rejected": -2.5816268920898438, "logps/chosen": -124.0481948852539, "logps/rejected": -135.4131317138672, "loss": 0.6265, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6618342399597168, "rewards/margins": 0.18380990624427795, "rewards/rejected": -0.8456441760063171, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 15.531015396118164, "learning_rate": 8.62612424543789e-08, "logits/chosen": -2.640845775604248, "logits/rejected": -2.612797498703003, "logps/chosen": -131.6069793701172, "logps/rejected": -133.2763671875, "loss": 0.6602, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7045750617980957, "rewards/margins": 0.11494274437427521, "rewards/rejected": -0.8195177912712097, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 19.734302520751953, "learning_rate": 8.61921516119757e-08, "logits/chosen": -2.624042510986328, "logits/rejected": -2.6247589588165283, "logps/chosen": -121.94276428222656, "logps/rejected": -139.32431030273438, "loss": 0.6441, "rewards/accuracies": 0.625, "rewards/chosen": -0.6817771792411804, "rewards/margins": 0.15398737788200378, "rewards/rejected": -0.8357645869255066, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 16.820507049560547, "learning_rate": 8.612291529871146e-08, "logits/chosen": -2.542262554168701, "logits/rejected": -2.530670642852783, "logps/chosen": -124.15339660644531, "logps/rejected": -132.162109375, "loss": 0.6824, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7120052576065063, "rewards/margins": 0.07154157012701035, "rewards/rejected": -0.7835467457771301, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 19.347867965698242, "learning_rate": 8.605353379287478e-08, "logits/chosen": -2.6067452430725098, "logits/rejected": -2.5873658657073975, "logps/chosen": -117.42878723144531, "logps/rejected": -125.9688720703125, "loss": 0.6513, "rewards/accuracies": 0.625, "rewards/chosen": -0.6437867879867554, "rewards/margins": 0.1278226375579834, "rewards/rejected": -0.7716094255447388, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 15.14845085144043, "learning_rate": 8.5984007373338e-08, "logits/chosen": -2.6609604358673096, "logits/rejected": -2.655918836593628, "logps/chosen": -116.06624603271484, "logps/rejected": -131.27557373046875, "loss": 0.6581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6413980722427368, "rewards/margins": 0.1107332855463028, "rewards/rejected": -0.7521313428878784, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 18.013757705688477, "learning_rate": 8.591433631955582e-08, "logits/chosen": -2.582545757293701, "logits/rejected": -2.574622392654419, "logps/chosen": -118.67476654052734, "logps/rejected": -134.64376831054688, "loss": 0.6315, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.645140528678894, "rewards/margins": 0.16778312623500824, "rewards/rejected": -0.8129236102104187, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 15.750480651855469, "learning_rate": 8.584452091156432e-08, "logits/chosen": -2.675096035003662, "logits/rejected": -2.6426644325256348, "logps/chosen": -120.98304748535156, "logps/rejected": -134.9139404296875, "loss": 0.6418, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6380943059921265, "rewards/margins": 0.1646120846271515, "rewards/rejected": -0.8027063608169556, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 21.686899185180664, "learning_rate": 8.57745614299798e-08, "logits/chosen": -2.6570706367492676, "logits/rejected": -2.6563992500305176, "logps/chosen": -120.1393051147461, "logps/rejected": -138.4915008544922, "loss": 0.6411, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6697486042976379, "rewards/margins": 0.1515139788389206, "rewards/rejected": -0.8212626576423645, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -2.6910486221313477, "eval_logits/rejected": -2.686899423599243, "eval_logps/chosen": -111.2964859008789, "eval_logps/rejected": -124.09293365478516, "eval_loss": 0.6622869372367859, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -0.5228100419044495, "eval_rewards/margins": 0.09062344580888748, "eval_rewards/rejected": -0.6134334802627563, "eval_runtime": 383.39, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 18.368070602416992, "learning_rate": 8.570445815599767e-08, "logits/chosen": -2.6654767990112305, "logits/rejected": -2.654480218887329, "logps/chosen": -121.57368469238281, "logps/rejected": -145.0277557373047, "loss": 0.6176, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6873144507408142, "rewards/margins": 0.208953857421875, "rewards/rejected": -0.8962682485580444, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 26.18334197998047, "learning_rate": 8.563421137139123e-08, "logits/chosen": -2.59879994392395, "logits/rejected": -2.5828347206115723, "logps/chosen": -129.18121337890625, "logps/rejected": -138.1641082763672, "loss": 0.6486, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7375103831291199, "rewards/margins": 0.14180725812911987, "rewards/rejected": -0.8793177604675293, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 18.84343719482422, "learning_rate": 8.556382135851068e-08, "logits/chosen": -2.6542553901672363, "logits/rejected": -2.6300244331359863, "logps/chosen": -127.53597259521484, "logps/rejected": -133.26052856445312, "loss": 0.6593, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7077816724777222, "rewards/margins": 0.119412362575531, "rewards/rejected": -0.8271940350532532, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 21.3780574798584, "learning_rate": 8.549328840028187e-08, "logits/chosen": -2.6287741661071777, "logits/rejected": -2.618954658508301, "logps/chosen": -120.96806335449219, "logps/rejected": -137.76504516601562, "loss": 0.641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6567273139953613, "rewards/margins": 0.16713783144950867, "rewards/rejected": -0.8238651156425476, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 19.235515594482422, "learning_rate": 8.542261278020524e-08, "logits/chosen": -2.560713291168213, "logits/rejected": -2.5502090454101562, "logps/chosen": -119.46761322021484, "logps/rejected": -136.87493896484375, "loss": 0.6323, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6769036054611206, "rewards/margins": 0.17919696867465973, "rewards/rejected": -0.8561005592346191, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 19.590194702148438, "learning_rate": 8.535179478235461e-08, "logits/chosen": -2.5745973587036133, "logits/rejected": -2.5703866481781006, "logps/chosen": -124.3803939819336, "logps/rejected": -138.00247192382812, "loss": 0.6457, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7225823998451233, "rewards/margins": 0.15808936953544617, "rewards/rejected": -0.8806716799736023, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 16.50753402709961, "learning_rate": 8.52808346913761e-08, "logits/chosen": -2.600623846054077, "logits/rejected": -2.590681552886963, "logps/chosen": -126.36222839355469, "logps/rejected": -136.4040985107422, "loss": 0.6496, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7098848819732666, "rewards/margins": 0.1333651840686798, "rewards/rejected": -0.8432499766349792, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 22.130084991455078, "learning_rate": 8.520973279248694e-08, "logits/chosen": -2.6281137466430664, "logits/rejected": -2.5988149642944336, "logps/chosen": -130.99139404296875, "logps/rejected": -144.82325744628906, "loss": 0.6252, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7589556574821472, "rewards/margins": 0.17990081012248993, "rewards/rejected": -0.9388564825057983, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 17.92255210876465, "learning_rate": 8.513848937147434e-08, "logits/chosen": -2.651932954788208, "logits/rejected": -2.62499737739563, "logps/chosen": -135.4591064453125, "logps/rejected": -145.86044311523438, "loss": 0.6232, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.754391074180603, "rewards/margins": 0.19621720910072327, "rewards/rejected": -0.9506082534790039, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 31.532100677490234, "learning_rate": 8.506710471469438e-08, "logits/chosen": -2.5525314807891846, "logits/rejected": -2.5413818359375, "logps/chosen": -135.53036499023438, "logps/rejected": -146.36544799804688, "loss": 0.6457, "rewards/accuracies": 0.65625, "rewards/chosen": -0.774732232093811, "rewards/margins": 0.15803535282611847, "rewards/rejected": -0.9327676892280579, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 17.127050399780273, "learning_rate": 8.499557910907078e-08, "logits/chosen": -2.657351493835449, "logits/rejected": -2.638766050338745, "logps/chosen": -135.3360137939453, "logps/rejected": -150.8716278076172, "loss": 0.6277, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7916986346244812, "rewards/margins": 0.19900405406951904, "rewards/rejected": -0.9907026290893555, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 21.06365394592285, "learning_rate": 8.492391284209383e-08, "logits/chosen": -2.6116607189178467, "logits/rejected": -2.5935304164886475, "logps/chosen": -131.7152862548828, "logps/rejected": -146.989013671875, "loss": 0.6208, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7815282344818115, "rewards/margins": 0.20366664230823517, "rewards/rejected": -0.9851948618888855, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 18.155738830566406, "learning_rate": 8.485210620181915e-08, "logits/chosen": -2.674208402633667, "logits/rejected": -2.6654229164123535, "logps/chosen": -129.9379425048828, "logps/rejected": -144.26626586914062, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": -0.7705144882202148, "rewards/margins": 0.16067850589752197, "rewards/rejected": -0.9311929941177368, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 19.837358474731445, "learning_rate": 8.478015947686664e-08, "logits/chosen": -2.5953633785247803, "logits/rejected": -2.572862148284912, "logps/chosen": -142.89976501464844, "logps/rejected": -153.91061401367188, "loss": 0.6368, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8325818777084351, "rewards/margins": 0.17331066727638245, "rewards/rejected": -1.0058925151824951, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 28.6536808013916, "learning_rate": 8.470807295641917e-08, "logits/chosen": -2.668067455291748, "logits/rejected": -2.651562213897705, "logps/chosen": -136.50344848632812, "logps/rejected": -136.56271362304688, "loss": 0.6732, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7828587293624878, "rewards/margins": 0.08392515778541565, "rewards/rejected": -0.8667839169502258, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 16.913461685180664, "learning_rate": 8.463584693022156e-08, "logits/chosen": -2.6280453205108643, "logits/rejected": -2.607534408569336, "logps/chosen": -129.51055908203125, "logps/rejected": -139.5177001953125, "loss": 0.6549, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7246989011764526, "rewards/margins": 0.12306500971317291, "rewards/rejected": -0.8477638959884644, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 18.49822998046875, "learning_rate": 8.45634816885794e-08, "logits/chosen": -2.5809426307678223, "logits/rejected": -2.5648703575134277, "logps/chosen": -118.6301040649414, "logps/rejected": -136.79217529296875, "loss": 0.6287, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.659207820892334, "rewards/margins": 0.17542117834091187, "rewards/rejected": -0.8346290588378906, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 24.4649600982666, "learning_rate": 8.449097752235776e-08, "logits/chosen": -2.5920777320861816, "logits/rejected": -2.577484607696533, "logps/chosen": -127.2874984741211, "logps/rejected": -141.57028198242188, "loss": 0.649, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7384024858474731, "rewards/margins": 0.1452442705631256, "rewards/rejected": -0.8836467862129211, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 15.391559600830078, "learning_rate": 8.441833472298014e-08, "logits/chosen": -2.556666612625122, "logits/rejected": -2.527010440826416, "logps/chosen": -114.5416259765625, "logps/rejected": -134.868408203125, "loss": 0.6269, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6321970224380493, "rewards/margins": 0.17795422673225403, "rewards/rejected": -0.8101511001586914, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 18.151533126831055, "learning_rate": 8.434555358242728e-08, "logits/chosen": -2.6377694606781006, "logits/rejected": -2.6138923168182373, "logps/chosen": -118.05354309082031, "logps/rejected": -138.7180633544922, "loss": 0.6149, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6627640128135681, "rewards/margins": 0.2130320817232132, "rewards/rejected": -0.8757961392402649, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 19.002288818359375, "learning_rate": 8.427263439323593e-08, "logits/chosen": -2.645638942718506, "logits/rejected": -2.6332497596740723, "logps/chosen": -130.27548217773438, "logps/rejected": -143.0097198486328, "loss": 0.6485, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7482485771179199, "rewards/margins": 0.15110786259174347, "rewards/rejected": -0.8993565440177917, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 17.686214447021484, "learning_rate": 8.419957744849773e-08, "logits/chosen": -2.644108533859253, "logits/rejected": -2.6132802963256836, "logps/chosen": -132.90487670898438, "logps/rejected": -155.9635009765625, "loss": 0.5902, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7895268797874451, "rewards/margins": 0.28397274017333984, "rewards/rejected": -1.0734995603561401, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 26.239715576171875, "learning_rate": 8.412638304185805e-08, "logits/chosen": -2.5749363899230957, "logits/rejected": -2.5539708137512207, "logps/chosen": -139.7153778076172, "logps/rejected": -152.18222045898438, "loss": 0.6493, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8502417802810669, "rewards/margins": 0.15593525767326355, "rewards/rejected": -1.0061770677566528, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 17.349905014038086, "learning_rate": 8.405305146751472e-08, "logits/chosen": -2.599428653717041, "logits/rejected": -2.5850729942321777, "logps/chosen": -136.01486206054688, "logps/rejected": -159.19105529785156, "loss": 0.6208, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8267250061035156, "rewards/margins": 0.2275165319442749, "rewards/rejected": -1.0542415380477905, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 22.872417449951172, "learning_rate": 8.397958302021695e-08, "logits/chosen": -2.632000684738159, "logits/rejected": -2.614732265472412, "logps/chosen": -128.5465087890625, "logps/rejected": -157.53292846679688, "loss": 0.5957, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7695121765136719, "rewards/margins": 0.2701513171195984, "rewards/rejected": -1.039663553237915, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 20.183704376220703, "learning_rate": 8.390597799526404e-08, "logits/chosen": -2.5346813201904297, "logits/rejected": -2.5250442028045654, "logps/chosen": -139.59866333007812, "logps/rejected": -159.0569610595703, "loss": 0.6271, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8646712303161621, "rewards/margins": 0.20986256003379822, "rewards/rejected": -1.0745337009429932, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 19.863487243652344, "learning_rate": 8.383223668850433e-08, "logits/chosen": -2.57783579826355, "logits/rejected": -2.560011386871338, "logps/chosen": -142.1617431640625, "logps/rejected": -161.0272216796875, "loss": 0.6222, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8560919761657715, "rewards/margins": 0.21619930863380432, "rewards/rejected": -1.072291374206543, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 22.493450164794922, "learning_rate": 8.375835939633384e-08, "logits/chosen": -2.620809555053711, "logits/rejected": -2.608182668685913, "logps/chosen": -133.28724670410156, "logps/rejected": -148.15118408203125, "loss": 0.629, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7764428853988647, "rewards/margins": 0.18140621483325958, "rewards/rejected": -0.9578492045402527, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 33.157630920410156, "learning_rate": 8.368434641569524e-08, "logits/chosen": -2.6399178504943848, "logits/rejected": -2.6295580863952637, "logps/chosen": -138.3361053466797, "logps/rejected": -154.23565673828125, "loss": 0.6492, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8195241093635559, "rewards/margins": 0.15121665596961975, "rewards/rejected": -0.970740795135498, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 20.49550437927246, "learning_rate": 8.361019804407657e-08, "logits/chosen": -2.5564870834350586, "logits/rejected": -2.540733575820923, "logps/chosen": -143.91696166992188, "logps/rejected": -165.53414916992188, "loss": 0.6051, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8727455139160156, "rewards/margins": 0.24948236346244812, "rewards/rejected": -1.1222279071807861, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 27.403352737426758, "learning_rate": 8.353591457951005e-08, "logits/chosen": -2.5638561248779297, "logits/rejected": -2.566239356994629, "logps/chosen": -138.6494598388672, "logps/rejected": -158.37081909179688, "loss": 0.6432, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8563209772109985, "rewards/margins": 0.16037517786026, "rewards/rejected": -1.0166962146759033, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 21.737804412841797, "learning_rate": 8.346149632057089e-08, "logits/chosen": -2.5654804706573486, "logits/rejected": -2.547624111175537, "logps/chosen": -138.55712890625, "logps/rejected": -150.98326110839844, "loss": 0.6675, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.8675752878189087, "rewards/margins": 0.11592147499322891, "rewards/rejected": -0.9834968447685242, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 22.046180725097656, "learning_rate": 8.338694356637612e-08, "logits/chosen": -2.6013948917388916, "logits/rejected": -2.5942418575286865, "logps/chosen": -139.40040588378906, "logps/rejected": -154.91806030273438, "loss": 0.6538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8879048228263855, "rewards/margins": 0.15720801055431366, "rewards/rejected": -1.0451128482818604, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 18.51112174987793, "learning_rate": 8.331225661658331e-08, "logits/chosen": -2.581714153289795, "logits/rejected": -2.55427622795105, "logps/chosen": -130.7750244140625, "logps/rejected": -152.53250122070312, "loss": 0.6093, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7571039199829102, "rewards/margins": 0.24412468075752258, "rewards/rejected": -1.0012285709381104, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 18.122440338134766, "learning_rate": 8.323743577138949e-08, "logits/chosen": -2.5277466773986816, "logits/rejected": -2.5242531299591064, "logps/chosen": -133.4421844482422, "logps/rejected": -144.9862518310547, "loss": 0.6494, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7824922204017639, "rewards/margins": 0.1383362114429474, "rewards/rejected": -0.9208283424377441, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 17.398357391357422, "learning_rate": 8.316248133152979e-08, "logits/chosen": -2.550398588180542, "logits/rejected": -2.5142664909362793, "logps/chosen": -136.10369873046875, "logps/rejected": -142.1478271484375, "loss": 0.6522, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7740185260772705, "rewards/margins": 0.14028708636760712, "rewards/rejected": -0.914305567741394, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 17.406795501708984, "learning_rate": 8.308739359827636e-08, "logits/chosen": -2.5689642429351807, "logits/rejected": -2.5538182258605957, "logps/chosen": -126.88874816894531, "logps/rejected": -144.57144165039062, "loss": 0.6172, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.738309383392334, "rewards/margins": 0.21361498534679413, "rewards/rejected": -0.9519243240356445, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 18.077293395996094, "learning_rate": 8.301217287343709e-08, "logits/chosen": -2.5557403564453125, "logits/rejected": -2.555007219314575, "logps/chosen": -123.3507308959961, "logps/rejected": -148.35134887695312, "loss": 0.6131, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6959012150764465, "rewards/margins": 0.22886237502098083, "rewards/rejected": -0.924763560295105, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 18.56474494934082, "learning_rate": 8.293681945935445e-08, "logits/chosen": -2.6191112995147705, "logits/rejected": -2.592783212661743, "logps/chosen": -124.36668395996094, "logps/rejected": -140.59884643554688, "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6880021691322327, "rewards/margins": 0.2067534476518631, "rewards/rejected": -0.8947556614875793, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 22.80888557434082, "learning_rate": 8.286133365890421e-08, "logits/chosen": -2.588536500930786, "logits/rejected": -2.578328847885132, "logps/chosen": -128.1990509033203, "logps/rejected": -146.2074432373047, "loss": 0.6293, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7584198117256165, "rewards/margins": 0.19320937991142273, "rewards/rejected": -0.9516291618347168, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -2.657290458679199, "eval_logits/rejected": -2.652585506439209, "eval_logps/chosen": -121.11921691894531, "eval_logps/rejected": -135.34634399414062, "eval_loss": 0.6617882251739502, "eval_rewards/accuracies": 0.6064126491546631, "eval_rewards/chosen": -0.6210372447967529, "eval_rewards/margins": 0.10493012517690659, "eval_rewards/rejected": -0.7259674072265625, "eval_runtime": 383.1898, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 28.314002990722656, "learning_rate": 8.278571577549425e-08, "logits/chosen": -2.5861663818359375, "logits/rejected": -2.574615478515625, "logps/chosen": -134.91171264648438, "logps/rejected": -141.68955993652344, "loss": 0.6776, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.8106623888015747, "rewards/margins": 0.10338765382766724, "rewards/rejected": -0.9140501022338867, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 17.736967086791992, "learning_rate": 8.270996611306335e-08, "logits/chosen": -2.6942198276519775, "logits/rejected": -2.6716208457946777, "logps/chosen": -135.91213989257812, "logps/rejected": -139.5281982421875, "loss": 0.6722, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8005961179733276, "rewards/margins": 0.10625004768371582, "rewards/rejected": -0.9068462252616882, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 16.85118865966797, "learning_rate": 8.263408497607998e-08, "logits/chosen": -2.481321334838867, "logits/rejected": -2.4672279357910156, "logps/chosen": -124.10848236083984, "logps/rejected": -137.87637329101562, "loss": 0.6532, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7385944128036499, "rewards/margins": 0.14067426323890686, "rewards/rejected": -0.8792687654495239, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 20.27057647705078, "learning_rate": 8.255807266954104e-08, "logits/chosen": -2.610898733139038, "logits/rejected": -2.5935518741607666, "logps/chosen": -119.01715087890625, "logps/rejected": -129.5271453857422, "loss": 0.6462, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6490130424499512, "rewards/margins": 0.14013883471488953, "rewards/rejected": -0.7891519069671631, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 19.23181915283203, "learning_rate": 8.248192949897068e-08, "logits/chosen": -2.5512688159942627, "logits/rejected": -2.530177593231201, "logps/chosen": -131.2162628173828, "logps/rejected": -142.27493286132812, "loss": 0.6241, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7078680992126465, "rewards/margins": 0.1947457492351532, "rewards/rejected": -0.9026137590408325, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 18.816614151000977, "learning_rate": 8.2405655770419e-08, "logits/chosen": -2.581057071685791, "logits/rejected": -2.5660622119903564, "logps/chosen": -126.4916763305664, "logps/rejected": -138.13027954101562, "loss": 0.6497, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.724175751209259, "rewards/margins": 0.14416101574897766, "rewards/rejected": -0.8683366775512695, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 20.96912956237793, "learning_rate": 8.232925179046092e-08, "logits/chosen": -2.58642578125, "logits/rejected": -2.5690746307373047, "logps/chosen": -124.3506851196289, "logps/rejected": -137.90533447265625, "loss": 0.6304, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6937921047210693, "rewards/margins": 0.17554596066474915, "rewards/rejected": -0.8693380355834961, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 14.86324405670166, "learning_rate": 8.225271786619485e-08, "logits/chosen": -2.5744924545288086, "logits/rejected": -2.5600318908691406, "logps/chosen": -129.59201049804688, "logps/rejected": -138.97303771972656, "loss": 0.6301, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6922917366027832, "rewards/margins": 0.17595288157463074, "rewards/rejected": -0.8682445287704468, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 21.273021697998047, "learning_rate": 8.217605430524151e-08, "logits/chosen": -2.6182470321655273, "logits/rejected": -2.5997517108917236, "logps/chosen": -120.73951721191406, "logps/rejected": -140.35650634765625, "loss": 0.6114, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6846445798873901, "rewards/margins": 0.21238479018211365, "rewards/rejected": -0.8970292806625366, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 17.90481185913086, "learning_rate": 8.209926141574268e-08, "logits/chosen": -2.595726728439331, "logits/rejected": -2.5896270275115967, "logps/chosen": -132.58290100097656, "logps/rejected": -152.82553100585938, "loss": 0.6048, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7439484000205994, "rewards/margins": 0.2570769190788269, "rewards/rejected": -1.0010253190994263, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 23.34183120727539, "learning_rate": 8.202233950635999e-08, "logits/chosen": -2.579566240310669, "logits/rejected": -2.5681982040405273, "logps/chosen": -130.9108428955078, "logps/rejected": -156.07015991210938, "loss": 0.6099, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7837319374084473, "rewards/margins": 0.24528345465660095, "rewards/rejected": -1.029015302658081, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 22.64227294921875, "learning_rate": 8.194528888627361e-08, "logits/chosen": -2.6670823097229004, "logits/rejected": -2.6301536560058594, "logps/chosen": -136.83065795898438, "logps/rejected": -163.07357788085938, "loss": 0.5872, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8139989972114563, "rewards/margins": 0.2943005859851837, "rewards/rejected": -1.1082994937896729, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 20.53219985961914, "learning_rate": 8.186810986518112e-08, "logits/chosen": -2.5834572315216064, "logits/rejected": -2.566066265106201, "logps/chosen": -145.5009307861328, "logps/rejected": -160.21743774414062, "loss": 0.6424, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9014961123466492, "rewards/margins": 0.16748717427253723, "rewards/rejected": -1.0689833164215088, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 30.76763153076172, "learning_rate": 8.179080275329606e-08, "logits/chosen": -2.594026803970337, "logits/rejected": -2.583535671234131, "logps/chosen": -141.31610107421875, "logps/rejected": -161.57565307617188, "loss": 0.626, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8761361837387085, "rewards/margins": 0.223777174949646, "rewards/rejected": -1.0999133586883545, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 20.423755645751953, "learning_rate": 8.171336786134699e-08, "logits/chosen": -2.5477170944213867, "logits/rejected": -2.5375678539276123, "logps/chosen": -145.0775146484375, "logps/rejected": -158.58587646484375, "loss": 0.6448, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9156156778335571, "rewards/margins": 0.18095484375953674, "rewards/rejected": -1.096570611000061, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 24.544588088989258, "learning_rate": 8.163580550057596e-08, "logits/chosen": -2.5236897468566895, "logits/rejected": -2.5145726203918457, "logps/chosen": -146.05165100097656, "logps/rejected": -162.24951171875, "loss": 0.6453, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9362128973007202, "rewards/margins": 0.16781552135944366, "rewards/rejected": -1.104028582572937, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 25.148283004760742, "learning_rate": 8.155811598273737e-08, "logits/chosen": -2.6110711097717285, "logits/rejected": -2.5971312522888184, "logps/chosen": -160.1621551513672, "logps/rejected": -181.1752166748047, "loss": 0.6112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0028265714645386, "rewards/margins": 0.24775870144367218, "rewards/rejected": -1.2505853176116943, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 25.57758903503418, "learning_rate": 8.148029962009677e-08, "logits/chosen": -2.6012930870056152, "logits/rejected": -2.585240125656128, "logps/chosen": -161.4896697998047, "logps/rejected": -173.24032592773438, "loss": 0.6366, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0221115350723267, "rewards/margins": 0.1880497932434082, "rewards/rejected": -1.2101614475250244, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 26.79003143310547, "learning_rate": 8.140235672542951e-08, "logits/chosen": -2.5914082527160645, "logits/rejected": -2.5720105171203613, "logps/chosen": -165.79928588867188, "logps/rejected": -176.74075317382812, "loss": 0.6417, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0590981245040894, "rewards/margins": 0.18660911917686462, "rewards/rejected": -1.2457071542739868, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 33.11368942260742, "learning_rate": 8.132428761201953e-08, "logits/chosen": -2.5066819190979004, "logits/rejected": -2.489210605621338, "logps/chosen": -157.1233673095703, "logps/rejected": -180.30471801757812, "loss": 0.618, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0171107053756714, "rewards/margins": 0.24292024970054626, "rewards/rejected": -1.26003098487854, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 27.38452911376953, "learning_rate": 8.124609259365812e-08, "logits/chosen": -2.5813941955566406, "logits/rejected": -2.5623717308044434, "logps/chosen": -157.97499084472656, "logps/rejected": -175.90878295898438, "loss": 0.615, "rewards/accuracies": 0.71875, "rewards/chosen": -1.016340970993042, "rewards/margins": 0.21486850082874298, "rewards/rejected": -1.231209397315979, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 28.536115646362305, "learning_rate": 8.116777198464257e-08, "logits/chosen": -2.5717034339904785, "logits/rejected": -2.557497024536133, "logps/chosen": -155.28977966308594, "logps/rejected": -172.4293975830078, "loss": 0.6368, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0214192867279053, "rewards/margins": 0.1867234855890274, "rewards/rejected": -1.208142876625061, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 20.48895263671875, "learning_rate": 8.108932609977504e-08, "logits/chosen": -2.661949872970581, "logits/rejected": -2.6440796852111816, "logps/chosen": -153.58737182617188, "logps/rejected": -176.33309936523438, "loss": 0.6006, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9294829368591309, "rewards/margins": 0.27870726585388184, "rewards/rejected": -1.2081902027130127, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 25.738983154296875, "learning_rate": 8.101075525436121e-08, "logits/chosen": -2.5315258502960205, "logits/rejected": -2.5130081176757812, "logps/chosen": -155.04676818847656, "logps/rejected": -167.66976928710938, "loss": 0.6498, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9809181094169617, "rewards/margins": 0.1655711978673935, "rewards/rejected": -1.1464893817901611, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 51.1860237121582, "learning_rate": 8.093205976420896e-08, "logits/chosen": -2.6003103256225586, "logits/rejected": -2.580641269683838, "logps/chosen": -155.04019165039062, "logps/rejected": -162.6709747314453, "loss": 0.6694, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9813059568405151, "rewards/margins": 0.11345939338207245, "rewards/rejected": -1.0947654247283936, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 19.37555503845215, "learning_rate": 8.085323994562727e-08, "logits/chosen": -2.5086302757263184, "logits/rejected": -2.489610195159912, "logps/chosen": -148.66253662109375, "logps/rejected": -165.6138916015625, "loss": 0.6275, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9338725805282593, "rewards/margins": 0.216073676943779, "rewards/rejected": -1.1499463319778442, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 15.658135414123535, "learning_rate": 8.077429611542476e-08, "logits/chosen": -2.7011168003082275, "logits/rejected": -2.7036242485046387, "logps/chosen": -140.63345336914062, "logps/rejected": -156.34402465820312, "loss": 0.6509, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8695812225341797, "rewards/margins": 0.15376465022563934, "rewards/rejected": -1.023345947265625, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 23.98619270324707, "learning_rate": 8.069522859090856e-08, "logits/chosen": -2.4844882488250732, "logits/rejected": -2.4635424613952637, "logps/chosen": -136.20181274414062, "logps/rejected": -150.969482421875, "loss": 0.6376, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8131068348884583, "rewards/margins": 0.1791612058877945, "rewards/rejected": -0.9922679662704468, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 19.53284454345703, "learning_rate": 8.061603768988294e-08, "logits/chosen": -2.5444564819335938, "logits/rejected": -2.523756742477417, "logps/chosen": -130.36697387695312, "logps/rejected": -151.30662536621094, "loss": 0.6153, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.781546950340271, "rewards/margins": 0.21957603096961975, "rewards/rejected": -1.001123070716858, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 21.767004013061523, "learning_rate": 8.053672373064811e-08, "logits/chosen": -2.5796427726745605, "logits/rejected": -2.563560962677002, "logps/chosen": -141.37403869628906, "logps/rejected": -155.7559356689453, "loss": 0.6416, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8766446113586426, "rewards/margins": 0.17183735966682434, "rewards/rejected": -1.0484821796417236, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 26.944143295288086, "learning_rate": 8.045728703199885e-08, "logits/chosen": -2.5742251873016357, "logits/rejected": -2.551163673400879, "logps/chosen": -132.3316650390625, "logps/rejected": -145.6800994873047, "loss": 0.6559, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8050101399421692, "rewards/margins": 0.14063867926597595, "rewards/rejected": -0.9456488490104675, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 20.394813537597656, "learning_rate": 8.037772791322331e-08, "logits/chosen": -2.528676748275757, "logits/rejected": -2.513557195663452, "logps/chosen": -134.52194213867188, "logps/rejected": -151.3898468017578, "loss": 0.63, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7903624773025513, "rewards/margins": 0.19733569025993347, "rewards/rejected": -0.9876980781555176, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 20.387157440185547, "learning_rate": 8.029804669410171e-08, "logits/chosen": -2.5528550148010254, "logits/rejected": -2.5368423461914062, "logps/chosen": -132.63211059570312, "logps/rejected": -158.43017578125, "loss": 0.5921, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.777782678604126, "rewards/margins": 0.2880386710166931, "rewards/rejected": -1.0658212900161743, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 17.525043487548828, "learning_rate": 8.0218243694905e-08, "logits/chosen": -2.5942163467407227, "logits/rejected": -2.576354503631592, "logps/chosen": -134.7543182373047, "logps/rejected": -147.21018981933594, "loss": 0.6359, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7760751843452454, "rewards/margins": 0.17111438512802124, "rewards/rejected": -0.9471896886825562, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 30.56574058532715, "learning_rate": 8.013831923639363e-08, "logits/chosen": -2.551419734954834, "logits/rejected": -2.545924663543701, "logps/chosen": -134.21063232421875, "logps/rejected": -154.470703125, "loss": 0.6198, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7749854922294617, "rewards/margins": 0.21509429812431335, "rewards/rejected": -0.9900798797607422, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 23.940534591674805, "learning_rate": 8.005827363981626e-08, "logits/chosen": -2.536100149154663, "logits/rejected": -2.53417706489563, "logps/chosen": -134.65310668945312, "logps/rejected": -154.59707641601562, "loss": 0.6298, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7997391819953918, "rewards/margins": 0.2075682133436203, "rewards/rejected": -1.0073074102401733, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 29.463321685791016, "learning_rate": 7.997810722690845e-08, "logits/chosen": -2.559302806854248, "logits/rejected": -2.555941104888916, "logps/chosen": -141.52078247070312, "logps/rejected": -153.58714294433594, "loss": 0.6646, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8632817268371582, "rewards/margins": 0.13526658713817596, "rewards/rejected": -0.9985483288764954, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 33.703369140625, "learning_rate": 7.989782031989135e-08, "logits/chosen": -2.589667320251465, "logits/rejected": -2.5763516426086426, "logps/chosen": -147.19015502929688, "logps/rejected": -170.93853759765625, "loss": 0.6199, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9342149496078491, "rewards/margins": 0.21943017840385437, "rewards/rejected": -1.1536452770233154, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 23.249961853027344, "learning_rate": 7.981741324147043e-08, "logits/chosen": -2.6110785007476807, "logits/rejected": -2.5831587314605713, "logps/chosen": -142.32308959960938, "logps/rejected": -159.36021423339844, "loss": 0.6117, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.84880131483078, "rewards/margins": 0.2458713948726654, "rewards/rejected": -1.0946727991104126, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 23.310190200805664, "learning_rate": 7.973688631483421e-08, "logits/chosen": -2.5864102840423584, "logits/rejected": -2.566074848175049, "logps/chosen": -140.55307006835938, "logps/rejected": -158.64735412597656, "loss": 0.6247, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8174555897712708, "rewards/margins": 0.21133661270141602, "rewards/rejected": -1.028792142868042, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -2.625418186187744, "eval_logits/rejected": -2.6201014518737793, "eval_logps/chosen": -129.89840698242188, "eval_logps/rejected": -145.43096923828125, "eval_loss": 0.6587028503417969, "eval_rewards/accuracies": 0.5989776849746704, "eval_rewards/chosen": -0.7088292837142944, "eval_rewards/margins": 0.1179843619465828, "eval_rewards/rejected": -0.8268135786056519, "eval_runtime": 383.4934, "eval_samples_per_second": 11.223, "eval_steps_per_second": 1.403, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 30.746761322021484, "learning_rate": 7.965623986365286e-08, "logits/chosen": -2.632479190826416, "logits/rejected": -2.614830493927002, "logps/chosen": -147.27743530273438, "logps/rejected": -161.5043487548828, "loss": 0.6384, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9128227233886719, "rewards/margins": 0.19528654217720032, "rewards/rejected": -1.1081092357635498, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 24.96663475036621, "learning_rate": 7.957547421207705e-08, "logits/chosen": -2.5945236682891846, "logits/rejected": -2.5781524181365967, "logps/chosen": -148.02255249023438, "logps/rejected": -160.6229248046875, "loss": 0.655, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9230918884277344, "rewards/margins": 0.15126900374889374, "rewards/rejected": -1.0743608474731445, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 18.77534294128418, "learning_rate": 7.949458968473649e-08, "logits/chosen": -2.5306360721588135, "logits/rejected": -2.5238285064697266, "logps/chosen": -137.1156463623047, "logps/rejected": -143.7222442626953, "loss": 0.6782, "rewards/accuracies": 0.53125, "rewards/chosen": -0.86149662733078, "rewards/margins": 0.09484690427780151, "rewards/rejected": -0.9563434720039368, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 19.45603370666504, "learning_rate": 7.941358660673876e-08, "logits/chosen": -2.561483860015869, "logits/rejected": -2.5457534790039062, "logps/chosen": -141.83096313476562, "logps/rejected": -156.63902282714844, "loss": 0.644, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8634850382804871, "rewards/margins": 0.1716698855161667, "rewards/rejected": -1.035154938697815, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 19.846593856811523, "learning_rate": 7.933246530366788e-08, "logits/chosen": -2.5645689964294434, "logits/rejected": -2.540552854537964, "logps/chosen": -139.06712341308594, "logps/rejected": -154.57325744628906, "loss": 0.6241, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8170877695083618, "rewards/margins": 0.2099314033985138, "rewards/rejected": -1.0270192623138428, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 19.40055274963379, "learning_rate": 7.925122610158315e-08, "logits/chosen": -2.513282537460327, "logits/rejected": -2.5152535438537598, "logps/chosen": -131.2607421875, "logps/rejected": -158.49551391601562, "loss": 0.6154, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7849147915840149, "rewards/margins": 0.2336963713169098, "rewards/rejected": -1.018611192703247, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 22.3400821685791, "learning_rate": 7.916986932701766e-08, "logits/chosen": -2.485816717147827, "logits/rejected": -2.4699718952178955, "logps/chosen": -136.15518188476562, "logps/rejected": -151.83595275878906, "loss": 0.6355, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8198622465133667, "rewards/margins": 0.18899795413017273, "rewards/rejected": -1.0088602304458618, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 24.075334548950195, "learning_rate": 7.908839530697713e-08, "logits/chosen": -2.558938503265381, "logits/rejected": -2.5304675102233887, "logps/chosen": -135.81005859375, "logps/rejected": -146.69322204589844, "loss": 0.6372, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7854007482528687, "rewards/margins": 0.17798066139221191, "rewards/rejected": -0.9633814096450806, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 20.839139938354492, "learning_rate": 7.900680436893852e-08, "logits/chosen": -2.6834394931793213, "logits/rejected": -2.674525022506714, "logps/chosen": -141.89013671875, "logps/rejected": -154.45413208007812, "loss": 0.6588, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8717700839042664, "rewards/margins": 0.13743311166763306, "rewards/rejected": -1.009203314781189, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 17.810470581054688, "learning_rate": 7.892509684084874e-08, "logits/chosen": -2.5747642517089844, "logits/rejected": -2.5670723915100098, "logps/chosen": -143.84841918945312, "logps/rejected": -151.48602294921875, "loss": 0.6572, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.864587128162384, "rewards/margins": 0.1298464834690094, "rewards/rejected": -0.9944335222244263, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 26.317533493041992, "learning_rate": 7.884327305112332e-08, "logits/chosen": -2.566572904586792, "logits/rejected": -2.5307557582855225, "logps/chosen": -137.45692443847656, "logps/rejected": -152.34506225585938, "loss": 0.6282, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8011199831962585, "rewards/margins": 0.2003363072872162, "rewards/rejected": -1.0014562606811523, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 22.78864288330078, "learning_rate": 7.876133332864505e-08, "logits/chosen": -2.5616345405578613, "logits/rejected": -2.543246030807495, "logps/chosen": -129.95115661621094, "logps/rejected": -144.4560546875, "loss": 0.6324, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7529776096343994, "rewards/margins": 0.18292686343193054, "rewards/rejected": -0.9359043836593628, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 16.22922706604004, "learning_rate": 7.86792780027628e-08, "logits/chosen": -2.528315544128418, "logits/rejected": -2.506397008895874, "logps/chosen": -135.58074951171875, "logps/rejected": -153.8533935546875, "loss": 0.6157, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8108615875244141, "rewards/margins": 0.2233131229877472, "rewards/rejected": -1.0341746807098389, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 25.679231643676758, "learning_rate": 7.859710740328998e-08, "logits/chosen": -2.5504448413848877, "logits/rejected": -2.5254950523376465, "logps/chosen": -146.25262451171875, "logps/rejected": -159.8805389404297, "loss": 0.6526, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.8821076154708862, "rewards/margins": 0.1563064604997635, "rewards/rejected": -1.0384140014648438, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 31.117042541503906, "learning_rate": 7.85148218605034e-08, "logits/chosen": -2.5004849433898926, "logits/rejected": -2.4817347526550293, "logps/chosen": -138.51187133789062, "logps/rejected": -155.68972778320312, "loss": 0.6447, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8690078854560852, "rewards/margins": 0.17679616808891296, "rewards/rejected": -1.0458040237426758, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 27.82855796813965, "learning_rate": 7.843242170514187e-08, "logits/chosen": -2.5705184936523438, "logits/rejected": -2.540701389312744, "logps/chosen": -141.11805725097656, "logps/rejected": -158.58779907226562, "loss": 0.6132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8434591293334961, "rewards/margins": 0.23432815074920654, "rewards/rejected": -1.077787160873413, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 23.152801513671875, "learning_rate": 7.834990726840485e-08, "logits/chosen": -2.5601956844329834, "logits/rejected": -2.5374298095703125, "logps/chosen": -139.59695434570312, "logps/rejected": -154.17977905273438, "loss": 0.6302, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8412817716598511, "rewards/margins": 0.18191157281398773, "rewards/rejected": -1.023193359375, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 20.162002563476562, "learning_rate": 7.826727888195118e-08, "logits/chosen": -2.5681393146514893, "logits/rejected": -2.5380806922912598, "logps/chosen": -143.95455932617188, "logps/rejected": -153.251953125, "loss": 0.6509, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8760701417922974, "rewards/margins": 0.1618967354297638, "rewards/rejected": -1.0379668474197388, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 21.368446350097656, "learning_rate": 7.818453687789766e-08, "logits/chosen": -2.5328991413116455, "logits/rejected": -2.5141239166259766, "logps/chosen": -137.56971740722656, "logps/rejected": -156.55111694335938, "loss": 0.6281, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8404220342636108, "rewards/margins": 0.19537512958049774, "rewards/rejected": -1.035797357559204, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 26.21380043029785, "learning_rate": 7.81016815888178e-08, "logits/chosen": -2.591376543045044, "logits/rejected": -2.5790627002716064, "logps/chosen": -141.25106811523438, "logps/rejected": -156.2598419189453, "loss": 0.6354, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8445740938186646, "rewards/margins": 0.18649375438690186, "rewards/rejected": -1.031067967414856, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 21.277738571166992, "learning_rate": 7.801871334774045e-08, "logits/chosen": -2.5717265605926514, "logits/rejected": -2.560377597808838, "logps/chosen": -136.41635131835938, "logps/rejected": -154.43130493164062, "loss": 0.6244, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8334992527961731, "rewards/margins": 0.20126286149024963, "rewards/rejected": -1.0347621440887451, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 23.049354553222656, "learning_rate": 7.793563248814843e-08, "logits/chosen": -2.515204906463623, "logits/rejected": -2.496919870376587, "logps/chosen": -144.44114685058594, "logps/rejected": -157.16624450683594, "loss": 0.6591, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9124046564102173, "rewards/margins": 0.146372988820076, "rewards/rejected": -1.0587775707244873, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 21.18852996826172, "learning_rate": 7.785243934397725e-08, "logits/chosen": -2.518746852874756, "logits/rejected": -2.5005240440368652, "logps/chosen": -132.6031036376953, "logps/rejected": -140.0795135498047, "loss": 0.6539, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7796574831008911, "rewards/margins": 0.13536986708641052, "rewards/rejected": -0.9150273203849792, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 28.055564880371094, "learning_rate": 7.776913424961374e-08, "logits/chosen": -2.5686328411102295, "logits/rejected": -2.54179310798645, "logps/chosen": -135.70887756347656, "logps/rejected": -143.9758758544922, "loss": 0.6526, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8114780187606812, "rewards/margins": 0.14228160679340363, "rewards/rejected": -0.9537595510482788, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 31.527973175048828, "learning_rate": 7.768571753989465e-08, "logits/chosen": -2.5954716205596924, "logits/rejected": -2.5796637535095215, "logps/chosen": -131.65628051757812, "logps/rejected": -151.76107788085938, "loss": 0.6297, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.785808801651001, "rewards/margins": 0.20410504937171936, "rewards/rejected": -0.989913821220398, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 24.025869369506836, "learning_rate": 7.760218955010542e-08, "logits/chosen": -2.622337818145752, "logits/rejected": -2.613774061203003, "logps/chosen": -128.61228942871094, "logps/rejected": -146.10922241210938, "loss": 0.6369, "rewards/accuracies": 0.625, "rewards/chosen": -0.757174015045166, "rewards/margins": 0.17353567481040955, "rewards/rejected": -0.9307096600532532, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 21.362499237060547, "learning_rate": 7.751855061597875e-08, "logits/chosen": -2.522671937942505, "logits/rejected": -2.524449348449707, "logps/chosen": -130.78817749023438, "logps/rejected": -161.01031494140625, "loss": 0.6017, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.800433337688446, "rewards/margins": 0.2665384113788605, "rewards/rejected": -1.066971778869629, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 25.16261863708496, "learning_rate": 7.743480107369324e-08, "logits/chosen": -2.521477222442627, "logits/rejected": -2.5011825561523438, "logps/chosen": -134.3588104248047, "logps/rejected": -146.83877563476562, "loss": 0.6455, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8043110966682434, "rewards/margins": 0.15868356823921204, "rewards/rejected": -0.9629947543144226, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 20.07200050354004, "learning_rate": 7.735094125987214e-08, "logits/chosen": -2.5675048828125, "logits/rejected": -2.5460128784179688, "logps/chosen": -132.4228057861328, "logps/rejected": -152.199951171875, "loss": 0.6141, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.743118405342102, "rewards/margins": 0.2335709035396576, "rewards/rejected": -0.9766892194747925, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 29.890438079833984, "learning_rate": 7.726697151158183e-08, "logits/chosen": -2.5477867126464844, "logits/rejected": -2.539823055267334, "logps/chosen": -134.93380737304688, "logps/rejected": -154.91146850585938, "loss": 0.6193, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7691043615341187, "rewards/margins": 0.22836211323738098, "rewards/rejected": -0.9974665641784668, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 19.754623413085938, "learning_rate": 7.718289216633063e-08, "logits/chosen": -2.5491116046905518, "logits/rejected": -2.5243980884552, "logps/chosen": -135.69090270996094, "logps/rejected": -153.7061309814453, "loss": 0.6247, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7767504453659058, "rewards/margins": 0.23671141266822815, "rewards/rejected": -1.0134618282318115, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 20.112178802490234, "learning_rate": 7.709870356206736e-08, "logits/chosen": -2.537121534347534, "logits/rejected": -2.520521879196167, "logps/chosen": -129.10336303710938, "logps/rejected": -148.66500854492188, "loss": 0.6274, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7644118070602417, "rewards/margins": 0.2091284692287445, "rewards/rejected": -0.9735404253005981, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 20.700651168823242, "learning_rate": 7.701440603718e-08, "logits/chosen": -2.510880947113037, "logits/rejected": -2.4959218502044678, "logps/chosen": -140.76002502441406, "logps/rejected": -150.38844299316406, "loss": 0.6665, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8319160342216492, "rewards/margins": 0.1302139014005661, "rewards/rejected": -0.9621298909187317, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 25.546266555786133, "learning_rate": 7.692999993049429e-08, "logits/chosen": -2.5427424907684326, "logits/rejected": -2.535956621170044, "logps/chosen": -135.4132537841797, "logps/rejected": -155.85977172851562, "loss": 0.6182, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.800602912902832, "rewards/margins": 0.21048641204833984, "rewards/rejected": -1.0110893249511719, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 29.764677047729492, "learning_rate": 7.684548558127247e-08, "logits/chosen": -2.5787529945373535, "logits/rejected": -2.5668704509735107, "logps/chosen": -140.60609436035156, "logps/rejected": -163.8345947265625, "loss": 0.618, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.871651828289032, "rewards/margins": 0.2363211214542389, "rewards/rejected": -1.1079729795455933, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 25.136219024658203, "learning_rate": 7.676086332921176e-08, "logits/chosen": -2.5359222888946533, "logits/rejected": -2.515162944793701, "logps/chosen": -136.893310546875, "logps/rejected": -152.5428009033203, "loss": 0.6302, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8350076675415039, "rewards/margins": 0.20780706405639648, "rewards/rejected": -1.0428146123886108, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 29.839658737182617, "learning_rate": 7.667613351444318e-08, "logits/chosen": -2.556365489959717, "logits/rejected": -2.548959732055664, "logps/chosen": -141.61341857910156, "logps/rejected": -161.94760131835938, "loss": 0.6314, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8632542490959167, "rewards/margins": 0.20508262515068054, "rewards/rejected": -1.068337082862854, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 29.74138069152832, "learning_rate": 7.659129647753002e-08, "logits/chosen": -2.5341110229492188, "logits/rejected": -2.511051654815674, "logps/chosen": -147.91268920898438, "logps/rejected": -160.97518920898438, "loss": 0.6452, "rewards/accuracies": 0.625, "rewards/chosen": -0.9066078066825867, "rewards/margins": 0.17172715067863464, "rewards/rejected": -1.078334927558899, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 26.996421813964844, "learning_rate": 7.650635255946658e-08, "logits/chosen": -2.5564591884613037, "logits/rejected": -2.541588306427002, "logps/chosen": -138.55068969726562, "logps/rejected": -167.80386352539062, "loss": 0.5865, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8645964860916138, "rewards/margins": 0.29569393396377563, "rewards/rejected": -1.1602903604507446, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 20.826135635375977, "learning_rate": 7.642130210167673e-08, "logits/chosen": -2.4886882305145264, "logits/rejected": -2.4650111198425293, "logps/chosen": -145.65353393554688, "logps/rejected": -165.99551391601562, "loss": 0.6194, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9237545132637024, "rewards/margins": 0.2404859960079193, "rewards/rejected": -1.1642405986785889, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -2.591191291809082, "eval_logits/rejected": -2.585751533508301, "eval_logps/chosen": -138.56918334960938, "eval_logps/rejected": -154.65988159179688, "eval_loss": 0.6580451130867004, "eval_rewards/accuracies": 0.5980483293533325, "eval_rewards/chosen": -0.7955370545387268, "eval_rewards/margins": 0.1235656887292862, "eval_rewards/rejected": -0.9191027879714966, "eval_runtime": 382.8732, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 23.784109115600586, "learning_rate": 7.633614544601257e-08, "logits/chosen": -2.5083320140838623, "logits/rejected": -2.4918289184570312, "logps/chosen": -154.79818725585938, "logps/rejected": -169.489501953125, "loss": 0.6279, "rewards/accuracies": 0.625, "rewards/chosen": -0.9800655245780945, "rewards/margins": 0.20611277222633362, "rewards/rejected": -1.18617844581604, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 22.796245574951172, "learning_rate": 7.625088293475308e-08, "logits/chosen": -2.5985751152038574, "logits/rejected": -2.5754246711730957, "logps/chosen": -153.35060119628906, "logps/rejected": -170.70590209960938, "loss": 0.626, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9690364599227905, "rewards/margins": 0.2199009358882904, "rewards/rejected": -1.1889374256134033, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 17.840660095214844, "learning_rate": 7.61655149106027e-08, "logits/chosen": -2.5838685035705566, "logits/rejected": -2.5834288597106934, "logps/chosen": -149.63232421875, "logps/rejected": -167.7073211669922, "loss": 0.6402, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9502065777778625, "rewards/margins": 0.20394647121429443, "rewards/rejected": -1.1541529893875122, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 21.295618057250977, "learning_rate": 7.608004171668994e-08, "logits/chosen": -2.567112684249878, "logits/rejected": -2.5470948219299316, "logps/chosen": -154.1027374267578, "logps/rejected": -169.5994415283203, "loss": 0.6509, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9616118669509888, "rewards/margins": 0.18688850104808807, "rewards/rejected": -1.1485002040863037, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 27.233596801757812, "learning_rate": 7.599446369656608e-08, "logits/chosen": -2.4759891033172607, "logits/rejected": -2.4508557319641113, "logps/chosen": -147.71963500976562, "logps/rejected": -165.37037658691406, "loss": 0.6399, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.923791229724884, "rewards/margins": 0.20322775840759277, "rewards/rejected": -1.127018928527832, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 21.57741928100586, "learning_rate": 7.59087811942037e-08, "logits/chosen": -2.5545706748962402, "logits/rejected": -2.5255355834960938, "logps/chosen": -153.35557556152344, "logps/rejected": -164.24728393554688, "loss": 0.627, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9340866804122925, "rewards/margins": 0.19009587168693542, "rewards/rejected": -1.1241824626922607, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 25.994808197021484, "learning_rate": 7.582299455399536e-08, "logits/chosen": -2.476388454437256, "logits/rejected": -2.4694392681121826, "logps/chosen": -140.9687957763672, "logps/rejected": -158.087158203125, "loss": 0.6374, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8894684910774231, "rewards/margins": 0.17998628318309784, "rewards/rejected": -1.0694547891616821, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 21.931888580322266, "learning_rate": 7.573710412075218e-08, "logits/chosen": -2.561197280883789, "logits/rejected": -2.5365850925445557, "logps/chosen": -141.51438903808594, "logps/rejected": -157.063720703125, "loss": 0.6289, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8552343249320984, "rewards/margins": 0.20935793220996857, "rewards/rejected": -1.0645922422409058, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 18.84161376953125, "learning_rate": 7.565111023970246e-08, "logits/chosen": -2.5000433921813965, "logits/rejected": -2.4791622161865234, "logps/chosen": -133.22152709960938, "logps/rejected": -156.64381408691406, "loss": 0.6162, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7999873757362366, "rewards/margins": 0.24644342064857483, "rewards/rejected": -1.0464308261871338, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 27.533275604248047, "learning_rate": 7.556501325649031e-08, "logits/chosen": -2.547579288482666, "logits/rejected": -2.5297350883483887, "logps/chosen": -138.42526245117188, "logps/rejected": -156.234375, "loss": 0.6339, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8409918546676636, "rewards/margins": 0.19497635960578918, "rewards/rejected": -1.0359681844711304, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 22.588600158691406, "learning_rate": 7.547881351717425e-08, "logits/chosen": -2.5680556297302246, "logits/rejected": -2.5507588386535645, "logps/chosen": -142.8251953125, "logps/rejected": -162.5928192138672, "loss": 0.6169, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8501895070075989, "rewards/margins": 0.2276049107313156, "rewards/rejected": -1.0777945518493652, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 27.605674743652344, "learning_rate": 7.539251136822582e-08, "logits/chosen": -2.6103172302246094, "logits/rejected": -2.5834383964538574, "logps/chosen": -148.08206176757812, "logps/rejected": -162.20498657226562, "loss": 0.6456, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9144037961959839, "rewards/margins": 0.1720445156097412, "rewards/rejected": -1.086448311805725, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 24.64449119567871, "learning_rate": 7.530610715652816e-08, "logits/chosen": -2.507188320159912, "logits/rejected": -2.478717088699341, "logps/chosen": -139.48257446289062, "logps/rejected": -160.83448791503906, "loss": 0.6039, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8062465786933899, "rewards/margins": 0.2692093253135681, "rewards/rejected": -1.0754557847976685, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 25.173749923706055, "learning_rate": 7.521960122937469e-08, "logits/chosen": -2.4724910259246826, "logits/rejected": -2.4382050037384033, "logps/chosen": -141.81011962890625, "logps/rejected": -157.2001495361328, "loss": 0.6054, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8174602389335632, "rewards/margins": 0.2571207880973816, "rewards/rejected": -1.0745811462402344, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 24.128093719482422, "learning_rate": 7.513299393446761e-08, "logits/chosen": -2.5183544158935547, "logits/rejected": -2.5030174255371094, "logps/chosen": -143.48355102539062, "logps/rejected": -169.42686462402344, "loss": 0.6007, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8923398852348328, "rewards/margins": 0.27517181634902954, "rewards/rejected": -1.1675117015838623, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 31.939889907836914, "learning_rate": 7.504628561991661e-08, "logits/chosen": -2.626190662384033, "logits/rejected": -2.5996010303497314, "logps/chosen": -160.25259399414062, "logps/rejected": -166.00714111328125, "loss": 0.6555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0419825315475464, "rewards/margins": 0.14706826210021973, "rewards/rejected": -1.1890507936477661, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 24.720861434936523, "learning_rate": 7.495947663423736e-08, "logits/chosen": -2.5689542293548584, "logits/rejected": -2.548269271850586, "logps/chosen": -145.82464599609375, "logps/rejected": -160.35841369628906, "loss": 0.6278, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9043046236038208, "rewards/margins": 0.1990642249584198, "rewards/rejected": -1.103368878364563, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 18.39963722229004, "learning_rate": 7.487256732635024e-08, "logits/chosen": -2.5041344165802, "logits/rejected": -2.4827022552490234, "logps/chosen": -144.06503295898438, "logps/rejected": -163.56167602539062, "loss": 0.6172, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8870753049850464, "rewards/margins": 0.2288040816783905, "rewards/rejected": -1.1158792972564697, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 19.67237663269043, "learning_rate": 7.478555804557881e-08, "logits/chosen": -2.4596590995788574, "logits/rejected": -2.449921131134033, "logps/chosen": -147.87271118164062, "logps/rejected": -159.21084594726562, "loss": 0.6508, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8981518745422363, "rewards/margins": 0.15594662725925446, "rewards/rejected": -1.05409836769104, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 22.669862747192383, "learning_rate": 7.469844914164847e-08, "logits/chosen": -2.665203809738159, "logits/rejected": -2.6429755687713623, "logps/chosen": -146.79232788085938, "logps/rejected": -164.08370971679688, "loss": 0.6235, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8958548307418823, "rewards/margins": 0.20432892441749573, "rewards/rejected": -1.1001837253570557, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 18.724201202392578, "learning_rate": 7.461124096468505e-08, "logits/chosen": -2.545606851577759, "logits/rejected": -2.5247297286987305, "logps/chosen": -139.4956817626953, "logps/rejected": -154.31460571289062, "loss": 0.6278, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8235694766044617, "rewards/margins": 0.19837680459022522, "rewards/rejected": -1.0219463109970093, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 24.118534088134766, "learning_rate": 7.45239338652134e-08, "logits/chosen": -2.516458034515381, "logits/rejected": -2.494795322418213, "logps/chosen": -131.16363525390625, "logps/rejected": -150.270263671875, "loss": 0.6322, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7914215326309204, "rewards/margins": 0.1959744691848755, "rewards/rejected": -0.9873960614204407, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 21.872209548950195, "learning_rate": 7.443652819415603e-08, "logits/chosen": -2.569121837615967, "logits/rejected": -2.5502028465270996, "logps/chosen": -136.0243682861328, "logps/rejected": -152.87554931640625, "loss": 0.6412, "rewards/accuracies": 0.625, "rewards/chosen": -0.8177220225334167, "rewards/margins": 0.1662174016237259, "rewards/rejected": -0.9839394688606262, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 38.59823989868164, "learning_rate": 7.434902430283154e-08, "logits/chosen": -2.5355591773986816, "logits/rejected": -2.516512870788574, "logps/chosen": -137.9992218017578, "logps/rejected": -156.84129333496094, "loss": 0.631, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8045310974121094, "rewards/margins": 0.20467019081115723, "rewards/rejected": -1.0092014074325562, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 24.6149959564209, "learning_rate": 7.426142254295343e-08, "logits/chosen": -2.507087230682373, "logits/rejected": -2.4920742511749268, "logps/chosen": -130.8026123046875, "logps/rejected": -148.0153350830078, "loss": 0.6415, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7819786071777344, "rewards/margins": 0.1737910807132721, "rewards/rejected": -0.9557696580886841, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 22.87141990661621, "learning_rate": 7.417372326662845e-08, "logits/chosen": -2.562243700027466, "logits/rejected": -2.5538878440856934, "logps/chosen": -135.90615844726562, "logps/rejected": -150.73922729492188, "loss": 0.6425, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8219138979911804, "rewards/margins": 0.16739928722381592, "rewards/rejected": -0.9893131256103516, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 23.218414306640625, "learning_rate": 7.408592682635546e-08, "logits/chosen": -2.5308501720428467, "logits/rejected": -2.513751745223999, "logps/chosen": -138.0851593017578, "logps/rejected": -144.14117431640625, "loss": 0.6813, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8276640176773071, "rewards/margins": 0.08705426007509232, "rewards/rejected": -0.9147183299064636, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 25.701818466186523, "learning_rate": 7.399803357502372e-08, "logits/chosen": -2.5900769233703613, "logits/rejected": -2.566382884979248, "logps/chosen": -128.85079956054688, "logps/rejected": -145.69757080078125, "loss": 0.6356, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7443950176239014, "rewards/margins": 0.17536406219005585, "rewards/rejected": -0.9197589755058289, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 42.72899627685547, "learning_rate": 7.391004386591171e-08, "logits/chosen": -2.6033012866973877, "logits/rejected": -2.595266342163086, "logps/chosen": -128.8573455810547, "logps/rejected": -147.2779541015625, "loss": 0.6188, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7513642311096191, "rewards/margins": 0.21458382904529572, "rewards/rejected": -0.9659481048583984, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 20.889738082885742, "learning_rate": 7.382195805268555e-08, "logits/chosen": -2.518144369125366, "logits/rejected": -2.4989988803863525, "logps/chosen": -133.90628051757812, "logps/rejected": -151.20870971679688, "loss": 0.6344, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7652236223220825, "rewards/margins": 0.18906177580356598, "rewards/rejected": -0.9542854428291321, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 18.80124855041504, "learning_rate": 7.373377648939768e-08, "logits/chosen": -2.5407471656799316, "logits/rejected": -2.517430305480957, "logps/chosen": -130.78244018554688, "logps/rejected": -139.30172729492188, "loss": 0.6569, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7649413347244263, "rewards/margins": 0.1254729926586151, "rewards/rejected": -0.8904143571853638, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 29.442237854003906, "learning_rate": 7.364549953048537e-08, "logits/chosen": -2.533996820449829, "logits/rejected": -2.500549793243408, "logps/chosen": -136.3294677734375, "logps/rejected": -143.03616333007812, "loss": 0.6355, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7312331199645996, "rewards/margins": 0.1811157912015915, "rewards/rejected": -0.9123488664627075, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 25.558725357055664, "learning_rate": 7.355712753076936e-08, "logits/chosen": -2.4735474586486816, "logits/rejected": -2.4533772468566895, "logps/chosen": -129.5522918701172, "logps/rejected": -144.97238159179688, "loss": 0.6257, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7247132062911987, "rewards/margins": 0.1992671936750412, "rewards/rejected": -0.9239804148674011, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 19.20479393005371, "learning_rate": 7.346866084545236e-08, "logits/chosen": -2.5496201515197754, "logits/rejected": -2.5420947074890137, "logps/chosen": -123.5148696899414, "logps/rejected": -141.8740692138672, "loss": 0.6282, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7120001912117004, "rewards/margins": 0.19337035715579987, "rewards/rejected": -0.9053705334663391, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 33.56254959106445, "learning_rate": 7.338009983011769e-08, "logits/chosen": -2.5616440773010254, "logits/rejected": -2.546262741088867, "logps/chosen": -144.4009552001953, "logps/rejected": -155.1837921142578, "loss": 0.6583, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8757875561714172, "rewards/margins": 0.13266423344612122, "rewards/rejected": -1.0084518194198608, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 34.856624603271484, "learning_rate": 7.329144484072778e-08, "logits/chosen": -2.5242040157318115, "logits/rejected": -2.4948160648345947, "logps/chosen": -130.3624267578125, "logps/rejected": -152.0260009765625, "loss": 0.6117, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7619328498840332, "rewards/margins": 0.23107370734214783, "rewards/rejected": -0.9930065274238586, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 32.98360061645508, "learning_rate": 7.320269623362282e-08, "logits/chosen": -2.5037379264831543, "logits/rejected": -2.4790263175964355, "logps/chosen": -133.53384399414062, "logps/rejected": -153.2665252685547, "loss": 0.6222, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7748786211013794, "rewards/margins": 0.2275443971157074, "rewards/rejected": -1.0024230480194092, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 22.934099197387695, "learning_rate": 7.311385436551928e-08, "logits/chosen": -2.6020641326904297, "logits/rejected": -2.590449810028076, "logps/chosen": -131.7655487060547, "logps/rejected": -147.60829162597656, "loss": 0.6302, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7536253333091736, "rewards/margins": 0.18533241748809814, "rewards/rejected": -0.9389578104019165, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 21.707487106323242, "learning_rate": 7.302491959350846e-08, "logits/chosen": -2.4640331268310547, "logits/rejected": -2.442941904067993, "logps/chosen": -128.1914825439453, "logps/rejected": -151.66429138183594, "loss": 0.6129, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7514085173606873, "rewards/margins": 0.23872356116771698, "rewards/rejected": -0.9901320338249207, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 19.718080520629883, "learning_rate": 7.293589227505511e-08, "logits/chosen": -2.508129835128784, "logits/rejected": -2.4907355308532715, "logps/chosen": -133.34864807128906, "logps/rejected": -158.2075958251953, "loss": 0.6127, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7946994304656982, "rewards/margins": 0.237708181142807, "rewards/rejected": -1.032407522201538, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -2.5876901149749756, "eval_logits/rejected": -2.582226276397705, "eval_logps/chosen": -125.13570404052734, "eval_logps/rejected": -140.89547729492188, "eval_loss": 0.6558043956756592, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -0.6612022519111633, "eval_rewards/margins": 0.12025635689496994, "eval_rewards/rejected": -0.7814586162567139, "eval_runtime": 383.022, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 26.754358291625977, "learning_rate": 7.284677276799593e-08, "logits/chosen": -2.5734477043151855, "logits/rejected": -2.5553011894226074, "logps/chosen": -143.27554321289062, "logps/rejected": -146.91378784179688, "loss": 0.6803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.852822482585907, "rewards/margins": 0.0901159793138504, "rewards/rejected": -0.9429384469985962, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 32.42961883544922, "learning_rate": 7.275756143053821e-08, "logits/chosen": -2.4831254482269287, "logits/rejected": -2.45436954498291, "logps/chosen": -136.29603576660156, "logps/rejected": -150.64312744140625, "loss": 0.6357, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8118859529495239, "rewards/margins": 0.19399450719356537, "rewards/rejected": -1.00588059425354, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 22.189268112182617, "learning_rate": 7.266825862125827e-08, "logits/chosen": -2.486410617828369, "logits/rejected": -2.4737281799316406, "logps/chosen": -137.525634765625, "logps/rejected": -149.06851196289062, "loss": 0.6451, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8232401609420776, "rewards/margins": 0.1511302888393402, "rewards/rejected": -0.9743705987930298, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 21.52886390686035, "learning_rate": 7.257886469910018e-08, "logits/chosen": -2.5281074047088623, "logits/rejected": -2.5180110931396484, "logps/chosen": -140.8453826904297, "logps/rejected": -158.28399658203125, "loss": 0.6219, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8273015022277832, "rewards/margins": 0.20986250042915344, "rewards/rejected": -1.0371639728546143, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 21.729318618774414, "learning_rate": 7.248938002337412e-08, "logits/chosen": -2.537574291229248, "logits/rejected": -2.5159497261047363, "logps/chosen": -139.05413818359375, "logps/rejected": -150.28652954101562, "loss": 0.6384, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.833267867565155, "rewards/margins": 0.1575065553188324, "rewards/rejected": -0.9907743334770203, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 26.113866806030273, "learning_rate": 7.239980495375518e-08, "logits/chosen": -2.546105146408081, "logits/rejected": -2.523954153060913, "logps/chosen": -137.6354522705078, "logps/rejected": -154.47494506835938, "loss": 0.6156, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8253766894340515, "rewards/margins": 0.2103661596775055, "rewards/rejected": -1.0357427597045898, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 26.12915802001953, "learning_rate": 7.231013985028168e-08, "logits/chosen": -2.560234546661377, "logits/rejected": -2.5334925651550293, "logps/chosen": -131.84494018554688, "logps/rejected": -148.89859008789062, "loss": 0.62, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7643373608589172, "rewards/margins": 0.2046651542186737, "rewards/rejected": -0.9690025448799133, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 19.570632934570312, "learning_rate": 7.222038507335384e-08, "logits/chosen": -2.5897583961486816, "logits/rejected": -2.55975079536438, "logps/chosen": -137.03005981445312, "logps/rejected": -153.3487091064453, "loss": 0.6194, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7909359931945801, "rewards/margins": 0.22452524304389954, "rewards/rejected": -1.0154612064361572, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 26.04621696472168, "learning_rate": 7.213054098373232e-08, "logits/chosen": -2.451444625854492, "logits/rejected": -2.4375545978546143, "logps/chosen": -141.99853515625, "logps/rejected": -157.99383544921875, "loss": 0.6407, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8617357015609741, "rewards/margins": 0.16678765416145325, "rewards/rejected": -1.028523325920105, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 22.285764694213867, "learning_rate": 7.204060794253679e-08, "logits/chosen": -2.420621395111084, "logits/rejected": -2.403648853302002, "logps/chosen": -132.68133544921875, "logps/rejected": -153.64938354492188, "loss": 0.634, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8204687237739563, "rewards/margins": 0.19447723031044006, "rewards/rejected": -1.0149458646774292, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 22.316200256347656, "learning_rate": 7.195058631124443e-08, "logits/chosen": -2.5474612712860107, "logits/rejected": -2.527472972869873, "logps/chosen": -145.7023468017578, "logps/rejected": -164.7689666748047, "loss": 0.6177, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9006555676460266, "rewards/margins": 0.23807115852832794, "rewards/rejected": -1.138726830482483, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 23.586435317993164, "learning_rate": 7.186047645168849e-08, "logits/chosen": -2.54943585395813, "logits/rejected": -2.5296919345855713, "logps/chosen": -138.24465942382812, "logps/rejected": -153.2172393798828, "loss": 0.6292, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8254194259643555, "rewards/margins": 0.20264151692390442, "rewards/rejected": -1.0280609130859375, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 36.1264533996582, "learning_rate": 7.177027872605686e-08, "logits/chosen": -2.4627273082733154, "logits/rejected": -2.4443767070770264, "logps/chosen": -139.52914428710938, "logps/rejected": -166.06155395507812, "loss": 0.5893, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8535804748535156, "rewards/margins": 0.3056808114051819, "rewards/rejected": -1.1592612266540527, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 27.05265998840332, "learning_rate": 7.167999349689062e-08, "logits/chosen": -2.525515079498291, "logits/rejected": -2.502408742904663, "logps/chosen": -147.11248779296875, "logps/rejected": -160.80465698242188, "loss": 0.6513, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9110174179077148, "rewards/margins": 0.16634492576122284, "rewards/rejected": -1.0773624181747437, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 29.149703979492188, "learning_rate": 7.158962112708247e-08, "logits/chosen": -2.575409412384033, "logits/rejected": -2.553063154220581, "logps/chosen": -140.3431854248047, "logps/rejected": -154.044189453125, "loss": 0.6291, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.819327175617218, "rewards/margins": 0.21409860253334045, "rewards/rejected": -1.0334258079528809, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 21.219430923461914, "learning_rate": 7.14991619798755e-08, "logits/chosen": -2.4919114112854004, "logits/rejected": -2.477019786834717, "logps/chosen": -141.4704132080078, "logps/rejected": -157.49951171875, "loss": 0.6308, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8595463633537292, "rewards/margins": 0.20486697554588318, "rewards/rejected": -1.0644134283065796, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 32.99021530151367, "learning_rate": 7.140861641886148e-08, "logits/chosen": -2.4564788341522217, "logits/rejected": -2.439401626586914, "logps/chosen": -141.84283447265625, "logps/rejected": -154.24302673339844, "loss": 0.6476, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8768693804740906, "rewards/margins": 0.16116423904895782, "rewards/rejected": -1.0380337238311768, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 22.127519607543945, "learning_rate": 7.131798480797957e-08, "logits/chosen": -2.5054032802581787, "logits/rejected": -2.488107204437256, "logps/chosen": -138.72915649414062, "logps/rejected": -162.11215209960938, "loss": 0.6188, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.852269172668457, "rewards/margins": 0.23749974370002747, "rewards/rejected": -1.0897690057754517, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 38.59953689575195, "learning_rate": 7.12272675115148e-08, "logits/chosen": -2.4843626022338867, "logits/rejected": -2.470242738723755, "logps/chosen": -137.43231201171875, "logps/rejected": -158.0469970703125, "loss": 0.6284, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8447599411010742, "rewards/margins": 0.21666069328784943, "rewards/rejected": -1.0614207983016968, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 22.354713439941406, "learning_rate": 7.113646489409654e-08, "logits/chosen": -2.479954957962036, "logits/rejected": -2.449371337890625, "logps/chosen": -150.95327758789062, "logps/rejected": -165.73294067382812, "loss": 0.6249, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9459403157234192, "rewards/margins": 0.2137012928724289, "rewards/rejected": -1.1596416234970093, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 24.540973663330078, "learning_rate": 7.104557732069722e-08, "logits/chosen": -2.4785265922546387, "logits/rejected": -2.446809768676758, "logps/chosen": -142.2882537841797, "logps/rejected": -164.82403564453125, "loss": 0.6116, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8778212666511536, "rewards/margins": 0.25136417150497437, "rewards/rejected": -1.129185438156128, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 23.970643997192383, "learning_rate": 7.09546051566306e-08, "logits/chosen": -2.4319984912872314, "logits/rejected": -2.4142327308654785, "logps/chosen": -147.84335327148438, "logps/rejected": -164.019775390625, "loss": 0.6262, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9034894704818726, "rewards/margins": 0.20930960774421692, "rewards/rejected": -1.1127991676330566, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 28.692846298217773, "learning_rate": 7.086354876755058e-08, "logits/chosen": -2.4339330196380615, "logits/rejected": -2.4105517864227295, "logps/chosen": -156.17025756835938, "logps/rejected": -180.38577270507812, "loss": 0.6097, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0276180505752563, "rewards/margins": 0.2644575238227844, "rewards/rejected": -1.292075514793396, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 27.577713012695312, "learning_rate": 7.07724085194495e-08, "logits/chosen": -2.508514881134033, "logits/rejected": -2.4860053062438965, "logps/chosen": -162.51748657226562, "logps/rejected": -180.8822479248047, "loss": 0.6349, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0603501796722412, "rewards/margins": 0.22516348958015442, "rewards/rejected": -1.2855136394500732, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 33.16237258911133, "learning_rate": 7.068118477865677e-08, "logits/chosen": -2.582409620285034, "logits/rejected": -2.559049606323242, "logps/chosen": -156.26156616210938, "logps/rejected": -167.25140380859375, "loss": 0.6504, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9874914884567261, "rewards/margins": 0.16984400153160095, "rewards/rejected": -1.1573354005813599, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 25.894336700439453, "learning_rate": 7.058987791183744e-08, "logits/chosen": -2.451228618621826, "logits/rejected": -2.448843240737915, "logps/chosen": -147.57638549804688, "logps/rejected": -170.20025634765625, "loss": 0.632, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.965354323387146, "rewards/margins": 0.21499767899513245, "rewards/rejected": -1.180351972579956, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 26.20570945739746, "learning_rate": 7.049848828599064e-08, "logits/chosen": -2.514191150665283, "logits/rejected": -2.5005524158477783, "logps/chosen": -152.73623657226562, "logps/rejected": -164.35342407226562, "loss": 0.6567, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9762645959854126, "rewards/margins": 0.16078788042068481, "rewards/rejected": -1.1370524168014526, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 23.704248428344727, "learning_rate": 7.040701626844819e-08, "logits/chosen": -2.465456485748291, "logits/rejected": -2.442073345184326, "logps/chosen": -149.51150512695312, "logps/rejected": -159.86837768554688, "loss": 0.6446, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9404380917549133, "rewards/margins": 0.17175881564617157, "rewards/rejected": -1.112196922302246, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 26.433223724365234, "learning_rate": 7.031546222687296e-08, "logits/chosen": -2.4424567222595215, "logits/rejected": -2.4317264556884766, "logps/chosen": -154.74525451660156, "logps/rejected": -174.90280151367188, "loss": 0.6298, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0154364109039307, "rewards/margins": 0.20561465620994568, "rewards/rejected": -1.2210509777069092, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 34.223243713378906, "learning_rate": 7.022382652925766e-08, "logits/chosen": -2.464292049407959, "logits/rejected": -2.4453139305114746, "logps/chosen": -152.37490844726562, "logps/rejected": -168.61769104003906, "loss": 0.6606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0072271823883057, "rewards/margins": 0.16684751212596893, "rewards/rejected": -1.174074649810791, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 32.43608474731445, "learning_rate": 7.01321095439231e-08, "logits/chosen": -2.471198558807373, "logits/rejected": -2.453827381134033, "logps/chosen": -154.67567443847656, "logps/rejected": -164.9767303466797, "loss": 0.6459, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9621097445487976, "rewards/margins": 0.1783999502658844, "rewards/rejected": -1.1405094861984253, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 29.828994750976562, "learning_rate": 7.004031163951686e-08, "logits/chosen": -2.478649377822876, "logits/rejected": -2.4622559547424316, "logps/chosen": -145.52989196777344, "logps/rejected": -162.56576538085938, "loss": 0.6354, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.910129725933075, "rewards/margins": 0.1902073472738266, "rewards/rejected": -1.100337028503418, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 25.16864585876465, "learning_rate": 6.994843318501175e-08, "logits/chosen": -2.434582471847534, "logits/rejected": -2.4288063049316406, "logps/chosen": -144.28469848632812, "logps/rejected": -161.70635986328125, "loss": 0.6463, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9157862663269043, "rewards/margins": 0.17825372517108917, "rewards/rejected": -1.0940399169921875, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 17.154449462890625, "learning_rate": 6.985647454970436e-08, "logits/chosen": -2.5492055416107178, "logits/rejected": -2.5423622131347656, "logps/chosen": -130.44430541992188, "logps/rejected": -154.4150848388672, "loss": 0.6089, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.778977632522583, "rewards/margins": 0.24291332066059113, "rewards/rejected": -1.0218908786773682, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 23.36393165588379, "learning_rate": 6.976443610321355e-08, "logits/chosen": -2.4938864707946777, "logits/rejected": -2.47977876663208, "logps/chosen": -135.61158752441406, "logps/rejected": -156.0998077392578, "loss": 0.62, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.824744701385498, "rewards/margins": 0.22780942916870117, "rewards/rejected": -1.0525541305541992, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 23.027864456176758, "learning_rate": 6.9672318215479e-08, "logits/chosen": -2.5449604988098145, "logits/rejected": -2.527798652648926, "logps/chosen": -131.2892608642578, "logps/rejected": -160.10562133789062, "loss": 0.598, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7776525020599365, "rewards/margins": 0.2734105587005615, "rewards/rejected": -1.0510631799697876, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 21.192401885986328, "learning_rate": 6.958012125675961e-08, "logits/chosen": -2.5487101078033447, "logits/rejected": -2.531736135482788, "logps/chosen": -147.08360290527344, "logps/rejected": -165.15524291992188, "loss": 0.6196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9009296298027039, "rewards/margins": 0.23346951603889465, "rewards/rejected": -1.134399175643921, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 27.005096435546875, "learning_rate": 6.948784559763221e-08, "logits/chosen": -2.5103249549865723, "logits/rejected": -2.494823932647705, "logps/chosen": -144.07765197753906, "logps/rejected": -160.0559844970703, "loss": 0.6343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.868603527545929, "rewards/margins": 0.19318082928657532, "rewards/rejected": -1.0617843866348267, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 30.346166610717773, "learning_rate": 6.93954916089899e-08, "logits/chosen": -2.519892454147339, "logits/rejected": -2.4853830337524414, "logps/chosen": -153.8564910888672, "logps/rejected": -169.29183959960938, "loss": 0.6149, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9491060376167297, "rewards/margins": 0.24793191254138947, "rewards/rejected": -1.197037935256958, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 31.32855224609375, "learning_rate": 6.930305966204059e-08, "logits/chosen": -2.4879536628723145, "logits/rejected": -2.4646151065826416, "logps/chosen": -147.8095245361328, "logps/rejected": -157.54727172851562, "loss": 0.6531, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9458245038986206, "rewards/margins": 0.15245869755744934, "rewards/rejected": -1.098283290863037, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -2.556385040283203, "eval_logits/rejected": -2.550178289413452, "eval_logps/chosen": -133.6133270263672, "eval_logps/rejected": -150.78623962402344, "eval_loss": 0.6534062027931213, "eval_rewards/accuracies": 0.6040892004966736, "eval_rewards/chosen": -0.7459785342216492, "eval_rewards/margins": 0.13438780605793, "eval_rewards/rejected": -0.880366325378418, "eval_runtime": 383.0675, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 24.08529281616211, "learning_rate": 6.921055012830563e-08, "logits/chosen": -2.4477341175079346, "logits/rejected": -2.423722743988037, "logps/chosen": -145.34474182128906, "logps/rejected": -161.76712036132812, "loss": 0.6323, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9105755090713501, "rewards/margins": 0.21531391143798828, "rewards/rejected": -1.125889539718628, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 19.797929763793945, "learning_rate": 6.911796337961813e-08, "logits/chosen": -2.4861130714416504, "logits/rejected": -2.4657671451568604, "logps/chosen": -142.93849182128906, "logps/rejected": -161.22935485839844, "loss": 0.6132, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8410897254943848, "rewards/margins": 0.23532815277576447, "rewards/rejected": -1.0764179229736328, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 27.883426666259766, "learning_rate": 6.902529978812159e-08, "logits/chosen": -2.4483742713928223, "logits/rejected": -2.4535410404205322, "logps/chosen": -137.55007934570312, "logps/rejected": -162.91024780273438, "loss": 0.6167, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8679767847061157, "rewards/margins": 0.2256569117307663, "rewards/rejected": -1.093633770942688, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 32.171966552734375, "learning_rate": 6.893255972626838e-08, "logits/chosen": -2.44758939743042, "logits/rejected": -2.425205707550049, "logps/chosen": -154.0350341796875, "logps/rejected": -170.94781494140625, "loss": 0.6284, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.955964207649231, "rewards/margins": 0.23719045519828796, "rewards/rejected": -1.1931545734405518, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 21.268714904785156, "learning_rate": 6.883974356681823e-08, "logits/chosen": -2.5042102336883545, "logits/rejected": -2.486968517303467, "logps/chosen": -161.19512939453125, "logps/rejected": -175.97557067871094, "loss": 0.6452, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0189523696899414, "rewards/margins": 0.1943272054195404, "rewards/rejected": -1.2132797241210938, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 20.272716522216797, "learning_rate": 6.874685168283675e-08, "logits/chosen": -2.512941837310791, "logits/rejected": -2.487058639526367, "logps/chosen": -150.97122192382812, "logps/rejected": -172.14968872070312, "loss": 0.611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9367518424987793, "rewards/margins": 0.24415257573127747, "rewards/rejected": -1.1809046268463135, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 29.262372970581055, "learning_rate": 6.865388444769388e-08, "logits/chosen": -2.465951919555664, "logits/rejected": -2.447601795196533, "logps/chosen": -148.7430419921875, "logps/rejected": -161.8044891357422, "loss": 0.6379, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.90485680103302, "rewards/margins": 0.1881795972585678, "rewards/rejected": -1.093036413192749, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 23.13471031188965, "learning_rate": 6.856084223506247e-08, "logits/chosen": -2.500370740890503, "logits/rejected": -2.484502077102661, "logps/chosen": -143.19085693359375, "logps/rejected": -163.03858947753906, "loss": 0.6177, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9128525853157043, "rewards/margins": 0.22799701988697052, "rewards/rejected": -1.1408497095108032, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 24.28408432006836, "learning_rate": 6.84677254189167e-08, "logits/chosen": -2.586789131164551, "logits/rejected": -2.5469651222229004, "logps/chosen": -140.514892578125, "logps/rejected": -157.67715454101562, "loss": 0.6139, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8414154052734375, "rewards/margins": 0.2492334395647049, "rewards/rejected": -1.090648889541626, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 20.994827270507812, "learning_rate": 6.837453437353064e-08, "logits/chosen": -2.4863336086273193, "logits/rejected": -2.456991672515869, "logps/chosen": -143.09527587890625, "logps/rejected": -162.80410766601562, "loss": 0.6266, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.884503960609436, "rewards/margins": 0.2061157524585724, "rewards/rejected": -1.090619683265686, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 28.234207153320312, "learning_rate": 6.82812694734767e-08, "logits/chosen": -2.493734121322632, "logits/rejected": -2.4809954166412354, "logps/chosen": -149.52813720703125, "logps/rejected": -163.19261169433594, "loss": 0.6498, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9637618064880371, "rewards/margins": 0.1601286083459854, "rewards/rejected": -1.1238903999328613, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 26.1023006439209, "learning_rate": 6.818793109362416e-08, "logits/chosen": -2.5035171508789062, "logits/rejected": -2.4748847484588623, "logps/chosen": -146.76425170898438, "logps/rejected": -160.68130493164062, "loss": 0.621, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8933634757995605, "rewards/margins": 0.2182648628950119, "rewards/rejected": -1.1116282939910889, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 32.87727737426758, "learning_rate": 6.80945196091376e-08, "logits/chosen": -2.438372850418091, "logits/rejected": -2.416724681854248, "logps/chosen": -134.09129333496094, "logps/rejected": -156.75234985351562, "loss": 0.5997, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7908092141151428, "rewards/margins": 0.26274845004081726, "rewards/rejected": -1.0535576343536377, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 22.23272705078125, "learning_rate": 6.800103539547548e-08, "logits/chosen": -2.4662704467773438, "logits/rejected": -2.450282096862793, "logps/chosen": -145.58279418945312, "logps/rejected": -168.5486297607422, "loss": 0.622, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9237093925476074, "rewards/margins": 0.23403067886829376, "rewards/rejected": -1.1577401161193848, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 20.547780990600586, "learning_rate": 6.790747882838859e-08, "logits/chosen": -2.4921231269836426, "logits/rejected": -2.4648630619049072, "logps/chosen": -151.70297241210938, "logps/rejected": -171.81393432617188, "loss": 0.6228, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9648507833480835, "rewards/margins": 0.24369268119335175, "rewards/rejected": -1.2085435390472412, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 34.338104248046875, "learning_rate": 6.781385028391851e-08, "logits/chosen": -2.384896755218506, "logits/rejected": -2.3702826499938965, "logps/chosen": -136.65414428710938, "logps/rejected": -162.7095184326172, "loss": 0.6044, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8515266180038452, "rewards/margins": 0.2675195038318634, "rewards/rejected": -1.1190460920333862, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 25.451663970947266, "learning_rate": 6.772015013839616e-08, "logits/chosen": -2.4470276832580566, "logits/rejected": -2.428715944290161, "logps/chosen": -143.4765625, "logps/rejected": -165.12799072265625, "loss": 0.6191, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9022136926651001, "rewards/margins": 0.23084497451782227, "rewards/rejected": -1.1330586671829224, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 25.063844680786133, "learning_rate": 6.762637876844021e-08, "logits/chosen": -2.529904365539551, "logits/rejected": -2.5203545093536377, "logps/chosen": -145.97256469726562, "logps/rejected": -167.47885131835938, "loss": 0.6327, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9347503781318665, "rewards/margins": 0.21358363330364227, "rewards/rejected": -1.1483341455459595, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 31.19562530517578, "learning_rate": 6.753253655095565e-08, "logits/chosen": -2.5071768760681152, "logits/rejected": -2.4931302070617676, "logps/chosen": -145.25839233398438, "logps/rejected": -168.6995849609375, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9317231178283691, "rewards/margins": 0.24524815380573273, "rewards/rejected": -1.176971197128296, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 21.104005813598633, "learning_rate": 6.743862386313219e-08, "logits/chosen": -2.512084722518921, "logits/rejected": -2.501608371734619, "logps/chosen": -147.72988891601562, "logps/rejected": -174.69314575195312, "loss": 0.6068, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9235385656356812, "rewards/margins": 0.2817572057247162, "rewards/rejected": -1.2052958011627197, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 29.805986404418945, "learning_rate": 6.734464108244285e-08, "logits/chosen": -2.531437635421753, "logits/rejected": -2.5037777423858643, "logps/chosen": -148.46267700195312, "logps/rejected": -169.79966735839844, "loss": 0.6042, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8897361755371094, "rewards/margins": 0.2666037976741791, "rewards/rejected": -1.1563400030136108, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 25.536258697509766, "learning_rate": 6.725058858664234e-08, "logits/chosen": -2.5017189979553223, "logits/rejected": -2.4743571281433105, "logps/chosen": -144.98760986328125, "logps/rejected": -172.6921844482422, "loss": 0.5926, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8739396929740906, "rewards/margins": 0.31180593371391296, "rewards/rejected": -1.1857458353042603, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 29.31256103515625, "learning_rate": 6.715646675376557e-08, "logits/chosen": -2.4239916801452637, "logits/rejected": -2.411015272140503, "logps/chosen": -151.56417846679688, "logps/rejected": -178.06365966796875, "loss": 0.6215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0077168941497803, "rewards/margins": 0.24843649566173553, "rewards/rejected": -1.2561534643173218, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 23.53630256652832, "learning_rate": 6.70622759621262e-08, "logits/chosen": -2.4109740257263184, "logits/rejected": -2.394785165786743, "logps/chosen": -150.35752868652344, "logps/rejected": -170.80088806152344, "loss": 0.6364, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9575635194778442, "rewards/margins": 0.21887817978858948, "rewards/rejected": -1.1764415502548218, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 20.69610595703125, "learning_rate": 6.6968016590315e-08, "logits/chosen": -2.4047062397003174, "logits/rejected": -2.372786521911621, "logps/chosen": -155.9188232421875, "logps/rejected": -165.7627716064453, "loss": 0.6492, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9968074560165405, "rewards/margins": 0.1855669915676117, "rewards/rejected": -1.1823744773864746, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 26.409324645996094, "learning_rate": 6.687368901719843e-08, "logits/chosen": -2.4540700912475586, "logits/rejected": -2.4210312366485596, "logps/chosen": -146.4640350341797, "logps/rejected": -176.3026123046875, "loss": 0.5928, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9319847822189331, "rewards/margins": 0.3151303827762604, "rewards/rejected": -1.2471152544021606, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 28.181825637817383, "learning_rate": 6.677929362191708e-08, "logits/chosen": -2.460646629333496, "logits/rejected": -2.4472787380218506, "logps/chosen": -156.83065795898438, "logps/rejected": -175.25631713867188, "loss": 0.6348, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.000558614730835, "rewards/margins": 0.2184421271085739, "rewards/rejected": -1.2190005779266357, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 34.54668045043945, "learning_rate": 6.668483078388411e-08, "logits/chosen": -2.4971394538879395, "logits/rejected": -2.4821231365203857, "logps/chosen": -148.3890380859375, "logps/rejected": -163.4290771484375, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": -0.9440007209777832, "rewards/margins": 0.1740240752696991, "rewards/rejected": -1.1180247068405151, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 30.347705841064453, "learning_rate": 6.659030088278378e-08, "logits/chosen": -2.4705967903137207, "logits/rejected": -2.451388359069824, "logps/chosen": -139.71258544921875, "logps/rejected": -162.09963989257812, "loss": 0.6221, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8485860824584961, "rewards/margins": 0.22423677146434784, "rewards/rejected": -1.0728228092193604, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 26.510684967041016, "learning_rate": 6.649570429856992e-08, "logits/chosen": -2.4827654361724854, "logits/rejected": -2.4759905338287354, "logps/chosen": -142.5802764892578, "logps/rejected": -162.55567932128906, "loss": 0.6289, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8945484161376953, "rewards/margins": 0.20854821801185608, "rewards/rejected": -1.103096604347229, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 23.563236236572266, "learning_rate": 6.640104141146439e-08, "logits/chosen": -2.4833462238311768, "logits/rejected": -2.463135242462158, "logps/chosen": -148.6671142578125, "logps/rejected": -164.3062286376953, "loss": 0.6431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9139005541801453, "rewards/margins": 0.19988103210926056, "rewards/rejected": -1.1137816905975342, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 21.265867233276367, "learning_rate": 6.630631260195548e-08, "logits/chosen": -2.463007688522339, "logits/rejected": -2.4445767402648926, "logps/chosen": -143.3325653076172, "logps/rejected": -161.53256225585938, "loss": 0.6136, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8710179328918457, "rewards/margins": 0.23537349700927734, "rewards/rejected": -1.106391191482544, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 27.329938888549805, "learning_rate": 6.621151825079657e-08, "logits/chosen": -2.506049633026123, "logits/rejected": -2.484340190887451, "logps/chosen": -150.452880859375, "logps/rejected": -163.8821258544922, "loss": 0.6313, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9138700366020203, "rewards/margins": 0.2064877450466156, "rewards/rejected": -1.1203577518463135, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 34.85219955444336, "learning_rate": 6.611665873900434e-08, "logits/chosen": -2.4367105960845947, "logits/rejected": -2.4192934036254883, "logps/chosen": -151.02890014648438, "logps/rejected": -175.16510009765625, "loss": 0.6079, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9187140464782715, "rewards/margins": 0.28845953941345215, "rewards/rejected": -1.2071735858917236, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 24.84377670288086, "learning_rate": 6.602173444785747e-08, "logits/chosen": -2.4143567085266113, "logits/rejected": -2.4058680534362793, "logps/chosen": -139.19375610351562, "logps/rejected": -167.15652465820312, "loss": 0.6078, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8710354566574097, "rewards/margins": 0.2791138291358948, "rewards/rejected": -1.1501493453979492, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 21.30821990966797, "learning_rate": 6.5926745758895e-08, "logits/chosen": -2.4210612773895264, "logits/rejected": -2.398871898651123, "logps/chosen": -142.59088134765625, "logps/rejected": -163.7798614501953, "loss": 0.6309, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9171046018600464, "rewards/margins": 0.20536521077156067, "rewards/rejected": -1.1224697828292847, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 21.328529357910156, "learning_rate": 6.583169305391479e-08, "logits/chosen": -2.4894626140594482, "logits/rejected": -2.469831705093384, "logps/chosen": -148.52047729492188, "logps/rejected": -162.56149291992188, "loss": 0.64, "rewards/accuracies": 0.625, "rewards/chosen": -0.9207156896591187, "rewards/margins": 0.18888339400291443, "rewards/rejected": -1.109598994255066, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 27.226303100585938, "learning_rate": 6.5736576714972e-08, "logits/chosen": -2.518310070037842, "logits/rejected": -2.5061028003692627, "logps/chosen": -148.75880432128906, "logps/rejected": -169.51956176757812, "loss": 0.6238, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9074954986572266, "rewards/margins": 0.23433642089366913, "rewards/rejected": -1.1418317556381226, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 23.61194610595703, "learning_rate": 6.564139712437761e-08, "logits/chosen": -2.51324200630188, "logits/rejected": -2.495260715484619, "logps/chosen": -144.06536865234375, "logps/rejected": -171.9273681640625, "loss": 0.6009, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9267575144767761, "rewards/margins": 0.2783433496952057, "rewards/rejected": -1.2051007747650146, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 30.414278030395508, "learning_rate": 6.554615466469677e-08, "logits/chosen": -2.413956880569458, "logits/rejected": -2.3987700939178467, "logps/chosen": -148.2495574951172, "logps/rejected": -178.20701599121094, "loss": 0.5995, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9225319027900696, "rewards/margins": 0.308528870344162, "rewards/rejected": -1.2310607433319092, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -2.526700496673584, "eval_logits/rejected": -2.5195157527923584, "eval_logps/chosen": -140.2942352294922, "eval_logps/rejected": -158.2948455810547, "eval_loss": 0.6527594923973083, "eval_rewards/accuracies": 0.6006041169166565, "eval_rewards/chosen": -0.8127875924110413, "eval_rewards/margins": 0.14266452193260193, "eval_rewards/rejected": -0.9554521441459656, "eval_runtime": 382.8699, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 30.691783905029297, "learning_rate": 6.545084971874738e-08, "logits/chosen": -2.4314675331115723, "logits/rejected": -2.4045021533966064, "logps/chosen": -157.57028198242188, "logps/rejected": -173.6315155029297, "loss": 0.6301, "rewards/accuracies": 0.625, "rewards/chosen": -0.9951564073562622, "rewards/margins": 0.22977761924266815, "rewards/rejected": -1.2249339818954468, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 25.586700439453125, "learning_rate": 6.535548266959845e-08, "logits/chosen": -2.4431333541870117, "logits/rejected": -2.4124550819396973, "logps/chosen": -168.57510375976562, "logps/rejected": -186.40036010742188, "loss": 0.6201, "rewards/accuracies": 0.625, "rewards/chosen": -1.0784189701080322, "rewards/margins": 0.24669373035430908, "rewards/rejected": -1.3251125812530518, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 29.118562698364258, "learning_rate": 6.526005390056863e-08, "logits/chosen": -2.4242732524871826, "logits/rejected": -2.4101269245147705, "logps/chosen": -152.4160919189453, "logps/rejected": -176.9132537841797, "loss": 0.6286, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0079786777496338, "rewards/margins": 0.2344062775373459, "rewards/rejected": -1.242384910583496, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 25.053556442260742, "learning_rate": 6.516456379522468e-08, "logits/chosen": -2.419296979904175, "logits/rejected": -2.3870930671691895, "logps/chosen": -165.99789428710938, "logps/rejected": -185.56979370117188, "loss": 0.6365, "rewards/accuracies": 0.625, "rewards/chosen": -1.1034375429153442, "rewards/margins": 0.24526703357696533, "rewards/rejected": -1.3487045764923096, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 24.719209671020508, "learning_rate": 6.506901273737985e-08, "logits/chosen": -2.449517011642456, "logits/rejected": -2.43290376663208, "logps/chosen": -154.53453063964844, "logps/rejected": -183.0343780517578, "loss": 0.6069, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9876915216445923, "rewards/margins": 0.2905445396900177, "rewards/rejected": -1.2782361507415771, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 28.948888778686523, "learning_rate": 6.497340111109239e-08, "logits/chosen": -2.4893579483032227, "logits/rejected": -2.4637856483459473, "logps/chosen": -163.29798889160156, "logps/rejected": -177.02935791015625, "loss": 0.6403, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.068241834640503, "rewards/margins": 0.20585057139396667, "rewards/rejected": -1.274092435836792, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 32.282264709472656, "learning_rate": 6.4877729300664e-08, "logits/chosen": -2.411339521408081, "logits/rejected": -2.3874354362487793, "logps/chosen": -154.8437957763672, "logps/rejected": -171.14035034179688, "loss": 0.6303, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0067319869995117, "rewards/margins": 0.2154075801372528, "rewards/rejected": -1.222139596939087, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 26.234561920166016, "learning_rate": 6.478199769063833e-08, "logits/chosen": -2.4240946769714355, "logits/rejected": -2.415292501449585, "logps/chosen": -145.56069946289062, "logps/rejected": -180.4043731689453, "loss": 0.5852, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9170554876327515, "rewards/margins": 0.3155246675014496, "rewards/rejected": -1.2325801849365234, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 27.18834686279297, "learning_rate": 6.468620666579927e-08, "logits/chosen": -2.421595335006714, "logits/rejected": -2.397613048553467, "logps/chosen": -151.18453979492188, "logps/rejected": -174.41067504882812, "loss": 0.6138, "rewards/accuracies": 0.6875, "rewards/chosen": -0.969436764717102, "rewards/margins": 0.27077406644821167, "rewards/rejected": -1.2402108907699585, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 22.773412704467773, "learning_rate": 6.459035661116967e-08, "logits/chosen": -2.4691033363342285, "logits/rejected": -2.463841676712036, "logps/chosen": -149.60824584960938, "logps/rejected": -170.5471954345703, "loss": 0.6377, "rewards/accuracies": 0.625, "rewards/chosen": -0.94111168384552, "rewards/margins": 0.19068074226379395, "rewards/rejected": -1.1317923069000244, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 28.7106990814209, "learning_rate": 6.449444791200956e-08, "logits/chosen": -2.449850559234619, "logits/rejected": -2.4193978309631348, "logps/chosen": -158.18594360351562, "logps/rejected": -175.22109985351562, "loss": 0.6249, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0139230489730835, "rewards/margins": 0.23840279877185822, "rewards/rejected": -1.2523258924484253, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 27.975040435791016, "learning_rate": 6.43984809538147e-08, "logits/chosen": -2.441727876663208, "logits/rejected": -2.4223945140838623, "logps/chosen": -155.35897827148438, "logps/rejected": -171.23580932617188, "loss": 0.643, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9953263401985168, "rewards/margins": 0.19734260439872742, "rewards/rejected": -1.1926690340042114, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 31.7548770904541, "learning_rate": 6.430245612231501e-08, "logits/chosen": -2.4581613540649414, "logits/rejected": -2.4431262016296387, "logps/chosen": -147.70947265625, "logps/rejected": -166.51431274414062, "loss": 0.6152, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9203088879585266, "rewards/margins": 0.2477300465106964, "rewards/rejected": -1.1680388450622559, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 30.289751052856445, "learning_rate": 6.420637380347304e-08, "logits/chosen": -2.43922758102417, "logits/rejected": -2.4155101776123047, "logps/chosen": -148.98468017578125, "logps/rejected": -173.9259033203125, "loss": 0.6175, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.952150821685791, "rewards/margins": 0.26444974541664124, "rewards/rejected": -1.2166005373001099, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 22.625505447387695, "learning_rate": 6.41102343834824e-08, "logits/chosen": -2.4767231941223145, "logits/rejected": -2.456955671310425, "logps/chosen": -148.2836151123047, "logps/rejected": -172.3328857421875, "loss": 0.6314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9487040638923645, "rewards/margins": 0.2521008551120758, "rewards/rejected": -1.2008049488067627, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 23.49427032470703, "learning_rate": 6.40140382487662e-08, "logits/chosen": -2.431381940841675, "logits/rejected": -2.41220760345459, "logps/chosen": -149.7254180908203, "logps/rejected": -173.80136108398438, "loss": 0.5963, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9332088232040405, "rewards/margins": 0.2952547073364258, "rewards/rejected": -1.2284636497497559, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 28.60329246520996, "learning_rate": 6.391778578597555e-08, "logits/chosen": -2.480720043182373, "logits/rejected": -2.4537298679351807, "logps/chosen": -147.88601684570312, "logps/rejected": -161.3191680908203, "loss": 0.6234, "rewards/accuracies": 0.625, "rewards/chosen": -0.8782271146774292, "rewards/margins": 0.223358154296875, "rewards/rejected": -1.1015852689743042, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 24.527114868164062, "learning_rate": 6.38214773819879e-08, "logits/chosen": -2.4858410358428955, "logits/rejected": -2.467200756072998, "logps/chosen": -154.21408081054688, "logps/rejected": -171.1106414794922, "loss": 0.6389, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9663408398628235, "rewards/margins": 0.1989414542913437, "rewards/rejected": -1.1652823686599731, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 28.459131240844727, "learning_rate": 6.37251134239056e-08, "logits/chosen": -2.4083938598632812, "logits/rejected": -2.3872787952423096, "logps/chosen": -158.9376220703125, "logps/rejected": -175.55117797851562, "loss": 0.6371, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9954689741134644, "rewards/margins": 0.22491955757141113, "rewards/rejected": -1.2203885316848755, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 33.668880462646484, "learning_rate": 6.362869429905431e-08, "logits/chosen": -2.4564146995544434, "logits/rejected": -2.435072422027588, "logps/chosen": -157.72775268554688, "logps/rejected": -171.80508422851562, "loss": 0.6479, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.018909215927124, "rewards/margins": 0.18646292388439178, "rewards/rejected": -1.2053722143173218, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 23.163822174072266, "learning_rate": 6.353222039498136e-08, "logits/chosen": -2.3753769397735596, "logits/rejected": -2.3579654693603516, "logps/chosen": -150.57046508789062, "logps/rejected": -170.6091766357422, "loss": 0.6411, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9436851739883423, "rewards/margins": 0.20695781707763672, "rewards/rejected": -1.150642991065979, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 25.262638092041016, "learning_rate": 6.343569209945431e-08, "logits/chosen": -2.460125684738159, "logits/rejected": -2.4342360496520996, "logps/chosen": -138.68099975585938, "logps/rejected": -163.5703582763672, "loss": 0.6106, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.85523521900177, "rewards/margins": 0.2580004334449768, "rewards/rejected": -1.1132357120513916, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 25.81752586364746, "learning_rate": 6.333910980045932e-08, "logits/chosen": -2.445486307144165, "logits/rejected": -2.4349093437194824, "logps/chosen": -149.00405883789062, "logps/rejected": -158.01571655273438, "loss": 0.6639, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9231271743774414, "rewards/margins": 0.15333291888237, "rewards/rejected": -1.0764600038528442, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 39.4451904296875, "learning_rate": 6.324247388619967e-08, "logits/chosen": -2.5211682319641113, "logits/rejected": -2.495532512664795, "logps/chosen": -144.02603149414062, "logps/rejected": -161.476318359375, "loss": 0.6187, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8706077337265015, "rewards/margins": 0.2339431345462799, "rewards/rejected": -1.104550838470459, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 25.24150276184082, "learning_rate": 6.314578474509403e-08, "logits/chosen": -2.475867748260498, "logits/rejected": -2.4594616889953613, "logps/chosen": -144.00283813476562, "logps/rejected": -164.76052856445312, "loss": 0.6127, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8457595109939575, "rewards/margins": 0.2605190873146057, "rewards/rejected": -1.106278657913208, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 19.108230590820312, "learning_rate": 6.30490427657751e-08, "logits/chosen": -2.5130741596221924, "logits/rejected": -2.495227336883545, "logps/chosen": -150.24356079101562, "logps/rejected": -173.76339721679688, "loss": 0.6116, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9215757250785828, "rewards/margins": 0.26414090394973755, "rewards/rejected": -1.1857167482376099, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 26.77340316772461, "learning_rate": 6.295224833708792e-08, "logits/chosen": -2.50467586517334, "logits/rejected": -2.493666887283325, "logps/chosen": -150.25022888183594, "logps/rejected": -172.16513061523438, "loss": 0.6383, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9621537923812866, "rewards/margins": 0.21912629902362823, "rewards/rejected": -1.1812803745269775, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 23.759750366210938, "learning_rate": 6.285540184808836e-08, "logits/chosen": -2.434004783630371, "logits/rejected": -2.422375202178955, "logps/chosen": -147.2050323486328, "logps/rejected": -164.6346893310547, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": -0.9418224096298218, "rewards/margins": 0.1856471747159958, "rewards/rejected": -1.127469539642334, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 22.19275665283203, "learning_rate": 6.275850368804156e-08, "logits/chosen": -2.4233717918395996, "logits/rejected": -2.3867719173431396, "logps/chosen": -143.0716094970703, "logps/rejected": -151.45523071289062, "loss": 0.6448, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8239733576774597, "rewards/margins": 0.15603697299957275, "rewards/rejected": -0.9800102114677429, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 20.175020217895508, "learning_rate": 6.26615542464203e-08, "logits/chosen": -2.5537476539611816, "logits/rejected": -2.5354702472686768, "logps/chosen": -147.383056640625, "logps/rejected": -162.55567932128906, "loss": 0.6372, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8805820345878601, "rewards/margins": 0.21152591705322266, "rewards/rejected": -1.0921080112457275, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 21.76934814453125, "learning_rate": 6.256455391290352e-08, "logits/chosen": -2.4161832332611084, "logits/rejected": -2.391357183456421, "logps/chosen": -136.17333984375, "logps/rejected": -149.758544921875, "loss": 0.6289, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7870678305625916, "rewards/margins": 0.20766834914684296, "rewards/rejected": -0.9947363138198853, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 28.782739639282227, "learning_rate": 6.246750307737468e-08, "logits/chosen": -2.4312474727630615, "logits/rejected": -2.4197757244110107, "logps/chosen": -135.2398223876953, "logps/rejected": -160.67611694335938, "loss": 0.6091, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.81584632396698, "rewards/margins": 0.25617408752441406, "rewards/rejected": -1.0720202922821045, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 25.49635887145996, "learning_rate": 6.237040212992028e-08, "logits/chosen": -2.467252016067505, "logits/rejected": -2.4579386711120605, "logps/chosen": -139.45643615722656, "logps/rejected": -163.50772094726562, "loss": 0.6373, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8992892503738403, "rewards/margins": 0.2082190215587616, "rewards/rejected": -1.1075081825256348, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 25.415372848510742, "learning_rate": 6.227325146082817e-08, "logits/chosen": -2.511993169784546, "logits/rejected": -2.4966964721679688, "logps/chosen": -140.6236114501953, "logps/rejected": -160.0564422607422, "loss": 0.6209, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8478447794914246, "rewards/margins": 0.23764362931251526, "rewards/rejected": -1.0854883193969727, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 22.237586975097656, "learning_rate": 6.217605146058612e-08, "logits/chosen": -2.4000821113586426, "logits/rejected": -2.3829004764556885, "logps/chosen": -143.29495239257812, "logps/rejected": -154.86904907226562, "loss": 0.6592, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8857830762863159, "rewards/margins": 0.14205804467201233, "rewards/rejected": -1.0278412103652954, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 32.56266403198242, "learning_rate": 6.207880251988014e-08, "logits/chosen": -2.383216381072998, "logits/rejected": -2.355720281600952, "logps/chosen": -142.70553588867188, "logps/rejected": -159.14981079101562, "loss": 0.6316, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8659551739692688, "rewards/margins": 0.21056893467903137, "rewards/rejected": -1.0765241384506226, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 30.852476119995117, "learning_rate": 6.198150502959296e-08, "logits/chosen": -2.438936710357666, "logits/rejected": -2.4242122173309326, "logps/chosen": -138.79904174804688, "logps/rejected": -161.7018280029297, "loss": 0.6295, "rewards/accuracies": 0.625, "rewards/chosen": -0.867357075214386, "rewards/margins": 0.2242121398448944, "rewards/rejected": -1.0915693044662476, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 25.91733169555664, "learning_rate": 6.188415938080246e-08, "logits/chosen": -2.506133794784546, "logits/rejected": -2.4882261753082275, "logps/chosen": -135.34335327148438, "logps/rejected": -158.02169799804688, "loss": 0.626, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8259119987487793, "rewards/margins": 0.22101373970508575, "rewards/rejected": -1.0469257831573486, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 23.551240921020508, "learning_rate": 6.178676596478007e-08, "logits/chosen": -2.497065782546997, "logits/rejected": -2.4661946296691895, "logps/chosen": -137.8984832763672, "logps/rejected": -167.7418670654297, "loss": 0.5869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8056305050849915, "rewards/margins": 0.318527489900589, "rewards/rejected": -1.1241579055786133, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 35.88808822631836, "learning_rate": 6.168932517298927e-08, "logits/chosen": -2.4765398502349854, "logits/rejected": -2.457763671875, "logps/chosen": -136.0047607421875, "logps/rejected": -160.081787109375, "loss": 0.61, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8506819009780884, "rewards/margins": 0.24718789756298065, "rewards/rejected": -1.097869873046875, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -2.5268397331237793, "eval_logits/rejected": -2.51981520652771, "eval_logps/chosen": -132.1185302734375, "eval_logps/rejected": -148.7821044921875, "eval_loss": 0.6540122628211975, "eval_rewards/accuracies": 0.5980483293533325, "eval_rewards/chosen": -0.7310304641723633, "eval_rewards/margins": 0.12929461896419525, "eval_rewards/rejected": -0.8603251576423645, "eval_runtime": 383.2338, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 24.67258071899414, "learning_rate": 6.159183739708386e-08, "logits/chosen": -2.4517154693603516, "logits/rejected": -2.423412561416626, "logps/chosen": -147.011962890625, "logps/rejected": -169.0835418701172, "loss": 0.5903, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8814491033554077, "rewards/margins": 0.30713149905204773, "rewards/rejected": -1.1885805130004883, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 23.347091674804688, "learning_rate": 6.149430302890658e-08, "logits/chosen": -2.3571114540100098, "logits/rejected": -2.3440260887145996, "logps/chosen": -145.65292358398438, "logps/rejected": -164.59690856933594, "loss": 0.6285, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9319342374801636, "rewards/margins": 0.20812121033668518, "rewards/rejected": -1.140055537223816, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 24.38920021057129, "learning_rate": 6.139672246048741e-08, "logits/chosen": -2.43851900100708, "logits/rejected": -2.4240710735321045, "logps/chosen": -146.89157104492188, "logps/rejected": -171.93997192382812, "loss": 0.6267, "rewards/accuracies": 0.625, "rewards/chosen": -0.9251266717910767, "rewards/margins": 0.2314465492963791, "rewards/rejected": -1.1565730571746826, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 28.91775894165039, "learning_rate": 6.129909608404203e-08, "logits/chosen": -2.4394712448120117, "logits/rejected": -2.426893472671509, "logps/chosen": -156.36746215820312, "logps/rejected": -170.6226043701172, "loss": 0.6387, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0020545721054077, "rewards/margins": 0.2015179693698883, "rewards/rejected": -1.2035726308822632, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 34.99538040161133, "learning_rate": 6.120142429197024e-08, "logits/chosen": -2.3521504402160645, "logits/rejected": -2.3453006744384766, "logps/chosen": -151.5529327392578, "logps/rejected": -178.8933563232422, "loss": 0.62, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0283129215240479, "rewards/margins": 0.23831066489219666, "rewards/rejected": -1.266623616218567, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 30.90590476989746, "learning_rate": 6.110370747685437e-08, "logits/chosen": -2.4394407272338867, "logits/rejected": -2.4174625873565674, "logps/chosen": -161.3643035888672, "logps/rejected": -182.9066925048828, "loss": 0.6383, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0784192085266113, "rewards/margins": 0.22087764739990234, "rewards/rejected": -1.2992968559265137, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 27.465330123901367, "learning_rate": 6.100594603145774e-08, "logits/chosen": -2.446089267730713, "logits/rejected": -2.4204440116882324, "logps/chosen": -157.5586395263672, "logps/rejected": -176.43116760253906, "loss": 0.6287, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0111454725265503, "rewards/margins": 0.23245224356651306, "rewards/rejected": -1.2435976266860962, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 33.98664474487305, "learning_rate": 6.090814034872306e-08, "logits/chosen": -2.4160518646240234, "logits/rejected": -2.3934266567230225, "logps/chosen": -155.5721435546875, "logps/rejected": -176.04550170898438, "loss": 0.6278, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0139497518539429, "rewards/margins": 0.23836950957775116, "rewards/rejected": -1.2523192167282104, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 24.236244201660156, "learning_rate": 6.08102908217708e-08, "logits/chosen": -2.4765357971191406, "logits/rejected": -2.4670848846435547, "logps/chosen": -147.55117797851562, "logps/rejected": -174.74295043945312, "loss": 0.6063, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9235634803771973, "rewards/margins": 0.2622116208076477, "rewards/rejected": -1.1857750415802002, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 36.001224517822266, "learning_rate": 6.071239784389773e-08, "logits/chosen": -2.3951425552368164, "logits/rejected": -2.380840301513672, "logps/chosen": -148.68881225585938, "logps/rejected": -171.8776397705078, "loss": 0.6088, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9332743883132935, "rewards/margins": 0.2613178789615631, "rewards/rejected": -1.1945923566818237, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 30.147855758666992, "learning_rate": 6.061446180857521e-08, "logits/chosen": -2.414111852645874, "logits/rejected": -2.3871891498565674, "logps/chosen": -158.39291381835938, "logps/rejected": -178.720947265625, "loss": 0.6044, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9931279420852661, "rewards/margins": 0.28494516015052795, "rewards/rejected": -1.2780730724334717, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 24.601787567138672, "learning_rate": 6.051648310944766e-08, "logits/chosen": -2.3954672813415527, "logits/rejected": -2.3746702671051025, "logps/chosen": -156.74166870117188, "logps/rejected": -172.85977172851562, "loss": 0.6243, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0202082395553589, "rewards/margins": 0.22919857501983643, "rewards/rejected": -1.2494069337844849, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 26.90444564819336, "learning_rate": 6.041846214033103e-08, "logits/chosen": -2.386838912963867, "logits/rejected": -2.369894504547119, "logps/chosen": -155.7630157470703, "logps/rejected": -168.96263122558594, "loss": 0.6497, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0040968656539917, "rewards/margins": 0.18276354670524597, "rewards/rejected": -1.18686044216156, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 25.653934478759766, "learning_rate": 6.032039929521118e-08, "logits/chosen": -2.5071861743927, "logits/rejected": -2.493710994720459, "logps/chosen": -149.86094665527344, "logps/rejected": -161.53927612304688, "loss": 0.6623, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9416524171829224, "rewards/margins": 0.13262325525283813, "rewards/rejected": -1.0742757320404053, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 25.425033569335938, "learning_rate": 6.02222949682422e-08, "logits/chosen": -2.4378743171691895, "logits/rejected": -2.4243133068084717, "logps/chosen": -144.62643432617188, "logps/rejected": -172.61434936523438, "loss": 0.5975, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8925802111625671, "rewards/margins": 0.2852075695991516, "rewards/rejected": -1.1777875423431396, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 30.290773391723633, "learning_rate": 6.0124149553745e-08, "logits/chosen": -2.5087571144104004, "logits/rejected": -2.484095335006714, "logps/chosen": -148.76278686523438, "logps/rejected": -177.43338012695312, "loss": 0.5976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9393836259841919, "rewards/margins": 0.3176620304584503, "rewards/rejected": -1.2570455074310303, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 29.89841651916504, "learning_rate": 6.002596344620556e-08, "logits/chosen": -2.3959784507751465, "logits/rejected": -2.3787484169006348, "logps/chosen": -155.08749389648438, "logps/rejected": -177.78707885742188, "loss": 0.6126, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0001827478408813, "rewards/margins": 0.2656208276748657, "rewards/rejected": -1.265803575515747, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 24.696767807006836, "learning_rate": 5.992773704027354e-08, "logits/chosen": -2.453443765640259, "logits/rejected": -2.430490493774414, "logps/chosen": -158.04920959472656, "logps/rejected": -188.0573272705078, "loss": 0.5873, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.02683424949646, "rewards/margins": 0.31399792432785034, "rewards/rejected": -1.340832233428955, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 28.04570198059082, "learning_rate": 5.982947073076041e-08, "logits/chosen": -2.449530839920044, "logits/rejected": -2.4251859188079834, "logps/chosen": -159.4496612548828, "logps/rejected": -177.74026489257812, "loss": 0.6136, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.008068323135376, "rewards/margins": 0.24308781325817108, "rewards/rejected": -1.251155972480774, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 28.41834831237793, "learning_rate": 5.973116491263818e-08, "logits/chosen": -2.4075114727020264, "logits/rejected": -2.3850674629211426, "logps/chosen": -160.99905395507812, "logps/rejected": -175.75933837890625, "loss": 0.6547, "rewards/accuracies": 0.625, "rewards/chosen": -1.0687962770462036, "rewards/margins": 0.17903414368629456, "rewards/rejected": -1.2478303909301758, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 33.26375961303711, "learning_rate": 5.963281998103759e-08, "logits/chosen": -2.420893907546997, "logits/rejected": -2.4020278453826904, "logps/chosen": -160.13504028320312, "logps/rejected": -179.3316650390625, "loss": 0.6322, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0473644733428955, "rewards/margins": 0.2434740513563156, "rewards/rejected": -1.290838599205017, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 24.831239700317383, "learning_rate": 5.953443633124658e-08, "logits/chosen": -2.371798038482666, "logits/rejected": -2.3636550903320312, "logps/chosen": -154.66403198242188, "logps/rejected": -166.26895141601562, "loss": 0.6485, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9878658056259155, "rewards/margins": 0.17269843816757202, "rewards/rejected": -1.1605643033981323, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 30.51129913330078, "learning_rate": 5.9436014358708787e-08, "logits/chosen": -2.361635208129883, "logits/rejected": -2.3477437496185303, "logps/chosen": -143.97293090820312, "logps/rejected": -171.8150177001953, "loss": 0.5939, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9073087573051453, "rewards/margins": 0.29389339685440063, "rewards/rejected": -1.2012020349502563, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 25.27128028869629, "learning_rate": 5.933755445902177e-08, "logits/chosen": -2.4640278816223145, "logits/rejected": -2.4400534629821777, "logps/chosen": -155.98472595214844, "logps/rejected": -171.5356903076172, "loss": 0.6424, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9910284876823425, "rewards/margins": 0.20574815571308136, "rewards/rejected": -1.1967766284942627, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 30.846521377563477, "learning_rate": 5.9239057027935637e-08, "logits/chosen": -2.397857666015625, "logits/rejected": -2.375941753387451, "logps/chosen": -155.69415283203125, "logps/rejected": -176.6514434814453, "loss": 0.624, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9969285130500793, "rewards/margins": 0.2363603562116623, "rewards/rejected": -1.2332890033721924, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 40.82417297363281, "learning_rate": 5.914052246135127e-08, "logits/chosen": -2.400925874710083, "logits/rejected": -2.382791757583618, "logps/chosen": -153.85623168945312, "logps/rejected": -180.74575805664062, "loss": 0.6067, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9923663139343262, "rewards/margins": 0.30433157086372375, "rewards/rejected": -1.2966978549957275, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 28.784740447998047, "learning_rate": 5.904195115531892e-08, "logits/chosen": -2.4620003700256348, "logits/rejected": -2.441227436065674, "logps/chosen": -160.8101806640625, "logps/rejected": -186.73927307128906, "loss": 0.6119, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.079491376876831, "rewards/margins": 0.26982641220092773, "rewards/rejected": -1.3493177890777588, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 33.21119689941406, "learning_rate": 5.894334350603637e-08, "logits/chosen": -2.3955516815185547, "logits/rejected": -2.391489028930664, "logps/chosen": -158.9721221923828, "logps/rejected": -177.6050262451172, "loss": 0.6511, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0687752962112427, "rewards/margins": 0.17915226519107819, "rewards/rejected": -1.2479274272918701, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 26.478548049926758, "learning_rate": 5.8844699909847576e-08, "logits/chosen": -2.412294626235962, "logits/rejected": -2.3877341747283936, "logps/chosen": -159.53952026367188, "logps/rejected": -169.1040802001953, "loss": 0.6595, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0341455936431885, "rewards/margins": 0.18306097388267517, "rewards/rejected": -1.2172067165374756, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 28.747560501098633, "learning_rate": 5.8746020763240956e-08, "logits/chosen": -2.4410312175750732, "logits/rejected": -2.415844440460205, "logps/chosen": -154.33767700195312, "logps/rejected": -167.69979858398438, "loss": 0.6399, "rewards/accuracies": 0.625, "rewards/chosen": -0.9719335436820984, "rewards/margins": 0.1852661371231079, "rewards/rejected": -1.1571996212005615, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 23.79633331298828, "learning_rate": 5.8647306462847814e-08, "logits/chosen": -2.3959717750549316, "logits/rejected": -2.377711772918701, "logps/chosen": -151.95407104492188, "logps/rejected": -167.41001892089844, "loss": 0.6573, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9795981645584106, "rewards/margins": 0.16346515715122223, "rewards/rejected": -1.1430633068084717, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 25.051179885864258, "learning_rate": 5.854855740544078e-08, "logits/chosen": -2.4349405765533447, "logits/rejected": -2.408961772918701, "logps/chosen": -142.64926147460938, "logps/rejected": -165.88307189941406, "loss": 0.6049, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8556321859359741, "rewards/margins": 0.26391908526420593, "rewards/rejected": -1.1195513010025024, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 22.280866622924805, "learning_rate": 5.844977398793211e-08, "logits/chosen": -2.453397274017334, "logits/rejected": -2.421761989593506, "logps/chosen": -145.08370971679688, "logps/rejected": -171.266845703125, "loss": 0.6036, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9160129427909851, "rewards/margins": 0.28230738639831543, "rewards/rejected": -1.1983201503753662, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 29.50322151184082, "learning_rate": 5.8350956607372284e-08, "logits/chosen": -2.4410319328308105, "logits/rejected": -2.433777093887329, "logps/chosen": -153.44320678710938, "logps/rejected": -175.6660614013672, "loss": 0.6224, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0005269050598145, "rewards/margins": 0.23446373641490936, "rewards/rejected": -1.2349905967712402, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 27.271137237548828, "learning_rate": 5.825210566094817e-08, "logits/chosen": -2.4460768699645996, "logits/rejected": -2.424647808074951, "logps/chosen": -148.30789184570312, "logps/rejected": -178.0007781982422, "loss": 0.5884, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9374672770500183, "rewards/margins": 0.32129615545272827, "rewards/rejected": -1.2587635517120361, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 23.55537986755371, "learning_rate": 5.8153221545981634e-08, "logits/chosen": -2.4017863273620605, "logits/rejected": -2.393065929412842, "logps/chosen": -151.10546875, "logps/rejected": -180.36007690429688, "loss": 0.6065, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9841243624687195, "rewards/margins": 0.27674761414527893, "rewards/rejected": -1.2608718872070312, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 24.040985107421875, "learning_rate": 5.805430465992783e-08, "logits/chosen": -2.4128012657165527, "logits/rejected": -2.393354892730713, "logps/chosen": -163.99383544921875, "logps/rejected": -184.14706420898438, "loss": 0.618, "rewards/accuracies": 0.6875, "rewards/chosen": -1.081097960472107, "rewards/margins": 0.2662922739982605, "rewards/rejected": -1.3473902940750122, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 30.036476135253906, "learning_rate": 5.795535540037364e-08, "logits/chosen": -2.4553608894348145, "logits/rejected": -2.4505133628845215, "logps/chosen": -163.72341918945312, "logps/rejected": -187.991943359375, "loss": 0.6352, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1354390382766724, "rewards/margins": 0.2194855660200119, "rewards/rejected": -1.3549244403839111, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 39.66596221923828, "learning_rate": 5.785637416503607e-08, "logits/chosen": -2.444953680038452, "logits/rejected": -2.420752763748169, "logps/chosen": -165.5264434814453, "logps/rejected": -186.53871154785156, "loss": 0.6197, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0931392908096313, "rewards/margins": 0.26103657484054565, "rewards/rejected": -1.3541758060455322, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 29.25849151611328, "learning_rate": 5.7757361351760625e-08, "logits/chosen": -2.4005560874938965, "logits/rejected": -2.3760859966278076, "logps/chosen": -162.47555541992188, "logps/rejected": -174.39950561523438, "loss": 0.6575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.061780571937561, "rewards/margins": 0.16641394793987274, "rewards/rejected": -1.2281947135925293, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -2.5021514892578125, "eval_logits/rejected": -2.4947457313537598, "eval_logps/chosen": -142.70245361328125, "eval_logps/rejected": -160.39002990722656, "eval_loss": 0.6526638269424438, "eval_rewards/accuracies": 0.5996747016906738, "eval_rewards/chosen": -0.8368697166442871, "eval_rewards/margins": 0.1395346075296402, "eval_rewards/rejected": -0.9764042496681213, "eval_runtime": 383.021, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 36.97319412231445, "learning_rate": 5.765831735851978e-08, "logits/chosen": -2.452981472015381, "logits/rejected": -2.4250099658966064, "logps/chosen": -153.0067596435547, "logps/rejected": -181.1830291748047, "loss": 0.6056, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0083194971084595, "rewards/margins": 0.28701165318489075, "rewards/rejected": -1.2953310012817383, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 29.794078826904297, "learning_rate": 5.7559242583411284e-08, "logits/chosen": -2.4661405086517334, "logits/rejected": -2.444133996963501, "logps/chosen": -151.85812377929688, "logps/rejected": -175.8976593017578, "loss": 0.6124, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.002595067024231, "rewards/margins": 0.26715952157974243, "rewards/rejected": -1.2697546482086182, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 31.617746353149414, "learning_rate": 5.746013742465665e-08, "logits/chosen": -2.327963352203369, "logits/rejected": -2.3066611289978027, "logps/chosen": -159.5944061279297, "logps/rejected": -183.40618896484375, "loss": 0.6145, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0419352054595947, "rewards/margins": 0.2699545621871948, "rewards/rejected": -1.3118897676467896, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 26.673795700073242, "learning_rate": 5.7361002280599503e-08, "logits/chosen": -2.377643585205078, "logits/rejected": -2.3630967140197754, "logps/chosen": -148.97561645507812, "logps/rejected": -180.88720703125, "loss": 0.5877, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9541751146316528, "rewards/margins": 0.32628554105758667, "rewards/rejected": -1.2804607152938843, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 42.92271041870117, "learning_rate": 5.726183754970397e-08, "logits/chosen": -2.470742702484131, "logits/rejected": -2.450875759124756, "logps/chosen": -151.8878936767578, "logps/rejected": -185.0004119873047, "loss": 0.5973, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9842556715011597, "rewards/margins": 0.3271600902080536, "rewards/rejected": -1.3114157915115356, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 32.4549446105957, "learning_rate": 5.716264363055314e-08, "logits/chosen": -2.3918213844299316, "logits/rejected": -2.3717854022979736, "logps/chosen": -167.3634490966797, "logps/rejected": -192.9376678466797, "loss": 0.6065, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1311490535736084, "rewards/margins": 0.2796618938446045, "rewards/rejected": -1.4108108282089233, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 33.520606994628906, "learning_rate": 5.706342092184739e-08, "logits/chosen": -2.502729892730713, "logits/rejected": -2.4715514183044434, "logps/chosen": -169.19027709960938, "logps/rejected": -197.01089477539062, "loss": 0.5957, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.112261414527893, "rewards/margins": 0.3132651150226593, "rewards/rejected": -1.42552649974823, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 40.5619010925293, "learning_rate": 5.696416982240282e-08, "logits/chosen": -2.336522102355957, "logits/rejected": -2.3161420822143555, "logps/chosen": -182.067626953125, "logps/rejected": -199.307861328125, "loss": 0.6547, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2775065898895264, "rewards/margins": 0.2100718468427658, "rewards/rejected": -1.4875786304473877, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 48.231964111328125, "learning_rate": 5.686489073114965e-08, "logits/chosen": -2.3405094146728516, "logits/rejected": -2.315372943878174, "logps/chosen": -174.9233856201172, "logps/rejected": -196.65234375, "loss": 0.6183, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1903566122055054, "rewards/margins": 0.2761090397834778, "rewards/rejected": -1.4664658308029175, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 35.84044647216797, "learning_rate": 5.676558404713061e-08, "logits/chosen": -2.4168288707733154, "logits/rejected": -2.3919520378112793, "logps/chosen": -173.74485778808594, "logps/rejected": -192.24745178222656, "loss": 0.6406, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1773998737335205, "rewards/margins": 0.2269444763660431, "rewards/rejected": -1.4043442010879517, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 30.220081329345703, "learning_rate": 5.666625016949933e-08, "logits/chosen": -2.4153027534484863, "logits/rejected": -2.4012513160705566, "logps/chosen": -168.22994995117188, "logps/rejected": -187.8097686767578, "loss": 0.6312, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1117278337478638, "rewards/margins": 0.2432313859462738, "rewards/rejected": -1.35495924949646, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 28.499393463134766, "learning_rate": 5.656688949751875e-08, "logits/chosen": -2.4883599281311035, "logits/rejected": -2.4606895446777344, "logps/chosen": -161.80191040039062, "logps/rejected": -186.3115692138672, "loss": 0.5991, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0300297737121582, "rewards/margins": 0.3197111189365387, "rewards/rejected": -1.349740982055664, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 25.9266414642334, "learning_rate": 5.64675024305595e-08, "logits/chosen": -2.4324049949645996, "logits/rejected": -2.4044463634490967, "logps/chosen": -152.42242431640625, "logps/rejected": -172.3201904296875, "loss": 0.6181, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9726540446281433, "rewards/margins": 0.25222453474998474, "rewards/rejected": -1.2248785495758057, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 25.37604331970215, "learning_rate": 5.6368089368098315e-08, "logits/chosen": -2.440627098083496, "logits/rejected": -2.4221489429473877, "logps/chosen": -148.80087280273438, "logps/rejected": -168.09170532226562, "loss": 0.6287, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9170831441879272, "rewards/margins": 0.21031467616558075, "rewards/rejected": -1.1273977756500244, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 28.829172134399414, "learning_rate": 5.626865070971638e-08, "logits/chosen": -2.376267910003662, "logits/rejected": -2.3775124549865723, "logps/chosen": -148.6018829345703, "logps/rejected": -168.5276336669922, "loss": 0.6395, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9364569783210754, "rewards/margins": 0.1876915991306305, "rewards/rejected": -1.1241486072540283, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 44.60697555541992, "learning_rate": 5.616918685509783e-08, "logits/chosen": -2.4391655921936035, "logits/rejected": -2.409994125366211, "logps/chosen": -166.36790466308594, "logps/rejected": -190.92630004882812, "loss": 0.6093, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1098852157592773, "rewards/margins": 0.2932312488555908, "rewards/rejected": -1.4031165838241577, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 42.026702880859375, "learning_rate": 5.606969820402797e-08, "logits/chosen": -2.4033069610595703, "logits/rejected": -2.3744869232177734, "logps/chosen": -165.89268493652344, "logps/rejected": -186.732177734375, "loss": 0.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1160341501235962, "rewards/margins": 0.2519269287586212, "rewards/rejected": -1.3679611682891846, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 30.750925064086914, "learning_rate": 5.597018515639189e-08, "logits/chosen": -2.467057943344116, "logits/rejected": -2.4468226432800293, "logps/chosen": -165.82440185546875, "logps/rejected": -179.36331176757812, "loss": 0.6693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1067142486572266, "rewards/margins": 0.16217520833015442, "rewards/rejected": -1.2688895463943481, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 28.24340057373047, "learning_rate": 5.587064811217266e-08, "logits/chosen": -2.3896968364715576, "logits/rejected": -2.3712821006774902, "logps/chosen": -153.08383178710938, "logps/rejected": -173.04818725585938, "loss": 0.616, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9606486558914185, "rewards/margins": 0.23374095559120178, "rewards/rejected": -1.1943897008895874, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 34.09727096557617, "learning_rate": 5.577108747144983e-08, "logits/chosen": -2.448643922805786, "logits/rejected": -2.426971197128296, "logps/chosen": -164.97779846191406, "logps/rejected": -180.38192749023438, "loss": 0.6403, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0645620822906494, "rewards/margins": 0.22409923374652863, "rewards/rejected": -1.288661241531372, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 20.49359893798828, "learning_rate": 5.567150363439779e-08, "logits/chosen": -2.418567657470703, "logits/rejected": -2.398942708969116, "logps/chosen": -156.52621459960938, "logps/rejected": -174.5842742919922, "loss": 0.6264, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9978251457214355, "rewards/margins": 0.2272067368030548, "rewards/rejected": -1.225031852722168, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 32.422637939453125, "learning_rate": 5.557189700128414e-08, "logits/chosen": -2.356786012649536, "logits/rejected": -2.342362880706787, "logps/chosen": -153.6685791015625, "logps/rejected": -175.55043029785156, "loss": 0.6103, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9793645739555359, "rewards/margins": 0.2766626179218292, "rewards/rejected": -1.256027340888977, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 34.36164855957031, "learning_rate": 5.547226797246817e-08, "logits/chosen": -2.3909685611724854, "logits/rejected": -2.3950893878936768, "logps/chosen": -148.6129913330078, "logps/rejected": -172.05209350585938, "loss": 0.6279, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9618427157402039, "rewards/margins": 0.23005004227161407, "rewards/rejected": -1.1918928623199463, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 25.065876007080078, "learning_rate": 5.53726169483991e-08, "logits/chosen": -2.3976728916168213, "logits/rejected": -2.382448673248291, "logps/chosen": -152.49664306640625, "logps/rejected": -173.43907165527344, "loss": 0.6379, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.011561632156372, "rewards/margins": 0.20885252952575684, "rewards/rejected": -1.2204139232635498, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 29.87236785888672, "learning_rate": 5.5272944329614656e-08, "logits/chosen": -2.448866367340088, "logits/rejected": -2.4280974864959717, "logps/chosen": -157.89830017089844, "logps/rejected": -178.96958923339844, "loss": 0.6355, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0388907194137573, "rewards/margins": 0.2381664514541626, "rewards/rejected": -1.27705717086792, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 28.082740783691406, "learning_rate": 5.517325051673928e-08, "logits/chosen": -2.4414734840393066, "logits/rejected": -2.423208713531494, "logps/chosen": -156.8459014892578, "logps/rejected": -171.1651153564453, "loss": 0.6481, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0130650997161865, "rewards/margins": 0.17299702763557434, "rewards/rejected": -1.186062216758728, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 30.63709259033203, "learning_rate": 5.5073535910482625e-08, "logits/chosen": -2.4263854026794434, "logits/rejected": -2.407336711883545, "logps/chosen": -144.11001586914062, "logps/rejected": -172.8216094970703, "loss": 0.5891, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8767523765563965, "rewards/margins": 0.31083789467811584, "rewards/rejected": -1.18759024143219, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 28.939895629882812, "learning_rate": 5.4973800911637966e-08, "logits/chosen": -2.4199304580688477, "logits/rejected": -2.4097867012023926, "logps/chosen": -144.50460815429688, "logps/rejected": -168.3485565185547, "loss": 0.6292, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9143769145011902, "rewards/margins": 0.2185221016407013, "rewards/rejected": -1.1328990459442139, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 26.135000228881836, "learning_rate": 5.487404592108047e-08, "logits/chosen": -2.395542621612549, "logits/rejected": -2.3650925159454346, "logps/chosen": -153.5354461669922, "logps/rejected": -171.31285095214844, "loss": 0.6163, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9741457104682922, "rewards/margins": 0.23896925151348114, "rewards/rejected": -1.2131149768829346, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 21.869953155517578, "learning_rate": 5.477427133976573e-08, "logits/chosen": -2.4462099075317383, "logits/rejected": -2.4201271533966064, "logps/chosen": -160.56097412109375, "logps/rejected": -169.7161865234375, "loss": 0.665, "rewards/accuracies": 0.625, "rewards/chosen": -1.0276155471801758, "rewards/margins": 0.1408327966928482, "rewards/rejected": -1.1684482097625732, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 23.752208709716797, "learning_rate": 5.467447756872802e-08, "logits/chosen": -2.414022922515869, "logits/rejected": -2.388249158859253, "logps/chosen": -151.4723663330078, "logps/rejected": -174.31192016601562, "loss": 0.6124, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.947963535785675, "rewards/margins": 0.27398091554641724, "rewards/rejected": -1.2219444513320923, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 24.905351638793945, "learning_rate": 5.457466500907877e-08, "logits/chosen": -2.452876091003418, "logits/rejected": -2.429744005203247, "logps/chosen": -157.91603088378906, "logps/rejected": -170.0849609375, "loss": 0.6383, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9618592262268066, "rewards/margins": 0.18914249539375305, "rewards/rejected": -1.1510016918182373, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 26.952442169189453, "learning_rate": 5.447483406200496e-08, "logits/chosen": -2.4045395851135254, "logits/rejected": -2.3840603828430176, "logps/chosen": -156.55654907226562, "logps/rejected": -176.99368286132812, "loss": 0.6282, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0117504596710205, "rewards/margins": 0.2167857438325882, "rewards/rejected": -1.2285362482070923, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 26.6584529876709, "learning_rate": 5.437498512876741e-08, "logits/chosen": -2.44810152053833, "logits/rejected": -2.4075706005096436, "logps/chosen": -158.2299041748047, "logps/rejected": -178.40318298339844, "loss": 0.5943, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9947940111160278, "rewards/margins": 0.30434533953666687, "rewards/rejected": -1.2991392612457275, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 35.36668395996094, "learning_rate": 5.427511861069932e-08, "logits/chosen": -2.4356255531311035, "logits/rejected": -2.407078266143799, "logps/chosen": -166.36285400390625, "logps/rejected": -191.15957641601562, "loss": 0.6044, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0679361820220947, "rewards/margins": 0.30740636587142944, "rewards/rejected": -1.375342607498169, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 30.753293991088867, "learning_rate": 5.417523490920448e-08, "logits/chosen": -2.404432773590088, "logits/rejected": -2.399017095565796, "logps/chosen": -153.8131103515625, "logps/rejected": -181.61129760742188, "loss": 0.6141, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0386629104614258, "rewards/margins": 0.2596225142478943, "rewards/rejected": -1.2982854843139648, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 24.764421463012695, "learning_rate": 5.4075334425755824e-08, "logits/chosen": -2.4555180072784424, "logits/rejected": -2.4226462841033936, "logps/chosen": -161.15907287597656, "logps/rejected": -186.55917358398438, "loss": 0.5987, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.055022954940796, "rewards/margins": 0.32597872614860535, "rewards/rejected": -1.3810017108917236, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 23.54519271850586, "learning_rate": 5.397541756189369e-08, "logits/chosen": -2.4232265949249268, "logits/rejected": -2.4142425060272217, "logps/chosen": -166.00881958007812, "logps/rejected": -180.232177734375, "loss": 0.6474, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0988941192626953, "rewards/margins": 0.18343313038349152, "rewards/rejected": -1.2823272943496704, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 22.17339515686035, "learning_rate": 5.387548471922425e-08, "logits/chosen": -2.509873628616333, "logits/rejected": -2.5046327114105225, "logps/chosen": -158.27267456054688, "logps/rejected": -186.60572814941406, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0305545330047607, "rewards/margins": 0.27656883001327515, "rewards/rejected": -1.3071234226226807, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 32.23719024658203, "learning_rate": 5.3775536299417957e-08, "logits/chosen": -2.4358341693878174, "logits/rejected": -2.418269395828247, "logps/chosen": -163.1688995361328, "logps/rejected": -189.51535034179688, "loss": 0.5969, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0882149934768677, "rewards/margins": 0.301506370306015, "rewards/rejected": -1.389721393585205, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -2.4746241569519043, "eval_logits/rejected": -2.466055154800415, "eval_logps/chosen": -148.2314910888672, "eval_logps/rejected": -166.408935546875, "eval_loss": 0.6516256332397461, "eval_rewards/accuracies": 0.6101301312446594, "eval_rewards/chosen": -0.8921600580215454, "eval_rewards/margins": 0.14443333446979523, "eval_rewards/rejected": -1.0365933179855347, "eval_runtime": 382.9379, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 39.60490036010742, "learning_rate": 5.3675572704207826e-08, "logits/chosen": -2.3610215187072754, "logits/rejected": -2.3336758613586426, "logps/chosen": -170.14991760253906, "logps/rejected": -187.44143676757812, "loss": 0.6331, "rewards/accuracies": 0.625, "rewards/chosen": -1.1286404132843018, "rewards/margins": 0.23823793232440948, "rewards/rejected": -1.3668782711029053, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 27.61195182800293, "learning_rate": 5.3575594335387876e-08, "logits/chosen": -2.404003381729126, "logits/rejected": -2.3874430656433105, "logps/chosen": -159.68882751464844, "logps/rejected": -183.86068725585938, "loss": 0.6165, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0844800472259521, "rewards/margins": 0.26251906156539917, "rewards/rejected": -1.346998929977417, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 27.68160629272461, "learning_rate": 5.347560159481153e-08, "logits/chosen": -2.3483078479766846, "logits/rejected": -2.3337655067443848, "logps/chosen": -156.78604125976562, "logps/rejected": -187.99929809570312, "loss": 0.5975, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0658495426177979, "rewards/margins": 0.28840649127960205, "rewards/rejected": -1.3542559146881104, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 33.5504035949707, "learning_rate": 5.337559488438994e-08, "logits/chosen": -2.3742103576660156, "logits/rejected": -2.364173173904419, "logps/chosen": -171.8185272216797, "logps/rejected": -199.3497314453125, "loss": 0.6049, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.143182635307312, "rewards/margins": 0.3130360543727875, "rewards/rejected": -1.4562186002731323, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 24.97473907470703, "learning_rate": 5.327557460609043e-08, "logits/chosen": -2.362847328186035, "logits/rejected": -2.339869499206543, "logps/chosen": -164.04771423339844, "logps/rejected": -186.96725463867188, "loss": 0.627, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.077368974685669, "rewards/margins": 0.2501301169395447, "rewards/rejected": -1.3274990320205688, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 27.14133071899414, "learning_rate": 5.317554116193488e-08, "logits/chosen": -2.3598151206970215, "logits/rejected": -2.3444392681121826, "logps/chosen": -168.51766967773438, "logps/rejected": -191.16806030273438, "loss": 0.6361, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1996451616287231, "rewards/margins": 0.23302045464515686, "rewards/rejected": -1.4326655864715576, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 24.20354652404785, "learning_rate": 5.307549495399804e-08, "logits/chosen": -2.428321123123169, "logits/rejected": -2.4023356437683105, "logps/chosen": -171.56423950195312, "logps/rejected": -188.52224731445312, "loss": 0.6295, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1535447835922241, "rewards/margins": 0.22041280567646027, "rewards/rejected": -1.373957633972168, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 37.25349807739258, "learning_rate": 5.2975436384406e-08, "logits/chosen": -2.421985387802124, "logits/rejected": -2.403449296951294, "logps/chosen": -157.58123779296875, "logps/rejected": -184.67398071289062, "loss": 0.6101, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0259568691253662, "rewards/margins": 0.2917240858078003, "rewards/rejected": -1.3176809549331665, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 25.58997917175293, "learning_rate": 5.287536585533453e-08, "logits/chosen": -2.3553547859191895, "logits/rejected": -2.3330671787261963, "logps/chosen": -152.5822296142578, "logps/rejected": -170.2565155029297, "loss": 0.627, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9716187715530396, "rewards/margins": 0.23068185150623322, "rewards/rejected": -1.2023006677627563, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 34.46986770629883, "learning_rate": 5.2775283769007464e-08, "logits/chosen": -2.402198314666748, "logits/rejected": -2.39088773727417, "logps/chosen": -156.928955078125, "logps/rejected": -186.67454528808594, "loss": 0.6059, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.019683599472046, "rewards/margins": 0.2989785075187683, "rewards/rejected": -1.318662166595459, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 30.7072696685791, "learning_rate": 5.267519052769507e-08, "logits/chosen": -2.3956387042999268, "logits/rejected": -2.3802638053894043, "logps/chosen": -164.15245056152344, "logps/rejected": -180.8313446044922, "loss": 0.6306, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0937596559524536, "rewards/margins": 0.21840448677539825, "rewards/rejected": -1.312164068222046, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 27.55194664001465, "learning_rate": 5.257508653371252e-08, "logits/chosen": -2.456815242767334, "logits/rejected": -2.4346182346343994, "logps/chosen": -155.98793029785156, "logps/rejected": -186.184326171875, "loss": 0.6075, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0377084016799927, "rewards/margins": 0.29293209314346313, "rewards/rejected": -1.330640435218811, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 31.598167419433594, "learning_rate": 5.2474972189418096e-08, "logits/chosen": -2.4160571098327637, "logits/rejected": -2.3925347328186035, "logps/chosen": -164.6399688720703, "logps/rejected": -189.77328491210938, "loss": 0.6075, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.061995267868042, "rewards/margins": 0.2920532822608948, "rewards/rejected": -1.354048490524292, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 25.802043914794922, "learning_rate": 5.237484789721178e-08, "logits/chosen": -2.3657262325286865, "logits/rejected": -2.349848747253418, "logps/chosen": -158.73101806640625, "logps/rejected": -189.5824737548828, "loss": 0.5898, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0642492771148682, "rewards/margins": 0.327562153339386, "rewards/rejected": -1.3918113708496094, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 44.81790542602539, "learning_rate": 5.227471405953352e-08, "logits/chosen": -2.3814568519592285, "logits/rejected": -2.3522324562072754, "logps/chosen": -156.79432678222656, "logps/rejected": -183.58204650878906, "loss": 0.5998, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0345754623413086, "rewards/margins": 0.29350045323371887, "rewards/rejected": -1.328075885772705, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 29.344892501831055, "learning_rate": 5.217457107886159e-08, "logits/chosen": -2.438901901245117, "logits/rejected": -2.406013011932373, "logps/chosen": -173.286376953125, "logps/rejected": -199.4598846435547, "loss": 0.6025, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1483434438705444, "rewards/margins": 0.32900819182395935, "rewards/rejected": -1.4773516654968262, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 42.48180389404297, "learning_rate": 5.207441935771104e-08, "logits/chosen": -2.430676221847534, "logits/rejected": -2.3925089836120605, "logps/chosen": -171.08541870117188, "logps/rejected": -199.71249389648438, "loss": 0.5897, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1605144739151, "rewards/margins": 0.3269973695278168, "rewards/rejected": -1.4875118732452393, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 34.442657470703125, "learning_rate": 5.197425929863204e-08, "logits/chosen": -2.4080941677093506, "logits/rejected": -2.3933424949645996, "logps/chosen": -172.6584014892578, "logps/rejected": -193.78787231445312, "loss": 0.6481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1749662160873413, "rewards/margins": 0.22580060362815857, "rewards/rejected": -1.4007668495178223, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 27.828460693359375, "learning_rate": 5.1874091304208314e-08, "logits/chosen": -2.3001291751861572, "logits/rejected": -2.2786030769348145, "logps/chosen": -168.0280303955078, "logps/rejected": -202.01181030273438, "loss": 0.5829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.141455888748169, "rewards/margins": 0.340496689081192, "rewards/rejected": -1.481952428817749, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 23.844345092773438, "learning_rate": 5.17739157770554e-08, "logits/chosen": -2.3625731468200684, "logits/rejected": -2.3439574241638184, "logps/chosen": -166.90896606445312, "logps/rejected": -188.41632080078125, "loss": 0.6401, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.112298607826233, "rewards/margins": 0.23041932284832, "rewards/rejected": -1.3427180051803589, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 31.775917053222656, "learning_rate": 5.167373311981922e-08, "logits/chosen": -2.359849452972412, "logits/rejected": -2.3399579524993896, "logps/chosen": -170.50335693359375, "logps/rejected": -194.62030029296875, "loss": 0.6208, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1581764221191406, "rewards/margins": 0.27720901370048523, "rewards/rejected": -1.4353853464126587, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 60.274497985839844, "learning_rate": 5.157354373517425e-08, "logits/chosen": -2.380483388900757, "logits/rejected": -2.3661274909973145, "logps/chosen": -184.9801483154297, "logps/rejected": -195.0530242919922, "loss": 0.6833, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2929103374481201, "rewards/margins": 0.14616604149341583, "rewards/rejected": -1.4390761852264404, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 28.30097198486328, "learning_rate": 5.147334802582208e-08, "logits/chosen": -2.361516237258911, "logits/rejected": -2.341750383377075, "logps/chosen": -170.9657440185547, "logps/rejected": -186.64126586914062, "loss": 0.6554, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1667261123657227, "rewards/margins": 0.22141973674297333, "rewards/rejected": -1.3881456851959229, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 39.51325225830078, "learning_rate": 5.1373146394489706e-08, "logits/chosen": -2.3611459732055664, "logits/rejected": -2.3512301445007324, "logps/chosen": -155.964599609375, "logps/rejected": -182.50051879882812, "loss": 0.6236, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0606611967086792, "rewards/margins": 0.2499229609966278, "rewards/rejected": -1.310584306716919, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 49.02311706542969, "learning_rate": 5.127293924392787e-08, "logits/chosen": -2.474292039871216, "logits/rejected": -2.460292100906372, "logps/chosen": -170.9194793701172, "logps/rejected": -184.9275360107422, "loss": 0.6621, "rewards/accuracies": 0.59375, "rewards/chosen": -1.168145775794983, "rewards/margins": 0.18423514068126678, "rewards/rejected": -1.3523808717727661, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 28.96719741821289, "learning_rate": 5.117272697690961e-08, "logits/chosen": -2.386169672012329, "logits/rejected": -2.3760600090026855, "logps/chosen": -152.43272399902344, "logps/rejected": -196.67221069335938, "loss": 0.5304, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9874264001846313, "rewards/margins": 0.4499265253543854, "rewards/rejected": -1.4373528957366943, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 31.411161422729492, "learning_rate": 5.10725099962284e-08, "logits/chosen": -2.2730417251586914, "logits/rejected": -2.2473788261413574, "logps/chosen": -161.56280517578125, "logps/rejected": -181.9588623046875, "loss": 0.6425, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0859378576278687, "rewards/margins": 0.2371862232685089, "rewards/rejected": -1.3231239318847656, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 30.01712417602539, "learning_rate": 5.0972288704696764e-08, "logits/chosen": -2.3781850337982178, "logits/rejected": -2.3473598957061768, "logps/chosen": -173.58242797851562, "logps/rejected": -196.97030639648438, "loss": 0.6137, "rewards/accuracies": 0.65625, "rewards/chosen": -1.188126802444458, "rewards/margins": 0.27098771929740906, "rewards/rejected": -1.4591143131256104, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 31.82417869567871, "learning_rate": 5.0872063505144494e-08, "logits/chosen": -2.3453164100646973, "logits/rejected": -2.3215858936309814, "logps/chosen": -176.57174682617188, "logps/rejected": -201.98179626464844, "loss": 0.6095, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2095147371292114, "rewards/margins": 0.29533299803733826, "rewards/rejected": -1.504847764968872, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 29.027536392211914, "learning_rate": 5.077183480041711e-08, "logits/chosen": -2.3935375213623047, "logits/rejected": -2.378740072250366, "logps/chosen": -171.82821655273438, "logps/rejected": -199.50689697265625, "loss": 0.6035, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.169775366783142, "rewards/margins": 0.29034778475761414, "rewards/rejected": -1.4601233005523682, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 36.75428009033203, "learning_rate": 5.067160299337423e-08, "logits/chosen": -2.3084945678710938, "logits/rejected": -2.2897262573242188, "logps/chosen": -177.8446502685547, "logps/rejected": -214.07833862304688, "loss": 0.5932, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2567390203475952, "rewards/margins": 0.35475316643714905, "rewards/rejected": -1.6114921569824219, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 28.859760284423828, "learning_rate": 5.0571368486887913e-08, "logits/chosen": -2.4560189247131348, "logits/rejected": -2.4452311992645264, "logps/chosen": -187.6722869873047, "logps/rejected": -218.9191436767578, "loss": 0.6331, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3441131114959717, "rewards/margins": 0.2731940746307373, "rewards/rejected": -1.6173073053359985, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 30.882644653320312, "learning_rate": 5.047113168384112e-08, "logits/chosen": -2.3941006660461426, "logits/rejected": -2.3643088340759277, "logps/chosen": -185.5674591064453, "logps/rejected": -214.42153930664062, "loss": 0.6052, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.311025619506836, "rewards/margins": 0.3395896553993225, "rewards/rejected": -1.6506150960922241, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 26.167566299438477, "learning_rate": 5.037089298712597e-08, "logits/chosen": -2.3372859954833984, "logits/rejected": -2.311713933944702, "logps/chosen": -179.64578247070312, "logps/rejected": -211.289794921875, "loss": 0.6028, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.246484637260437, "rewards/margins": 0.34223586320877075, "rewards/rejected": -1.5887203216552734, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 30.182079315185547, "learning_rate": 5.027065279964226e-08, "logits/chosen": -2.3960721492767334, "logits/rejected": -2.393296003341675, "logps/chosen": -173.44830322265625, "logps/rejected": -198.02462768554688, "loss": 0.6382, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1929553747177124, "rewards/margins": 0.23101527988910675, "rewards/rejected": -1.4239708185195923, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 45.042503356933594, "learning_rate": 5.017041152429572e-08, "logits/chosen": -2.4475998878479004, "logits/rejected": -2.438934803009033, "logps/chosen": -168.21823120117188, "logps/rejected": -186.36083984375, "loss": 0.6511, "rewards/accuracies": 0.625, "rewards/chosen": -1.1365259885787964, "rewards/margins": 0.20526358485221863, "rewards/rejected": -1.341789722442627, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 23.81475257873535, "learning_rate": 5.00701695639965e-08, "logits/chosen": -2.3587894439697266, "logits/rejected": -2.3438403606414795, "logps/chosen": -161.5234832763672, "logps/rejected": -186.20321655273438, "loss": 0.6293, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0926761627197266, "rewards/margins": 0.26515525579452515, "rewards/rejected": -1.3578314781188965, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 33.94333267211914, "learning_rate": 4.99699273216575e-08, "logits/chosen": -2.4317760467529297, "logits/rejected": -2.414454460144043, "logps/chosen": -159.59573364257812, "logps/rejected": -178.28103637695312, "loss": 0.65, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0440845489501953, "rewards/margins": 0.17422762513160706, "rewards/rejected": -1.2183122634887695, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 24.0312442779541, "learning_rate": 4.986968520019272e-08, "logits/chosen": -2.5148632526397705, "logits/rejected": -2.495476245880127, "logps/chosen": -157.68704223632812, "logps/rejected": -177.65390014648438, "loss": 0.6358, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0029971599578857, "rewards/margins": 0.21555516123771667, "rewards/rejected": -1.2185523509979248, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 28.904621124267578, "learning_rate": 4.9769443602515724e-08, "logits/chosen": -2.390204668045044, "logits/rejected": -2.3624749183654785, "logps/chosen": -157.79043579101562, "logps/rejected": -180.45880126953125, "loss": 0.6211, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0174524784088135, "rewards/margins": 0.2713993489742279, "rewards/rejected": -1.2888518571853638, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -2.480398416519165, "eval_logits/rejected": -2.4724621772766113, "eval_logps/chosen": -137.76979064941406, "eval_logps/rejected": -155.23399353027344, "eval_loss": 0.6525858640670776, "eval_rewards/accuracies": 0.609433114528656, "eval_rewards/chosen": -0.7875431180000305, "eval_rewards/margins": 0.13730084896087646, "eval_rewards/rejected": -0.9248440265655518, "eval_runtime": 383.5461, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 27.6368408203125, "learning_rate": 4.9669202931537895e-08, "logits/chosen": -2.393742799758911, "logits/rejected": -2.379835605621338, "logps/chosen": -147.69479370117188, "logps/rejected": -173.48165893554688, "loss": 0.5951, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8961979746818542, "rewards/margins": 0.2986195981502533, "rewards/rejected": -1.1948175430297852, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 27.73402214050293, "learning_rate": 4.956896359016698e-08, "logits/chosen": -2.472517251968384, "logits/rejected": -2.4543066024780273, "logps/chosen": -151.52615356445312, "logps/rejected": -170.35833740234375, "loss": 0.6451, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9573991894721985, "rewards/margins": 0.1973901093006134, "rewards/rejected": -1.1547894477844238, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 34.30169677734375, "learning_rate": 4.946872598130531e-08, "logits/chosen": -2.41727352142334, "logits/rejected": -2.3968358039855957, "logps/chosen": -156.9556884765625, "logps/rejected": -179.29806518554688, "loss": 0.6254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.012807846069336, "rewards/margins": 0.2528161108493805, "rewards/rejected": -1.265623927116394, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 30.467981338500977, "learning_rate": 4.9368490507848285e-08, "logits/chosen": -2.4359209537506104, "logits/rejected": -2.4101357460021973, "logps/chosen": -153.869873046875, "logps/rejected": -172.2772979736328, "loss": 0.6071, "rewards/accuracies": 0.625, "rewards/chosen": -0.9647769927978516, "rewards/margins": 0.26178082823753357, "rewards/rejected": -1.2265578508377075, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 35.35158920288086, "learning_rate": 4.926825757268276e-08, "logits/chosen": -2.3688926696777344, "logits/rejected": -2.3474221229553223, "logps/chosen": -154.3858184814453, "logps/rejected": -171.7613067626953, "loss": 0.6398, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9814737439155579, "rewards/margins": 0.1964212954044342, "rewards/rejected": -1.1778948307037354, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 28.72515296936035, "learning_rate": 4.916802757868529e-08, "logits/chosen": -2.364741802215576, "logits/rejected": -2.3553318977355957, "logps/chosen": -146.39309692382812, "logps/rejected": -177.4051513671875, "loss": 0.5995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9462183713912964, "rewards/margins": 0.2969276010990143, "rewards/rejected": -1.2431461811065674, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 26.201595306396484, "learning_rate": 4.906780092872069e-08, "logits/chosen": -2.45100474357605, "logits/rejected": -2.4262044429779053, "logps/chosen": -157.6940460205078, "logps/rejected": -181.42202758789062, "loss": 0.6079, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9933892488479614, "rewards/margins": 0.29405349493026733, "rewards/rejected": -1.2874428033828735, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 24.7778263092041, "learning_rate": 4.89675780256403e-08, "logits/chosen": -2.3970322608947754, "logits/rejected": -2.3876750469207764, "logps/chosen": -158.8399658203125, "logps/rejected": -172.13357543945312, "loss": 0.6546, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0322105884552002, "rewards/margins": 0.17332449555397034, "rewards/rejected": -1.2055351734161377, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 38.30397033691406, "learning_rate": 4.886735927228044e-08, "logits/chosen": -2.33786940574646, "logits/rejected": -2.3223893642425537, "logps/chosen": -158.79647827148438, "logps/rejected": -175.82296752929688, "loss": 0.6322, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.023982286453247, "rewards/margins": 0.2030734121799469, "rewards/rejected": -1.227055549621582, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 63.075931549072266, "learning_rate": 4.876714507146066e-08, "logits/chosen": -2.3727738857269287, "logits/rejected": -2.3485240936279297, "logps/chosen": -151.89422607421875, "logps/rejected": -170.76956176757812, "loss": 0.6475, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9858682751655579, "rewards/margins": 0.21038445830345154, "rewards/rejected": -1.1962525844573975, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 34.793338775634766, "learning_rate": 4.86669358259823e-08, "logits/chosen": -2.299872875213623, "logits/rejected": -2.292018413543701, "logps/chosen": -147.75247192382812, "logps/rejected": -167.35336303710938, "loss": 0.6316, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9372814297676086, "rewards/margins": 0.22653648257255554, "rewards/rejected": -1.163818120956421, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 28.445850372314453, "learning_rate": 4.856673193862677e-08, "logits/chosen": -2.4229607582092285, "logits/rejected": -2.409935235977173, "logps/chosen": -147.39321899414062, "logps/rejected": -161.4310302734375, "loss": 0.6444, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.923819899559021, "rewards/margins": 0.18175753951072693, "rewards/rejected": -1.1055775880813599, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 25.407543182373047, "learning_rate": 4.846653381215391e-08, "logits/chosen": -2.4299368858337402, "logits/rejected": -2.4185292720794678, "logps/chosen": -141.1247100830078, "logps/rejected": -164.10366821289062, "loss": 0.6213, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8609121441841125, "rewards/margins": 0.2429952174425125, "rewards/rejected": -1.1039073467254639, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 27.56557273864746, "learning_rate": 4.836634184930043e-08, "logits/chosen": -2.4252548217773438, "logits/rejected": -2.410492420196533, "logps/chosen": -141.70298767089844, "logps/rejected": -158.42745971679688, "loss": 0.636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.861044704914093, "rewards/margins": 0.1996491253376007, "rewards/rejected": -1.0606937408447266, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 24.753618240356445, "learning_rate": 4.826615645277823e-08, "logits/chosen": -2.436265230178833, "logits/rejected": -2.3932459354400635, "logps/chosen": -149.7669219970703, "logps/rejected": -166.84420776367188, "loss": 0.6266, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9381783604621887, "rewards/margins": 0.24333791434764862, "rewards/rejected": -1.181516170501709, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 27.361852645874023, "learning_rate": 4.8165978025272865e-08, "logits/chosen": -2.4168701171875, "logits/rejected": -2.3895132541656494, "logps/chosen": -144.3466796875, "logps/rejected": -165.06533813476562, "loss": 0.6197, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9014531970024109, "rewards/margins": 0.24613547325134277, "rewards/rejected": -1.1475884914398193, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 27.26271629333496, "learning_rate": 4.806580696944186e-08, "logits/chosen": -2.3578128814697266, "logits/rejected": -2.338855266571045, "logps/chosen": -147.1383819580078, "logps/rejected": -170.07638549804688, "loss": 0.6291, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9200402498245239, "rewards/margins": 0.24137751758098602, "rewards/rejected": -1.1614177227020264, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 30.34505271911621, "learning_rate": 4.796564368791311e-08, "logits/chosen": -2.393284559249878, "logits/rejected": -2.350308895111084, "logps/chosen": -159.84327697753906, "logps/rejected": -179.5383758544922, "loss": 0.6137, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9826564788818359, "rewards/margins": 0.3017580807209015, "rewards/rejected": -1.284414529800415, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 26.540788650512695, "learning_rate": 4.786548858328325e-08, "logits/chosen": -2.403374195098877, "logits/rejected": -2.395364999771118, "logps/chosen": -151.01055908203125, "logps/rejected": -186.31092834472656, "loss": 0.5993, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9662774801254272, "rewards/margins": 0.33149346709251404, "rewards/rejected": -1.2977708578109741, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 32.02655029296875, "learning_rate": 4.7765342058116057e-08, "logits/chosen": -2.432471513748169, "logits/rejected": -2.403170347213745, "logps/chosen": -157.26419067382812, "logps/rejected": -178.321044921875, "loss": 0.6271, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.986743152141571, "rewards/margins": 0.26471954584121704, "rewards/rejected": -1.251462697982788, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 26.325098037719727, "learning_rate": 4.766520451494082e-08, "logits/chosen": -2.39579176902771, "logits/rejected": -2.3640379905700684, "logps/chosen": -156.1612091064453, "logps/rejected": -179.22048950195312, "loss": 0.6284, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9938696622848511, "rewards/margins": 0.27399882674217224, "rewards/rejected": -1.2678686380386353, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 28.532209396362305, "learning_rate": 4.756507635625075e-08, "logits/chosen": -2.3948326110839844, "logits/rejected": -2.368168830871582, "logps/chosen": -151.87503051757812, "logps/rejected": -185.233154296875, "loss": 0.5835, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9678211212158203, "rewards/margins": 0.35295677185058594, "rewards/rejected": -1.3207777738571167, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 25.736608505249023, "learning_rate": 4.7464957984501324e-08, "logits/chosen": -2.4210550785064697, "logits/rejected": -2.412477970123291, "logps/chosen": -163.955078125, "logps/rejected": -183.9481964111328, "loss": 0.6479, "rewards/accuracies": 0.53125, "rewards/chosen": -1.0461199283599854, "rewards/margins": 0.23021705448627472, "rewards/rejected": -1.2763371467590332, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 36.30265426635742, "learning_rate": 4.736484980210865e-08, "logits/chosen": -2.382336139678955, "logits/rejected": -2.355818510055542, "logps/chosen": -155.4736328125, "logps/rejected": -182.4906463623047, "loss": 0.6175, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9920843839645386, "rewards/margins": 0.30391108989715576, "rewards/rejected": -1.2959954738616943, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 26.407812118530273, "learning_rate": 4.726475221144791e-08, "logits/chosen": -2.3951416015625, "logits/rejected": -2.3838553428649902, "logps/chosen": -149.45062255859375, "logps/rejected": -173.43222045898438, "loss": 0.5941, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.930752158164978, "rewards/margins": 0.292227566242218, "rewards/rejected": -1.2229797840118408, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 32.601341247558594, "learning_rate": 4.7164665614851735e-08, "logits/chosen": -2.4249911308288574, "logits/rejected": -2.4174537658691406, "logps/chosen": -166.8317413330078, "logps/rejected": -176.670166015625, "loss": 0.675, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0959227085113525, "rewards/margins": 0.1376759260892868, "rewards/rejected": -1.2335984706878662, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 25.09123992919922, "learning_rate": 4.706459041460853e-08, "logits/chosen": -2.398160934448242, "logits/rejected": -2.3736088275909424, "logps/chosen": -155.81951904296875, "logps/rejected": -177.24488830566406, "loss": 0.6168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.021740198135376, "rewards/margins": 0.25436100363731384, "rewards/rejected": -1.2761012315750122, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 28.56049346923828, "learning_rate": 4.69645270129609e-08, "logits/chosen": -2.3422672748565674, "logits/rejected": -2.3361213207244873, "logps/chosen": -157.16632080078125, "logps/rejected": -182.27499389648438, "loss": 0.6296, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0430997610092163, "rewards/margins": 0.23366820812225342, "rewards/rejected": -1.2767678499221802, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 27.852136611938477, "learning_rate": 4.686447581210404e-08, "logits/chosen": -2.3364086151123047, "logits/rejected": -2.3273282051086426, "logps/chosen": -157.48739624023438, "logps/rejected": -184.2283477783203, "loss": 0.592, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9968830347061157, "rewards/margins": 0.3168890178203583, "rewards/rejected": -1.313772201538086, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 22.739540100097656, "learning_rate": 4.676443721418408e-08, "logits/chosen": -2.3864567279815674, "logits/rejected": -2.3565258979797363, "logps/chosen": -149.53790283203125, "logps/rejected": -188.0727081298828, "loss": 0.553, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9652314186096191, "rewards/margins": 0.42683133482933044, "rewards/rejected": -1.3920629024505615, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 28.658220291137695, "learning_rate": 4.666441162129653e-08, "logits/chosen": -2.425554037094116, "logits/rejected": -2.3846659660339355, "logps/chosen": -166.473876953125, "logps/rejected": -183.8915252685547, "loss": 0.6209, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0830899477005005, "rewards/margins": 0.25999629497528076, "rewards/rejected": -1.3430863618850708, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 37.005409240722656, "learning_rate": 4.6564399435484616e-08, "logits/chosen": -2.4445948600769043, "logits/rejected": -2.421433448791504, "logps/chosen": -164.7544403076172, "logps/rejected": -185.24612426757812, "loss": 0.6192, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1069726943969727, "rewards/margins": 0.2556740641593933, "rewards/rejected": -1.3626466989517212, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 29.145448684692383, "learning_rate": 4.646440105873764e-08, "logits/chosen": -2.3813118934631348, "logits/rejected": -2.378369092941284, "logps/chosen": -158.937744140625, "logps/rejected": -188.14231872558594, "loss": 0.6131, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.056016206741333, "rewards/margins": 0.28049999475479126, "rewards/rejected": -1.3365163803100586, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 24.187950134277344, "learning_rate": 4.636441689298945e-08, "logits/chosen": -2.4301557540893555, "logits/rejected": -2.422311305999756, "logps/chosen": -161.22079467773438, "logps/rejected": -188.21884155273438, "loss": 0.6328, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0536234378814697, "rewards/margins": 0.22920770943164825, "rewards/rejected": -1.282831072807312, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 41.217735290527344, "learning_rate": 4.626444734011674e-08, "logits/chosen": -2.3997209072113037, "logits/rejected": -2.367011308670044, "logps/chosen": -166.6905059814453, "logps/rejected": -196.6645965576172, "loss": 0.5842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1190987825393677, "rewards/margins": 0.3476200997829437, "rewards/rejected": -1.4667189121246338, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 40.069129943847656, "learning_rate": 4.6164492801937516e-08, "logits/chosen": -2.428872585296631, "logits/rejected": -2.413508892059326, "logps/chosen": -171.06387329101562, "logps/rejected": -193.9467010498047, "loss": 0.6359, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1686700582504272, "rewards/margins": 0.2308274209499359, "rewards/rejected": -1.399497628211975, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 53.755035400390625, "learning_rate": 4.606455368020934e-08, "logits/chosen": -2.419379711151123, "logits/rejected": -2.4056499004364014, "logps/chosen": -164.5887451171875, "logps/rejected": -188.0942840576172, "loss": 0.6267, "rewards/accuracies": 0.625, "rewards/chosen": -1.1215044260025024, "rewards/margins": 0.2510332465171814, "rewards/rejected": -1.3725377321243286, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 36.47673416137695, "learning_rate": 4.59646303766279e-08, "logits/chosen": -2.3453407287597656, "logits/rejected": -2.328829050064087, "logps/chosen": -163.23776245117188, "logps/rejected": -195.60076904296875, "loss": 0.5966, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1016603708267212, "rewards/margins": 0.3368353545665741, "rewards/rejected": -1.4384956359863281, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 39.14576721191406, "learning_rate": 4.586472329282529e-08, "logits/chosen": -2.4195375442504883, "logits/rejected": -2.3854496479034424, "logps/chosen": -161.21798706054688, "logps/rejected": -180.47915649414062, "loss": 0.6193, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0634045600891113, "rewards/margins": 0.2559213936328888, "rewards/rejected": -1.3193260431289673, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 30.87076759338379, "learning_rate": 4.576483283036835e-08, "logits/chosen": -2.41550350189209, "logits/rejected": -2.3969714641571045, "logps/chosen": -161.6389923095703, "logps/rejected": -189.10501098632812, "loss": 0.6011, "rewards/accuracies": 0.65625, "rewards/chosen": -1.047298789024353, "rewards/margins": 0.2876473367214203, "rewards/rejected": -1.3349462747573853, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -2.4489173889160156, "eval_logits/rejected": -2.4395930767059326, "eval_logps/chosen": -148.1359405517578, "eval_logps/rejected": -166.54095458984375, "eval_loss": 0.6517484784126282, "eval_rewards/accuracies": 0.609897792339325, "eval_rewards/chosen": -0.8912045955657959, "eval_rewards/margins": 0.14670883119106293, "eval_rewards/rejected": -1.03791344165802, "eval_runtime": 383.0999, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 26.357059478759766, "learning_rate": 4.566495939075722e-08, "logits/chosen": -2.4122986793518066, "logits/rejected": -2.391216278076172, "logps/chosen": -158.59512329101562, "logps/rejected": -191.3281707763672, "loss": 0.5861, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0556198358535767, "rewards/margins": 0.33377084136009216, "rewards/rejected": -1.3893907070159912, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 40.59810256958008, "learning_rate": 4.5565103375423466e-08, "logits/chosen": -2.356652021408081, "logits/rejected": -2.3264429569244385, "logps/chosen": -166.05789184570312, "logps/rejected": -190.78692626953125, "loss": 0.6051, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1111173629760742, "rewards/margins": 0.2901898920536041, "rewards/rejected": -1.4013073444366455, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 35.417301177978516, "learning_rate": 4.546526518572878e-08, "logits/chosen": -2.358099937438965, "logits/rejected": -2.3320469856262207, "logps/chosen": -171.77035522460938, "logps/rejected": -183.01419067382812, "loss": 0.655, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1348702907562256, "rewards/margins": 0.18133307993412018, "rewards/rejected": -1.3162034749984741, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 24.3604736328125, "learning_rate": 4.5365445222963096e-08, "logits/chosen": -2.4755070209503174, "logits/rejected": -2.458996534347534, "logps/chosen": -166.288330078125, "logps/rejected": -194.246337890625, "loss": 0.6074, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1001951694488525, "rewards/margins": 0.30437546968460083, "rewards/rejected": -1.4045706987380981, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 30.295141220092773, "learning_rate": 4.5265643888343146e-08, "logits/chosen": -2.391026735305786, "logits/rejected": -2.39094614982605, "logps/chosen": -164.63150024414062, "logps/rejected": -180.1056671142578, "loss": 0.6687, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0971840620040894, "rewards/margins": 0.1522771418094635, "rewards/rejected": -1.2494614124298096, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 40.7687873840332, "learning_rate": 4.516586158301074e-08, "logits/chosen": -2.3803515434265137, "logits/rejected": -2.3743226528167725, "logps/chosen": -152.97393798828125, "logps/rejected": -176.10609436035156, "loss": 0.6433, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0329920053482056, "rewards/margins": 0.20597794651985168, "rewards/rejected": -1.2389700412750244, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 26.10736846923828, "learning_rate": 4.506609870803122e-08, "logits/chosen": -2.3440957069396973, "logits/rejected": -2.3347837924957275, "logps/chosen": -144.05136108398438, "logps/rejected": -176.00039672851562, "loss": 0.5923, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9066449403762817, "rewards/margins": 0.3289529085159302, "rewards/rejected": -1.235597848892212, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 38.685455322265625, "learning_rate": 4.4966355664391856e-08, "logits/chosen": -2.416966676712036, "logits/rejected": -2.3993144035339355, "logps/chosen": -150.76454162597656, "logps/rejected": -170.0929718017578, "loss": 0.6309, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9308739900588989, "rewards/margins": 0.2240399420261383, "rewards/rejected": -1.1549139022827148, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 29.607315063476562, "learning_rate": 4.486663285300019e-08, "logits/chosen": -2.469937324523926, "logits/rejected": -2.447589635848999, "logps/chosen": -142.93031311035156, "logps/rejected": -175.0164031982422, "loss": 0.5962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8724008798599243, "rewards/margins": 0.3072863221168518, "rewards/rejected": -1.179687261581421, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 34.174522399902344, "learning_rate": 4.4766930674682446e-08, "logits/chosen": -2.4112601280212402, "logits/rejected": -2.3972017765045166, "logps/chosen": -152.28091430664062, "logps/rejected": -177.61593627929688, "loss": 0.6097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9579811096191406, "rewards/margins": 0.28297972679138184, "rewards/rejected": -1.2409610748291016, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 30.270769119262695, "learning_rate": 4.4667249530181866e-08, "logits/chosen": -2.4306607246398926, "logits/rejected": -2.427654981613159, "logps/chosen": -157.984130859375, "logps/rejected": -193.0686798095703, "loss": 0.5951, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0728533267974854, "rewards/margins": 0.32097524404525757, "rewards/rejected": -1.3938283920288086, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 34.1330451965332, "learning_rate": 4.456758982015724e-08, "logits/chosen": -2.393573045730591, "logits/rejected": -2.3679986000061035, "logps/chosen": -168.1916961669922, "logps/rejected": -188.971923828125, "loss": 0.6195, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1371654272079468, "rewards/margins": 0.2622528374195099, "rewards/rejected": -1.3994182348251343, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 31.97394371032715, "learning_rate": 4.446795194518113e-08, "logits/chosen": -2.4118752479553223, "logits/rejected": -2.3918135166168213, "logps/chosen": -158.6249542236328, "logps/rejected": -191.27536010742188, "loss": 0.5906, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.039110541343689, "rewards/margins": 0.34039172530174255, "rewards/rejected": -1.379502296447754, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 31.597654342651367, "learning_rate": 4.436833630573837e-08, "logits/chosen": -2.389570951461792, "logits/rejected": -2.3521625995635986, "logps/chosen": -176.93966674804688, "logps/rejected": -198.84036254882812, "loss": 0.6193, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1528842449188232, "rewards/margins": 0.29966822266578674, "rewards/rejected": -1.4525524377822876, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 29.19540786743164, "learning_rate": 4.4268743302224405e-08, "logits/chosen": -2.3549156188964844, "logits/rejected": -2.3334720134735107, "logps/chosen": -168.19863891601562, "logps/rejected": -202.30337524414062, "loss": 0.6053, "rewards/accuracies": 0.625, "rewards/chosen": -1.1256818771362305, "rewards/margins": 0.34564125537872314, "rewards/rejected": -1.471323013305664, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 29.48337173461914, "learning_rate": 4.416917333494369e-08, "logits/chosen": -2.362104892730713, "logits/rejected": -2.3388655185699463, "logps/chosen": -164.33436584472656, "logps/rejected": -200.4606475830078, "loss": 0.5722, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1089446544647217, "rewards/margins": 0.3804648816585541, "rewards/rejected": -1.4894095659255981, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 29.898658752441406, "learning_rate": 4.406962680410812e-08, "logits/chosen": -2.338603973388672, "logits/rejected": -2.3241982460021973, "logps/chosen": -176.6895294189453, "logps/rejected": -209.5714569091797, "loss": 0.5924, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2172462940216064, "rewards/margins": 0.34438905119895935, "rewards/rejected": -1.5616354942321777, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 25.77763557434082, "learning_rate": 4.3970104109835374e-08, "logits/chosen": -2.3041019439697266, "logits/rejected": -2.2791824340820312, "logps/chosen": -179.14761352539062, "logps/rejected": -217.32827758789062, "loss": 0.5787, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2611709833145142, "rewards/margins": 0.3846471607685089, "rewards/rejected": -1.6458181142807007, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 31.118616104125977, "learning_rate": 4.387060565214732e-08, "logits/chosen": -2.3026082515716553, "logits/rejected": -2.278801202774048, "logps/chosen": -174.61709594726562, "logps/rejected": -214.9127960205078, "loss": 0.5687, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2221773862838745, "rewards/margins": 0.4176904261112213, "rewards/rejected": -1.6398680210113525, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 30.557296752929688, "learning_rate": 4.3771131830968386e-08, "logits/chosen": -2.3609557151794434, "logits/rejected": -2.3385212421417236, "logps/chosen": -182.12051391601562, "logps/rejected": -215.4132537841797, "loss": 0.5951, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2877042293548584, "rewards/margins": 0.3666505813598633, "rewards/rejected": -1.6543548107147217, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 31.58578109741211, "learning_rate": 4.367168304612399e-08, "logits/chosen": -2.343324899673462, "logits/rejected": -2.330151081085205, "logps/chosen": -192.5848846435547, "logps/rejected": -229.7525177001953, "loss": 0.5968, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3690006732940674, "rewards/margins": 0.37162265181541443, "rewards/rejected": -1.7406232357025146, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 30.243844985961914, "learning_rate": 4.3572259697338966e-08, "logits/chosen": -2.3129944801330566, "logits/rejected": -2.296821117401123, "logps/chosen": -175.6407928466797, "logps/rejected": -207.247314453125, "loss": 0.6053, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.261752724647522, "rewards/margins": 0.308516263961792, "rewards/rejected": -1.5702688694000244, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 33.17268753051758, "learning_rate": 4.347286218423585e-08, "logits/chosen": -2.2974283695220947, "logits/rejected": -2.2750775814056396, "logps/chosen": -179.20205688476562, "logps/rejected": -197.1276092529297, "loss": 0.6486, "rewards/accuracies": 0.625, "rewards/chosen": -1.247676134109497, "rewards/margins": 0.21209442615509033, "rewards/rejected": -1.4597707986831665, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 31.049867630004883, "learning_rate": 4.337349090633335e-08, "logits/chosen": -2.3183791637420654, "logits/rejected": -2.2915894985198975, "logps/chosen": -174.53001403808594, "logps/rejected": -216.4149627685547, "loss": 0.571, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2244672775268555, "rewards/margins": 0.42240673303604126, "rewards/rejected": -1.6468738317489624, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 33.40523910522461, "learning_rate": 4.327414626304473e-08, "logits/chosen": -2.3828606605529785, "logits/rejected": -2.345085859298706, "logps/chosen": -175.66213989257812, "logps/rejected": -203.55291748046875, "loss": 0.5824, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1799122095108032, "rewards/margins": 0.3656123876571655, "rewards/rejected": -1.5455245971679688, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 28.497297286987305, "learning_rate": 4.317482865367619e-08, "logits/chosen": -2.3601796627044678, "logits/rejected": -2.3613171577453613, "logps/chosen": -165.05870056152344, "logps/rejected": -197.8215789794922, "loss": 0.602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.109321117401123, "rewards/margins": 0.30615848302841187, "rewards/rejected": -1.4154794216156006, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 37.638763427734375, "learning_rate": 4.3075538477425296e-08, "logits/chosen": -2.334505796432495, "logits/rejected": -2.318866491317749, "logps/chosen": -172.26190185546875, "logps/rejected": -201.54942321777344, "loss": 0.6051, "rewards/accuracies": 0.625, "rewards/chosen": -1.1580668687820435, "rewards/margins": 0.30520960688591003, "rewards/rejected": -1.4632765054702759, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 31.05406379699707, "learning_rate": 4.2976276133379336e-08, "logits/chosen": -2.3457462787628174, "logits/rejected": -2.3291707038879395, "logps/chosen": -174.33682250976562, "logps/rejected": -188.51695251464844, "loss": 0.664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.192948579788208, "rewards/margins": 0.16929271817207336, "rewards/rejected": -1.362241506576538, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 32.82965850830078, "learning_rate": 4.2877042020513696e-08, "logits/chosen": -2.317387580871582, "logits/rejected": -2.296901226043701, "logps/chosen": -158.64199829101562, "logps/rejected": -196.9527130126953, "loss": 0.6009, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0650023221969604, "rewards/margins": 0.3558632731437683, "rewards/rejected": -1.420865774154663, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 44.43830871582031, "learning_rate": 4.2777836537690336e-08, "logits/chosen": -2.372666358947754, "logits/rejected": -2.3581936359405518, "logps/chosen": -182.65878295898438, "logps/rejected": -198.89340209960938, "loss": 0.6712, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2641236782073975, "rewards/margins": 0.18516698479652405, "rewards/rejected": -1.4492907524108887, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 27.348831176757812, "learning_rate": 4.26786600836561e-08, "logits/chosen": -2.3039889335632324, "logits/rejected": -2.282872200012207, "logps/chosen": -179.7623291015625, "logps/rejected": -204.12564086914062, "loss": 0.5981, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.22052001953125, "rewards/margins": 0.3195902407169342, "rewards/rejected": -1.5401101112365723, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 29.10089683532715, "learning_rate": 4.2579513057041225e-08, "logits/chosen": -2.3530561923980713, "logits/rejected": -2.3259224891662598, "logps/chosen": -178.1007843017578, "logps/rejected": -199.89125061035156, "loss": 0.6372, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1998882293701172, "rewards/margins": 0.2680136561393738, "rewards/rejected": -1.4679019451141357, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 39.05019760131836, "learning_rate": 4.248039585635756e-08, "logits/chosen": -2.3624677658081055, "logits/rejected": -2.341622829437256, "logps/chosen": -167.41891479492188, "logps/rejected": -196.73715209960938, "loss": 0.604, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1432722806930542, "rewards/margins": 0.2914575934410095, "rewards/rejected": -1.434729814529419, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 38.29509353637695, "learning_rate": 4.238130887999716e-08, "logits/chosen": -2.4009697437286377, "logits/rejected": -2.379375457763672, "logps/chosen": -160.23373413085938, "logps/rejected": -186.0135955810547, "loss": 0.6137, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0323786735534668, "rewards/margins": 0.2908129394054413, "rewards/rejected": -1.3231916427612305, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 38.13880157470703, "learning_rate": 4.228225252623055e-08, "logits/chosen": -2.432330846786499, "logits/rejected": -2.408937692642212, "logps/chosen": -158.62673950195312, "logps/rejected": -179.88282775878906, "loss": 0.621, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0324501991271973, "rewards/margins": 0.2672559320926666, "rewards/rejected": -1.299706220626831, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 23.965024948120117, "learning_rate": 4.218322719320519e-08, "logits/chosen": -2.365845203399658, "logits/rejected": -2.339709758758545, "logps/chosen": -159.224853515625, "logps/rejected": -174.7471466064453, "loss": 0.6421, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0221807956695557, "rewards/margins": 0.20925001800060272, "rewards/rejected": -1.2314307689666748, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 34.28385543823242, "learning_rate": 4.208423327894387e-08, "logits/chosen": -2.2426235675811768, "logits/rejected": -2.216484546661377, "logps/chosen": -153.36285400390625, "logps/rejected": -181.20248413085938, "loss": 0.5993, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9984307289123535, "rewards/margins": 0.29507023096084595, "rewards/rejected": -1.2935011386871338, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 33.906288146972656, "learning_rate": 4.1985271181343056e-08, "logits/chosen": -2.350764751434326, "logits/rejected": -2.337337017059326, "logps/chosen": -163.58370971679688, "logps/rejected": -175.857421875, "loss": 0.689, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1080776453018188, "rewards/margins": 0.13066032528877258, "rewards/rejected": -1.2387378215789795, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 27.24697494506836, "learning_rate": 4.188634129817135e-08, "logits/chosen": -2.3855433464050293, "logits/rejected": -2.3613743782043457, "logps/chosen": -153.98219299316406, "logps/rejected": -177.3594512939453, "loss": 0.6202, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9690343141555786, "rewards/margins": 0.27303311228752136, "rewards/rejected": -1.2420674562454224, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 27.075759887695312, "learning_rate": 4.178744402706788e-08, "logits/chosen": -2.415712833404541, "logits/rejected": -2.400829792022705, "logps/chosen": -151.54632568359375, "logps/rejected": -190.5732879638672, "loss": 0.571, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9909058809280396, "rewards/margins": 0.3732060194015503, "rewards/rejected": -1.3641119003295898, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -2.4488589763641357, "eval_logits/rejected": -2.4401326179504395, "eval_logps/chosen": -141.355712890625, "eval_logps/rejected": -159.2781982421875, "eval_loss": 0.651395857334137, "eval_rewards/accuracies": 0.6122211813926697, "eval_rewards/chosen": -0.8234025239944458, "eval_rewards/margins": 0.14188359677791595, "eval_rewards/rejected": -0.9652861952781677, "eval_runtime": 383.1182, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 30.668682098388672, "learning_rate": 4.168857976554067e-08, "logits/chosen": -2.3561768531799316, "logits/rejected": -2.3268985748291016, "logps/chosen": -160.31906127929688, "logps/rejected": -179.61732482910156, "loss": 0.623, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0286571979522705, "rewards/margins": 0.2432531863451004, "rewards/rejected": -1.2719104290008545, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 26.499998092651367, "learning_rate": 4.1589748910965104e-08, "logits/chosen": -2.3826560974121094, "logits/rejected": -2.3583431243896484, "logps/chosen": -157.2075958251953, "logps/rejected": -185.43310546875, "loss": 0.6181, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0241360664367676, "rewards/margins": 0.2945292592048645, "rewards/rejected": -1.3186652660369873, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 24.107423782348633, "learning_rate": 4.1490951860582243e-08, "logits/chosen": -2.4138405323028564, "logits/rejected": -2.393023729324341, "logps/chosen": -157.1988525390625, "logps/rejected": -177.00567626953125, "loss": 0.6355, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0148719549179077, "rewards/margins": 0.23055486381053925, "rewards/rejected": -1.2454270124435425, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 29.232955932617188, "learning_rate": 4.139218901149731e-08, "logits/chosen": -2.4301648139953613, "logits/rejected": -2.4212965965270996, "logps/chosen": -172.70167541503906, "logps/rejected": -184.5668487548828, "loss": 0.6641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1334761381149292, "rewards/margins": 0.1488693505525589, "rewards/rejected": -1.2823455333709717, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 31.3802490234375, "learning_rate": 4.129346076067802e-08, "logits/chosen": -2.4010443687438965, "logits/rejected": -2.389110565185547, "logps/chosen": -158.57315063476562, "logps/rejected": -193.9378662109375, "loss": 0.579, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.03981614112854, "rewards/margins": 0.3365381360054016, "rewards/rejected": -1.3763542175292969, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 34.95161819458008, "learning_rate": 4.119476750495312e-08, "logits/chosen": -2.385775327682495, "logits/rejected": -2.356813907623291, "logps/chosen": -162.02548217773438, "logps/rejected": -187.3418426513672, "loss": 0.6009, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0644725561141968, "rewards/margins": 0.28172314167022705, "rewards/rejected": -1.3461956977844238, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 29.766733169555664, "learning_rate": 4.109610964101054e-08, "logits/chosen": -2.2834041118621826, "logits/rejected": -2.2619574069976807, "logps/chosen": -164.19271850585938, "logps/rejected": -191.8291015625, "loss": 0.6049, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1161973476409912, "rewards/margins": 0.304057240486145, "rewards/rejected": -1.4202544689178467, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 35.490474700927734, "learning_rate": 4.099748756539609e-08, "logits/chosen": -2.372213840484619, "logits/rejected": -2.3380684852600098, "logps/chosen": -171.201904296875, "logps/rejected": -206.79092407226562, "loss": 0.5709, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1381561756134033, "rewards/margins": 0.41463032364845276, "rewards/rejected": -1.5527865886688232, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 26.32746124267578, "learning_rate": 4.089890167451169e-08, "logits/chosen": -2.3567185401916504, "logits/rejected": -2.3334903717041016, "logps/chosen": -167.50057983398438, "logps/rejected": -192.45741271972656, "loss": 0.6156, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1219137907028198, "rewards/margins": 0.2818216383457184, "rewards/rejected": -1.4037355184555054, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 35.64101791381836, "learning_rate": 4.08003523646138e-08, "logits/chosen": -2.3531155586242676, "logits/rejected": -2.327070713043213, "logps/chosen": -172.28802490234375, "logps/rejected": -210.2729949951172, "loss": 0.5829, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1850135326385498, "rewards/margins": 0.37663546204566956, "rewards/rejected": -1.5616488456726074, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 39.186458587646484, "learning_rate": 4.070184003181189e-08, "logits/chosen": -2.3595175743103027, "logits/rejected": -2.326897144317627, "logps/chosen": -179.2899627685547, "logps/rejected": -208.84072875976562, "loss": 0.5982, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.231400728225708, "rewards/margins": 0.3356294631958008, "rewards/rejected": -1.5670301914215088, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 42.04875183105469, "learning_rate": 4.060336507206673e-08, "logits/chosen": -2.378934383392334, "logits/rejected": -2.367832899093628, "logps/chosen": -178.14813232421875, "logps/rejected": -213.73025512695312, "loss": 0.6138, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2611725330352783, "rewards/margins": 0.3409961760044098, "rewards/rejected": -1.6021686792373657, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 38.262855529785156, "learning_rate": 4.0504927881188946e-08, "logits/chosen": -2.319676399230957, "logits/rejected": -2.29738187789917, "logps/chosen": -180.23928833007812, "logps/rejected": -201.06912231445312, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": -1.2533966302871704, "rewards/margins": 0.24199731647968292, "rewards/rejected": -1.4953938722610474, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 41.87706756591797, "learning_rate": 4.040652885483733e-08, "logits/chosen": -2.2700414657592773, "logits/rejected": -2.2463417053222656, "logps/chosen": -171.6498565673828, "logps/rejected": -195.3922882080078, "loss": 0.6154, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1847819089889526, "rewards/margins": 0.2865777015686035, "rewards/rejected": -1.4713597297668457, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 46.120147705078125, "learning_rate": 4.0308168388517284e-08, "logits/chosen": -2.4297916889190674, "logits/rejected": -2.4190146923065186, "logps/chosen": -179.94268798828125, "logps/rejected": -208.4828338623047, "loss": 0.6259, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2590245008468628, "rewards/margins": 0.27799472212791443, "rewards/rejected": -1.5370192527770996, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 36.81279754638672, "learning_rate": 4.020984687757918e-08, "logits/chosen": -2.3187246322631836, "logits/rejected": -2.29058837890625, "logps/chosen": -177.59408569335938, "logps/rejected": -212.2113800048828, "loss": 0.5926, "rewards/accuracies": 0.75, "rewards/chosen": -1.208798885345459, "rewards/margins": 0.3812781274318695, "rewards/rejected": -1.5900771617889404, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 37.20123291015625, "learning_rate": 4.0111564717216845e-08, "logits/chosen": -2.3557188510894775, "logits/rejected": -2.3360562324523926, "logps/chosen": -179.9247589111328, "logps/rejected": -215.9092254638672, "loss": 0.5865, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2344462871551514, "rewards/margins": 0.3715837001800537, "rewards/rejected": -1.6060298681259155, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 31.85076904296875, "learning_rate": 4.001332230246597e-08, "logits/chosen": -2.3521053791046143, "logits/rejected": -2.327073574066162, "logps/chosen": -173.4813995361328, "logps/rejected": -206.79641723632812, "loss": 0.586, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1977475881576538, "rewards/margins": 0.34784871339797974, "rewards/rejected": -1.5455963611602783, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 30.5350284576416, "learning_rate": 3.9915120028202434e-08, "logits/chosen": -2.3166115283966064, "logits/rejected": -2.2853188514709473, "logps/chosen": -181.58470153808594, "logps/rejected": -203.5180206298828, "loss": 0.6113, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2494252920150757, "rewards/margins": 0.2973533272743225, "rewards/rejected": -1.546778678894043, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 26.205764770507812, "learning_rate": 3.9816958289140836e-08, "logits/chosen": -2.386885404586792, "logits/rejected": -2.3785994052886963, "logps/chosen": -169.0087890625, "logps/rejected": -195.49234008789062, "loss": 0.6327, "rewards/accuracies": 0.6875, "rewards/chosen": -1.156529188156128, "rewards/margins": 0.2596370577812195, "rewards/rejected": -1.4161661863327026, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 40.030616760253906, "learning_rate": 3.971883747983278e-08, "logits/chosen": -2.3261427879333496, "logits/rejected": -2.316303014755249, "logps/chosen": -176.147705078125, "logps/rejected": -204.08290100097656, "loss": 0.6211, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2405706644058228, "rewards/margins": 0.28227168321609497, "rewards/rejected": -1.522842288017273, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 32.2052116394043, "learning_rate": 3.9620757994665383e-08, "logits/chosen": -2.2571442127227783, "logits/rejected": -2.2298996448516846, "logps/chosen": -172.920654296875, "logps/rejected": -204.323974609375, "loss": 0.5915, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1804695129394531, "rewards/margins": 0.3475439250469208, "rewards/rejected": -1.5280134677886963, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 32.063472747802734, "learning_rate": 3.952272022785971e-08, "logits/chosen": -2.347079038619995, "logits/rejected": -2.3194215297698975, "logps/chosen": -169.39932250976562, "logps/rejected": -202.79177856445312, "loss": 0.5969, "rewards/accuracies": 0.6875, "rewards/chosen": -1.138787031173706, "rewards/margins": 0.3578266203403473, "rewards/rejected": -1.4966135025024414, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 61.603126525878906, "learning_rate": 3.9424724573469094e-08, "logits/chosen": -2.374408483505249, "logits/rejected": -2.3449018001556396, "logps/chosen": -172.2761993408203, "logps/rejected": -200.76254272460938, "loss": 0.6058, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1696155071258545, "rewards/margins": 0.32298147678375244, "rewards/rejected": -1.4925968647003174, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 31.435705184936523, "learning_rate": 3.9326771425377586e-08, "logits/chosen": -2.364908218383789, "logits/rejected": -2.3407516479492188, "logps/chosen": -182.14035034179688, "logps/rejected": -222.3705291748047, "loss": 0.5796, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2733443975448608, "rewards/margins": 0.422320693731308, "rewards/rejected": -1.6956650018692017, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 35.018035888671875, "learning_rate": 3.9228861177298434e-08, "logits/chosen": -2.2953758239746094, "logits/rejected": -2.2792255878448486, "logps/chosen": -186.21810913085938, "logps/rejected": -211.9817657470703, "loss": 0.6269, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3122917413711548, "rewards/margins": 0.26357850432395935, "rewards/rejected": -1.5758702754974365, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 26.195331573486328, "learning_rate": 3.913099422277242e-08, "logits/chosen": -2.326352596282959, "logits/rejected": -2.3013761043548584, "logps/chosen": -184.91539001464844, "logps/rejected": -218.487060546875, "loss": 0.5974, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3113325834274292, "rewards/margins": 0.3626203238964081, "rewards/rejected": -1.6739528179168701, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 31.851932525634766, "learning_rate": 3.903317095516634e-08, "logits/chosen": -2.346625804901123, "logits/rejected": -2.3183517456054688, "logps/chosen": -183.88546752929688, "logps/rejected": -205.15170288085938, "loss": 0.6126, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.263718843460083, "rewards/margins": 0.29013028740882874, "rewards/rejected": -1.5538489818572998, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 40.034358978271484, "learning_rate": 3.893539176767138e-08, "logits/chosen": -2.309453010559082, "logits/rejected": -2.2947006225585938, "logps/chosen": -185.39358520507812, "logps/rejected": -224.47067260742188, "loss": 0.5827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3045397996902466, "rewards/margins": 0.3803395628929138, "rewards/rejected": -1.6848793029785156, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 50.86007308959961, "learning_rate": 3.8837657053301533e-08, "logits/chosen": -2.3584494590759277, "logits/rejected": -2.3197760581970215, "logps/chosen": -180.21823120117188, "logps/rejected": -205.2100830078125, "loss": 0.5955, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.23667311668396, "rewards/margins": 0.3189604878425598, "rewards/rejected": -1.555633783340454, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 30.94768524169922, "learning_rate": 3.873996720489205e-08, "logits/chosen": -2.308336019515991, "logits/rejected": -2.280362606048584, "logps/chosen": -176.10208129882812, "logps/rejected": -202.3173065185547, "loss": 0.5951, "rewards/accuracies": 0.625, "rewards/chosen": -1.2233304977416992, "rewards/margins": 0.31216123700141907, "rewards/rejected": -1.535491704940796, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 35.156558990478516, "learning_rate": 3.864232261509787e-08, "logits/chosen": -2.302485942840576, "logits/rejected": -2.2754483222961426, "logps/chosen": -184.95298767089844, "logps/rejected": -211.0207061767578, "loss": 0.6317, "rewards/accuracies": 0.65625, "rewards/chosen": -1.316679835319519, "rewards/margins": 0.2793746590614319, "rewards/rejected": -1.5960544347763062, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 29.506084442138672, "learning_rate": 3.8544723676392e-08, "logits/chosen": -2.4065823554992676, "logits/rejected": -2.3675832748413086, "logps/chosen": -179.6064453125, "logps/rejected": -211.91336059570312, "loss": 0.5788, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2562201023101807, "rewards/margins": 0.36423414945602417, "rewards/rejected": -1.62045419216156, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 42.23896789550781, "learning_rate": 3.844717078106394e-08, "logits/chosen": -2.29411244392395, "logits/rejected": -2.273477077484131, "logps/chosen": -187.98069763183594, "logps/rejected": -210.1696319580078, "loss": 0.6447, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3325438499450684, "rewards/margins": 0.22396263480186462, "rewards/rejected": -1.5565065145492554, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 33.77505874633789, "learning_rate": 3.8349664321218135e-08, "logits/chosen": -2.294753313064575, "logits/rejected": -2.2612767219543457, "logps/chosen": -169.0750274658203, "logps/rejected": -208.6610107421875, "loss": 0.5765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1630382537841797, "rewards/margins": 0.40251216292381287, "rewards/rejected": -1.565550446510315, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 38.18898010253906, "learning_rate": 3.82522046887724e-08, "logits/chosen": -2.3010048866271973, "logits/rejected": -2.277493715286255, "logps/chosen": -179.2954559326172, "logps/rejected": -204.9003143310547, "loss": 0.623, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2467565536499023, "rewards/margins": 0.2810923159122467, "rewards/rejected": -1.5278488397598267, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 33.77649688720703, "learning_rate": 3.815479227545633e-08, "logits/chosen": -2.295396327972412, "logits/rejected": -2.279035806655884, "logps/chosen": -181.14846801757812, "logps/rejected": -210.6282958984375, "loss": 0.598, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2278902530670166, "rewards/margins": 0.3429439067840576, "rewards/rejected": -1.5708341598510742, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 31.69585418701172, "learning_rate": 3.8057427472809736e-08, "logits/chosen": -2.4099316596984863, "logits/rejected": -2.3927342891693115, "logps/chosen": -173.10000610351562, "logps/rejected": -204.47640991210938, "loss": 0.6153, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1872986555099487, "rewards/margins": 0.30703839659690857, "rewards/rejected": -1.4943370819091797, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 37.59237289428711, "learning_rate": 3.796011067218101e-08, "logits/chosen": -2.436169147491455, "logits/rejected": -2.4094064235687256, "logps/chosen": -167.60398864746094, "logps/rejected": -195.60458374023438, "loss": 0.6161, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.142968773841858, "rewards/margins": 0.29017987847328186, "rewards/rejected": -1.4331486225128174, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 29.355329513549805, "learning_rate": 3.786284226472565e-08, "logits/chosen": -2.3927254676818848, "logits/rejected": -2.368919849395752, "logps/chosen": -168.04922485351562, "logps/rejected": -204.6519317626953, "loss": 0.5889, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1452404260635376, "rewards/margins": 0.37039321660995483, "rewards/rejected": -1.5156338214874268, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -2.403923511505127, "eval_logits/rejected": -2.393235445022583, "eval_logps/chosen": -160.73316955566406, "eval_logps/rejected": -180.2567596435547, "eval_loss": 0.6505710482597351, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -1.017176866531372, "eval_rewards/margins": 0.15789496898651123, "eval_rewards/rejected": -1.1750717163085938, "eval_runtime": 384.6021, "eval_samples_per_second": 11.191, "eval_steps_per_second": 1.399, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 28.493104934692383, "learning_rate": 3.776562264140464e-08, "logits/chosen": -2.365048408508301, "logits/rejected": -2.3278393745422363, "logps/chosen": -180.26234436035156, "logps/rejected": -199.35757446289062, "loss": 0.6182, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.215912103652954, "rewards/margins": 0.285257488489151, "rewards/rejected": -1.5011695623397827, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 33.84832763671875, "learning_rate": 3.766845219298291e-08, "logits/chosen": -2.3179924488067627, "logits/rejected": -2.294447422027588, "logps/chosen": -167.72108459472656, "logps/rejected": -197.6214141845703, "loss": 0.6103, "rewards/accuracies": 0.625, "rewards/chosen": -1.1377894878387451, "rewards/margins": 0.3336928188800812, "rewards/rejected": -1.471482515335083, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 32.825592041015625, "learning_rate": 3.757133131002764e-08, "logits/chosen": -2.337118148803711, "logits/rejected": -2.3134689331054688, "logps/chosen": -176.63125610351562, "logps/rejected": -202.58164978027344, "loss": 0.6203, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2352012395858765, "rewards/margins": 0.26735469698905945, "rewards/rejected": -1.5025558471679688, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 25.85580062866211, "learning_rate": 3.747426038290689e-08, "logits/chosen": -2.346402645111084, "logits/rejected": -2.325465440750122, "logps/chosen": -167.85498046875, "logps/rejected": -191.32839965820312, "loss": 0.6291, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1382768154144287, "rewards/margins": 0.26838475465774536, "rewards/rejected": -1.4066616296768188, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 30.227493286132812, "learning_rate": 3.737723980178786e-08, "logits/chosen": -2.3299098014831543, "logits/rejected": -2.309762477874756, "logps/chosen": -159.28550720214844, "logps/rejected": -191.73582458496094, "loss": 0.5993, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0782078504562378, "rewards/margins": 0.2990468144416809, "rewards/rejected": -1.377254605293274, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 34.90668487548828, "learning_rate": 3.7280269956635414e-08, "logits/chosen": -2.394364595413208, "logits/rejected": -2.3578202724456787, "logps/chosen": -168.866943359375, "logps/rejected": -199.4810333251953, "loss": 0.5919, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1270270347595215, "rewards/margins": 0.3460743725299835, "rewards/rejected": -1.4731013774871826, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 25.8541259765625, "learning_rate": 3.718335123721054e-08, "logits/chosen": -2.2831482887268066, "logits/rejected": -2.2722556591033936, "logps/chosen": -161.58595275878906, "logps/rejected": -201.04360961914062, "loss": 0.5987, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.107114553451538, "rewards/margins": 0.34930944442749023, "rewards/rejected": -1.4564241170883179, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 34.01238250732422, "learning_rate": 3.708648403306859e-08, "logits/chosen": -2.3505282402038574, "logits/rejected": -2.3237061500549316, "logps/chosen": -172.83595275878906, "logps/rejected": -201.0712127685547, "loss": 0.5968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1779439449310303, "rewards/margins": 0.3129883408546448, "rewards/rejected": -1.4909324645996094, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 38.18024444580078, "learning_rate": 3.698966873355802e-08, "logits/chosen": -2.374328374862671, "logits/rejected": -2.3627209663391113, "logps/chosen": -175.97042846679688, "logps/rejected": -203.05239868164062, "loss": 0.6211, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2167003154754639, "rewards/margins": 0.27208462357521057, "rewards/rejected": -1.4887850284576416, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 31.21358299255371, "learning_rate": 3.6892905727818544e-08, "logits/chosen": -2.4051666259765625, "logits/rejected": -2.3722360134124756, "logps/chosen": -170.09390258789062, "logps/rejected": -193.1257781982422, "loss": 0.6076, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1315958499908447, "rewards/margins": 0.2980080544948578, "rewards/rejected": -1.429603934288025, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 38.000186920166016, "learning_rate": 3.679619540477975e-08, "logits/chosen": -2.3199026584625244, "logits/rejected": -2.2889769077301025, "logps/chosen": -169.51327514648438, "logps/rejected": -195.25967407226562, "loss": 0.6133, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1240684986114502, "rewards/margins": 0.288606733083725, "rewards/rejected": -1.412675142288208, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 34.62694549560547, "learning_rate": 3.669953815315943e-08, "logits/chosen": -2.3156676292419434, "logits/rejected": -2.292684555053711, "logps/chosen": -174.4659881591797, "logps/rejected": -199.62318420410156, "loss": 0.6055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.180438756942749, "rewards/margins": 0.2978615164756775, "rewards/rejected": -1.4783003330230713, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 34.97676086425781, "learning_rate": 3.6602934361462065e-08, "logits/chosen": -2.2863287925720215, "logits/rejected": -2.2612805366516113, "logps/chosen": -173.59727478027344, "logps/rejected": -189.28097534179688, "loss": 0.6444, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1842314004898071, "rewards/margins": 0.19635871052742004, "rewards/rejected": -1.3805900812149048, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 36.55752944946289, "learning_rate": 3.6506384417977314e-08, "logits/chosen": -2.2828993797302246, "logits/rejected": -2.265763282775879, "logps/chosen": -173.64022827148438, "logps/rejected": -195.41806030273438, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": -1.1666899919509888, "rewards/margins": 0.24730460345745087, "rewards/rejected": -1.413994550704956, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 33.55683135986328, "learning_rate": 3.6409888710778344e-08, "logits/chosen": -2.331960439682007, "logits/rejected": -2.317816972732544, "logps/chosen": -167.28970336914062, "logps/rejected": -188.5340576171875, "loss": 0.6268, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1197372674942017, "rewards/margins": 0.22455672919750214, "rewards/rejected": -1.3442939519882202, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 36.97398376464844, "learning_rate": 3.631344762772034e-08, "logits/chosen": -2.3534226417541504, "logits/rejected": -2.3306939601898193, "logps/chosen": -163.12069702148438, "logps/rejected": -192.68130493164062, "loss": 0.6123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0892016887664795, "rewards/margins": 0.3190400302410126, "rewards/rejected": -1.4082419872283936, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 30.5578556060791, "learning_rate": 3.621706155643891e-08, "logits/chosen": -2.372744083404541, "logits/rejected": -2.340273380279541, "logps/chosen": -158.9438018798828, "logps/rejected": -191.1975555419922, "loss": 0.5876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.013116478919983, "rewards/margins": 0.36311542987823486, "rewards/rejected": -1.3762319087982178, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 50.900856018066406, "learning_rate": 3.612073088434858e-08, "logits/chosen": -2.3769946098327637, "logits/rejected": -2.3548004627227783, "logps/chosen": -167.50221252441406, "logps/rejected": -200.83953857421875, "loss": 0.6065, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1152487993240356, "rewards/margins": 0.34097960591316223, "rewards/rejected": -1.456228494644165, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 27.747989654541016, "learning_rate": 3.6024455998641206e-08, "logits/chosen": -2.3134467601776123, "logits/rejected": -2.298866033554077, "logps/chosen": -158.34649658203125, "logps/rejected": -185.16175842285156, "loss": 0.6092, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0483105182647705, "rewards/margins": 0.278871089220047, "rewards/rejected": -1.3271814584732056, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 30.30034637451172, "learning_rate": 3.592823728628439e-08, "logits/chosen": -2.4606642723083496, "logits/rejected": -2.453303575515747, "logps/chosen": -156.36085510253906, "logps/rejected": -186.90420532226562, "loss": 0.618, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0328586101531982, "rewards/margins": 0.2768515944480896, "rewards/rejected": -1.309709906578064, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 28.235002517700195, "learning_rate": 3.5832075134019955e-08, "logits/chosen": -2.380627155303955, "logits/rejected": -2.348646640777588, "logps/chosen": -154.5124053955078, "logps/rejected": -189.811279296875, "loss": 0.5734, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.005467414855957, "rewards/margins": 0.38841429352760315, "rewards/rejected": -1.3938816785812378, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 32.709320068359375, "learning_rate": 3.573596992836239e-08, "logits/chosen": -2.406069278717041, "logits/rejected": -2.386014223098755, "logps/chosen": -160.51290893554688, "logps/rejected": -179.46875, "loss": 0.6359, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0448853969573975, "rewards/margins": 0.22973811626434326, "rewards/rejected": -1.2746235132217407, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 34.195552825927734, "learning_rate": 3.5639922055597306e-08, "logits/chosen": -2.4023020267486572, "logits/rejected": -2.3860769271850586, "logps/chosen": -170.53150939941406, "logps/rejected": -193.5362548828125, "loss": 0.6419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1451653242111206, "rewards/margins": 0.25867563486099243, "rewards/rejected": -1.4038410186767578, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 29.43927001953125, "learning_rate": 3.5543931901779855e-08, "logits/chosen": -2.412224531173706, "logits/rejected": -2.393313407897949, "logps/chosen": -164.941650390625, "logps/rejected": -190.35043334960938, "loss": 0.6026, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0708945989608765, "rewards/margins": 0.28530409932136536, "rewards/rejected": -1.3561986684799194, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 32.702964782714844, "learning_rate": 3.544799985273321e-08, "logits/chosen": -2.363358974456787, "logits/rejected": -2.34291672706604, "logps/chosen": -150.8433837890625, "logps/rejected": -184.0169219970703, "loss": 0.5946, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9849146008491516, "rewards/margins": 0.3147234320640564, "rewards/rejected": -1.2996381521224976, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 32.199180603027344, "learning_rate": 3.535212629404697e-08, "logits/chosen": -2.3609559535980225, "logits/rejected": -2.3134467601776123, "logps/chosen": -164.59950256347656, "logps/rejected": -197.69320678710938, "loss": 0.5852, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0920616388320923, "rewards/margins": 0.37384092807769775, "rewards/rejected": -1.46590256690979, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 32.38897705078125, "learning_rate": 3.525631161107564e-08, "logits/chosen": -2.3948540687561035, "logits/rejected": -2.350449562072754, "logps/chosen": -169.53561401367188, "logps/rejected": -205.30337524414062, "loss": 0.5558, "rewards/accuracies": 0.71875, "rewards/chosen": -1.148709774017334, "rewards/margins": 0.4157761037349701, "rewards/rejected": -1.5644859075546265, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 25.04302978515625, "learning_rate": 3.516055618893712e-08, "logits/chosen": -2.3484442234039307, "logits/rejected": -2.3237853050231934, "logps/chosen": -187.7406005859375, "logps/rejected": -216.2569122314453, "loss": 0.5911, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.306531548500061, "rewards/margins": 0.3401932120323181, "rewards/rejected": -1.6467249393463135, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 27.777585983276367, "learning_rate": 3.50648604125111e-08, "logits/chosen": -2.333702564239502, "logits/rejected": -2.307685136795044, "logps/chosen": -190.0394287109375, "logps/rejected": -215.70840454101562, "loss": 0.6033, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.319045066833496, "rewards/margins": 0.31661438941955566, "rewards/rejected": -1.6356592178344727, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 31.222261428833008, "learning_rate": 3.496922466643748e-08, "logits/chosen": -2.2713112831115723, "logits/rejected": -2.255934238433838, "logps/chosen": -176.26388549804688, "logps/rejected": -202.74990844726562, "loss": 0.6314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2430853843688965, "rewards/margins": 0.2581595778465271, "rewards/rejected": -1.5012450218200684, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 29.14847183227539, "learning_rate": 3.487364933511494e-08, "logits/chosen": -2.318223476409912, "logits/rejected": -2.2866435050964355, "logps/chosen": -188.31454467773438, "logps/rejected": -228.31185913085938, "loss": 0.5999, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3372859954833984, "rewards/margins": 0.4245668947696686, "rewards/rejected": -1.7618528604507446, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 28.7227783203125, "learning_rate": 3.4778134802699274e-08, "logits/chosen": -2.401853084564209, "logits/rejected": -2.3755974769592285, "logps/chosen": -185.71531677246094, "logps/rejected": -211.0960693359375, "loss": 0.6019, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.251940131187439, "rewards/margins": 0.33058732748031616, "rewards/rejected": -1.5825273990631104, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 38.03221893310547, "learning_rate": 3.4682681453101966e-08, "logits/chosen": -2.3060457706451416, "logits/rejected": -2.27876615524292, "logps/chosen": -185.47006225585938, "logps/rejected": -210.9840087890625, "loss": 0.6077, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2905194759368896, "rewards/margins": 0.29180076718330383, "rewards/rejected": -1.5823204517364502, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 36.275672912597656, "learning_rate": 3.458728966998853e-08, "logits/chosen": -2.2617485523223877, "logits/rejected": -2.241072416305542, "logps/chosen": -178.1785430908203, "logps/rejected": -203.6383819580078, "loss": 0.6146, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2262318134307861, "rewards/margins": 0.2985805869102478, "rewards/rejected": -1.5248124599456787, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 27.28822135925293, "learning_rate": 3.4491959836777025e-08, "logits/chosen": -2.3436708450317383, "logits/rejected": -2.3321614265441895, "logps/chosen": -181.6899871826172, "logps/rejected": -201.81768798828125, "loss": 0.6403, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2293510437011719, "rewards/margins": 0.23338110744953156, "rewards/rejected": -1.4627320766448975, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 35.84111785888672, "learning_rate": 3.439669233663651e-08, "logits/chosen": -2.3769030570983887, "logits/rejected": -2.3559274673461914, "logps/chosen": -171.7718963623047, "logps/rejected": -197.13270568847656, "loss": 0.6136, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.184159517288208, "rewards/margins": 0.280154824256897, "rewards/rejected": -1.464314579963684, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 41.14706802368164, "learning_rate": 3.430148755248552e-08, "logits/chosen": -2.2820653915405273, "logits/rejected": -2.2608208656311035, "logps/chosen": -178.73739624023438, "logps/rejected": -189.0192108154297, "loss": 0.6456, "rewards/accuracies": 0.625, "rewards/chosen": -1.2014598846435547, "rewards/margins": 0.20202994346618652, "rewards/rejected": -1.4034898281097412, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 30.83884048461914, "learning_rate": 3.4206345866990535e-08, "logits/chosen": -2.3826446533203125, "logits/rejected": -2.3630967140197754, "logps/chosen": -181.05758666992188, "logps/rejected": -206.005615234375, "loss": 0.6197, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2589973211288452, "rewards/margins": 0.29420050978660583, "rewards/rejected": -1.553197979927063, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 33.7467041015625, "learning_rate": 3.41112676625643e-08, "logits/chosen": -2.380197048187256, "logits/rejected": -2.3536598682403564, "logps/chosen": -167.689697265625, "logps/rejected": -200.7379608154297, "loss": 0.5648, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.117105484008789, "rewards/margins": 0.3807833790779114, "rewards/rejected": -1.4978888034820557, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 38.58169937133789, "learning_rate": 3.401625332136455e-08, "logits/chosen": -2.3757309913635254, "logits/rejected": -2.3479130268096924, "logps/chosen": -165.47581481933594, "logps/rejected": -200.92559814453125, "loss": 0.5685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1087088584899902, "rewards/margins": 0.36637088656425476, "rewards/rejected": -1.4750797748565674, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -2.3992388248443604, "eval_logits/rejected": -2.3886518478393555, "eval_logps/chosen": -161.57830810546875, "eval_logps/rejected": -181.8199920654297, "eval_loss": 0.6485751867294312, "eval_rewards/accuracies": 0.5992100238800049, "eval_rewards/chosen": -1.0256284475326538, "eval_rewards/margins": 0.16507543623447418, "eval_rewards/rejected": -1.1907037496566772, "eval_runtime": 384.7254, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 31.150392532348633, "learning_rate": 3.3921303225292226e-08, "logits/chosen": -2.2678802013397217, "logits/rejected": -2.243900775909424, "logps/chosen": -170.72007751464844, "logps/rejected": -208.6854705810547, "loss": 0.5868, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1942079067230225, "rewards/margins": 0.3671010732650757, "rewards/rejected": -1.5613089799880981, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 44.891056060791016, "learning_rate": 3.382641775599008e-08, "logits/chosen": -2.316336154937744, "logits/rejected": -2.302032947540283, "logps/chosen": -177.46009826660156, "logps/rejected": -207.8301239013672, "loss": 0.6361, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2496541738510132, "rewards/margins": 0.2785920202732086, "rewards/rejected": -1.5282460451126099, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 30.4825496673584, "learning_rate": 3.373159729484113e-08, "logits/chosen": -2.2911577224731445, "logits/rejected": -2.276029348373413, "logps/chosen": -193.60427856445312, "logps/rejected": -210.3373565673828, "loss": 0.6456, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3614342212677002, "rewards/margins": 0.2419278621673584, "rewards/rejected": -1.6033620834350586, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 27.854116439819336, "learning_rate": 3.363684222296704e-08, "logits/chosen": -2.3137216567993164, "logits/rejected": -2.2912824153900146, "logps/chosen": -181.1165771484375, "logps/rejected": -202.65650939941406, "loss": 0.6348, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2578670978546143, "rewards/margins": 0.24567310512065887, "rewards/rejected": -1.503540277481079, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 36.37520980834961, "learning_rate": 3.3542152921226686e-08, "logits/chosen": -2.33618426322937, "logits/rejected": -2.308847188949585, "logps/chosen": -171.78488159179688, "logps/rejected": -205.62368774414062, "loss": 0.5829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1678929328918457, "rewards/margins": 0.3575848937034607, "rewards/rejected": -1.5254778861999512, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 30.956172943115234, "learning_rate": 3.3447529770214565e-08, "logits/chosen": -2.29649019241333, "logits/rejected": -2.265418291091919, "logps/chosen": -179.08489990234375, "logps/rejected": -201.74864196777344, "loss": 0.6165, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2031055688858032, "rewards/margins": 0.3095191419124603, "rewards/rejected": -1.5126248598098755, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 29.66795539855957, "learning_rate": 3.335297315025935e-08, "logits/chosen": -2.284562110900879, "logits/rejected": -2.2554664611816406, "logps/chosen": -176.65817260742188, "logps/rejected": -206.97415161132812, "loss": 0.5787, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2032575607299805, "rewards/margins": 0.36220937967300415, "rewards/rejected": -1.5654670000076294, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 31.585372924804688, "learning_rate": 3.325848344142219e-08, "logits/chosen": -2.338066816329956, "logits/rejected": -2.3050124645233154, "logps/chosen": -183.82411193847656, "logps/rejected": -204.71006774902344, "loss": 0.6312, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2679367065429688, "rewards/margins": 0.26346999406814575, "rewards/rejected": -1.5314067602157593, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 35.755943298339844, "learning_rate": 3.3164061023495385e-08, "logits/chosen": -2.318530559539795, "logits/rejected": -2.295261859893799, "logps/chosen": -185.0235137939453, "logps/rejected": -217.89462280273438, "loss": 0.5831, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2432845830917358, "rewards/margins": 0.38053396344184875, "rewards/rejected": -1.6238186359405518, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 37.60653305053711, "learning_rate": 3.306970627600073e-08, "logits/chosen": -2.2636165618896484, "logits/rejected": -2.240103006362915, "logps/chosen": -199.27163696289062, "logps/rejected": -211.94677734375, "loss": 0.662, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.392655611038208, "rewards/margins": 0.1966453492641449, "rewards/rejected": -1.5893008708953857, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 35.965511322021484, "learning_rate": 3.297541957818801e-08, "logits/chosen": -2.362086772918701, "logits/rejected": -2.3415915966033936, "logps/chosen": -180.53945922851562, "logps/rejected": -220.4781951904297, "loss": 0.57, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2809550762176514, "rewards/margins": 0.4184480607509613, "rewards/rejected": -1.699403166770935, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 26.968000411987305, "learning_rate": 3.2881201309033555e-08, "logits/chosen": -2.318852663040161, "logits/rejected": -2.293492555618286, "logps/chosen": -178.1951904296875, "logps/rejected": -210.78713989257812, "loss": 0.5892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2191166877746582, "rewards/margins": 0.3563511371612549, "rewards/rejected": -1.575467824935913, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 55.6558952331543, "learning_rate": 3.278705184723856e-08, "logits/chosen": -2.308856964111328, "logits/rejected": -2.28068470954895, "logps/chosen": -193.82093811035156, "logps/rejected": -221.250244140625, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": -1.396980881690979, "rewards/margins": 0.33789733052253723, "rewards/rejected": -1.7348783016204834, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 42.39641571044922, "learning_rate": 3.2692971571227705e-08, "logits/chosen": -2.269031286239624, "logits/rejected": -2.2357709407806396, "logps/chosen": -187.25526428222656, "logps/rejected": -226.68356323242188, "loss": 0.5747, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.30891752243042, "rewards/margins": 0.41980305314064026, "rewards/rejected": -1.7287204265594482, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 40.96769332885742, "learning_rate": 3.25989608591476e-08, "logits/chosen": -2.3529152870178223, "logits/rejected": -2.3288815021514893, "logps/chosen": -194.79385375976562, "logps/rejected": -220.18704223632812, "loss": 0.6385, "rewards/accuracies": 0.625, "rewards/chosen": -1.3637688159942627, "rewards/margins": 0.2891305983066559, "rewards/rejected": -1.652899146080017, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 37.165679931640625, "learning_rate": 3.250502008886524e-08, "logits/chosen": -2.311553716659546, "logits/rejected": -2.287487506866455, "logps/chosen": -188.62142944335938, "logps/rejected": -216.90078735351562, "loss": 0.599, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3451268672943115, "rewards/margins": 0.34311169385910034, "rewards/rejected": -1.688238501548767, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 29.16134262084961, "learning_rate": 3.241114963796646e-08, "logits/chosen": -2.303574562072754, "logits/rejected": -2.284886360168457, "logps/chosen": -185.53944396972656, "logps/rejected": -211.68630981445312, "loss": 0.6121, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.287890076637268, "rewards/margins": 0.3026021122932434, "rewards/rejected": -1.5904921293258667, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 36.85075378417969, "learning_rate": 3.231734988375447e-08, "logits/chosen": -2.2683825492858887, "logits/rejected": -2.2413578033447266, "logps/chosen": -181.06224060058594, "logps/rejected": -215.37014770507812, "loss": 0.605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2876007556915283, "rewards/margins": 0.3665514588356018, "rewards/rejected": -1.6541521549224854, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 30.369298934936523, "learning_rate": 3.222362120324837e-08, "logits/chosen": -2.3610501289367676, "logits/rejected": -2.331861972808838, "logps/chosen": -169.66964721679688, "logps/rejected": -206.345703125, "loss": 0.6025, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.16221022605896, "rewards/margins": 0.3576747179031372, "rewards/rejected": -1.5198849439620972, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 30.62260627746582, "learning_rate": 3.2129963973181526e-08, "logits/chosen": -2.341277599334717, "logits/rejected": -2.3116092681884766, "logps/chosen": -168.75254821777344, "logps/rejected": -199.89877319335938, "loss": 0.5973, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1201581954956055, "rewards/margins": 0.357011616230011, "rewards/rejected": -1.4771697521209717, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 34.26206588745117, "learning_rate": 3.2036378570000146e-08, "logits/chosen": -2.3451850414276123, "logits/rejected": -2.3163180351257324, "logps/chosen": -168.078125, "logps/rejected": -200.7991485595703, "loss": 0.5808, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1173222064971924, "rewards/margins": 0.35028529167175293, "rewards/rejected": -1.4676073789596558, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 31.360225677490234, "learning_rate": 3.1942865369861704e-08, "logits/chosen": -2.313110113143921, "logits/rejected": -2.287832736968994, "logps/chosen": -178.32369995117188, "logps/rejected": -197.38841247558594, "loss": 0.6376, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2378849983215332, "rewards/margins": 0.2414713352918625, "rewards/rejected": -1.4793564081192017, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 35.559757232666016, "learning_rate": 3.18494247486335e-08, "logits/chosen": -2.277827739715576, "logits/rejected": -2.254122257232666, "logps/chosen": -170.04818725585938, "logps/rejected": -196.9369354248047, "loss": 0.607, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1452734470367432, "rewards/margins": 0.3120071291923523, "rewards/rejected": -1.4572807550430298, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 28.967737197875977, "learning_rate": 3.1756057081891104e-08, "logits/chosen": -2.31591796875, "logits/rejected": -2.3056747913360596, "logps/chosen": -167.77761840820312, "logps/rejected": -187.48861694335938, "loss": 0.6362, "rewards/accuracies": 0.625, "rewards/chosen": -1.1490370035171509, "rewards/margins": 0.20246684551239014, "rewards/rejected": -1.3515037298202515, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 33.354129791259766, "learning_rate": 3.166276274491684e-08, "logits/chosen": -2.3166346549987793, "logits/rejected": -2.2880496978759766, "logps/chosen": -165.89306640625, "logps/rejected": -200.16159057617188, "loss": 0.5776, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1227996349334717, "rewards/margins": 0.36727264523506165, "rewards/rejected": -1.490072250366211, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 27.8592529296875, "learning_rate": 3.156954211269828e-08, "logits/chosen": -2.3116652965545654, "logits/rejected": -2.2855889797210693, "logps/chosen": -167.8182373046875, "logps/rejected": -199.59274291992188, "loss": 0.593, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1173088550567627, "rewards/margins": 0.34456413984298706, "rewards/rejected": -1.4618730545043945, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 33.00469207763672, "learning_rate": 3.147639555992677e-08, "logits/chosen": -2.3325459957122803, "logits/rejected": -2.2897353172302246, "logps/chosen": -187.8080596923828, "logps/rejected": -216.45242309570312, "loss": 0.5906, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2975906133651733, "rewards/margins": 0.36165517568588257, "rewards/rejected": -1.6592458486557007, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 35.083377838134766, "learning_rate": 3.138332346099587e-08, "logits/chosen": -2.3586764335632324, "logits/rejected": -2.320845603942871, "logps/chosen": -170.81422424316406, "logps/rejected": -194.9658966064453, "loss": 0.6084, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.132930040359497, "rewards/margins": 0.3183945119380951, "rewards/rejected": -1.4513245820999146, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 31.931344985961914, "learning_rate": 3.129032618999994e-08, "logits/chosen": -2.3237130641937256, "logits/rejected": -2.2973456382751465, "logps/chosen": -183.87399291992188, "logps/rejected": -200.5390625, "loss": 0.6421, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2792707681655884, "rewards/margins": 0.219514399766922, "rewards/rejected": -1.4987852573394775, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 33.240440368652344, "learning_rate": 3.119740412073252e-08, "logits/chosen": -2.346156597137451, "logits/rejected": -2.3272767066955566, "logps/chosen": -165.01187133789062, "logps/rejected": -182.28871154785156, "loss": 0.6523, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1343519687652588, "rewards/margins": 0.1887514442205429, "rewards/rejected": -1.3231033086776733, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 49.612754821777344, "learning_rate": 3.1104557626684884e-08, "logits/chosen": -2.302736282348633, "logits/rejected": -2.2873153686523438, "logps/chosen": -168.75994873046875, "logps/rejected": -200.77427673339844, "loss": 0.6094, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1248788833618164, "rewards/margins": 0.3630138337612152, "rewards/rejected": -1.487892746925354, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 24.72926902770996, "learning_rate": 3.101178708104456e-08, "logits/chosen": -2.3570656776428223, "logits/rejected": -2.3030755519866943, "logps/chosen": -162.44204711914062, "logps/rejected": -187.81796264648438, "loss": 0.5762, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0133875608444214, "rewards/margins": 0.3679368793964386, "rewards/rejected": -1.3813245296478271, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 27.86030387878418, "learning_rate": 3.091909285669383e-08, "logits/chosen": -2.326167583465576, "logits/rejected": -2.2939858436584473, "logps/chosen": -157.47518920898438, "logps/rejected": -177.9588623046875, "loss": 0.6127, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0260807275772095, "rewards/margins": 0.27049964666366577, "rewards/rejected": -1.29658043384552, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 30.32522964477539, "learning_rate": 3.082647532620817e-08, "logits/chosen": -2.359875440597534, "logits/rejected": -2.3462164402008057, "logps/chosen": -158.7584686279297, "logps/rejected": -202.20672607421875, "loss": 0.5737, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0689572095870972, "rewards/margins": 0.4044581353664398, "rewards/rejected": -1.4734153747558594, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 32.972084045410156, "learning_rate": 3.0733934861854794e-08, "logits/chosen": -2.3069303035736084, "logits/rejected": -2.2892849445343018, "logps/chosen": -162.90286254882812, "logps/rejected": -193.65733337402344, "loss": 0.6039, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1045806407928467, "rewards/margins": 0.3092039227485657, "rewards/rejected": -1.4137846231460571, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 31.083885192871094, "learning_rate": 3.0641471835591184e-08, "logits/chosen": -2.3341760635375977, "logits/rejected": -2.3034563064575195, "logps/chosen": -171.6559600830078, "logps/rejected": -197.1893768310547, "loss": 0.6091, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1536730527877808, "rewards/margins": 0.3134633004665375, "rewards/rejected": -1.467136263847351, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 28.85565185546875, "learning_rate": 3.054908661906353e-08, "logits/chosen": -2.350050210952759, "logits/rejected": -2.329953193664551, "logps/chosen": -171.4935760498047, "logps/rejected": -201.5003662109375, "loss": 0.6215, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1677134037017822, "rewards/margins": 0.3004491329193115, "rewards/rejected": -1.4681622982025146, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 29.845497131347656, "learning_rate": 3.045677958360532e-08, "logits/chosen": -2.3807244300842285, "logits/rejected": -2.356279134750366, "logps/chosen": -170.1651611328125, "logps/rejected": -197.697265625, "loss": 0.6111, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1297045946121216, "rewards/margins": 0.30230602622032166, "rewards/rejected": -1.432010531425476, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 38.318115234375, "learning_rate": 3.0364551100235795e-08, "logits/chosen": -2.295039415359497, "logits/rejected": -2.261448860168457, "logps/chosen": -164.27969360351562, "logps/rejected": -175.9013214111328, "loss": 0.6335, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0416390895843506, "rewards/margins": 0.21523399651050568, "rewards/rejected": -1.2568730115890503, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 23.599227905273438, "learning_rate": 3.027240153965839e-08, "logits/chosen": -2.360468626022339, "logits/rejected": -2.345468759536743, "logps/chosen": -149.54684448242188, "logps/rejected": -173.2725372314453, "loss": 0.63, "rewards/accuracies": 0.59375, "rewards/chosen": -0.981712818145752, "rewards/margins": 0.2234046459197998, "rewards/rejected": -1.2051174640655518, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -2.410827159881592, "eval_logits/rejected": -2.401189088821411, "eval_logps/chosen": -147.70538330078125, "eval_logps/rejected": -166.54612731933594, "eval_loss": 0.6502494812011719, "eval_rewards/accuracies": 0.6003717184066772, "eval_rewards/chosen": -0.886898934841156, "eval_rewards/margins": 0.15106609463691711, "eval_rewards/rejected": -1.0379650592803955, "eval_runtime": 384.8158, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 11600 }, { "epoch": 2.0003445899379737, "grad_norm": 33.16267776489258, "learning_rate": 3.0180331272259404e-08, "logits/chosen": -2.310610771179199, "logits/rejected": -2.2826197147369385, "logps/chosen": -155.88720703125, "logps/rejected": -188.38926696777344, "loss": 0.5894, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0316044092178345, "rewards/margins": 0.3397893011569977, "rewards/rejected": -1.3713937997817993, "step": 11610 }, { "epoch": 2.0020675396278427, "grad_norm": 32.0125617980957, "learning_rate": 3.0088340668106376e-08, "logits/chosen": -2.345252275466919, "logits/rejected": -2.328648805618286, "logps/chosen": -167.07174682617188, "logps/rejected": -193.3257598876953, "loss": 0.621, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.098212480545044, "rewards/margins": 0.2750307023525238, "rewards/rejected": -1.3732430934906006, "step": 11620 }, { "epoch": 2.003790489317712, "grad_norm": 29.57107925415039, "learning_rate": 2.999643009694671e-08, "logits/chosen": -2.3526864051818848, "logits/rejected": -2.3216300010681152, "logps/chosen": -166.13430786132812, "logps/rejected": -191.68673706054688, "loss": 0.6018, "rewards/accuracies": 0.6875, "rewards/chosen": -1.074463129043579, "rewards/margins": 0.3187711238861084, "rewards/rejected": -1.393234372138977, "step": 11630 }, { "epoch": 2.005513439007581, "grad_norm": 28.913124084472656, "learning_rate": 2.990459992820601e-08, "logits/chosen": -2.365588426589966, "logits/rejected": -2.34390926361084, "logps/chosen": -157.5725555419922, "logps/rejected": -182.8646697998047, "loss": 0.6065, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0403567552566528, "rewards/margins": 0.29022300243377686, "rewards/rejected": -1.3305796384811401, "step": 11640 }, { "epoch": 2.00723638869745, "grad_norm": 26.71157455444336, "learning_rate": 2.981285053098682e-08, "logits/chosen": -2.2904579639434814, "logits/rejected": -2.2564196586608887, "logps/chosen": -150.6053924560547, "logps/rejected": -181.81773376464844, "loss": 0.5712, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9526816606521606, "rewards/margins": 0.35900914669036865, "rewards/rejected": -1.3116909265518188, "step": 11650 }, { "epoch": 2.008959338387319, "grad_norm": 26.587501525878906, "learning_rate": 2.972118227406698e-08, "logits/chosen": -2.367176055908203, "logits/rejected": -2.3259403705596924, "logps/chosen": -171.75442504882812, "logps/rejected": -195.70761108398438, "loss": 0.6051, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1537991762161255, "rewards/margins": 0.31325364112854004, "rewards/rejected": -1.467052698135376, "step": 11660 }, { "epoch": 2.010682288077188, "grad_norm": 39.934181213378906, "learning_rate": 2.9629595525898188e-08, "logits/chosen": -2.3322441577911377, "logits/rejected": -2.292595624923706, "logps/chosen": -169.62432861328125, "logps/rejected": -215.9363250732422, "loss": 0.5333, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1304153203964233, "rewards/margins": 0.5008441805839539, "rewards/rejected": -1.6312596797943115, "step": 11670 }, { "epoch": 2.0124052377670574, "grad_norm": 34.38392639160156, "learning_rate": 2.9538090654604596e-08, "logits/chosen": -2.2712364196777344, "logits/rejected": -2.2498137950897217, "logps/chosen": -174.3423614501953, "logps/rejected": -211.03689575195312, "loss": 0.5887, "rewards/accuracies": 0.71875, "rewards/chosen": -1.167636513710022, "rewards/margins": 0.39166468381881714, "rewards/rejected": -1.5593011379241943, "step": 11680 }, { "epoch": 2.0141281874569263, "grad_norm": 31.447341918945312, "learning_rate": 2.9446668027981127e-08, "logits/chosen": -2.3560667037963867, "logits/rejected": -2.3193726539611816, "logps/chosen": -185.02783203125, "logps/rejected": -213.715576171875, "loss": 0.5913, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2407398223876953, "rewards/margins": 0.3654331564903259, "rewards/rejected": -1.606172800064087, "step": 11690 }, { "epoch": 2.0158511371467953, "grad_norm": 24.823726654052734, "learning_rate": 2.9355328013492255e-08, "logits/chosen": -2.406175136566162, "logits/rejected": -2.3746700286865234, "logps/chosen": -162.5546112060547, "logps/rejected": -196.85719299316406, "loss": 0.5813, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.05254328250885, "rewards/margins": 0.3555363118648529, "rewards/rejected": -1.4080796241760254, "step": 11700 }, { "epoch": 2.0175740868366643, "grad_norm": 41.32663345336914, "learning_rate": 2.926407097827034e-08, "logits/chosen": -2.3307394981384277, "logits/rejected": -2.298651933670044, "logps/chosen": -172.9115753173828, "logps/rejected": -200.8367919921875, "loss": 0.5956, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1703336238861084, "rewards/margins": 0.3152288496494293, "rewards/rejected": -1.4855625629425049, "step": 11710 }, { "epoch": 2.0192970365265333, "grad_norm": 34.57819747924805, "learning_rate": 2.917289728911424e-08, "logits/chosen": -2.358710527420044, "logits/rejected": -2.3419289588928223, "logps/chosen": -177.53518676757812, "logps/rejected": -204.48703002929688, "loss": 0.627, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.223528265953064, "rewards/margins": 0.2960992753505707, "rewards/rejected": -1.5196274518966675, "step": 11720 }, { "epoch": 2.0210199862164027, "grad_norm": 27.626047134399414, "learning_rate": 2.90818073124878e-08, "logits/chosen": -2.306004047393799, "logits/rejected": -2.2871932983398438, "logps/chosen": -179.81082153320312, "logps/rejected": -212.8517303466797, "loss": 0.6091, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2425450086593628, "rewards/margins": 0.3176325261592865, "rewards/rejected": -1.5601775646209717, "step": 11730 }, { "epoch": 2.0227429359062716, "grad_norm": 31.49341583251953, "learning_rate": 2.899080141451836e-08, "logits/chosen": -2.345893621444702, "logits/rejected": -2.32354474067688, "logps/chosen": -174.951171875, "logps/rejected": -205.5146484375, "loss": 0.612, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2136671543121338, "rewards/margins": 0.3185957670211792, "rewards/rejected": -1.5322628021240234, "step": 11740 }, { "epoch": 2.0244658855961406, "grad_norm": 29.300275802612305, "learning_rate": 2.8899879960995376e-08, "logits/chosen": -2.3585095405578613, "logits/rejected": -2.34792160987854, "logps/chosen": -163.48052978515625, "logps/rejected": -211.85824584960938, "loss": 0.564, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.141412377357483, "rewards/margins": 0.4344809651374817, "rewards/rejected": -1.5758932828903198, "step": 11750 }, { "epoch": 2.0261888352860096, "grad_norm": 33.636085510253906, "learning_rate": 2.8809043317368876e-08, "logits/chosen": -2.3376007080078125, "logits/rejected": -2.3140745162963867, "logps/chosen": -181.65231323242188, "logps/rejected": -219.00009155273438, "loss": 0.5894, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2589913606643677, "rewards/margins": 0.40537405014038086, "rewards/rejected": -1.664365530014038, "step": 11760 }, { "epoch": 2.0279117849758785, "grad_norm": 25.824485778808594, "learning_rate": 2.871829184874795e-08, "logits/chosen": -2.285428524017334, "logits/rejected": -2.2535977363586426, "logps/chosen": -181.88848876953125, "logps/rejected": -210.92041015625, "loss": 0.5936, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2358392477035522, "rewards/margins": 0.36059147119522095, "rewards/rejected": -1.596430778503418, "step": 11770 }, { "epoch": 2.029634734665748, "grad_norm": 38.72450637817383, "learning_rate": 2.8627625919899363e-08, "logits/chosen": -2.2584919929504395, "logits/rejected": -2.238612413406372, "logps/chosen": -175.74777221679688, "logps/rejected": -206.83676147460938, "loss": 0.613, "rewards/accuracies": 0.625, "rewards/chosen": -1.2549865245819092, "rewards/margins": 0.2964690625667572, "rewards/rejected": -1.5514557361602783, "step": 11780 }, { "epoch": 2.031357684355617, "grad_norm": 31.6374454498291, "learning_rate": 2.8537045895246103e-08, "logits/chosen": -2.285067081451416, "logits/rejected": -2.274216413497925, "logps/chosen": -170.0199737548828, "logps/rejected": -225.0682830810547, "loss": 0.5148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.195155382156372, "rewards/margins": 0.5274872779846191, "rewards/rejected": -1.7226425409317017, "step": 11790 }, { "epoch": 2.033080634045486, "grad_norm": 40.261634826660156, "learning_rate": 2.8446552138865797e-08, "logits/chosen": -2.297799587249756, "logits/rejected": -2.2750699520111084, "logps/chosen": -197.58456420898438, "logps/rejected": -222.3473663330078, "loss": 0.6118, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3904147148132324, "rewards/margins": 0.30489349365234375, "rewards/rejected": -1.6953080892562866, "step": 11800 }, { "epoch": 2.034803583735355, "grad_norm": 37.03939437866211, "learning_rate": 2.8356145014489408e-08, "logits/chosen": -2.2964396476745605, "logits/rejected": -2.2717151641845703, "logps/chosen": -200.8223114013672, "logps/rejected": -228.19686889648438, "loss": 0.6273, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4382585287094116, "rewards/margins": 0.3232322633266449, "rewards/rejected": -1.761490821838379, "step": 11810 }, { "epoch": 2.036526533425224, "grad_norm": 30.490917205810547, "learning_rate": 2.8265824885499605e-08, "logits/chosen": -2.3136515617370605, "logits/rejected": -2.2980868816375732, "logps/chosen": -187.7859649658203, "logps/rejected": -209.8665313720703, "loss": 0.6629, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3637282848358154, "rewards/margins": 0.2283482849597931, "rewards/rejected": -1.5920765399932861, "step": 11820 }, { "epoch": 2.0382494831150932, "grad_norm": 44.555973052978516, "learning_rate": 2.817559211492948e-08, "logits/chosen": -2.2768752574920654, "logits/rejected": -2.265839099884033, "logps/chosen": -178.7625732421875, "logps/rejected": -213.99478149414062, "loss": 0.6033, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2605791091918945, "rewards/margins": 0.3402387499809265, "rewards/rejected": -1.6008179187774658, "step": 11830 }, { "epoch": 2.039972432804962, "grad_norm": 28.042152404785156, "learning_rate": 2.80854470654609e-08, "logits/chosen": -2.2855865955352783, "logits/rejected": -2.257840871810913, "logps/chosen": -186.37057495117188, "logps/rejected": -222.32763671875, "loss": 0.585, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3317365646362305, "rewards/margins": 0.37816449999809265, "rewards/rejected": -1.7099010944366455, "step": 11840 }, { "epoch": 2.041695382494831, "grad_norm": 32.66047668457031, "learning_rate": 2.7995390099423217e-08, "logits/chosen": -2.252861261367798, "logits/rejected": -2.216279983520508, "logps/chosen": -189.672607421875, "logps/rejected": -216.9271697998047, "loss": 0.6055, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3427212238311768, "rewards/margins": 0.34207189083099365, "rewards/rejected": -1.68479323387146, "step": 11850 }, { "epoch": 2.0434183321847, "grad_norm": 51.951820373535156, "learning_rate": 2.7905421578791754e-08, "logits/chosen": -2.326028347015381, "logits/rejected": -2.3146190643310547, "logps/chosen": -199.56216430664062, "logps/rejected": -230.6979522705078, "loss": 0.6341, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4476042985916138, "rewards/margins": 0.28900259733200073, "rewards/rejected": -1.7366068363189697, "step": 11860 }, { "epoch": 2.045141281874569, "grad_norm": 57.09029006958008, "learning_rate": 2.7815541865186215e-08, "logits/chosen": -2.2642533779144287, "logits/rejected": -2.2513880729675293, "logps/chosen": -184.52235412597656, "logps/rejected": -224.866943359375, "loss": 0.585, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3491449356079102, "rewards/margins": 0.36590269207954407, "rewards/rejected": -1.7150475978851318, "step": 11870 }, { "epoch": 2.0468642315644385, "grad_norm": 35.73862075805664, "learning_rate": 2.7725751319869485e-08, "logits/chosen": -2.284669876098633, "logits/rejected": -2.2507481575012207, "logps/chosen": -190.849365234375, "logps/rejected": -236.66421508789062, "loss": 0.5279, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3258202075958252, "rewards/margins": 0.49654918909072876, "rewards/rejected": -1.8223693370819092, "step": 11880 }, { "epoch": 2.0485871812543075, "grad_norm": 39.445770263671875, "learning_rate": 2.7636050303746004e-08, "logits/chosen": -2.3135440349578857, "logits/rejected": -2.288027763366699, "logps/chosen": -203.68576049804688, "logps/rejected": -235.73757934570312, "loss": 0.5891, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.456667423248291, "rewards/margins": 0.382424533367157, "rewards/rejected": -1.8390918970108032, "step": 11890 }, { "epoch": 2.0503101309441765, "grad_norm": 54.75608825683594, "learning_rate": 2.7546439177360336e-08, "logits/chosen": -2.2159829139709473, "logits/rejected": -2.188356637954712, "logps/chosen": -197.61387634277344, "logps/rejected": -231.9712677001953, "loss": 0.6042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.436052680015564, "rewards/margins": 0.37783369421958923, "rewards/rejected": -1.8138864040374756, "step": 11900 }, { "epoch": 2.0520330806340454, "grad_norm": 31.257429122924805, "learning_rate": 2.7456918300895748e-08, "logits/chosen": -2.351876735687256, "logits/rejected": -2.3544445037841797, "logps/chosen": -193.8394012451172, "logps/rejected": -233.034423828125, "loss": 0.6153, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.404994249343872, "rewards/margins": 0.352558970451355, "rewards/rejected": -1.7575533390045166, "step": 11910 }, { "epoch": 2.0537560303239144, "grad_norm": 56.795692443847656, "learning_rate": 2.736748803417277e-08, "logits/chosen": -2.3139095306396484, "logits/rejected": -2.2963128089904785, "logps/chosen": -198.30126953125, "logps/rejected": -228.29336547851562, "loss": 0.6077, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3945560455322266, "rewards/margins": 0.32481223344802856, "rewards/rejected": -1.7193682193756104, "step": 11920 }, { "epoch": 2.055478980013784, "grad_norm": 32.29816818237305, "learning_rate": 2.7278148736647748e-08, "logits/chosen": -2.3049678802490234, "logits/rejected": -2.289769411087036, "logps/chosen": -184.04885864257812, "logps/rejected": -221.4799346923828, "loss": 0.5865, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3049168586730957, "rewards/margins": 0.36031457781791687, "rewards/rejected": -1.665231466293335, "step": 11930 }, { "epoch": 2.057201929703653, "grad_norm": 32.54777145385742, "learning_rate": 2.7188900767411338e-08, "logits/chosen": -2.2739245891571045, "logits/rejected": -2.247042179107666, "logps/chosen": -177.62771606445312, "logps/rejected": -216.1691131591797, "loss": 0.5644, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2208917140960693, "rewards/margins": 0.41405215859413147, "rewards/rejected": -1.6349437236785889, "step": 11940 }, { "epoch": 2.0589248793935218, "grad_norm": 28.15619468688965, "learning_rate": 2.709974448518718e-08, "logits/chosen": -2.345357656478882, "logits/rejected": -2.3193631172180176, "logps/chosen": -193.7064666748047, "logps/rejected": -215.07058715820312, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": -1.3472740650177002, "rewards/margins": 0.24748286604881287, "rewards/rejected": -1.594757080078125, "step": 11950 }, { "epoch": 2.0606478290833907, "grad_norm": 33.60971450805664, "learning_rate": 2.7010680248330307e-08, "logits/chosen": -2.2373039722442627, "logits/rejected": -2.2140448093414307, "logps/chosen": -184.1251220703125, "logps/rejected": -227.1833953857422, "loss": 0.5664, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.306465744972229, "rewards/margins": 0.4474376142024994, "rewards/rejected": -1.7539036273956299, "step": 11960 }, { "epoch": 2.0623707787732597, "grad_norm": 33.096187591552734, "learning_rate": 2.6921708414825857e-08, "logits/chosen": -2.3217430114746094, "logits/rejected": -2.297858715057373, "logps/chosen": -187.7861328125, "logps/rejected": -216.83102416992188, "loss": 0.6186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.306571125984192, "rewards/margins": 0.33881863951683044, "rewards/rejected": -1.6453897953033447, "step": 11970 }, { "epoch": 2.0640937284631287, "grad_norm": 24.070341110229492, "learning_rate": 2.6832829342287488e-08, "logits/chosen": -2.308072090148926, "logits/rejected": -2.2790815830230713, "logps/chosen": -180.55453491210938, "logps/rejected": -217.0457763671875, "loss": 0.5715, "rewards/accuracies": 0.71875, "rewards/chosen": -1.24399733543396, "rewards/margins": 0.39089125394821167, "rewards/rejected": -1.6348886489868164, "step": 11980 }, { "epoch": 2.065816678152998, "grad_norm": 39.581634521484375, "learning_rate": 2.674404338795611e-08, "logits/chosen": -2.369114637374878, "logits/rejected": -2.329042434692383, "logps/chosen": -181.17039489746094, "logps/rejected": -206.80126953125, "loss": 0.5977, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2240442037582397, "rewards/margins": 0.3359059989452362, "rewards/rejected": -1.5599501132965088, "step": 11990 }, { "epoch": 2.067539627842867, "grad_norm": 26.552762985229492, "learning_rate": 2.665535090869827e-08, "logits/chosen": -2.284182071685791, "logits/rejected": -2.271190881729126, "logps/chosen": -173.64321899414062, "logps/rejected": -212.55203247070312, "loss": 0.5891, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2180447578430176, "rewards/margins": 0.3685550391674042, "rewards/rejected": -1.5865997076034546, "step": 12000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -2.382530927658081, "eval_logits/rejected": -2.37129282951355, "eval_logps/chosen": -163.54176330566406, "eval_logps/rejected": -183.97140502929688, "eval_loss": 0.6490315794944763, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -1.0452629327774048, "eval_rewards/margins": 0.16695521771907806, "eval_rewards/rejected": -1.2122180461883545, "eval_runtime": 384.8046, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 12000 }, { "epoch": 2.069262577532736, "grad_norm": 42.818328857421875, "learning_rate": 2.656675226100481e-08, "logits/chosen": -2.3415584564208984, "logits/rejected": -2.309140920639038, "logps/chosen": -180.33029174804688, "logps/rejected": -211.2962646484375, "loss": 0.605, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2456438541412354, "rewards/margins": 0.3233585059642792, "rewards/rejected": -1.569002389907837, "step": 12010 }, { "epoch": 2.070985527222605, "grad_norm": 31.882787704467773, "learning_rate": 2.6478247800989474e-08, "logits/chosen": -2.3419384956359863, "logits/rejected": -2.3203039169311523, "logps/chosen": -169.97256469726562, "logps/rejected": -198.49661254882812, "loss": 0.6165, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.167227029800415, "rewards/margins": 0.29829588532447815, "rewards/rejected": -1.4655230045318604, "step": 12020 }, { "epoch": 2.072708476912474, "grad_norm": 42.521568298339844, "learning_rate": 2.63898378843874e-08, "logits/chosen": -2.399444103240967, "logits/rejected": -2.383762836456299, "logps/chosen": -160.01272583007812, "logps/rejected": -182.17120361328125, "loss": 0.6475, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.098426103591919, "rewards/margins": 0.21871300041675568, "rewards/rejected": -1.3171392679214478, "step": 12030 }, { "epoch": 2.0744314266023434, "grad_norm": 27.004297256469727, "learning_rate": 2.6301522866553714e-08, "logits/chosen": -2.3278212547302246, "logits/rejected": -2.2990002632141113, "logps/chosen": -170.15162658691406, "logps/rejected": -200.32017517089844, "loss": 0.5964, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1340413093566895, "rewards/margins": 0.35810285806655884, "rewards/rejected": -1.4921441078186035, "step": 12040 }, { "epoch": 2.0761543762922123, "grad_norm": 34.77950668334961, "learning_rate": 2.621330310246208e-08, "logits/chosen": -2.37144136428833, "logits/rejected": -2.3289756774902344, "logps/chosen": -166.37918090820312, "logps/rejected": -203.81527709960938, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1094896793365479, "rewards/margins": 0.4241317808628082, "rewards/rejected": -1.5336215496063232, "step": 12050 }, { "epoch": 2.0778773259820813, "grad_norm": 30.318689346313477, "learning_rate": 2.6125178946703352e-08, "logits/chosen": -2.334500312805176, "logits/rejected": -2.3106274604797363, "logps/chosen": -171.7869415283203, "logps/rejected": -195.88204956054688, "loss": 0.6165, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.151228904724121, "rewards/margins": 0.27953606843948364, "rewards/rejected": -1.43076491355896, "step": 12060 }, { "epoch": 2.0796002756719503, "grad_norm": 26.932645797729492, "learning_rate": 2.6037150753484082e-08, "logits/chosen": -2.3272311687469482, "logits/rejected": -2.286487102508545, "logps/chosen": -170.1809539794922, "logps/rejected": -203.5109100341797, "loss": 0.5733, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1393085718154907, "rewards/margins": 0.4047151505947113, "rewards/rejected": -1.5440236330032349, "step": 12070 }, { "epoch": 2.0813232253618192, "grad_norm": 33.24617385864258, "learning_rate": 2.594921887662509e-08, "logits/chosen": -2.2792840003967285, "logits/rejected": -2.2516608238220215, "logps/chosen": -165.46157836914062, "logps/rejected": -200.6604461669922, "loss": 0.5774, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0833677053451538, "rewards/margins": 0.38120564818382263, "rewards/rejected": -1.4645735025405884, "step": 12080 }, { "epoch": 2.0830461750516887, "grad_norm": 34.20355987548828, "learning_rate": 2.5861383669560045e-08, "logits/chosen": -2.314237594604492, "logits/rejected": -2.2838430404663086, "logps/chosen": -178.33566284179688, "logps/rejected": -203.56239318847656, "loss": 0.613, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2032502889633179, "rewards/margins": 0.3110768496990204, "rewards/rejected": -1.514327049255371, "step": 12090 }, { "epoch": 2.0847691247415576, "grad_norm": 30.14881134033203, "learning_rate": 2.5773645485334122e-08, "logits/chosen": -2.303605318069458, "logits/rejected": -2.282360076904297, "logps/chosen": -169.51351928710938, "logps/rejected": -194.35842895507812, "loss": 0.5954, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1090033054351807, "rewards/margins": 0.3162084221839905, "rewards/rejected": -1.4252115488052368, "step": 12100 }, { "epoch": 2.0864920744314266, "grad_norm": 31.299373626708984, "learning_rate": 2.568600467660245e-08, "logits/chosen": -2.326810836791992, "logits/rejected": -2.2981925010681152, "logps/chosen": -177.06642150878906, "logps/rejected": -201.6583709716797, "loss": 0.6135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1888201236724854, "rewards/margins": 0.29908499121665955, "rewards/rejected": -1.4879052639007568, "step": 12110 }, { "epoch": 2.0882150241212956, "grad_norm": 37.546592712402344, "learning_rate": 2.5598461595628827e-08, "logits/chosen": -2.306678295135498, "logits/rejected": -2.2846360206604004, "logps/chosen": -178.78823852539062, "logps/rejected": -204.18438720703125, "loss": 0.6238, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2205679416656494, "rewards/margins": 0.3085453510284424, "rewards/rejected": -1.5291130542755127, "step": 12120 }, { "epoch": 2.0899379738111645, "grad_norm": 29.08656883239746, "learning_rate": 2.5511016594284236e-08, "logits/chosen": -2.310135841369629, "logits/rejected": -2.276185989379883, "logps/chosen": -166.3862762451172, "logps/rejected": -207.1776123046875, "loss": 0.5526, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1164814233779907, "rewards/margins": 0.4281376004219055, "rewards/rejected": -1.5446192026138306, "step": 12130 }, { "epoch": 2.091660923501034, "grad_norm": 36.57272720336914, "learning_rate": 2.5423670024045397e-08, "logits/chosen": -2.3395144939422607, "logits/rejected": -2.320728302001953, "logps/chosen": -169.90667724609375, "logps/rejected": -198.46682739257812, "loss": 0.6227, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1688696146011353, "rewards/margins": 0.2796100974082947, "rewards/rejected": -1.4484796524047852, "step": 12140 }, { "epoch": 2.093383873190903, "grad_norm": 58.29833984375, "learning_rate": 2.5336422235993403e-08, "logits/chosen": -2.3326809406280518, "logits/rejected": -2.3147945404052734, "logps/chosen": -169.92568969726562, "logps/rejected": -201.376708984375, "loss": 0.6059, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1544063091278076, "rewards/margins": 0.33958542346954346, "rewards/rejected": -1.4939919710159302, "step": 12150 }, { "epoch": 2.095106822880772, "grad_norm": 60.696189880371094, "learning_rate": 2.5249273580812346e-08, "logits/chosen": -2.2743961811065674, "logits/rejected": -2.2431156635284424, "logps/chosen": -170.11212158203125, "logps/rejected": -200.20468139648438, "loss": 0.5955, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1341893672943115, "rewards/margins": 0.3392500877380371, "rewards/rejected": -1.4734394550323486, "step": 12160 }, { "epoch": 2.096829772570641, "grad_norm": 35.26255798339844, "learning_rate": 2.5162224408787874e-08, "logits/chosen": -2.2966880798339844, "logits/rejected": -2.274604320526123, "logps/chosen": -179.924560546875, "logps/rejected": -213.06930541992188, "loss": 0.6118, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2429232597351074, "rewards/margins": 0.3358061611652374, "rewards/rejected": -1.5787293910980225, "step": 12170 }, { "epoch": 2.09855272226051, "grad_norm": 27.878921508789062, "learning_rate": 2.5075275069805646e-08, "logits/chosen": -2.232279062271118, "logits/rejected": -2.2205493450164795, "logps/chosen": -175.306884765625, "logps/rejected": -222.40975952148438, "loss": 0.5734, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2602647542953491, "rewards/margins": 0.3941713571548462, "rewards/rejected": -1.6544361114501953, "step": 12180 }, { "epoch": 2.1002756719503792, "grad_norm": 34.53485870361328, "learning_rate": 2.4988425913350192e-08, "logits/chosen": -2.2715885639190674, "logits/rejected": -2.239790439605713, "logps/chosen": -181.70046997070312, "logps/rejected": -204.890869140625, "loss": 0.6273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2391859292984009, "rewards/margins": 0.28382736444473267, "rewards/rejected": -1.5230133533477783, "step": 12190 }, { "epoch": 2.101998621640248, "grad_norm": 38.71670913696289, "learning_rate": 2.4901677288503326e-08, "logits/chosen": -2.347888469696045, "logits/rejected": -2.324063777923584, "logps/chosen": -184.92343139648438, "logps/rejected": -215.3417510986328, "loss": 0.596, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2747955322265625, "rewards/margins": 0.3419981598854065, "rewards/rejected": -1.6167936325073242, "step": 12200 }, { "epoch": 2.103721571330117, "grad_norm": 30.0412540435791, "learning_rate": 2.4815029543942735e-08, "logits/chosen": -2.303335189819336, "logits/rejected": -2.2876079082489014, "logps/chosen": -183.39291381835938, "logps/rejected": -210.96267700195312, "loss": 0.6119, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.284551978111267, "rewards/margins": 0.3175322413444519, "rewards/rejected": -1.6020843982696533, "step": 12210 }, { "epoch": 2.105444521019986, "grad_norm": 41.832977294921875, "learning_rate": 2.4728483027940715e-08, "logits/chosen": -2.237399101257324, "logits/rejected": -2.2207062244415283, "logps/chosen": -185.9912109375, "logps/rejected": -210.3903045654297, "loss": 0.6351, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.323387622833252, "rewards/margins": 0.25132933259010315, "rewards/rejected": -1.5747170448303223, "step": 12220 }, { "epoch": 2.107167470709855, "grad_norm": 28.826099395751953, "learning_rate": 2.4642038088362595e-08, "logits/chosen": -2.3004941940307617, "logits/rejected": -2.2839975357055664, "logps/chosen": -180.2684326171875, "logps/rejected": -212.25131225585938, "loss": 0.6004, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.275651216506958, "rewards/margins": 0.3466276526451111, "rewards/rejected": -1.6222788095474243, "step": 12230 }, { "epoch": 2.1088904203997245, "grad_norm": 34.61656951904297, "learning_rate": 2.4555695072665494e-08, "logits/chosen": -2.231196403503418, "logits/rejected": -2.204634189605713, "logps/chosen": -178.1566619873047, "logps/rejected": -210.77017211914062, "loss": 0.5946, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2443420886993408, "rewards/margins": 0.34749799966812134, "rewards/rejected": -1.5918400287628174, "step": 12240 }, { "epoch": 2.1106133700895935, "grad_norm": 41.588321685791016, "learning_rate": 2.446945432789681e-08, "logits/chosen": -2.287522315979004, "logits/rejected": -2.264913558959961, "logps/chosen": -176.79226684570312, "logps/rejected": -197.31668090820312, "loss": 0.6499, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2354786396026611, "rewards/margins": 0.23657634854316711, "rewards/rejected": -1.4720548391342163, "step": 12250 }, { "epoch": 2.1123363197794625, "grad_norm": 37.527713775634766, "learning_rate": 2.4383316200692928e-08, "logits/chosen": -2.2929043769836426, "logits/rejected": -2.2686684131622314, "logps/chosen": -169.481689453125, "logps/rejected": -199.8762969970703, "loss": 0.6155, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1590509414672852, "rewards/margins": 0.29075926542282104, "rewards/rejected": -1.449810266494751, "step": 12260 }, { "epoch": 2.1140592694693314, "grad_norm": 31.186466217041016, "learning_rate": 2.4297281037277694e-08, "logits/chosen": -2.366913318634033, "logits/rejected": -2.339359760284424, "logps/chosen": -176.31210327148438, "logps/rejected": -207.9870147705078, "loss": 0.6041, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.203351378440857, "rewards/margins": 0.34722915291786194, "rewards/rejected": -1.550580382347107, "step": 12270 }, { "epoch": 2.1157822191592004, "grad_norm": 27.913915634155273, "learning_rate": 2.4211349183461195e-08, "logits/chosen": -2.3033981323242188, "logits/rejected": -2.2841343879699707, "logps/chosen": -177.34950256347656, "logps/rejected": -204.71127319335938, "loss": 0.6015, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1983354091644287, "rewards/margins": 0.3280332684516907, "rewards/rejected": -1.5263686180114746, "step": 12280 }, { "epoch": 2.11750516884907, "grad_norm": 34.77854537963867, "learning_rate": 2.4125520984638177e-08, "logits/chosen": -2.2566428184509277, "logits/rejected": -2.228959083557129, "logps/chosen": -173.9934539794922, "logps/rejected": -200.5246124267578, "loss": 0.6157, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1927902698516846, "rewards/margins": 0.2847309112548828, "rewards/rejected": -1.4775211811065674, "step": 12290 }, { "epoch": 2.1192281185389388, "grad_norm": 34.15695571899414, "learning_rate": 2.4039796785786827e-08, "logits/chosen": -2.311659574508667, "logits/rejected": -2.281036853790283, "logps/chosen": -180.02626037597656, "logps/rejected": -207.08236694335938, "loss": 0.6036, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2438709735870361, "rewards/margins": 0.30703234672546387, "rewards/rejected": -1.5509033203125, "step": 12300 }, { "epoch": 2.1209510682288077, "grad_norm": 25.6216983795166, "learning_rate": 2.3954176931467323e-08, "logits/chosen": -2.286026954650879, "logits/rejected": -2.2513835430145264, "logps/chosen": -173.94776916503906, "logps/rejected": -204.9058380126953, "loss": 0.5921, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1738064289093018, "rewards/margins": 0.35794585943222046, "rewards/rejected": -1.531752347946167, "step": 12310 }, { "epoch": 2.1226740179186767, "grad_norm": 37.73752212524414, "learning_rate": 2.3868661765820346e-08, "logits/chosen": -2.2825002670288086, "logits/rejected": -2.252878189086914, "logps/chosen": -170.19056701660156, "logps/rejected": -213.87472534179688, "loss": 0.5703, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.144214391708374, "rewards/margins": 0.44874677062034607, "rewards/rejected": -1.592961311340332, "step": 12320 }, { "epoch": 2.1243969676085457, "grad_norm": 26.66788101196289, "learning_rate": 2.3783251632565875e-08, "logits/chosen": -2.3073744773864746, "logits/rejected": -2.2946484088897705, "logps/chosen": -176.7238006591797, "logps/rejected": -203.3588409423828, "loss": 0.6013, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1875582933425903, "rewards/margins": 0.3035629093647003, "rewards/rejected": -1.4911211729049683, "step": 12330 }, { "epoch": 2.126119917298415, "grad_norm": 32.90800476074219, "learning_rate": 2.3697946875001725e-08, "logits/chosen": -2.3109140396118164, "logits/rejected": -2.280402898788452, "logps/chosen": -181.58554077148438, "logps/rejected": -217.5074005126953, "loss": 0.5762, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.265123963356018, "rewards/margins": 0.37893742322921753, "rewards/rejected": -1.6440613269805908, "step": 12340 }, { "epoch": 2.127842866988284, "grad_norm": 30.612865447998047, "learning_rate": 2.3612747836002116e-08, "logits/chosen": -2.231553316116333, "logits/rejected": -2.1880717277526855, "logps/chosen": -187.61593627929688, "logps/rejected": -221.2359619140625, "loss": 0.58, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3059110641479492, "rewards/margins": 0.400498628616333, "rewards/rejected": -1.7064098119735718, "step": 12350 }, { "epoch": 2.129565816678153, "grad_norm": 33.540550231933594, "learning_rate": 2.352765485801635e-08, "logits/chosen": -2.3030381202697754, "logits/rejected": -2.283235549926758, "logps/chosen": -172.56826782226562, "logps/rejected": -210.2171630859375, "loss": 0.5728, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1839771270751953, "rewards/margins": 0.40479379892349243, "rewards/rejected": -1.5887707471847534, "step": 12360 }, { "epoch": 2.131288766368022, "grad_norm": 32.70058059692383, "learning_rate": 2.3442668283067453e-08, "logits/chosen": -2.292982578277588, "logits/rejected": -2.2578206062316895, "logps/chosen": -180.56468200683594, "logps/rejected": -218.25924682617188, "loss": 0.5707, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2692445516586304, "rewards/margins": 0.4212293028831482, "rewards/rejected": -1.6904739141464233, "step": 12370 }, { "epoch": 2.133011716057891, "grad_norm": 54.63309097290039, "learning_rate": 2.335778845275079e-08, "logits/chosen": -2.2785980701446533, "logits/rejected": -2.274125576019287, "logps/chosen": -180.288818359375, "logps/rejected": -222.59780883789062, "loss": 0.5769, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2888532876968384, "rewards/margins": 0.3815760314464569, "rewards/rejected": -1.6704292297363281, "step": 12380 }, { "epoch": 2.13473466574776, "grad_norm": 38.83860397338867, "learning_rate": 2.32730157082326e-08, "logits/chosen": -2.3655247688293457, "logits/rejected": -2.349914073944092, "logps/chosen": -183.9013214111328, "logps/rejected": -226.04238891601562, "loss": 0.5715, "rewards/accuracies": 0.71875, "rewards/chosen": -1.296132206916809, "rewards/margins": 0.42857784032821655, "rewards/rejected": -1.7247101068496704, "step": 12390 }, { "epoch": 2.1364576154376294, "grad_norm": 28.777034759521484, "learning_rate": 2.3188350390248796e-08, "logits/chosen": -2.28861665725708, "logits/rejected": -2.2760109901428223, "logps/chosen": -195.12936401367188, "logps/rejected": -231.7972869873047, "loss": 0.5808, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.390777826309204, "rewards/margins": 0.3694989085197449, "rewards/rejected": -1.7602765560150146, "step": 12400 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -2.350759744644165, "eval_logits/rejected": -2.3382339477539062, "eval_logps/chosen": -178.0778350830078, "eval_logps/rejected": -199.925537109375, "eval_loss": 0.6489568948745728, "eval_rewards/accuracies": 0.6038568615913391, "eval_rewards/chosen": -1.1906236410140991, "eval_rewards/margins": 0.1811356097459793, "eval_rewards/rejected": -1.3717591762542725, "eval_runtime": 384.8916, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 12400 }, { "epoch": 2.1381805651274983, "grad_norm": 38.66655349731445, "learning_rate": 2.310379283910343e-08, "logits/chosen": -2.267749309539795, "logits/rejected": -2.2331278324127197, "logps/chosen": -186.0257568359375, "logps/rejected": -220.5785675048828, "loss": 0.5827, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3094804286956787, "rewards/margins": 0.37783220410346985, "rewards/rejected": -1.6873127222061157, "step": 12410 }, { "epoch": 2.1399035148173673, "grad_norm": 56.3077392578125, "learning_rate": 2.30193433946674e-08, "logits/chosen": -2.2084813117980957, "logits/rejected": -2.182887554168701, "logps/chosen": -195.47129821777344, "logps/rejected": -227.36257934570312, "loss": 0.6132, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3920471668243408, "rewards/margins": 0.3612838089466095, "rewards/rejected": -1.753330945968628, "step": 12420 }, { "epoch": 2.1416264645072363, "grad_norm": 29.597278594970703, "learning_rate": 2.2935002396377128e-08, "logits/chosen": -2.2616899013519287, "logits/rejected": -2.2430830001831055, "logps/chosen": -196.7481231689453, "logps/rejected": -226.3641815185547, "loss": 0.6206, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4068971872329712, "rewards/margins": 0.32016709446907043, "rewards/rejected": -1.7270641326904297, "step": 12430 }, { "epoch": 2.1433494141971057, "grad_norm": 33.88653564453125, "learning_rate": 2.2850770183233125e-08, "logits/chosen": -2.235628843307495, "logits/rejected": -2.2164924144744873, "logps/chosen": -185.83493041992188, "logps/rejected": -216.488037109375, "loss": 0.605, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3134291172027588, "rewards/margins": 0.33438539505004883, "rewards/rejected": -1.6478145122528076, "step": 12440 }, { "epoch": 2.1450723638869746, "grad_norm": 36.81434631347656, "learning_rate": 2.276664709379863e-08, "logits/chosen": -2.273482084274292, "logits/rejected": -2.2568745613098145, "logps/chosen": -182.1161651611328, "logps/rejected": -213.73391723632812, "loss": 0.6151, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.305662989616394, "rewards/margins": 0.3310778737068176, "rewards/rejected": -1.6367409229278564, "step": 12450 }, { "epoch": 2.1467953135768436, "grad_norm": 39.7001953125, "learning_rate": 2.2682633466198263e-08, "logits/chosen": -2.3082103729248047, "logits/rejected": -2.2860910892486572, "logps/chosen": -190.76364135742188, "logps/rejected": -221.93704223632812, "loss": 0.5958, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3485095500946045, "rewards/margins": 0.35299503803253174, "rewards/rejected": -1.7015047073364258, "step": 12460 }, { "epoch": 2.1485182632667126, "grad_norm": 44.36844253540039, "learning_rate": 2.259872963811672e-08, "logits/chosen": -2.3637163639068604, "logits/rejected": -2.3345274925231934, "logps/chosen": -190.12307739257812, "logps/rejected": -237.3924560546875, "loss": 0.5544, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3406221866607666, "rewards/margins": 0.48035627603530884, "rewards/rejected": -1.8209785223007202, "step": 12470 }, { "epoch": 2.1502412129565815, "grad_norm": 32.28123092651367, "learning_rate": 2.2514935946797347e-08, "logits/chosen": -2.3527333736419678, "logits/rejected": -2.334380626678467, "logps/chosen": -186.5384521484375, "logps/rejected": -218.5380859375, "loss": 0.5923, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.308751106262207, "rewards/margins": 0.33788079023361206, "rewards/rejected": -1.6466318368911743, "step": 12480 }, { "epoch": 2.1519641626464505, "grad_norm": 45.380977630615234, "learning_rate": 2.2431252729040796e-08, "logits/chosen": -2.248897075653076, "logits/rejected": -2.225717067718506, "logps/chosen": -190.13870239257812, "logps/rejected": -228.26266479492188, "loss": 0.5985, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3422235250473022, "rewards/margins": 0.4081028997898102, "rewards/rejected": -1.7503265142440796, "step": 12490 }, { "epoch": 2.15368711233632, "grad_norm": 44.201690673828125, "learning_rate": 2.2347680321203655e-08, "logits/chosen": -2.2898340225219727, "logits/rejected": -2.266667366027832, "logps/chosen": -193.89730834960938, "logps/rejected": -227.7718963623047, "loss": 0.6083, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3940218687057495, "rewards/margins": 0.3586365580558777, "rewards/rejected": -1.7526586055755615, "step": 12500 }, { "epoch": 2.155410062026189, "grad_norm": 30.75022315979004, "learning_rate": 2.2264219059197174e-08, "logits/chosen": -2.3246655464172363, "logits/rejected": -2.3050475120544434, "logps/chosen": -183.81161499023438, "logps/rejected": -209.09164428710938, "loss": 0.6141, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2971190214157104, "rewards/margins": 0.28229445219039917, "rewards/rejected": -1.5794134140014648, "step": 12510 }, { "epoch": 2.157133011716058, "grad_norm": 37.722503662109375, "learning_rate": 2.218086927848587e-08, "logits/chosen": -2.2650203704833984, "logits/rejected": -2.2433574199676514, "logps/chosen": -184.23312377929688, "logps/rejected": -210.4525909423828, "loss": 0.6266, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3043887615203857, "rewards/margins": 0.26472169160842896, "rewards/rejected": -1.5691105127334595, "step": 12520 }, { "epoch": 2.158855961405927, "grad_norm": 34.342227935791016, "learning_rate": 2.2097631314086112e-08, "logits/chosen": -2.2892611026763916, "logits/rejected": -2.2736310958862305, "logps/chosen": -183.8441925048828, "logps/rejected": -217.6746826171875, "loss": 0.594, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.287831425666809, "rewards/margins": 0.3383808732032776, "rewards/rejected": -1.6262121200561523, "step": 12530 }, { "epoch": 2.160578911095796, "grad_norm": 42.98286056518555, "learning_rate": 2.201450550056486e-08, "logits/chosen": -2.2905638217926025, "logits/rejected": -2.264129161834717, "logps/chosen": -180.79383850097656, "logps/rejected": -211.2161865234375, "loss": 0.6188, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2625057697296143, "rewards/margins": 0.31720057129859924, "rewards/rejected": -1.5797064304351807, "step": 12540 }, { "epoch": 2.162301860785665, "grad_norm": 52.744972229003906, "learning_rate": 2.193149217203833e-08, "logits/chosen": -2.370565891265869, "logits/rejected": -2.3550562858581543, "logps/chosen": -172.3019561767578, "logps/rejected": -199.21258544921875, "loss": 0.6206, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1986303329467773, "rewards/margins": 0.2771567702293396, "rewards/rejected": -1.4757869243621826, "step": 12550 }, { "epoch": 2.164024810475534, "grad_norm": 31.21704864501953, "learning_rate": 2.1848591662170546e-08, "logits/chosen": -2.317023515701294, "logits/rejected": -2.283731460571289, "logps/chosen": -181.77279663085938, "logps/rejected": -203.51612854003906, "loss": 0.6246, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2345434427261353, "rewards/margins": 0.3114972710609436, "rewards/rejected": -1.5460407733917236, "step": 12560 }, { "epoch": 2.165747760165403, "grad_norm": 38.83090591430664, "learning_rate": 2.1765804304172137e-08, "logits/chosen": -2.290691375732422, "logits/rejected": -2.2567405700683594, "logps/chosen": -168.33099365234375, "logps/rejected": -199.4054412841797, "loss": 0.5832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1442289352416992, "rewards/margins": 0.34920820593833923, "rewards/rejected": -1.4934370517730713, "step": 12570 }, { "epoch": 2.167470709855272, "grad_norm": 33.23080825805664, "learning_rate": 2.1683130430798907e-08, "logits/chosen": -2.3136255741119385, "logits/rejected": -2.272890567779541, "logps/chosen": -178.71817016601562, "logps/rejected": -206.2785186767578, "loss": 0.5737, "rewards/accuracies": 0.75, "rewards/chosen": -1.1674859523773193, "rewards/margins": 0.3780251145362854, "rewards/rejected": -1.54551100730896, "step": 12580 }, { "epoch": 2.169193659545141, "grad_norm": 37.45336151123047, "learning_rate": 2.16005703743505e-08, "logits/chosen": -2.2911033630371094, "logits/rejected": -2.2649600505828857, "logps/chosen": -172.32638549804688, "logps/rejected": -202.3101043701172, "loss": 0.5868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1516131162643433, "rewards/margins": 0.34974223375320435, "rewards/rejected": -1.5013554096221924, "step": 12590 }, { "epoch": 2.1709166092350105, "grad_norm": 31.608232498168945, "learning_rate": 2.151812446666908e-08, "logits/chosen": -2.2943625450134277, "logits/rejected": -2.2727608680725098, "logps/chosen": -184.67996215820312, "logps/rejected": -205.7906951904297, "loss": 0.6199, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2513759136199951, "rewards/margins": 0.284992516040802, "rewards/rejected": -1.5363683700561523, "step": 12600 }, { "epoch": 2.1726395589248795, "grad_norm": 26.701438903808594, "learning_rate": 2.1435793039138035e-08, "logits/chosen": -2.3679873943328857, "logits/rejected": -2.358001232147217, "logps/chosen": -173.2816619873047, "logps/rejected": -207.9984588623047, "loss": 0.5833, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1841771602630615, "rewards/margins": 0.359028160572052, "rewards/rejected": -1.5432054996490479, "step": 12610 }, { "epoch": 2.1743625086147484, "grad_norm": 42.2379035949707, "learning_rate": 2.135357642268062e-08, "logits/chosen": -2.402472734451294, "logits/rejected": -2.3818836212158203, "logps/chosen": -188.36465454101562, "logps/rejected": -212.6293182373047, "loss": 0.6238, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3277316093444824, "rewards/margins": 0.27842921018600464, "rewards/rejected": -1.6061607599258423, "step": 12620 }, { "epoch": 2.1760854583046174, "grad_norm": 35.1007194519043, "learning_rate": 2.1271474947758533e-08, "logits/chosen": -2.351163864135742, "logits/rejected": -2.3398239612579346, "logps/chosen": -172.6517333984375, "logps/rejected": -201.7603759765625, "loss": 0.6291, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2183504104614258, "rewards/margins": 0.28705310821533203, "rewards/rejected": -1.5054035186767578, "step": 12630 }, { "epoch": 2.1778084079944864, "grad_norm": 38.27680587768555, "learning_rate": 2.1189488944370753e-08, "logits/chosen": -2.348966598510742, "logits/rejected": -2.323958158493042, "logps/chosen": -168.53770446777344, "logps/rejected": -202.28419494628906, "loss": 0.5809, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.131299614906311, "rewards/margins": 0.3630850613117218, "rewards/rejected": -1.4943846464157104, "step": 12640 }, { "epoch": 2.179531357684356, "grad_norm": 50.590946197509766, "learning_rate": 2.110761874205214e-08, "logits/chosen": -2.30483078956604, "logits/rejected": -2.2819137573242188, "logps/chosen": -166.08322143554688, "logps/rejected": -191.8925018310547, "loss": 0.6195, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1224647760391235, "rewards/margins": 0.28845423460006714, "rewards/rejected": -1.4109190702438354, "step": 12650 }, { "epoch": 2.1812543073742248, "grad_norm": 34.77700424194336, "learning_rate": 2.1025864669872028e-08, "logits/chosen": -2.2371766567230225, "logits/rejected": -2.2177200317382812, "logps/chosen": -180.53785705566406, "logps/rejected": -207.4517822265625, "loss": 0.6437, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2753950357437134, "rewards/margins": 0.27624499797821045, "rewards/rejected": -1.5516400337219238, "step": 12660 }, { "epoch": 2.1829772570640937, "grad_norm": 34.8372688293457, "learning_rate": 2.0944227056433062e-08, "logits/chosen": -2.4396395683288574, "logits/rejected": -2.395169496536255, "logps/chosen": -169.12814331054688, "logps/rejected": -204.72259521484375, "loss": 0.5752, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1491328477859497, "rewards/margins": 0.3878241777420044, "rewards/rejected": -1.5369569063186646, "step": 12670 }, { "epoch": 2.1847002067539627, "grad_norm": 30.4854679107666, "learning_rate": 2.0862706229869716e-08, "logits/chosen": -2.276838541030884, "logits/rejected": -2.249408006668091, "logps/chosen": -174.7004852294922, "logps/rejected": -213.88442993164062, "loss": 0.5698, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2216650247573853, "rewards/margins": 0.407600075006485, "rewards/rejected": -1.6292650699615479, "step": 12680 }, { "epoch": 2.1864231564438317, "grad_norm": 33.259063720703125, "learning_rate": 2.0781302517847115e-08, "logits/chosen": -2.2541377544403076, "logits/rejected": -2.2349231243133545, "logps/chosen": -174.60008239746094, "logps/rejected": -207.94424438476562, "loss": 0.6085, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2174088954925537, "rewards/margins": 0.3410707116127014, "rewards/rejected": -1.5584796667099, "step": 12690 }, { "epoch": 2.188146106133701, "grad_norm": 30.73868179321289, "learning_rate": 2.0700016247559592e-08, "logits/chosen": -2.2638649940490723, "logits/rejected": -2.23850679397583, "logps/chosen": -183.5129852294922, "logps/rejected": -213.8329620361328, "loss": 0.5905, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2510440349578857, "rewards/margins": 0.35103440284729004, "rewards/rejected": -1.6020784378051758, "step": 12700 }, { "epoch": 2.18986905582357, "grad_norm": 32.50075149536133, "learning_rate": 2.0618847745729506e-08, "logits/chosen": -2.3324215412139893, "logits/rejected": -2.3214612007141113, "logps/chosen": -179.8836669921875, "logps/rejected": -221.8270263671875, "loss": 0.5777, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2533272504806519, "rewards/margins": 0.3927740454673767, "rewards/rejected": -1.6461012363433838, "step": 12710 }, { "epoch": 2.191592005513439, "grad_norm": 30.249420166015625, "learning_rate": 2.05377973386058e-08, "logits/chosen": -2.3358154296875, "logits/rejected": -2.295535087585449, "logps/chosen": -180.45872497558594, "logps/rejected": -215.7207489013672, "loss": 0.5581, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2281229496002197, "rewards/margins": 0.424754798412323, "rewards/rejected": -1.6528778076171875, "step": 12720 }, { "epoch": 2.193314955203308, "grad_norm": 38.64509963989258, "learning_rate": 2.0456865351962742e-08, "logits/chosen": -2.259183883666992, "logits/rejected": -2.230043411254883, "logps/chosen": -179.16160583496094, "logps/rejected": -213.3286895751953, "loss": 0.5972, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2795768976211548, "rewards/margins": 0.3571908175945282, "rewards/rejected": -1.6367677450180054, "step": 12730 }, { "epoch": 2.195037904893177, "grad_norm": 34.17341995239258, "learning_rate": 2.037605211109866e-08, "logits/chosen": -2.2671656608581543, "logits/rejected": -2.246932029724121, "logps/chosen": -193.5773468017578, "logps/rejected": -230.35049438476562, "loss": 0.5922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.408772349357605, "rewards/margins": 0.38163310289382935, "rewards/rejected": -1.790405511856079, "step": 12740 }, { "epoch": 2.1967608545830464, "grad_norm": 30.60337257385254, "learning_rate": 2.0295357940834605e-08, "logits/chosen": -2.26202392578125, "logits/rejected": -2.232573986053467, "logps/chosen": -182.06118774414062, "logps/rejected": -218.7804412841797, "loss": 0.5819, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2679184675216675, "rewards/margins": 0.38135385513305664, "rewards/rejected": -1.6492723226547241, "step": 12750 }, { "epoch": 2.1984838042729153, "grad_norm": 42.23899841308594, "learning_rate": 2.0214783165512984e-08, "logits/chosen": -2.2580082416534424, "logits/rejected": -2.240325450897217, "logps/chosen": -186.8460693359375, "logps/rejected": -219.8138427734375, "loss": 0.6256, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3406201601028442, "rewards/margins": 0.3337910771369934, "rewards/rejected": -1.6744110584259033, "step": 12760 }, { "epoch": 2.2002067539627843, "grad_norm": 40.34876251220703, "learning_rate": 2.0134328108996308e-08, "logits/chosen": -2.3470633029937744, "logits/rejected": -2.312948226928711, "logps/chosen": -186.7422637939453, "logps/rejected": -210.3963623046875, "loss": 0.6185, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2813963890075684, "rewards/margins": 0.30719074606895447, "rewards/rejected": -1.5885872840881348, "step": 12770 }, { "epoch": 2.2019297036526533, "grad_norm": 30.002656936645508, "learning_rate": 2.0053993094665937e-08, "logits/chosen": -2.3167033195495605, "logits/rejected": -2.2913691997528076, "logps/chosen": -187.4027557373047, "logps/rejected": -219.14956665039062, "loss": 0.6183, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3514249324798584, "rewards/margins": 0.31111449003219604, "rewards/rejected": -1.6625392436981201, "step": 12780 }, { "epoch": 2.2036526533425222, "grad_norm": 37.465084075927734, "learning_rate": 1.9973778445420732e-08, "logits/chosen": -2.2503247261047363, "logits/rejected": -2.2288479804992676, "logps/chosen": -192.74766540527344, "logps/rejected": -226.4638214111328, "loss": 0.5781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3739817142486572, "rewards/margins": 0.36986181139945984, "rewards/rejected": -1.74384343624115, "step": 12790 }, { "epoch": 2.205375603032391, "grad_norm": 40.986148834228516, "learning_rate": 1.9893684483675706e-08, "logits/chosen": -2.309469223022461, "logits/rejected": -2.2865958213806152, "logps/chosen": -182.12448120117188, "logps/rejected": -213.16604614257812, "loss": 0.6051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2606197595596313, "rewards/margins": 0.3188409209251404, "rewards/rejected": -1.5794607400894165, "step": 12800 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -2.365828037261963, "eval_logits/rejected": -2.354224681854248, "eval_logps/chosen": -168.60400390625, "eval_logps/rejected": -189.23008728027344, "eval_loss": 0.6495684385299683, "eval_rewards/accuracies": 0.6052509546279907, "eval_rewards/chosen": -1.0958852767944336, "eval_rewards/margins": 0.16891968250274658, "eval_rewards/rejected": -1.2648048400878906, "eval_runtime": 384.7328, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 12800 }, { "epoch": 2.2070985527222606, "grad_norm": 39.08464431762695, "learning_rate": 1.98137115313608e-08, "logits/chosen": -2.3342087268829346, "logits/rejected": -2.3211050033569336, "logps/chosen": -181.80323791503906, "logps/rejected": -210.1888427734375, "loss": 0.6366, "rewards/accuracies": 0.65625, "rewards/chosen": -1.306378960609436, "rewards/margins": 0.2542952299118042, "rewards/rejected": -1.5606739521026611, "step": 12810 }, { "epoch": 2.2088215024121296, "grad_norm": 48.59690856933594, "learning_rate": 1.9733859909919593e-08, "logits/chosen": -2.241438627243042, "logits/rejected": -2.230583906173706, "logps/chosen": -178.1573486328125, "logps/rejected": -205.2084503173828, "loss": 0.6242, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2335833311080933, "rewards/margins": 0.2759568989276886, "rewards/rejected": -1.509540319442749, "step": 12820 }, { "epoch": 2.2105444521019986, "grad_norm": 34.11722946166992, "learning_rate": 1.9654129940307994e-08, "logits/chosen": -2.2492871284484863, "logits/rejected": -2.230903148651123, "logps/chosen": -181.12081909179688, "logps/rejected": -205.6280059814453, "loss": 0.6323, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.297506332397461, "rewards/margins": 0.24382254481315613, "rewards/rejected": -1.54132878780365, "step": 12830 }, { "epoch": 2.2122674017918675, "grad_norm": 35.33450698852539, "learning_rate": 1.9574521942992884e-08, "logits/chosen": -2.348066806793213, "logits/rejected": -2.331679582595825, "logps/chosen": -180.0199432373047, "logps/rejected": -214.4061279296875, "loss": 0.5888, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2399193048477173, "rewards/margins": 0.3646962642669678, "rewards/rejected": -1.604615569114685, "step": 12840 }, { "epoch": 2.213990351481737, "grad_norm": 30.569448471069336, "learning_rate": 1.9495036237950956e-08, "logits/chosen": -2.342040538787842, "logits/rejected": -2.307283401489258, "logps/chosen": -178.75552368164062, "logps/rejected": -211.24728393554688, "loss": 0.5804, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2484490871429443, "rewards/margins": 0.3602822721004486, "rewards/rejected": -1.6087315082550049, "step": 12850 }, { "epoch": 2.215713301171606, "grad_norm": 29.184051513671875, "learning_rate": 1.9415673144667326e-08, "logits/chosen": -2.2426769733428955, "logits/rejected": -2.216895580291748, "logps/chosen": -186.26821899414062, "logps/rejected": -218.3754425048828, "loss": 0.5877, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2924940586090088, "rewards/margins": 0.3836742341518402, "rewards/rejected": -1.676168441772461, "step": 12860 }, { "epoch": 2.217436250861475, "grad_norm": 40.87593078613281, "learning_rate": 1.9336432982134266e-08, "logits/chosen": -2.2999205589294434, "logits/rejected": -2.2765324115753174, "logps/chosen": -176.50772094726562, "logps/rejected": -205.4868927001953, "loss": 0.6181, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2287260293960571, "rewards/margins": 0.32602426409721375, "rewards/rejected": -1.5547503232955933, "step": 12870 }, { "epoch": 2.219159200551344, "grad_norm": 40.17512893676758, "learning_rate": 1.925731606884998e-08, "logits/chosen": -2.3098721504211426, "logits/rejected": -2.290262222290039, "logps/chosen": -175.09259033203125, "logps/rejected": -207.22021484375, "loss": 0.5977, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1981713771820068, "rewards/margins": 0.3191829323768616, "rewards/rejected": -1.5173542499542236, "step": 12880 }, { "epoch": 2.220882150241213, "grad_norm": 35.549747467041016, "learning_rate": 1.9178322722817288e-08, "logits/chosen": -2.3461060523986816, "logits/rejected": -2.328920602798462, "logps/chosen": -179.7633056640625, "logps/rejected": -200.14736938476562, "loss": 0.6199, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2108832597732544, "rewards/margins": 0.2605508267879486, "rewards/rejected": -1.4714341163635254, "step": 12890 }, { "epoch": 2.222605099931082, "grad_norm": 35.350547790527344, "learning_rate": 1.9099453261542297e-08, "logits/chosen": -2.3254337310791016, "logits/rejected": -2.2910609245300293, "logps/chosen": -179.11000061035156, "logps/rejected": -216.47085571289062, "loss": 0.5762, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2380391359329224, "rewards/margins": 0.3910645842552185, "rewards/rejected": -1.629103660583496, "step": 12900 }, { "epoch": 2.224328049620951, "grad_norm": 36.8025016784668, "learning_rate": 1.9020708002033182e-08, "logits/chosen": -2.3482630252838135, "logits/rejected": -2.33329439163208, "logps/chosen": -171.5440216064453, "logps/rejected": -199.91836547851562, "loss": 0.6273, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.196648120880127, "rewards/margins": 0.25569668412208557, "rewards/rejected": -1.4523446559906006, "step": 12910 }, { "epoch": 2.22605099931082, "grad_norm": 27.304306030273438, "learning_rate": 1.8942087260798933e-08, "logits/chosen": -2.3011980056762695, "logits/rejected": -2.278656482696533, "logps/chosen": -168.2295379638672, "logps/rejected": -209.52566528320312, "loss": 0.5754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1450660228729248, "rewards/margins": 0.4235553741455078, "rewards/rejected": -1.5686213970184326, "step": 12920 }, { "epoch": 2.227773949000689, "grad_norm": 33.451446533203125, "learning_rate": 1.886359135384805e-08, "logits/chosen": -2.3403878211975098, "logits/rejected": -2.325918436050415, "logps/chosen": -163.4576873779297, "logps/rejected": -194.09799194335938, "loss": 0.6186, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1322734355926514, "rewards/margins": 0.2874290347099304, "rewards/rejected": -1.4197025299072266, "step": 12930 }, { "epoch": 2.229496898690558, "grad_norm": 35.901519775390625, "learning_rate": 1.8785220596687244e-08, "logits/chosen": -2.2995190620422363, "logits/rejected": -2.26780366897583, "logps/chosen": -173.3542022705078, "logps/rejected": -198.0942840576172, "loss": 0.6165, "rewards/accuracies": 0.625, "rewards/chosen": -1.200137972831726, "rewards/margins": 0.29804515838623047, "rewards/rejected": -1.498183250427246, "step": 12940 }, { "epoch": 2.231219848380427, "grad_norm": 64.96479034423828, "learning_rate": 1.870697530432019e-08, "logits/chosen": -2.2867963314056396, "logits/rejected": -2.264647960662842, "logps/chosen": -173.05960083007812, "logps/rejected": -212.035888671875, "loss": 0.5627, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1870536804199219, "rewards/margins": 0.40245503187179565, "rewards/rejected": -1.5895087718963623, "step": 12950 }, { "epoch": 2.2329427980702965, "grad_norm": 26.18922996520996, "learning_rate": 1.8628855791246323e-08, "logits/chosen": -2.25118088722229, "logits/rejected": -2.2259633541107178, "logps/chosen": -183.46018981933594, "logps/rejected": -200.4232635498047, "loss": 0.6394, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2426159381866455, "rewards/margins": 0.23426619172096252, "rewards/rejected": -1.4768823385238647, "step": 12960 }, { "epoch": 2.2346657477601655, "grad_norm": 26.887893676757812, "learning_rate": 1.8550862371459457e-08, "logits/chosen": -2.2446653842926025, "logits/rejected": -2.211754322052002, "logps/chosen": -174.5418701171875, "logps/rejected": -206.7992401123047, "loss": 0.5685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2038919925689697, "rewards/margins": 0.3909718990325928, "rewards/rejected": -1.594863772392273, "step": 12970 }, { "epoch": 2.2363886974500344, "grad_norm": 30.763185501098633, "learning_rate": 1.8472995358446646e-08, "logits/chosen": -2.2639660835266113, "logits/rejected": -2.248469114303589, "logps/chosen": -171.14822387695312, "logps/rejected": -208.5131072998047, "loss": 0.5737, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2012474536895752, "rewards/margins": 0.3625752329826355, "rewards/rejected": -1.5638227462768555, "step": 12980 }, { "epoch": 2.2381116471399034, "grad_norm": 34.40770721435547, "learning_rate": 1.8395255065186804e-08, "logits/chosen": -2.3413078784942627, "logits/rejected": -2.3061013221740723, "logps/chosen": -182.0183563232422, "logps/rejected": -212.195068359375, "loss": 0.5928, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2216604948043823, "rewards/margins": 0.37895292043685913, "rewards/rejected": -1.6006133556365967, "step": 12990 }, { "epoch": 2.2398345968297724, "grad_norm": 38.62473678588867, "learning_rate": 1.8317641804149575e-08, "logits/chosen": -2.2863516807556152, "logits/rejected": -2.259194850921631, "logps/chosen": -183.0014190673828, "logps/rejected": -215.2476348876953, "loss": 0.5887, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2855231761932373, "rewards/margins": 0.3565601110458374, "rewards/rejected": -1.6420834064483643, "step": 13000 }, { "epoch": 2.241557546519642, "grad_norm": 34.64742660522461, "learning_rate": 1.8240155887293938e-08, "logits/chosen": -2.2462267875671387, "logits/rejected": -2.230980396270752, "logps/chosen": -179.2667694091797, "logps/rejected": -206.8971710205078, "loss": 0.6077, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2816540002822876, "rewards/margins": 0.29531174898147583, "rewards/rejected": -1.5769659280776978, "step": 13010 }, { "epoch": 2.2432804962095108, "grad_norm": 25.726648330688477, "learning_rate": 1.8162797626067072e-08, "logits/chosen": -2.2447333335876465, "logits/rejected": -2.2197303771972656, "logps/chosen": -179.4647979736328, "logps/rejected": -208.9609832763672, "loss": 0.6071, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2703464031219482, "rewards/margins": 0.3287833333015442, "rewards/rejected": -1.5991299152374268, "step": 13020 }, { "epoch": 2.2450034458993797, "grad_norm": 44.14027404785156, "learning_rate": 1.808556733140306e-08, "logits/chosen": -2.313990831375122, "logits/rejected": -2.2862823009490967, "logps/chosen": -183.66726684570312, "logps/rejected": -209.05703735351562, "loss": 0.6153, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.281714677810669, "rewards/margins": 0.29137855768203735, "rewards/rejected": -1.5730931758880615, "step": 13030 }, { "epoch": 2.2467263955892487, "grad_norm": 41.62800979614258, "learning_rate": 1.800846531372161e-08, "logits/chosen": -2.3211021423339844, "logits/rejected": -2.297131061553955, "logps/chosen": -184.17926025390625, "logps/rejected": -222.2518310546875, "loss": 0.5852, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2904921770095825, "rewards/margins": 0.3817223608493805, "rewards/rejected": -1.6722145080566406, "step": 13040 }, { "epoch": 2.2484493452791177, "grad_norm": 37.87771224975586, "learning_rate": 1.7931491882926813e-08, "logits/chosen": -2.302258253097534, "logits/rejected": -2.286159038543701, "logps/chosen": -183.59597778320312, "logps/rejected": -221.5272979736328, "loss": 0.5966, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.310205340385437, "rewards/margins": 0.37293821573257446, "rewards/rejected": -1.6831436157226562, "step": 13050 }, { "epoch": 2.250172294968987, "grad_norm": 35.3230094909668, "learning_rate": 1.7854647348405993e-08, "logits/chosen": -2.3094897270202637, "logits/rejected": -2.2902464866638184, "logps/chosen": -182.91275024414062, "logps/rejected": -219.88931274414062, "loss": 0.5999, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2874906063079834, "rewards/margins": 0.37509453296661377, "rewards/rejected": -1.6625852584838867, "step": 13060 }, { "epoch": 2.251895244658856, "grad_norm": 38.81179428100586, "learning_rate": 1.7777932019028314e-08, "logits/chosen": -2.254732847213745, "logits/rejected": -2.225248336791992, "logps/chosen": -178.22152709960938, "logps/rejected": -203.3961639404297, "loss": 0.6193, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2079050540924072, "rewards/margins": 0.30005380511283875, "rewards/rejected": -1.5079588890075684, "step": 13070 }, { "epoch": 2.253618194348725, "grad_norm": 31.214067459106445, "learning_rate": 1.770134620314363e-08, "logits/chosen": -2.2343270778656006, "logits/rejected": -2.218808174133301, "logps/chosen": -180.11376953125, "logps/rejected": -211.5517120361328, "loss": 0.6029, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2803877592086792, "rewards/margins": 0.3238711953163147, "rewards/rejected": -1.6042588949203491, "step": 13080 }, { "epoch": 2.255341144038594, "grad_norm": 50.3981819152832, "learning_rate": 1.762489020858125e-08, "logits/chosen": -2.3294975757598877, "logits/rejected": -2.2978947162628174, "logps/chosen": -183.16334533691406, "logps/rejected": -210.7142791748047, "loss": 0.6255, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2859891653060913, "rewards/margins": 0.3077470362186432, "rewards/rejected": -1.593736171722412, "step": 13090 }, { "epoch": 2.257064093728463, "grad_norm": 29.65924644470215, "learning_rate": 1.754856434264869e-08, "logits/chosen": -2.3811585903167725, "logits/rejected": -2.3489012718200684, "logps/chosen": -179.81199645996094, "logps/rejected": -212.436279296875, "loss": 0.5774, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2426912784576416, "rewards/margins": 0.3862963318824768, "rewards/rejected": -1.6289876699447632, "step": 13100 }, { "epoch": 2.2587870434183324, "grad_norm": 30.150890350341797, "learning_rate": 1.7472368912130365e-08, "logits/chosen": -2.2300591468811035, "logits/rejected": -2.199148416519165, "logps/chosen": -182.3174285888672, "logps/rejected": -218.63070678710938, "loss": 0.5946, "rewards/accuracies": 0.625, "rewards/chosen": -1.273079752922058, "rewards/margins": 0.40432214736938477, "rewards/rejected": -1.677402138710022, "step": 13110 }, { "epoch": 2.2605099931082013, "grad_norm": 26.72989273071289, "learning_rate": 1.7396304223286484e-08, "logits/chosen": -2.344831705093384, "logits/rejected": -2.330641508102417, "logps/chosen": -178.80715942382812, "logps/rejected": -210.7775421142578, "loss": 0.6014, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2342865467071533, "rewards/margins": 0.3504003584384918, "rewards/rejected": -1.5846867561340332, "step": 13120 }, { "epoch": 2.2622329427980703, "grad_norm": 29.89188575744629, "learning_rate": 1.73203705818517e-08, "logits/chosen": -2.384573221206665, "logits/rejected": -2.355776309967041, "logps/chosen": -172.6627960205078, "logps/rejected": -217.6291046142578, "loss": 0.5639, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1754767894744873, "rewards/margins": 0.4282703399658203, "rewards/rejected": -1.603747010231018, "step": 13130 }, { "epoch": 2.2639558924879393, "grad_norm": 29.023595809936523, "learning_rate": 1.724456829303399e-08, "logits/chosen": -2.2623140811920166, "logits/rejected": -2.2416133880615234, "logps/chosen": -171.00674438476562, "logps/rejected": -203.25985717773438, "loss": 0.5916, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1680914163589478, "rewards/margins": 0.3259120285511017, "rewards/rejected": -1.494003415107727, "step": 13140 }, { "epoch": 2.2656788421778082, "grad_norm": 38.09290313720703, "learning_rate": 1.71688976615133e-08, "logits/chosen": -2.2763171195983887, "logits/rejected": -2.2502236366271973, "logps/chosen": -178.1101531982422, "logps/rejected": -217.0858612060547, "loss": 0.5706, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2172980308532715, "rewards/margins": 0.42803043127059937, "rewards/rejected": -1.6453285217285156, "step": 13150 }, { "epoch": 2.2674017918676777, "grad_norm": 41.28289031982422, "learning_rate": 1.7093358991440466e-08, "logits/chosen": -2.257770538330078, "logits/rejected": -2.22393798828125, "logps/chosen": -187.81979370117188, "logps/rejected": -221.6351776123047, "loss": 0.5819, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3062386512756348, "rewards/margins": 0.3810449242591858, "rewards/rejected": -1.6872835159301758, "step": 13160 }, { "epoch": 2.2691247415575466, "grad_norm": 38.38431167602539, "learning_rate": 1.7017952586435874e-08, "logits/chosen": -2.292358875274658, "logits/rejected": -2.27616286277771, "logps/chosen": -183.13650512695312, "logps/rejected": -208.0890655517578, "loss": 0.6288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2799513339996338, "rewards/margins": 0.28485870361328125, "rewards/rejected": -1.564810037612915, "step": 13170 }, { "epoch": 2.2708476912474156, "grad_norm": 36.38185501098633, "learning_rate": 1.6942678749588263e-08, "logits/chosen": -2.2948360443115234, "logits/rejected": -2.2646539211273193, "logps/chosen": -179.03570556640625, "logps/rejected": -212.93115234375, "loss": 0.5728, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2423360347747803, "rewards/margins": 0.385326623916626, "rewards/rejected": -1.6276626586914062, "step": 13180 }, { "epoch": 2.2725706409372846, "grad_norm": 35.919898986816406, "learning_rate": 1.686753778345359e-08, "logits/chosen": -2.2914533615112305, "logits/rejected": -2.263596534729004, "logps/chosen": -180.45431518554688, "logps/rejected": -207.91995239257812, "loss": 0.6145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2361723184585571, "rewards/margins": 0.32427722215652466, "rewards/rejected": -1.5604493618011475, "step": 13190 }, { "epoch": 2.2742935906271535, "grad_norm": 32.69383239746094, "learning_rate": 1.6792529990053715e-08, "logits/chosen": -2.278026819229126, "logits/rejected": -2.2545487880706787, "logps/chosen": -179.6051483154297, "logps/rejected": -201.932861328125, "loss": 0.6223, "rewards/accuracies": 0.625, "rewards/chosen": -1.2343283891677856, "rewards/margins": 0.2624986469745636, "rewards/rejected": -1.4968270063400269, "step": 13200 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -2.357876777648926, "eval_logits/rejected": -2.3459715843200684, "eval_logps/chosen": -167.66600036621094, "eval_logps/rejected": -188.626708984375, "eval_loss": 0.6502256989479065, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -1.0865050554275513, "eval_rewards/margins": 0.17226597666740417, "eval_rewards/rejected": -1.2587710618972778, "eval_runtime": 384.8668, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 13200 }, { "epoch": 2.2760165403170225, "grad_norm": 42.5054931640625, "learning_rate": 1.671765567087523e-08, "logits/chosen": -2.3471245765686035, "logits/rejected": -2.338646650314331, "logps/chosen": -172.4659881591797, "logps/rejected": -201.0215606689453, "loss": 0.6181, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1909699440002441, "rewards/margins": 0.29259103536605835, "rewards/rejected": -1.4835610389709473, "step": 13210 }, { "epoch": 2.277739490006892, "grad_norm": 27.458099365234375, "learning_rate": 1.6642915126868203e-08, "logits/chosen": -2.3157057762145996, "logits/rejected": -2.2974138259887695, "logps/chosen": -178.68539428710938, "logps/rejected": -205.52627563476562, "loss": 0.6127, "rewards/accuracies": 0.65625, "rewards/chosen": -1.22714364528656, "rewards/margins": 0.27688199281692505, "rewards/rejected": -1.5040256977081299, "step": 13220 }, { "epoch": 2.279462439696761, "grad_norm": 26.042802810668945, "learning_rate": 1.6568308658445064e-08, "logits/chosen": -2.2837634086608887, "logits/rejected": -2.2617480754852295, "logps/chosen": -168.11532592773438, "logps/rejected": -207.41650390625, "loss": 0.5715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1576555967330933, "rewards/margins": 0.39275139570236206, "rewards/rejected": -1.5504071712493896, "step": 13230 }, { "epoch": 2.28118538938663, "grad_norm": 37.14672088623047, "learning_rate": 1.6493836565479324e-08, "logits/chosen": -2.3002994060516357, "logits/rejected": -2.2840652465820312, "logps/chosen": -178.58566284179688, "logps/rejected": -211.67404174804688, "loss": 0.5895, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2686278820037842, "rewards/margins": 0.3217323422431946, "rewards/rejected": -1.5903600454330444, "step": 13240 }, { "epoch": 2.282908339076499, "grad_norm": 29.27492904663086, "learning_rate": 1.6419499147304366e-08, "logits/chosen": -2.264556646347046, "logits/rejected": -2.253521203994751, "logps/chosen": -177.2767791748047, "logps/rejected": -214.045166015625, "loss": 0.5916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2435022592544556, "rewards/margins": 0.34912917017936707, "rewards/rejected": -1.5926315784454346, "step": 13250 }, { "epoch": 2.2846312887663682, "grad_norm": 29.972997665405273, "learning_rate": 1.634529670271224e-08, "logits/chosen": -2.3521368503570557, "logits/rejected": -2.3302011489868164, "logps/chosen": -179.6995391845703, "logps/rejected": -224.6154327392578, "loss": 0.5766, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3029943704605103, "rewards/margins": 0.42312970757484436, "rewards/rejected": -1.7261241674423218, "step": 13260 }, { "epoch": 2.286354238456237, "grad_norm": 31.261409759521484, "learning_rate": 1.6271229529952563e-08, "logits/chosen": -2.232389211654663, "logits/rejected": -2.217360258102417, "logps/chosen": -186.85389709472656, "logps/rejected": -220.95077514648438, "loss": 0.5922, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3120219707489014, "rewards/margins": 0.3629273474216461, "rewards/rejected": -1.674949288368225, "step": 13270 }, { "epoch": 2.288077188146106, "grad_norm": 34.21379089355469, "learning_rate": 1.619729792673114e-08, "logits/chosen": -2.2794029712677, "logits/rejected": -2.2498767375946045, "logps/chosen": -177.8300018310547, "logps/rejected": -207.9958953857422, "loss": 0.6091, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.228564739227295, "rewards/margins": 0.34510141611099243, "rewards/rejected": -1.5736663341522217, "step": 13280 }, { "epoch": 2.289800137835975, "grad_norm": 36.05464172363281, "learning_rate": 1.6123502190208944e-08, "logits/chosen": -2.2673323154449463, "logits/rejected": -2.2458250522613525, "logps/chosen": -180.65086364746094, "logps/rejected": -213.90859985351562, "loss": 0.5841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2555345296859741, "rewards/margins": 0.35977333784103394, "rewards/rejected": -1.6153080463409424, "step": 13290 }, { "epoch": 2.291523087525844, "grad_norm": 28.938241958618164, "learning_rate": 1.6049842617000826e-08, "logits/chosen": -2.285695791244507, "logits/rejected": -2.2649500370025635, "logps/chosen": -190.43856811523438, "logps/rejected": -215.55447387695312, "loss": 0.6485, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3869019746780396, "rewards/margins": 0.2633451819419861, "rewards/rejected": -1.65024733543396, "step": 13300 }, { "epoch": 2.293246037215713, "grad_norm": 36.23344421386719, "learning_rate": 1.5976319503174313e-08, "logits/chosen": -2.284667491912842, "logits/rejected": -2.2592196464538574, "logps/chosen": -192.6880645751953, "logps/rejected": -225.78768920898438, "loss": 0.5965, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3499374389648438, "rewards/margins": 0.37441927194595337, "rewards/rejected": -1.7243566513061523, "step": 13310 }, { "epoch": 2.2949689869055825, "grad_norm": 52.024208068847656, "learning_rate": 1.590293314424846e-08, "logits/chosen": -2.314713716506958, "logits/rejected": -2.2793240547180176, "logps/chosen": -192.59432983398438, "logps/rejected": -208.95361328125, "loss": 0.6508, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3572700023651123, "rewards/margins": 0.25697070360183716, "rewards/rejected": -1.6142408847808838, "step": 13320 }, { "epoch": 2.2966919365954515, "grad_norm": 45.16383743286133, "learning_rate": 1.582968383519267e-08, "logits/chosen": -2.242969036102295, "logits/rejected": -2.2187442779541016, "logps/chosen": -181.7742919921875, "logps/rejected": -214.463134765625, "loss": 0.592, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2493605613708496, "rewards/margins": 0.40000343322753906, "rewards/rejected": -1.6493641138076782, "step": 13330 }, { "epoch": 2.2984148862853204, "grad_norm": 40.29065704345703, "learning_rate": 1.5756571870425485e-08, "logits/chosen": -2.341768264770508, "logits/rejected": -2.325599193572998, "logps/chosen": -188.314697265625, "logps/rejected": -216.07229614257812, "loss": 0.6231, "rewards/accuracies": 0.625, "rewards/chosen": -1.3263964653015137, "rewards/margins": 0.30297255516052246, "rewards/rejected": -1.6293690204620361, "step": 13340 }, { "epoch": 2.3001378359751894, "grad_norm": 39.64885711669922, "learning_rate": 1.568359754381337e-08, "logits/chosen": -2.3034918308258057, "logits/rejected": -2.2614338397979736, "logps/chosen": -176.49429321289062, "logps/rejected": -216.81161499023438, "loss": 0.563, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1907312870025635, "rewards/margins": 0.46979132294654846, "rewards/rejected": -1.6605224609375, "step": 13350 }, { "epoch": 2.301860785665059, "grad_norm": 40.885826110839844, "learning_rate": 1.5610761148669588e-08, "logits/chosen": -2.2991368770599365, "logits/rejected": -2.2754249572753906, "logps/chosen": -188.7205810546875, "logps/rejected": -222.1409912109375, "loss": 0.5961, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3075730800628662, "rewards/margins": 0.3844447731971741, "rewards/rejected": -1.6920177936553955, "step": 13360 }, { "epoch": 2.3035837353549278, "grad_norm": 47.9317512512207, "learning_rate": 1.5538062977753007e-08, "logits/chosen": -2.265645742416382, "logits/rejected": -2.2291998863220215, "logps/chosen": -186.75180053710938, "logps/rejected": -219.0263671875, "loss": 0.6174, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3111344575881958, "rewards/margins": 0.3566862940788269, "rewards/rejected": -1.667820692062378, "step": 13370 }, { "epoch": 2.3053066850447967, "grad_norm": 32.88581466674805, "learning_rate": 1.5465503323266933e-08, "logits/chosen": -2.2227225303649902, "logits/rejected": -2.1929125785827637, "logps/chosen": -188.45529174804688, "logps/rejected": -215.32308959960938, "loss": 0.6093, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3009912967681885, "rewards/margins": 0.3205062448978424, "rewards/rejected": -1.6214977502822876, "step": 13380 }, { "epoch": 2.3070296347346657, "grad_norm": 46.64186477661133, "learning_rate": 1.539308247685787e-08, "logits/chosen": -2.255293607711792, "logits/rejected": -2.2119433879852295, "logps/chosen": -175.30612182617188, "logps/rejected": -201.49118041992188, "loss": 0.6095, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1783514022827148, "rewards/margins": 0.32855120301246643, "rewards/rejected": -1.5069026947021484, "step": 13390 }, { "epoch": 2.3087525844245347, "grad_norm": 31.650501251220703, "learning_rate": 1.532080072961442e-08, "logits/chosen": -2.2712759971618652, "logits/rejected": -2.240250825881958, "logps/chosen": -171.73333740234375, "logps/rejected": -209.0310516357422, "loss": 0.5619, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1551803350448608, "rewards/margins": 0.4344462454319, "rewards/rejected": -1.5896265506744385, "step": 13400 }, { "epoch": 2.3104755341144037, "grad_norm": 31.78446388244629, "learning_rate": 1.5248658372066107e-08, "logits/chosen": -2.301515817642212, "logits/rejected": -2.2714321613311768, "logps/chosen": -182.88613891601562, "logps/rejected": -215.9427947998047, "loss": 0.5949, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2810585498809814, "rewards/margins": 0.37699708342552185, "rewards/rejected": -1.6580556631088257, "step": 13410 }, { "epoch": 2.312198483804273, "grad_norm": 42.18976974487305, "learning_rate": 1.5176655694182156e-08, "logits/chosen": -2.290903329849243, "logits/rejected": -2.2751002311706543, "logps/chosen": -177.2351531982422, "logps/rejected": -218.5797119140625, "loss": 0.5751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2314550876617432, "rewards/margins": 0.40304645895957947, "rewards/rejected": -1.6345014572143555, "step": 13420 }, { "epoch": 2.313921433494142, "grad_norm": 36.520469665527344, "learning_rate": 1.5104792985370406e-08, "logits/chosen": -2.360677480697632, "logits/rejected": -2.332749605178833, "logps/chosen": -178.2684326171875, "logps/rejected": -212.31723022460938, "loss": 0.6023, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2227405309677124, "rewards/margins": 0.3327510952949524, "rewards/rejected": -1.5554919242858887, "step": 13430 }, { "epoch": 2.315644383184011, "grad_norm": 34.495445251464844, "learning_rate": 1.5033070534476055e-08, "logits/chosen": -2.2259278297424316, "logits/rejected": -2.2103095054626465, "logps/chosen": -172.03411865234375, "logps/rejected": -199.85397338867188, "loss": 0.6245, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1800358295440674, "rewards/margins": 0.302860826253891, "rewards/rejected": -1.4828965663909912, "step": 13440 }, { "epoch": 2.31736733287388, "grad_norm": 28.928640365600586, "learning_rate": 1.4961488629780604e-08, "logits/chosen": -2.2545201778411865, "logits/rejected": -2.230071544647217, "logps/chosen": -171.535400390625, "logps/rejected": -207.2544403076172, "loss": 0.5966, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2039422988891602, "rewards/margins": 0.3546209931373596, "rewards/rejected": -1.558563470840454, "step": 13450 }, { "epoch": 2.3190902825637494, "grad_norm": 21.376005172729492, "learning_rate": 1.489004755900058e-08, "logits/chosen": -2.2931625843048096, "logits/rejected": -2.260887384414673, "logps/chosen": -175.7183074951172, "logps/rejected": -213.6696014404297, "loss": 0.5651, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1990082263946533, "rewards/margins": 0.44000688195228577, "rewards/rejected": -1.6390151977539062, "step": 13460 }, { "epoch": 2.3208132322536184, "grad_norm": 34.317535400390625, "learning_rate": 1.4818747609286486e-08, "logits/chosen": -2.214077949523926, "logits/rejected": -2.1879045963287354, "logps/chosen": -185.1648406982422, "logps/rejected": -211.9553680419922, "loss": 0.6136, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.293401837348938, "rewards/margins": 0.32934027910232544, "rewards/rejected": -1.622741937637329, "step": 13470 }, { "epoch": 2.3225361819434873, "grad_norm": 43.287086486816406, "learning_rate": 1.4747589067221627e-08, "logits/chosen": -2.3157906532287598, "logits/rejected": -2.275646686553955, "logps/chosen": -175.83084106445312, "logps/rejected": -211.6923065185547, "loss": 0.572, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.216455101966858, "rewards/margins": 0.3852939009666443, "rewards/rejected": -1.6017488241195679, "step": 13480 }, { "epoch": 2.3242591316333563, "grad_norm": 33.92958450317383, "learning_rate": 1.4676572218820831e-08, "logits/chosen": -2.2876946926116943, "logits/rejected": -2.2540640830993652, "logps/chosen": -191.86166381835938, "logps/rejected": -230.7425537109375, "loss": 0.5832, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3764475584030151, "rewards/margins": 0.4016246795654297, "rewards/rejected": -1.7780723571777344, "step": 13490 }, { "epoch": 2.3259820813232253, "grad_norm": 35.81848907470703, "learning_rate": 1.4605697349529494e-08, "logits/chosen": -2.2865166664123535, "logits/rejected": -2.25510573387146, "logps/chosen": -187.5164337158203, "logps/rejected": -223.8311004638672, "loss": 0.5741, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3080507516860962, "rewards/margins": 0.41828426718711853, "rewards/rejected": -1.7263351678848267, "step": 13500 }, { "epoch": 2.3277050310130942, "grad_norm": 30.200706481933594, "learning_rate": 1.4534964744222339e-08, "logits/chosen": -2.218099594116211, "logits/rejected": -2.2067201137542725, "logps/chosen": -178.73098754882812, "logps/rejected": -213.6283416748047, "loss": 0.6113, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.28030526638031, "rewards/margins": 0.31757602095603943, "rewards/rejected": -1.5978811979293823, "step": 13510 }, { "epoch": 2.3294279807029636, "grad_norm": 37.271827697753906, "learning_rate": 1.4464374687202224e-08, "logits/chosen": -2.188913583755493, "logits/rejected": -2.1714272499084473, "logps/chosen": -185.5612335205078, "logps/rejected": -218.77609252929688, "loss": 0.6041, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3394790887832642, "rewards/margins": 0.34229475259780884, "rewards/rejected": -1.6817739009857178, "step": 13520 }, { "epoch": 2.3311509303928326, "grad_norm": 39.160675048828125, "learning_rate": 1.4393927462199062e-08, "logits/chosen": -2.2882041931152344, "logits/rejected": -2.2603230476379395, "logps/chosen": -184.82806396484375, "logps/rejected": -202.9477996826172, "loss": 0.6465, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.29298996925354, "rewards/margins": 0.22285866737365723, "rewards/rejected": -1.5158485174179077, "step": 13530 }, { "epoch": 2.3328738800827016, "grad_norm": 39.556976318359375, "learning_rate": 1.4323623352368691e-08, "logits/chosen": -2.2315313816070557, "logits/rejected": -2.1995949745178223, "logps/chosen": -184.9795684814453, "logps/rejected": -216.235595703125, "loss": 0.5828, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.285422682762146, "rewards/margins": 0.38336047530174255, "rewards/rejected": -1.668783187866211, "step": 13540 }, { "epoch": 2.3345968297725705, "grad_norm": 31.444944381713867, "learning_rate": 1.4253462640291708e-08, "logits/chosen": -2.2623448371887207, "logits/rejected": -2.2332801818847656, "logps/chosen": -185.16944885253906, "logps/rejected": -216.3262176513672, "loss": 0.6145, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3362573385238647, "rewards/margins": 0.3372352421283722, "rewards/rejected": -1.6734927892684937, "step": 13550 }, { "epoch": 2.3363197794624395, "grad_norm": 40.11167526245117, "learning_rate": 1.4183445607972299e-08, "logits/chosen": -2.2807250022888184, "logits/rejected": -2.2724642753601074, "logps/chosen": -179.11654663085938, "logps/rejected": -201.07957458496094, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": -1.2554104328155518, "rewards/margins": 0.2322661429643631, "rewards/rejected": -1.4876763820648193, "step": 13560 }, { "epoch": 2.338042729152309, "grad_norm": 35.94459915161133, "learning_rate": 1.4113572536837192e-08, "logits/chosen": -2.2167115211486816, "logits/rejected": -2.1958794593811035, "logps/chosen": -184.91183471679688, "logps/rejected": -224.0436248779297, "loss": 0.5753, "rewards/accuracies": 0.71875, "rewards/chosen": -1.325109601020813, "rewards/margins": 0.4127611219882965, "rewards/rejected": -1.7378708124160767, "step": 13570 }, { "epoch": 2.339765678842178, "grad_norm": 31.295074462890625, "learning_rate": 1.4043843707734448e-08, "logits/chosen": -2.2628185749053955, "logits/rejected": -2.2513587474823, "logps/chosen": -179.8581085205078, "logps/rejected": -209.72250366210938, "loss": 0.6121, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2665109634399414, "rewards/margins": 0.3041861653327942, "rewards/rejected": -1.5706971883773804, "step": 13580 }, { "epoch": 2.341488628532047, "grad_norm": 54.577552795410156, "learning_rate": 1.3974259400932348e-08, "logits/chosen": -2.2290074825286865, "logits/rejected": -2.2191624641418457, "logps/chosen": -180.3831329345703, "logps/rejected": -216.0695343017578, "loss": 0.5961, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2730530500411987, "rewards/margins": 0.35534295439720154, "rewards/rejected": -1.6283960342407227, "step": 13590 }, { "epoch": 2.343211578221916, "grad_norm": 37.03809356689453, "learning_rate": 1.3904819896118314e-08, "logits/chosen": -2.283867597579956, "logits/rejected": -2.2657036781311035, "logps/chosen": -189.1884307861328, "logps/rejected": -212.0028076171875, "loss": 0.6245, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2924267053604126, "rewards/margins": 0.2556527256965637, "rewards/rejected": -1.5480793714523315, "step": 13600 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -2.3582911491394043, "eval_logits/rejected": -2.346229076385498, "eval_logps/chosen": -167.07147216796875, "eval_logps/rejected": -188.04969787597656, "eval_loss": 0.6505503058433533, "eval_rewards/accuracies": 0.598280668258667, "eval_rewards/chosen": -1.0805600881576538, "eval_rewards/margins": 0.172440767288208, "eval_rewards/rejected": -1.2530008554458618, "eval_runtime": 384.874, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 13600 }, { "epoch": 2.344934527911785, "grad_norm": 36.64026641845703, "learning_rate": 1.3835525472397747e-08, "logits/chosen": -2.4126977920532227, "logits/rejected": -2.387202262878418, "logps/chosen": -173.5795135498047, "logps/rejected": -202.99668884277344, "loss": 0.625, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1968533992767334, "rewards/margins": 0.3015924096107483, "rewards/rejected": -1.498445749282837, "step": 13610 }, { "epoch": 2.346657477601654, "grad_norm": 30.19744110107422, "learning_rate": 1.376637640829289e-08, "logits/chosen": -2.2988693714141846, "logits/rejected": -2.2542026042938232, "logps/chosen": -177.09573364257812, "logps/rejected": -212.8185272216797, "loss": 0.5693, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1949437856674194, "rewards/margins": 0.42264899611473083, "rewards/rejected": -1.6175928115844727, "step": 13620 }, { "epoch": 2.348380427291523, "grad_norm": 35.095787048339844, "learning_rate": 1.3697372981741707e-08, "logits/chosen": -2.236766815185547, "logits/rejected": -2.196708917617798, "logps/chosen": -181.1944122314453, "logps/rejected": -215.4459991455078, "loss": 0.5747, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2550218105316162, "rewards/margins": 0.4060831665992737, "rewards/rejected": -1.6611049175262451, "step": 13630 }, { "epoch": 2.350103376981392, "grad_norm": 30.182931900024414, "learning_rate": 1.362851547009684e-08, "logits/chosen": -2.2480995655059814, "logits/rejected": -2.221525192260742, "logps/chosen": -175.38925170898438, "logps/rejected": -218.19741821289062, "loss": 0.5489, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1962645053863525, "rewards/margins": 0.45355424284935, "rewards/rejected": -1.649818778038025, "step": 13640 }, { "epoch": 2.351826326671261, "grad_norm": 33.286869049072266, "learning_rate": 1.3559804150124421e-08, "logits/chosen": -2.3198165893554688, "logits/rejected": -2.2934446334838867, "logps/chosen": -176.68719482421875, "logps/rejected": -210.19088745117188, "loss": 0.5757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2155816555023193, "rewards/margins": 0.3787309527397156, "rewards/rejected": -1.5943125486373901, "step": 13650 }, { "epoch": 2.35354927636113, "grad_norm": 38.136558532714844, "learning_rate": 1.3491239298002954e-08, "logits/chosen": -2.2096331119537354, "logits/rejected": -2.186317205429077, "logps/chosen": -180.8411102294922, "logps/rejected": -211.2050323486328, "loss": 0.6122, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2719953060150146, "rewards/margins": 0.30958497524261475, "rewards/rejected": -1.581580400466919, "step": 13660 }, { "epoch": 2.3552722260509995, "grad_norm": 30.543697357177734, "learning_rate": 1.3422821189322231e-08, "logits/chosen": -2.297574520111084, "logits/rejected": -2.273895740509033, "logps/chosen": -183.28347778320312, "logps/rejected": -207.8615264892578, "loss": 0.6355, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2745354175567627, "rewards/margins": 0.2656957507133484, "rewards/rejected": -1.5402309894561768, "step": 13670 }, { "epoch": 2.3569951757408685, "grad_norm": 33.59134292602539, "learning_rate": 1.3354550099082256e-08, "logits/chosen": -2.309947967529297, "logits/rejected": -2.279665470123291, "logps/chosen": -179.7958984375, "logps/rejected": -203.94461059570312, "loss": 0.6041, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2364991903305054, "rewards/margins": 0.2978908121585846, "rewards/rejected": -1.5343900918960571, "step": 13680 }, { "epoch": 2.3587181254307374, "grad_norm": 45.42689895629883, "learning_rate": 1.3286426301692105e-08, "logits/chosen": -2.2824180126190186, "logits/rejected": -2.250120162963867, "logps/chosen": -177.89151000976562, "logps/rejected": -214.27743530273438, "loss": 0.5862, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2323997020721436, "rewards/margins": 0.3764374852180481, "rewards/rejected": -1.6088371276855469, "step": 13690 }, { "epoch": 2.3604410751206064, "grad_norm": 46.692657470703125, "learning_rate": 1.321845007096879e-08, "logits/chosen": -2.278366804122925, "logits/rejected": -2.248579502105713, "logps/chosen": -186.04518127441406, "logps/rejected": -209.16122436523438, "loss": 0.6153, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.263258695602417, "rewards/margins": 0.30537840723991394, "rewards/rejected": -1.5686371326446533, "step": 13700 }, { "epoch": 2.3621640248104754, "grad_norm": 40.69031524658203, "learning_rate": 1.3150621680136197e-08, "logits/chosen": -2.233503818511963, "logits/rejected": -2.1990857124328613, "logps/chosen": -173.60745239257812, "logps/rejected": -212.70382690429688, "loss": 0.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1895692348480225, "rewards/margins": 0.4038330912590027, "rewards/rejected": -1.5934025049209595, "step": 13710 }, { "epoch": 2.3638869745003444, "grad_norm": 37.574806213378906, "learning_rate": 1.3082941401824027e-08, "logits/chosen": -2.2590606212615967, "logits/rejected": -2.2263731956481934, "logps/chosen": -168.90953063964844, "logps/rejected": -199.29470825195312, "loss": 0.597, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1364753246307373, "rewards/margins": 0.3397006094455719, "rewards/rejected": -1.4761759042739868, "step": 13720 }, { "epoch": 2.3656099241902138, "grad_norm": 59.643516540527344, "learning_rate": 1.30154095080666e-08, "logits/chosen": -2.369276523590088, "logits/rejected": -2.3432652950286865, "logps/chosen": -175.1021270751953, "logps/rejected": -216.71896362304688, "loss": 0.5713, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1881558895111084, "rewards/margins": 0.4090685248374939, "rewards/rejected": -1.597224473953247, "step": 13730 }, { "epoch": 2.3673328738800827, "grad_norm": 30.843334197998047, "learning_rate": 1.2948026270301853e-08, "logits/chosen": -2.348867654800415, "logits/rejected": -2.317417621612549, "logps/chosen": -179.7681427001953, "logps/rejected": -213.34017944335938, "loss": 0.5824, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2141062021255493, "rewards/margins": 0.39131397008895874, "rewards/rejected": -1.6054203510284424, "step": 13740 }, { "epoch": 2.3690558235699517, "grad_norm": 39.30093002319336, "learning_rate": 1.2880791959370235e-08, "logits/chosen": -2.307525157928467, "logits/rejected": -2.279491424560547, "logps/chosen": -183.9978485107422, "logps/rejected": -220.3440399169922, "loss": 0.591, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.274763822555542, "rewards/margins": 0.39177602529525757, "rewards/rejected": -1.6665397882461548, "step": 13750 }, { "epoch": 2.3707787732598207, "grad_norm": 46.24401092529297, "learning_rate": 1.2813706845513556e-08, "logits/chosen": -2.359976291656494, "logits/rejected": -2.327162504196167, "logps/chosen": -171.99435424804688, "logps/rejected": -200.48883056640625, "loss": 0.6053, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1655842065811157, "rewards/margins": 0.3405342102050781, "rewards/rejected": -1.5061182975769043, "step": 13760 }, { "epoch": 2.37250172294969, "grad_norm": 43.86614227294922, "learning_rate": 1.274677119837393e-08, "logits/chosen": -2.3412115573883057, "logits/rejected": -2.3285574913024902, "logps/chosen": -178.02932739257812, "logps/rejected": -214.586181640625, "loss": 0.5976, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.265352725982666, "rewards/margins": 0.34253057837486267, "rewards/rejected": -1.6078834533691406, "step": 13770 }, { "epoch": 2.374224672639559, "grad_norm": 31.504146575927734, "learning_rate": 1.2679985286992762e-08, "logits/chosen": -2.3894755840301514, "logits/rejected": -2.3525168895721436, "logps/chosen": -188.07192993164062, "logps/rejected": -205.6201171875, "loss": 0.6331, "rewards/accuracies": 0.65625, "rewards/chosen": -1.280479073524475, "rewards/margins": 0.26705050468444824, "rewards/rejected": -1.5475298166275024, "step": 13780 }, { "epoch": 2.375947622329428, "grad_norm": 40.06073760986328, "learning_rate": 1.2613349379809596e-08, "logits/chosen": -2.299266815185547, "logits/rejected": -2.271803140640259, "logps/chosen": -181.13267517089844, "logps/rejected": -212.23593139648438, "loss": 0.5869, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2333463430404663, "rewards/margins": 0.36941683292388916, "rewards/rejected": -1.6027634143829346, "step": 13790 }, { "epoch": 2.377670572019297, "grad_norm": 31.72646141052246, "learning_rate": 1.2546863744660975e-08, "logits/chosen": -2.3663368225097656, "logits/rejected": -2.3349125385284424, "logps/chosen": -174.5872802734375, "logps/rejected": -204.26718139648438, "loss": 0.581, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1611812114715576, "rewards/margins": 0.3709130883216858, "rewards/rejected": -1.5320942401885986, "step": 13800 }, { "epoch": 2.379393521709166, "grad_norm": 39.007442474365234, "learning_rate": 1.2480528648779532e-08, "logits/chosen": -2.2920098304748535, "logits/rejected": -2.2649521827697754, "logps/chosen": -164.62759399414062, "logps/rejected": -201.13555908203125, "loss": 0.5844, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.10493803024292, "rewards/margins": 0.37880510091781616, "rewards/rejected": -1.4837430715560913, "step": 13810 }, { "epoch": 2.381116471399035, "grad_norm": 32.280784606933594, "learning_rate": 1.2414344358792784e-08, "logits/chosen": -2.339385747909546, "logits/rejected": -2.3074727058410645, "logps/chosen": -177.5450439453125, "logps/rejected": -200.82669067382812, "loss": 0.6071, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.166850209236145, "rewards/margins": 0.3151569664478302, "rewards/rejected": -1.4820071458816528, "step": 13820 }, { "epoch": 2.3828394210889043, "grad_norm": 30.069440841674805, "learning_rate": 1.2348311140722079e-08, "logits/chosen": -2.3644509315490723, "logits/rejected": -2.3493714332580566, "logps/chosen": -170.24691772460938, "logps/rejected": -197.38812255859375, "loss": 0.5992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1383792161941528, "rewards/margins": 0.3191404938697815, "rewards/rejected": -1.45751953125, "step": 13830 }, { "epoch": 2.3845623707787733, "grad_norm": 41.18797302246094, "learning_rate": 1.2282429259981597e-08, "logits/chosen": -2.3379101753234863, "logits/rejected": -2.30659556388855, "logps/chosen": -175.2816162109375, "logps/rejected": -193.45132446289062, "loss": 0.6339, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.177595853805542, "rewards/margins": 0.24431820213794708, "rewards/rejected": -1.4219141006469727, "step": 13840 }, { "epoch": 2.3862853204686423, "grad_norm": 36.08599090576172, "learning_rate": 1.221669898137716e-08, "logits/chosen": -2.2660117149353027, "logits/rejected": -2.2420566082000732, "logps/chosen": -171.4685516357422, "logps/rejected": -193.0071258544922, "loss": 0.6317, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.16433584690094, "rewards/margins": 0.2799225449562073, "rewards/rejected": -1.4442580938339233, "step": 13850 }, { "epoch": 2.3880082701585112, "grad_norm": 31.845666885375977, "learning_rate": 1.2151120569105316e-08, "logits/chosen": -2.2724967002868652, "logits/rejected": -2.2546486854553223, "logps/chosen": -175.4547882080078, "logps/rejected": -204.92172241210938, "loss": 0.6033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2325869798660278, "rewards/margins": 0.310187965631485, "rewards/rejected": -1.5427749156951904, "step": 13860 }, { "epoch": 2.3897312198483807, "grad_norm": 35.77281951904297, "learning_rate": 1.208569428675214e-08, "logits/chosen": -2.310751438140869, "logits/rejected": -2.2911205291748047, "logps/chosen": -185.12376403808594, "logps/rejected": -212.27761840820312, "loss": 0.6249, "rewards/accuracies": 0.625, "rewards/chosen": -1.2908276319503784, "rewards/margins": 0.3130032420158386, "rewards/rejected": -1.6038309335708618, "step": 13870 }, { "epoch": 2.3914541695382496, "grad_norm": 27.441791534423828, "learning_rate": 1.2020420397292285e-08, "logits/chosen": -2.2574360370635986, "logits/rejected": -2.2178995609283447, "logps/chosen": -170.9420166015625, "logps/rejected": -205.03140258789062, "loss": 0.5863, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1549043655395508, "rewards/margins": 0.37761181592941284, "rewards/rejected": -1.5325162410736084, "step": 13880 }, { "epoch": 2.3931771192281186, "grad_norm": 25.524450302124023, "learning_rate": 1.1955299163087818e-08, "logits/chosen": -2.333522081375122, "logits/rejected": -2.3122828006744385, "logps/chosen": -172.7584991455078, "logps/rejected": -199.32009887695312, "loss": 0.6054, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1644203662872314, "rewards/margins": 0.3049108386039734, "rewards/rejected": -1.46933114528656, "step": 13890 }, { "epoch": 2.3949000689179876, "grad_norm": 33.01766586303711, "learning_rate": 1.1890330845887292e-08, "logits/chosen": -2.256608724594116, "logits/rejected": -2.2248635292053223, "logps/chosen": -166.27073669433594, "logps/rejected": -197.65635681152344, "loss": 0.5851, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1234867572784424, "rewards/margins": 0.36823463439941406, "rewards/rejected": -1.4917213916778564, "step": 13900 }, { "epoch": 2.3966230186078565, "grad_norm": 35.33543395996094, "learning_rate": 1.1825515706824563e-08, "logits/chosen": -2.284083127975464, "logits/rejected": -2.260112762451172, "logps/chosen": -166.86380004882812, "logps/rejected": -190.37025451660156, "loss": 0.6052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0944288969039917, "rewards/margins": 0.29038792848587036, "rewards/rejected": -1.3848168849945068, "step": 13910 }, { "epoch": 2.3983459682977255, "grad_norm": 43.38460922241211, "learning_rate": 1.1760854006417848e-08, "logits/chosen": -2.335136890411377, "logits/rejected": -2.299079418182373, "logps/chosen": -177.9640350341797, "logps/rejected": -202.99925231933594, "loss": 0.5887, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1683671474456787, "rewards/margins": 0.34926632046699524, "rewards/rejected": -1.5176336765289307, "step": 13920 }, { "epoch": 2.400068917987595, "grad_norm": 32.38801574707031, "learning_rate": 1.1696346004568597e-08, "logits/chosen": -2.2909762859344482, "logits/rejected": -2.268258571624756, "logps/chosen": -163.80203247070312, "logps/rejected": -187.72994995117188, "loss": 0.6266, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1165748834609985, "rewards/margins": 0.25797998905181885, "rewards/rejected": -1.3745548725128174, "step": 13930 }, { "epoch": 2.401791867677464, "grad_norm": 28.899768829345703, "learning_rate": 1.1631991960560494e-08, "logits/chosen": -2.264949083328247, "logits/rejected": -2.2347309589385986, "logps/chosen": -162.36122131347656, "logps/rejected": -207.45498657226562, "loss": 0.5682, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0848444700241089, "rewards/margins": 0.43307724595069885, "rewards/rejected": -1.5179216861724854, "step": 13940 }, { "epoch": 2.403514817367333, "grad_norm": 36.21199035644531, "learning_rate": 1.1567792133058418e-08, "logits/chosen": -2.3258869647979736, "logits/rejected": -2.300102949142456, "logps/chosen": -171.20603942871094, "logps/rejected": -201.0448455810547, "loss": 0.618, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1686848402023315, "rewards/margins": 0.3188968300819397, "rewards/rejected": -1.487581729888916, "step": 13950 }, { "epoch": 2.405237767057202, "grad_norm": 32.96730041503906, "learning_rate": 1.1503746780107394e-08, "logits/chosen": -2.192915439605713, "logits/rejected": -2.1882834434509277, "logps/chosen": -172.82211303710938, "logps/rejected": -193.9451141357422, "loss": 0.6298, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1832388639450073, "rewards/margins": 0.23356863856315613, "rewards/rejected": -1.4168074131011963, "step": 13960 }, { "epoch": 2.406960716747071, "grad_norm": 28.457719802856445, "learning_rate": 1.1439856159131528e-08, "logits/chosen": -2.2863781452178955, "logits/rejected": -2.2556209564208984, "logps/chosen": -173.32937622070312, "logps/rejected": -203.6952667236328, "loss": 0.5837, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1314260959625244, "rewards/margins": 0.3794175982475281, "rewards/rejected": -1.5108436346054077, "step": 13970 }, { "epoch": 2.40868366643694, "grad_norm": 30.23308753967285, "learning_rate": 1.1376120526932987e-08, "logits/chosen": -2.259094476699829, "logits/rejected": -2.244642734527588, "logps/chosen": -175.16502380371094, "logps/rejected": -211.3958740234375, "loss": 0.5767, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1950197219848633, "rewards/margins": 0.38072091341018677, "rewards/rejected": -1.5757405757904053, "step": 13980 }, { "epoch": 2.410406616126809, "grad_norm": 41.41474914550781, "learning_rate": 1.1312540139691012e-08, "logits/chosen": -2.288893222808838, "logits/rejected": -2.2532081604003906, "logps/chosen": -172.82003784179688, "logps/rejected": -202.6908721923828, "loss": 0.5953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1435893774032593, "rewards/margins": 0.3480374217033386, "rewards/rejected": -1.4916269779205322, "step": 13990 }, { "epoch": 2.412129565816678, "grad_norm": 39.095706939697266, "learning_rate": 1.1249115252960845e-08, "logits/chosen": -2.2197909355163574, "logits/rejected": -2.1884961128234863, "logps/chosen": -172.7142333984375, "logps/rejected": -213.0373992919922, "loss": 0.5716, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2112494707107544, "rewards/margins": 0.39283236861228943, "rewards/rejected": -1.6040818691253662, "step": 14000 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -2.3650660514831543, "eval_logits/rejected": -2.353348970413208, "eval_logps/chosen": -162.07864379882812, "eval_logps/rejected": -182.53675842285156, "eval_loss": 0.6510778069496155, "eval_rewards/accuracies": 0.5940985083580017, "eval_rewards/chosen": -1.0306316614151, "eval_rewards/margins": 0.16723977029323578, "eval_rewards/rejected": -1.197871446609497, "eval_runtime": 384.5746, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 14000 }, { "epoch": 2.413852515506547, "grad_norm": 31.06534767150879, "learning_rate": 1.1185846121672677e-08, "logits/chosen": -2.2538845539093018, "logits/rejected": -2.2423095703125, "logps/chosen": -177.99278259277344, "logps/rejected": -201.70272827148438, "loss": 0.6277, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2160732746124268, "rewards/margins": 0.27514421939849854, "rewards/rejected": -1.4912176132202148, "step": 14010 }, { "epoch": 2.415575465196416, "grad_norm": 32.03325271606445, "learning_rate": 1.1122733000130697e-08, "logits/chosen": -2.268951892852783, "logits/rejected": -2.257387638092041, "logps/chosen": -176.60130310058594, "logps/rejected": -198.1162567138672, "loss": 0.6351, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2103415727615356, "rewards/margins": 0.26508891582489014, "rewards/rejected": -1.4754304885864258, "step": 14020 }, { "epoch": 2.4172984148862855, "grad_norm": 37.38322448730469, "learning_rate": 1.1059776142011995e-08, "logits/chosen": -2.2546093463897705, "logits/rejected": -2.2319176197052, "logps/chosen": -181.75502014160156, "logps/rejected": -207.01553344726562, "loss": 0.6094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2745532989501953, "rewards/margins": 0.3088175356388092, "rewards/rejected": -1.583370566368103, "step": 14030 }, { "epoch": 2.4190213645761545, "grad_norm": 36.754966735839844, "learning_rate": 1.0996975800365577e-08, "logits/chosen": -2.2602710723876953, "logits/rejected": -2.2106664180755615, "logps/chosen": -176.01315307617188, "logps/rejected": -209.2595672607422, "loss": 0.5703, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1861064434051514, "rewards/margins": 0.4158700406551361, "rewards/rejected": -1.6019766330718994, "step": 14040 }, { "epoch": 2.4207443142660234, "grad_norm": 31.584936141967773, "learning_rate": 1.0934332227611365e-08, "logits/chosen": -2.2989883422851562, "logits/rejected": -2.2649173736572266, "logps/chosen": -166.5214385986328, "logps/rejected": -189.67869567871094, "loss": 0.6205, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1098644733428955, "rewards/margins": 0.28757235407829285, "rewards/rejected": -1.3974368572235107, "step": 14050 }, { "epoch": 2.4224672639558924, "grad_norm": 26.96416664123535, "learning_rate": 1.0871845675539166e-08, "logits/chosen": -2.318040609359741, "logits/rejected": -2.2748923301696777, "logps/chosen": -171.8230438232422, "logps/rejected": -212.2820281982422, "loss": 0.5522, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1478931903839111, "rewards/margins": 0.4663833677768707, "rewards/rejected": -1.6142765283584595, "step": 14060 }, { "epoch": 2.4241902136457614, "grad_norm": 33.14677429199219, "learning_rate": 1.0809516395307644e-08, "logits/chosen": -2.2845077514648438, "logits/rejected": -2.2546377182006836, "logps/chosen": -177.21847534179688, "logps/rejected": -213.17330932617188, "loss": 0.5825, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2126785516738892, "rewards/margins": 0.3783145546913147, "rewards/rejected": -1.5909929275512695, "step": 14070 }, { "epoch": 2.425913163335631, "grad_norm": 27.923873901367188, "learning_rate": 1.07473446374433e-08, "logits/chosen": -2.2292556762695312, "logits/rejected": -2.1985204219818115, "logps/chosen": -191.96128845214844, "logps/rejected": -218.0422821044922, "loss": 0.6239, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3302682638168335, "rewards/margins": 0.32979243993759155, "rewards/rejected": -1.6600606441497803, "step": 14080 }, { "epoch": 2.4276361130254998, "grad_norm": 34.443580627441406, "learning_rate": 1.0685330651839542e-08, "logits/chosen": -2.264598846435547, "logits/rejected": -2.2283425331115723, "logps/chosen": -173.8609161376953, "logps/rejected": -199.7482147216797, "loss": 0.6073, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1643470525741577, "rewards/margins": 0.32152336835861206, "rewards/rejected": -1.485870361328125, "step": 14090 }, { "epoch": 2.4293590627153687, "grad_norm": 34.7966423034668, "learning_rate": 1.0623474687755607e-08, "logits/chosen": -2.3133583068847656, "logits/rejected": -2.2820286750793457, "logps/chosen": -178.41860961914062, "logps/rejected": -213.8348388671875, "loss": 0.5781, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1884939670562744, "rewards/margins": 0.4087247848510742, "rewards/rejected": -1.5972187519073486, "step": 14100 }, { "epoch": 2.4310820124052377, "grad_norm": 36.908451080322266, "learning_rate": 1.0561776993815563e-08, "logits/chosen": -2.309270143508911, "logits/rejected": -2.295470714569092, "logps/chosen": -174.90457153320312, "logps/rejected": -204.77542114257812, "loss": 0.6114, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2425603866577148, "rewards/margins": 0.30024009943008423, "rewards/rejected": -1.5428004264831543, "step": 14110 }, { "epoch": 2.4328049620951067, "grad_norm": 47.489341735839844, "learning_rate": 1.0500237818007318e-08, "logits/chosen": -2.2908339500427246, "logits/rejected": -2.26305890083313, "logps/chosen": -173.8285369873047, "logps/rejected": -197.50125122070312, "loss": 0.6355, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2086008787155151, "rewards/margins": 0.24783003330230713, "rewards/rejected": -1.4564310312271118, "step": 14120 }, { "epoch": 2.4345279117849756, "grad_norm": 43.21564483642578, "learning_rate": 1.0438857407681683e-08, "logits/chosen": -2.3160407543182373, "logits/rejected": -2.2969260215759277, "logps/chosen": -175.17642211914062, "logps/rejected": -190.04049682617188, "loss": 0.6502, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1963536739349365, "rewards/margins": 0.1892521232366562, "rewards/rejected": -1.3856055736541748, "step": 14130 }, { "epoch": 2.436250861474845, "grad_norm": 27.080957412719727, "learning_rate": 1.0377636009551271e-08, "logits/chosen": -2.358844041824341, "logits/rejected": -2.3329920768737793, "logps/chosen": -178.12252807617188, "logps/rejected": -206.2164306640625, "loss": 0.6192, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2507833242416382, "rewards/margins": 0.30481910705566406, "rewards/rejected": -1.5556023120880127, "step": 14140 }, { "epoch": 2.437973811164714, "grad_norm": 38.455928802490234, "learning_rate": 1.0316573869689605e-08, "logits/chosen": -2.3427817821502686, "logits/rejected": -2.3219618797302246, "logps/chosen": -176.49655151367188, "logps/rejected": -207.5787353515625, "loss": 0.6135, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2148641347885132, "rewards/margins": 0.3144477903842926, "rewards/rejected": -1.529312014579773, "step": 14150 }, { "epoch": 2.439696760854583, "grad_norm": 38.341854095458984, "learning_rate": 1.025567123353004e-08, "logits/chosen": -2.289370059967041, "logits/rejected": -2.265319347381592, "logps/chosen": -171.89669799804688, "logps/rejected": -205.0989532470703, "loss": 0.6089, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1749041080474854, "rewards/margins": 0.3178651034832001, "rewards/rejected": -1.4927692413330078, "step": 14160 }, { "epoch": 2.441419710544452, "grad_norm": 36.23020553588867, "learning_rate": 1.0194928345864867e-08, "logits/chosen": -2.2404868602752686, "logits/rejected": -2.211150646209717, "logps/chosen": -172.58767700195312, "logps/rejected": -199.41384887695312, "loss": 0.6132, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1612130403518677, "rewards/margins": 0.2983928918838501, "rewards/rejected": -1.4596058130264282, "step": 14170 }, { "epoch": 2.4431426602343214, "grad_norm": 44.42694854736328, "learning_rate": 1.0134345450844245e-08, "logits/chosen": -2.301393985748291, "logits/rejected": -2.264596462249756, "logps/chosen": -177.45223999023438, "logps/rejected": -199.0797882080078, "loss": 0.6199, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1935441493988037, "rewards/margins": 0.28957706689834595, "rewards/rejected": -1.4831211566925049, "step": 14180 }, { "epoch": 2.4448656099241903, "grad_norm": 30.67015838623047, "learning_rate": 1.0073922791975276e-08, "logits/chosen": -2.320544719696045, "logits/rejected": -2.30179500579834, "logps/chosen": -181.50807189941406, "logps/rejected": -204.28504943847656, "loss": 0.6171, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2201424837112427, "rewards/margins": 0.28387171030044556, "rewards/rejected": -1.504014253616333, "step": 14190 }, { "epoch": 2.4465885596140593, "grad_norm": 32.7570686340332, "learning_rate": 1.0013660612121034e-08, "logits/chosen": -2.2290682792663574, "logits/rejected": -2.203991651535034, "logps/chosen": -166.692626953125, "logps/rejected": -206.97122192382812, "loss": 0.5601, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1180620193481445, "rewards/margins": 0.41545921564102173, "rewards/rejected": -1.533521294593811, "step": 14200 }, { "epoch": 2.4483115093039283, "grad_norm": 33.343238830566406, "learning_rate": 9.953559153499509e-09, "logits/chosen": -2.3083038330078125, "logits/rejected": -2.285892963409424, "logps/chosen": -172.07347106933594, "logps/rejected": -201.3411102294922, "loss": 0.6103, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1820226907730103, "rewards/margins": 0.31112828850746155, "rewards/rejected": -1.4931509494781494, "step": 14210 }, { "epoch": 2.4500344589937972, "grad_norm": 31.730648040771484, "learning_rate": 9.893618657682712e-09, "logits/chosen": -2.359067440032959, "logits/rejected": -2.3279736042022705, "logps/chosen": -176.6471710205078, "logps/rejected": -201.74581909179688, "loss": 0.6194, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1735197305679321, "rewards/margins": 0.298747718334198, "rewards/rejected": -1.472267508506775, "step": 14220 }, { "epoch": 2.451757408683666, "grad_norm": 34.798095703125, "learning_rate": 9.833839365595686e-09, "logits/chosen": -2.236711025238037, "logits/rejected": -2.1991546154022217, "logps/chosen": -172.93850708007812, "logps/rejected": -207.1778564453125, "loss": 0.5772, "rewards/accuracies": 0.71875, "rewards/chosen": -1.155918836593628, "rewards/margins": 0.40659481287002563, "rewards/rejected": -1.5625135898590088, "step": 14230 }, { "epoch": 2.4534803583735356, "grad_norm": 35.521636962890625, "learning_rate": 9.774221517515563e-09, "logits/chosen": -2.2360782623291016, "logits/rejected": -2.2201473712921143, "logps/chosen": -174.67828369140625, "logps/rejected": -210.30056762695312, "loss": 0.5886, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1995961666107178, "rewards/margins": 0.3683069348335266, "rewards/rejected": -1.5679031610488892, "step": 14240 }, { "epoch": 2.4552033080634046, "grad_norm": 42.19344711303711, "learning_rate": 9.71476535307047e-09, "logits/chosen": -2.2473440170288086, "logits/rejected": -2.2286272048950195, "logps/chosen": -177.55215454101562, "logps/rejected": -201.4174346923828, "loss": 0.6289, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2225806713104248, "rewards/margins": 0.2655474543571472, "rewards/rejected": -1.4881280660629272, "step": 14250 }, { "epoch": 2.4569262577532736, "grad_norm": 61.28897476196289, "learning_rate": 9.65547111123875e-09, "logits/chosen": -2.3363399505615234, "logits/rejected": -2.2926130294799805, "logps/chosen": -177.8993682861328, "logps/rejected": -199.60348510742188, "loss": 0.6195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2001395225524902, "rewards/margins": 0.3035646080970764, "rewards/rejected": -1.503704309463501, "step": 14260 }, { "epoch": 2.4586492074431425, "grad_norm": 23.33378791809082, "learning_rate": 9.596339030347906e-09, "logits/chosen": -2.302767038345337, "logits/rejected": -2.270862340927124, "logps/chosen": -175.49441528320312, "logps/rejected": -213.15371704101562, "loss": 0.5656, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.200317144393921, "rewards/margins": 0.42555421590805054, "rewards/rejected": -1.6258713006973267, "step": 14270 }, { "epoch": 2.460372157133012, "grad_norm": 31.418357849121094, "learning_rate": 9.537369348073598e-09, "logits/chosen": -2.235307216644287, "logits/rejected": -2.2249467372894287, "logps/chosen": -173.64279174804688, "logps/rejected": -194.50186157226562, "loss": 0.6545, "rewards/accuracies": 0.625, "rewards/chosen": -1.2113587856292725, "rewards/margins": 0.2167220413684845, "rewards/rejected": -1.4280807971954346, "step": 14280 }, { "epoch": 2.462095106822881, "grad_norm": 32.423194885253906, "learning_rate": 9.478562301438809e-09, "logits/chosen": -2.2253098487854004, "logits/rejected": -2.193913459777832, "logps/chosen": -187.88681030273438, "logps/rejected": -204.9330291748047, "loss": 0.6332, "rewards/accuracies": 0.6875, "rewards/chosen": -1.266350507736206, "rewards/margins": 0.26731666922569275, "rewards/rejected": -1.5336672067642212, "step": 14290 }, { "epoch": 2.46381805651275, "grad_norm": 32.8293342590332, "learning_rate": 9.419918126812748e-09, "logits/chosen": -2.347121238708496, "logits/rejected": -2.325183868408203, "logps/chosen": -163.93734741210938, "logps/rejected": -206.36135864257812, "loss": 0.5616, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0888183116912842, "rewards/margins": 0.4189375340938568, "rewards/rejected": -1.507755994796753, "step": 14300 }, { "epoch": 2.465541006202619, "grad_norm": 32.74689865112305, "learning_rate": 9.361437059910055e-09, "logits/chosen": -2.2703864574432373, "logits/rejected": -2.243338108062744, "logps/chosen": -176.22911071777344, "logps/rejected": -205.7763214111328, "loss": 0.6056, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.216325044631958, "rewards/margins": 0.33054304122924805, "rewards/rejected": -1.546868085861206, "step": 14310 }, { "epoch": 2.467263955892488, "grad_norm": 30.113887786865234, "learning_rate": 9.303119335789705e-09, "logits/chosen": -2.255998134613037, "logits/rejected": -2.2358832359313965, "logps/chosen": -167.65725708007812, "logps/rejected": -203.99642944335938, "loss": 0.5832, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1159486770629883, "rewards/margins": 0.37771984934806824, "rewards/rejected": -1.493668556213379, "step": 14320 }, { "epoch": 2.468986905582357, "grad_norm": 43.85342788696289, "learning_rate": 9.244965188854186e-09, "logits/chosen": -2.3707833290100098, "logits/rejected": -2.330193042755127, "logps/chosen": -178.6895294189453, "logps/rejected": -234.42626953125, "loss": 0.5207, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.2279475927352905, "rewards/margins": 0.5815509557723999, "rewards/rejected": -1.8094985485076904, "step": 14330 }, { "epoch": 2.470709855272226, "grad_norm": 40.238525390625, "learning_rate": 9.186974852848467e-09, "logits/chosen": -2.2677359580993652, "logits/rejected": -2.2470545768737793, "logps/chosen": -178.6530303955078, "logps/rejected": -221.50711059570312, "loss": 0.5663, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2290410995483398, "rewards/margins": 0.4284849166870117, "rewards/rejected": -1.6575257778167725, "step": 14340 }, { "epoch": 2.472432804962095, "grad_norm": 29.78543472290039, "learning_rate": 9.129148560859102e-09, "logits/chosen": -2.3092570304870605, "logits/rejected": -2.286848306655884, "logps/chosen": -175.47021484375, "logps/rejected": -209.6531524658203, "loss": 0.5908, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2112047672271729, "rewards/margins": 0.35105112195014954, "rewards/rejected": -1.562256097793579, "step": 14350 }, { "epoch": 2.474155754651964, "grad_norm": 46.5627555847168, "learning_rate": 9.0714865453133e-09, "logits/chosen": -2.2718825340270996, "logits/rejected": -2.2470686435699463, "logps/chosen": -187.2011260986328, "logps/rejected": -211.96572875976562, "loss": 0.6279, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.307032823562622, "rewards/margins": 0.2946831285953522, "rewards/rejected": -1.6017158031463623, "step": 14360 }, { "epoch": 2.475878704341833, "grad_norm": 40.91022491455078, "learning_rate": 9.013989037977977e-09, "logits/chosen": -2.29414701461792, "logits/rejected": -2.2601523399353027, "logps/chosen": -181.77938842773438, "logps/rejected": -213.543701171875, "loss": 0.5905, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2551586627960205, "rewards/margins": 0.3801318109035492, "rewards/rejected": -1.635290503501892, "step": 14370 }, { "epoch": 2.4776016540317025, "grad_norm": 32.45248794555664, "learning_rate": 8.956656269958812e-09, "logits/chosen": -2.358607769012451, "logits/rejected": -2.3335189819335938, "logps/chosen": -181.627685546875, "logps/rejected": -211.6080322265625, "loss": 0.5967, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2830419540405273, "rewards/margins": 0.32768845558166504, "rewards/rejected": -1.610730528831482, "step": 14380 }, { "epoch": 2.4793246037215715, "grad_norm": 39.58088302612305, "learning_rate": 8.899488471699312e-09, "logits/chosen": -2.2676587104797363, "logits/rejected": -2.2427353858947754, "logps/chosen": -183.3169708251953, "logps/rejected": -212.7167205810547, "loss": 0.6191, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.300159215927124, "rewards/margins": 0.3232465386390686, "rewards/rejected": -1.6234058141708374, "step": 14390 }, { "epoch": 2.4810475534114405, "grad_norm": 38.66813659667969, "learning_rate": 8.842485872979944e-09, "logits/chosen": -2.309758186340332, "logits/rejected": -2.2829182147979736, "logps/chosen": -185.3292999267578, "logps/rejected": -210.6401824951172, "loss": 0.6078, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2728933095932007, "rewards/margins": 0.3427709639072418, "rewards/rejected": -1.6156641244888306, "step": 14400 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -2.3539810180664062, "eval_logits/rejected": -2.3417115211486816, "eval_logps/chosen": -167.90591430664062, "eval_logps/rejected": -189.16842651367188, "eval_loss": 0.6505683064460754, "eval_rewards/accuracies": 0.6003717184066772, "eval_rewards/chosen": -1.0889043807983398, "eval_rewards/margins": 0.17528373003005981, "eval_rewards/rejected": -1.2641881704330444, "eval_runtime": 384.7222, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 14400 }, { "epoch": 2.4827705031013094, "grad_norm": 30.45815086364746, "learning_rate": 8.785648702917164e-09, "logits/chosen": -2.274064064025879, "logits/rejected": -2.252723217010498, "logps/chosen": -175.93069458007812, "logps/rejected": -208.1988983154297, "loss": 0.6014, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2411539554595947, "rewards/margins": 0.3212995231151581, "rewards/rejected": -1.5624535083770752, "step": 14410 }, { "epoch": 2.4844934527911784, "grad_norm": 33.634010314941406, "learning_rate": 8.728977189962484e-09, "logits/chosen": -2.2990376949310303, "logits/rejected": -2.279719591140747, "logps/chosen": -187.06346130371094, "logps/rejected": -211.91421508789062, "loss": 0.6467, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3428099155426025, "rewards/margins": 0.26273879408836365, "rewards/rejected": -1.6055485010147095, "step": 14420 }, { "epoch": 2.4862164024810474, "grad_norm": 38.20600891113281, "learning_rate": 8.672471561901563e-09, "logits/chosen": -2.276118278503418, "logits/rejected": -2.249997615814209, "logps/chosen": -179.34445190429688, "logps/rejected": -213.3744659423828, "loss": 0.5951, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2632334232330322, "rewards/margins": 0.37510672211647034, "rewards/rejected": -1.6383403539657593, "step": 14430 }, { "epoch": 2.4879393521709168, "grad_norm": 30.55494499206543, "learning_rate": 8.616132045853341e-09, "logits/chosen": -2.274087429046631, "logits/rejected": -2.2450904846191406, "logps/chosen": -172.67007446289062, "logps/rejected": -214.5222930908203, "loss": 0.5597, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1785537004470825, "rewards/margins": 0.43323737382888794, "rewards/rejected": -1.6117912530899048, "step": 14440 }, { "epoch": 2.4896623018607857, "grad_norm": 44.838504791259766, "learning_rate": 8.559958868269058e-09, "logits/chosen": -2.2660040855407715, "logits/rejected": -2.2459208965301514, "logps/chosen": -186.3092803955078, "logps/rejected": -211.8496856689453, "loss": 0.6032, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3006166219711304, "rewards/margins": 0.3049980401992798, "rewards/rejected": -1.6056146621704102, "step": 14450 }, { "epoch": 2.4913852515506547, "grad_norm": 27.789527893066406, "learning_rate": 8.50395225493138e-09, "logits/chosen": -2.2741141319274902, "logits/rejected": -2.2550957202911377, "logps/chosen": -183.6599578857422, "logps/rejected": -211.07876586914062, "loss": 0.6189, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.291940689086914, "rewards/margins": 0.29155492782592773, "rewards/rejected": -1.5834957361221313, "step": 14460 }, { "epoch": 2.4931082012405237, "grad_norm": 39.434593200683594, "learning_rate": 8.448112430953502e-09, "logits/chosen": -2.379971981048584, "logits/rejected": -2.3414828777313232, "logps/chosen": -183.31814575195312, "logps/rejected": -212.9709930419922, "loss": 0.5712, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.234494686126709, "rewards/margins": 0.4042988419532776, "rewards/rejected": -1.6387935876846313, "step": 14470 }, { "epoch": 2.4948311509303926, "grad_norm": 33.748207092285156, "learning_rate": 8.392439620778197e-09, "logits/chosen": -2.288809299468994, "logits/rejected": -2.280153274536133, "logps/chosen": -183.03431701660156, "logps/rejected": -219.2383575439453, "loss": 0.5833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2870408296585083, "rewards/margins": 0.3628956973552704, "rewards/rejected": -1.6499366760253906, "step": 14480 }, { "epoch": 2.496554100620262, "grad_norm": 28.285213470458984, "learning_rate": 8.336934048176935e-09, "logits/chosen": -2.2701704502105713, "logits/rejected": -2.2498607635498047, "logps/chosen": -178.55081176757812, "logps/rejected": -212.88577270507812, "loss": 0.601, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.271669626235962, "rewards/margins": 0.3485538363456726, "rewards/rejected": -1.6202234029769897, "step": 14490 }, { "epoch": 2.498277050310131, "grad_norm": 31.462501525878906, "learning_rate": 8.281595936249031e-09, "logits/chosen": -2.2780776023864746, "logits/rejected": -2.239255905151367, "logps/chosen": -178.77101135253906, "logps/rejected": -212.22274780273438, "loss": 0.5806, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2120325565338135, "rewards/margins": 0.37745457887649536, "rewards/rejected": -1.5894873142242432, "step": 14500 }, { "epoch": 2.5, "grad_norm": 35.11719512939453, "learning_rate": 8.226425507420687e-09, "logits/chosen": -2.3147332668304443, "logits/rejected": -2.2924513816833496, "logps/chosen": -186.06500244140625, "logps/rejected": -206.14431762695312, "loss": 0.6319, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.296118140220642, "rewards/margins": 0.25911945104599, "rewards/rejected": -1.5552375316619873, "step": 14510 }, { "epoch": 2.501722949689869, "grad_norm": 35.80636978149414, "learning_rate": 8.171422983444116e-09, "logits/chosen": -2.277247190475464, "logits/rejected": -2.2509655952453613, "logps/chosen": -183.95880126953125, "logps/rejected": -209.32876586914062, "loss": 0.6284, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.271933674812317, "rewards/margins": 0.28048646450042725, "rewards/rejected": -1.552419900894165, "step": 14520 }, { "epoch": 2.503445899379738, "grad_norm": 40.243621826171875, "learning_rate": 8.11658858539664e-09, "logits/chosen": -2.3021738529205322, "logits/rejected": -2.2769148349761963, "logps/chosen": -186.34942626953125, "logps/rejected": -218.3411102294922, "loss": 0.5939, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2804752588272095, "rewards/margins": 0.3546562194824219, "rewards/rejected": -1.6351312398910522, "step": 14530 }, { "epoch": 2.505168849069607, "grad_norm": 25.926076889038086, "learning_rate": 8.061922533679838e-09, "logits/chosen": -2.270775556564331, "logits/rejected": -2.243312358856201, "logps/chosen": -180.53915405273438, "logps/rejected": -215.7350311279297, "loss": 0.586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.251887559890747, "rewards/margins": 0.39002978801727295, "rewards/rejected": -1.6419174671173096, "step": 14540 }, { "epoch": 2.5068917987594763, "grad_norm": 29.957273483276367, "learning_rate": 8.007425048018652e-09, "logits/chosen": -2.3033363819122314, "logits/rejected": -2.265852689743042, "logps/chosen": -180.44415283203125, "logps/rejected": -211.4573211669922, "loss": 0.5961, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2219346761703491, "rewards/margins": 0.3654627501964569, "rewards/rejected": -1.5873974561691284, "step": 14550 }, { "epoch": 2.5086147484493453, "grad_norm": 29.57919692993164, "learning_rate": 7.953096347460442e-09, "logits/chosen": -2.264911651611328, "logits/rejected": -2.2368927001953125, "logps/chosen": -177.29637145996094, "logps/rejected": -216.89492797851562, "loss": 0.5777, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2392809391021729, "rewards/margins": 0.3909870386123657, "rewards/rejected": -1.630267858505249, "step": 14560 }, { "epoch": 2.5103376981392143, "grad_norm": 40.99264907836914, "learning_rate": 7.898936650374177e-09, "logits/chosen": -2.2209935188293457, "logits/rejected": -2.211142063140869, "logps/chosen": -179.51312255859375, "logps/rejected": -211.30606079101562, "loss": 0.6259, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2609214782714844, "rewards/margins": 0.3168264925479889, "rewards/rejected": -1.5777479410171509, "step": 14570 }, { "epoch": 2.5120606478290832, "grad_norm": 36.08527374267578, "learning_rate": 7.844946174449552e-09, "logits/chosen": -2.3129754066467285, "logits/rejected": -2.2868075370788574, "logps/chosen": -171.59011840820312, "logps/rejected": -204.12374877929688, "loss": 0.6058, "rewards/accuracies": 0.65625, "rewards/chosen": -1.209629774093628, "rewards/margins": 0.3304891884326935, "rewards/rejected": -1.5401188135147095, "step": 14580 }, { "epoch": 2.5137835975189526, "grad_norm": 44.74150085449219, "learning_rate": 7.791125136696053e-09, "logits/chosen": -2.2585346698760986, "logits/rejected": -2.2415921688079834, "logps/chosen": -172.50912475585938, "logps/rejected": -202.05429077148438, "loss": 0.6111, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1867698431015015, "rewards/margins": 0.31644508242607117, "rewards/rejected": -1.5032150745391846, "step": 14590 }, { "epoch": 2.5155065472088216, "grad_norm": 32.48900604248047, "learning_rate": 7.737473753442175e-09, "logits/chosen": -2.277561664581299, "logits/rejected": -2.23915433883667, "logps/chosen": -178.53311157226562, "logps/rejected": -214.3824005126953, "loss": 0.5669, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2278555631637573, "rewards/margins": 0.4098914563655853, "rewards/rejected": -1.6377471685409546, "step": 14600 }, { "epoch": 2.5172294968986906, "grad_norm": 58.75372314453125, "learning_rate": 7.683992240334442e-09, "logits/chosen": -2.257072687149048, "logits/rejected": -2.2232871055603027, "logps/chosen": -183.6359100341797, "logps/rejected": -207.38363647460938, "loss": 0.6014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2601312398910522, "rewards/margins": 0.3229925036430359, "rewards/rejected": -1.5831239223480225, "step": 14610 }, { "epoch": 2.5189524465885595, "grad_norm": 32.79248809814453, "learning_rate": 7.630680812336666e-09, "logits/chosen": -2.2399673461914062, "logits/rejected": -2.2240517139434814, "logps/chosen": -181.73648071289062, "logps/rejected": -216.5868682861328, "loss": 0.6075, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2946263551712036, "rewards/margins": 0.34417837858200073, "rewards/rejected": -1.6388046741485596, "step": 14620 }, { "epoch": 2.5206753962784285, "grad_norm": 35.11437225341797, "learning_rate": 7.577539683728963e-09, "logits/chosen": -2.203411817550659, "logits/rejected": -2.1906564235687256, "logps/chosen": -186.56008911132812, "logps/rejected": -211.3187713623047, "loss": 0.6224, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3181005716323853, "rewards/margins": 0.27355271577835083, "rewards/rejected": -1.5916532278060913, "step": 14630 }, { "epoch": 2.5223983459682975, "grad_norm": 27.418834686279297, "learning_rate": 7.524569068106984e-09, "logits/chosen": -2.2308120727539062, "logits/rejected": -2.213721513748169, "logps/chosen": -178.981201171875, "logps/rejected": -213.73458862304688, "loss": 0.5998, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2649548053741455, "rewards/margins": 0.3553503453731537, "rewards/rejected": -1.620305061340332, "step": 14640 }, { "epoch": 2.524121295658167, "grad_norm": 60.733726501464844, "learning_rate": 7.471769178381032e-09, "logits/chosen": -2.2828164100646973, "logits/rejected": -2.261434316635132, "logps/chosen": -192.31106567382812, "logps/rejected": -207.40194702148438, "loss": 0.6721, "rewards/accuracies": 0.625, "rewards/chosen": -1.3739750385284424, "rewards/margins": 0.20933596789836884, "rewards/rejected": -1.583310842514038, "step": 14650 }, { "epoch": 2.525844245348036, "grad_norm": 32.428741455078125, "learning_rate": 7.419140226775117e-09, "logits/chosen": -2.2503812313079834, "logits/rejected": -2.2110300064086914, "logps/chosen": -178.69284057617188, "logps/rejected": -217.0933380126953, "loss": 0.5539, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2397966384887695, "rewards/margins": 0.4224214553833008, "rewards/rejected": -1.6622183322906494, "step": 14660 }, { "epoch": 2.527567195037905, "grad_norm": 27.9921932220459, "learning_rate": 7.366682424826259e-09, "logits/chosen": -2.202364683151245, "logits/rejected": -2.177262783050537, "logps/chosen": -171.57933044433594, "logps/rejected": -207.56698608398438, "loss": 0.5828, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1934975385665894, "rewards/margins": 0.36846160888671875, "rewards/rejected": -1.561959147453308, "step": 14670 }, { "epoch": 2.529290144727774, "grad_norm": 43.02341079711914, "learning_rate": 7.314395983383548e-09, "logits/chosen": -2.262956142425537, "logits/rejected": -2.241204261779785, "logps/chosen": -173.86380004882812, "logps/rejected": -207.066162109375, "loss": 0.6014, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2458035945892334, "rewards/margins": 0.30617329478263855, "rewards/rejected": -1.5519767999649048, "step": 14680 }, { "epoch": 2.531013094417643, "grad_norm": 40.94485092163086, "learning_rate": 7.262281112607266e-09, "logits/chosen": -2.2700066566467285, "logits/rejected": -2.2460155487060547, "logps/chosen": -189.86940002441406, "logps/rejected": -227.3620147705078, "loss": 0.5994, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3717550039291382, "rewards/margins": 0.37433767318725586, "rewards/rejected": -1.7460925579071045, "step": 14690 }, { "epoch": 2.532736044107512, "grad_norm": 28.846698760986328, "learning_rate": 7.210338021968099e-09, "logits/chosen": -2.333097219467163, "logits/rejected": -2.307025194168091, "logps/chosen": -189.25680541992188, "logps/rejected": -235.3755340576172, "loss": 0.572, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3376073837280273, "rewards/margins": 0.45716962218284607, "rewards/rejected": -1.7947769165039062, "step": 14700 }, { "epoch": 2.534458993797381, "grad_norm": 39.72157287597656, "learning_rate": 7.158566920246306e-09, "logits/chosen": -2.279195547103882, "logits/rejected": -2.2656617164611816, "logps/chosen": -181.99972534179688, "logps/rejected": -213.56253051757812, "loss": 0.6022, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.274493932723999, "rewards/margins": 0.33153611421585083, "rewards/rejected": -1.6060301065444946, "step": 14710 }, { "epoch": 2.53618194348725, "grad_norm": 40.20477294921875, "learning_rate": 7.1069680155308455e-09, "logits/chosen": -2.247303009033203, "logits/rejected": -2.2131989002227783, "logps/chosen": -189.83795166015625, "logps/rejected": -223.70590209960938, "loss": 0.5967, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3142995834350586, "rewards/margins": 0.3924974799156189, "rewards/rejected": -1.7067972421646118, "step": 14720 }, { "epoch": 2.537904893177119, "grad_norm": 42.69450759887695, "learning_rate": 7.055541515218505e-09, "logits/chosen": -2.315809965133667, "logits/rejected": -2.2835214138031006, "logps/chosen": -178.3886260986328, "logps/rejected": -212.3560333251953, "loss": 0.5927, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.239457368850708, "rewards/margins": 0.38728493452072144, "rewards/rejected": -1.6267423629760742, "step": 14730 }, { "epoch": 2.539627842866988, "grad_norm": 36.893882751464844, "learning_rate": 7.004287626013167e-09, "logits/chosen": -2.2423672676086426, "logits/rejected": -2.233050584793091, "logps/chosen": -194.85977172851562, "logps/rejected": -223.08267211914062, "loss": 0.63, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3785429000854492, "rewards/margins": 0.3103850781917572, "rewards/rejected": -1.6889280080795288, "step": 14740 }, { "epoch": 2.5413507925568575, "grad_norm": 32.512611389160156, "learning_rate": 6.9532065539248785e-09, "logits/chosen": -2.2675812244415283, "logits/rejected": -2.235931873321533, "logps/chosen": -180.07955932617188, "logps/rejected": -211.1636962890625, "loss": 0.5938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2650864124298096, "rewards/margins": 0.3591112792491913, "rewards/rejected": -1.6241976022720337, "step": 14750 }, { "epoch": 2.5430737422467264, "grad_norm": 42.58386993408203, "learning_rate": 6.902298504269089e-09, "logits/chosen": -2.30161714553833, "logits/rejected": -2.2694239616394043, "logps/chosen": -177.73812866210938, "logps/rejected": -214.989013671875, "loss": 0.5785, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2390029430389404, "rewards/margins": 0.3862294852733612, "rewards/rejected": -1.625232458114624, "step": 14760 }, { "epoch": 2.5447966919365954, "grad_norm": 40.47549057006836, "learning_rate": 6.851563681665778e-09, "logits/chosen": -2.292722225189209, "logits/rejected": -2.2698347568511963, "logps/chosen": -186.69671630859375, "logps/rejected": -221.0588836669922, "loss": 0.606, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3003615140914917, "rewards/margins": 0.3530925512313843, "rewards/rejected": -1.6534541845321655, "step": 14770 }, { "epoch": 2.5465196416264644, "grad_norm": 34.78125, "learning_rate": 6.801002290038687e-09, "logits/chosen": -2.260918140411377, "logits/rejected": -2.2494137287139893, "logps/chosen": -177.1928253173828, "logps/rejected": -208.9444122314453, "loss": 0.6072, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.266132116317749, "rewards/margins": 0.3350914418697357, "rewards/rejected": -1.6012235879898071, "step": 14780 }, { "epoch": 2.548242591316334, "grad_norm": 28.724706649780273, "learning_rate": 6.750614532614446e-09, "logits/chosen": -2.3022968769073486, "logits/rejected": -2.272913694381714, "logps/chosen": -194.76040649414062, "logps/rejected": -222.46749877929688, "loss": 0.6229, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4062250852584839, "rewards/margins": 0.3128969073295593, "rewards/rejected": -1.7191219329833984, "step": 14790 }, { "epoch": 2.5499655410062028, "grad_norm": 46.327484130859375, "learning_rate": 6.7004006119217695e-09, "logits/chosen": -2.2679340839385986, "logits/rejected": -2.2513203620910645, "logps/chosen": -189.68954467773438, "logps/rejected": -225.81494140625, "loss": 0.6112, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3462212085723877, "rewards/margins": 0.3592769503593445, "rewards/rejected": -1.7054980993270874, "step": 14800 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -2.3514130115509033, "eval_logits/rejected": -2.3389508724212646, "eval_logps/chosen": -169.68975830078125, "eval_logps/rejected": -191.403564453125, "eval_loss": 0.650035560131073, "eval_rewards/accuracies": 0.5971189737319946, "eval_rewards/chosen": -1.1067428588867188, "eval_rewards/margins": 0.17979657649993896, "eval_rewards/rejected": -1.2865396738052368, "eval_runtime": 384.8181, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 14800 }, { "epoch": 2.5516884906960717, "grad_norm": 36.57168197631836, "learning_rate": 6.650360729790677e-09, "logits/chosen": -2.305518627166748, "logits/rejected": -2.2647337913513184, "logps/chosen": -195.060791015625, "logps/rejected": -216.7763671875, "loss": 0.6095, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3535444736480713, "rewards/margins": 0.3394257128238678, "rewards/rejected": -1.6929700374603271, "step": 14810 }, { "epoch": 2.5534114403859407, "grad_norm": 35.029850006103516, "learning_rate": 6.600495087351654e-09, "logits/chosen": -2.399819850921631, "logits/rejected": -2.366640567779541, "logps/chosen": -180.87078857421875, "logps/rejected": -218.5558624267578, "loss": 0.565, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2629244327545166, "rewards/margins": 0.41114553809165955, "rewards/rejected": -1.6740700006484985, "step": 14820 }, { "epoch": 2.5551343900758097, "grad_norm": 47.66520309448242, "learning_rate": 6.550803885034833e-09, "logits/chosen": -2.293966770172119, "logits/rejected": -2.2669548988342285, "logps/chosen": -184.09866333007812, "logps/rejected": -209.52108764648438, "loss": 0.6163, "rewards/accuracies": 0.625, "rewards/chosen": -1.2763346433639526, "rewards/margins": 0.3137095868587494, "rewards/rejected": -1.5900442600250244, "step": 14830 }, { "epoch": 2.5568573397656786, "grad_norm": 27.413667678833008, "learning_rate": 6.5012873225691875e-09, "logits/chosen": -2.335608959197998, "logits/rejected": -2.300196886062622, "logps/chosen": -185.65924072265625, "logps/rejected": -224.2730255126953, "loss": 0.5868, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.287444829940796, "rewards/margins": 0.3980006277561188, "rewards/rejected": -1.6854454278945923, "step": 14840 }, { "epoch": 2.558580289455548, "grad_norm": 34.01144027709961, "learning_rate": 6.451945598981784e-09, "logits/chosen": -2.2700634002685547, "logits/rejected": -2.2440428733825684, "logps/chosen": -191.76467895507812, "logps/rejected": -222.75576782226562, "loss": 0.6082, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.36762273311615, "rewards/margins": 0.32878175377845764, "rewards/rejected": -1.6964046955108643, "step": 14850 }, { "epoch": 2.560303239145417, "grad_norm": 36.0723991394043, "learning_rate": 6.4027789125969286e-09, "logits/chosen": -2.251737117767334, "logits/rejected": -2.2386040687561035, "logps/chosen": -178.60855102539062, "logps/rejected": -213.0312042236328, "loss": 0.5966, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2298336029052734, "rewards/margins": 0.3910522758960724, "rewards/rejected": -1.6208856105804443, "step": 14860 }, { "epoch": 2.562026188835286, "grad_norm": 35.34622573852539, "learning_rate": 6.353787461035354e-09, "logits/chosen": -2.3134000301361084, "logits/rejected": -2.2874956130981445, "logps/chosen": -185.47122192382812, "logps/rejected": -213.2705841064453, "loss": 0.6239, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2700347900390625, "rewards/margins": 0.3386613726615906, "rewards/rejected": -1.6086963415145874, "step": 14870 }, { "epoch": 2.563749138525155, "grad_norm": 30.014429092407227, "learning_rate": 6.304971441213469e-09, "logits/chosen": -2.2496464252471924, "logits/rejected": -2.2386934757232666, "logps/chosen": -175.8731231689453, "logps/rejected": -212.3821563720703, "loss": 0.5841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.218999981880188, "rewards/margins": 0.3805854916572571, "rewards/rejected": -1.5995855331420898, "step": 14880 }, { "epoch": 2.5654720882150244, "grad_norm": 31.44453239440918, "learning_rate": 6.256331049342572e-09, "logits/chosen": -2.2204737663269043, "logits/rejected": -2.2018988132476807, "logps/chosen": -182.56192016601562, "logps/rejected": -213.97006225585938, "loss": 0.6012, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2417454719543457, "rewards/margins": 0.33237189054489136, "rewards/rejected": -1.5741174221038818, "step": 14890 }, { "epoch": 2.5671950379048933, "grad_norm": 34.05234146118164, "learning_rate": 6.207866480928003e-09, "logits/chosen": -2.202576160430908, "logits/rejected": -2.170523166656494, "logps/chosen": -176.09579467773438, "logps/rejected": -209.90576171875, "loss": 0.595, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2167494297027588, "rewards/margins": 0.3754423260688782, "rewards/rejected": -1.5921916961669922, "step": 14900 }, { "epoch": 2.5689179875947623, "grad_norm": 36.98624801635742, "learning_rate": 6.1595779307684334e-09, "logits/chosen": -2.273719310760498, "logits/rejected": -2.2517528533935547, "logps/chosen": -168.31080627441406, "logps/rejected": -206.63308715820312, "loss": 0.5902, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1679550409317017, "rewards/margins": 0.3669065833091736, "rewards/rejected": -1.53486168384552, "step": 14910 }, { "epoch": 2.5706409372846313, "grad_norm": 31.453224182128906, "learning_rate": 6.11146559295504e-09, "logits/chosen": -2.259575366973877, "logits/rejected": -2.2475197315216064, "logps/chosen": -177.86105346679688, "logps/rejected": -215.9774169921875, "loss": 0.6009, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.240506887435913, "rewards/margins": 0.37624454498291016, "rewards/rejected": -1.6167514324188232, "step": 14920 }, { "epoch": 2.5723638869745002, "grad_norm": 49.65825271606445, "learning_rate": 6.063529660870709e-09, "logits/chosen": -2.3555989265441895, "logits/rejected": -2.328648328781128, "logps/chosen": -171.65200805664062, "logps/rejected": -212.0304412841797, "loss": 0.5651, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1696637868881226, "rewards/margins": 0.4292203485965729, "rewards/rejected": -1.5988839864730835, "step": 14930 }, { "epoch": 2.574086836664369, "grad_norm": 67.15663146972656, "learning_rate": 6.015770327189285e-09, "logits/chosen": -2.2821450233459473, "logits/rejected": -2.25254487991333, "logps/chosen": -175.12237548828125, "logps/rejected": -205.60256958007812, "loss": 0.5861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2120064496994019, "rewards/margins": 0.35235559940338135, "rewards/rejected": -1.5643621683120728, "step": 14940 }, { "epoch": 2.575809786354238, "grad_norm": 57.31182098388672, "learning_rate": 5.968187783874806e-09, "logits/chosen": -2.35475492477417, "logits/rejected": -2.3296236991882324, "logps/chosen": -181.62442016601562, "logps/rejected": -206.0255889892578, "loss": 0.6176, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2485582828521729, "rewards/margins": 0.3184163570404053, "rewards/rejected": -1.566974401473999, "step": 14950 }, { "epoch": 2.5775327360441076, "grad_norm": 38.26123046875, "learning_rate": 5.920782222180748e-09, "logits/chosen": -2.2558817863464355, "logits/rejected": -2.2312397956848145, "logps/chosen": -184.12100219726562, "logps/rejected": -211.8404083251953, "loss": 0.6204, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2677650451660156, "rewards/margins": 0.3114776611328125, "rewards/rejected": -1.5792428255081177, "step": 14960 }, { "epoch": 2.5792556857339766, "grad_norm": 48.023014068603516, "learning_rate": 5.873553832649137e-09, "logits/chosen": -2.3023171424865723, "logits/rejected": -2.2697367668151855, "logps/chosen": -179.10989379882812, "logps/rejected": -214.2508087158203, "loss": 0.5974, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2418620586395264, "rewards/margins": 0.3722601532936096, "rewards/rejected": -1.6141221523284912, "step": 14970 }, { "epoch": 2.5809786354238455, "grad_norm": 44.4215202331543, "learning_rate": 5.826502805109956e-09, "logits/chosen": -2.323770046234131, "logits/rejected": -2.2797865867614746, "logps/chosen": -179.7855987548828, "logps/rejected": -224.40115356445312, "loss": 0.5421, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2483810186386108, "rewards/margins": 0.4901328682899475, "rewards/rejected": -1.7385139465332031, "step": 14980 }, { "epoch": 2.582701585113715, "grad_norm": 30.38766098022461, "learning_rate": 5.779629328680275e-09, "logits/chosen": -2.3169326782226562, "logits/rejected": -2.3045449256896973, "logps/chosen": -172.8456573486328, "logps/rejected": -216.1270294189453, "loss": 0.568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.205559492111206, "rewards/margins": 0.4353708326816559, "rewards/rejected": -1.640929937362671, "step": 14990 }, { "epoch": 2.584424534803584, "grad_norm": 30.998842239379883, "learning_rate": 5.732933591763495e-09, "logits/chosen": -2.319552183151245, "logits/rejected": -2.304553985595703, "logps/chosen": -180.916259765625, "logps/rejected": -208.8544158935547, "loss": 0.607, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2423069477081299, "rewards/margins": 0.3127003312110901, "rewards/rejected": -1.5550072193145752, "step": 15000 }, { "epoch": 2.586147484493453, "grad_norm": 41.17787551879883, "learning_rate": 5.686415782048643e-09, "logits/chosen": -2.3234753608703613, "logits/rejected": -2.2967770099639893, "logps/chosen": -178.34402465820312, "logps/rejected": -210.7972412109375, "loss": 0.6112, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.225295066833496, "rewards/margins": 0.3530086874961853, "rewards/rejected": -1.5783039331436157, "step": 15010 }, { "epoch": 2.587870434183322, "grad_norm": 52.57218933105469, "learning_rate": 5.640076086509538e-09, "logits/chosen": -2.246558666229248, "logits/rejected": -2.2387757301330566, "logps/chosen": -174.56594848632812, "logps/rejected": -207.66690063476562, "loss": 0.6215, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2340904474258423, "rewards/margins": 0.32109197974205017, "rewards/rejected": -1.5551823377609253, "step": 15020 }, { "epoch": 2.589593383873191, "grad_norm": 35.95048522949219, "learning_rate": 5.593914691404145e-09, "logits/chosen": -2.261404275894165, "logits/rejected": -2.235604763031006, "logps/chosen": -180.11178588867188, "logps/rejected": -212.8577117919922, "loss": 0.6168, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2472907304763794, "rewards/margins": 0.3403995633125305, "rewards/rejected": -1.5876904726028442, "step": 15030 }, { "epoch": 2.59131633356306, "grad_norm": 48.30842208862305, "learning_rate": 5.547931782273718e-09, "logits/chosen": -2.2906880378723145, "logits/rejected": -2.2671432495117188, "logps/chosen": -187.49795532226562, "logps/rejected": -211.20700073242188, "loss": 0.6294, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.322435975074768, "rewards/margins": 0.27619048953056335, "rewards/rejected": -1.5986262559890747, "step": 15040 }, { "epoch": 2.5930392832529288, "grad_norm": 47.93348693847656, "learning_rate": 5.5021275439421365e-09, "logits/chosen": -2.3172030448913574, "logits/rejected": -2.278430461883545, "logps/chosen": -175.37673950195312, "logps/rejected": -205.2795867919922, "loss": 0.5773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1954820156097412, "rewards/margins": 0.37430500984191895, "rewards/rejected": -1.5697870254516602, "step": 15050 }, { "epoch": 2.594762232942798, "grad_norm": 44.634864807128906, "learning_rate": 5.456502160515097e-09, "logits/chosen": -2.2572150230407715, "logits/rejected": -2.2382962703704834, "logps/chosen": -173.3562774658203, "logps/rejected": -206.46798706054688, "loss": 0.6115, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1877930164337158, "rewards/margins": 0.33759254217147827, "rewards/rejected": -1.5253856182098389, "step": 15060 }, { "epoch": 2.596485182632667, "grad_norm": 33.96257019042969, "learning_rate": 5.411055815379451e-09, "logits/chosen": -2.327104091644287, "logits/rejected": -2.28928804397583, "logps/chosen": -177.70382690429688, "logps/rejected": -201.44186401367188, "loss": 0.6114, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1931462287902832, "rewards/margins": 0.3145656883716583, "rewards/rejected": -1.5077118873596191, "step": 15070 }, { "epoch": 2.598208132322536, "grad_norm": 40.272865295410156, "learning_rate": 5.365788691202372e-09, "logits/chosen": -2.2955873012542725, "logits/rejected": -2.2718987464904785, "logps/chosen": -174.05657958984375, "logps/rejected": -207.31423950195312, "loss": 0.5995, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1952232122421265, "rewards/margins": 0.3444897532463074, "rewards/rejected": -1.5397131443023682, "step": 15080 }, { "epoch": 2.599931082012405, "grad_norm": 29.57343101501465, "learning_rate": 5.320700969930708e-09, "logits/chosen": -2.3112263679504395, "logits/rejected": -2.2815232276916504, "logps/chosen": -172.55918884277344, "logps/rejected": -201.70616149902344, "loss": 0.606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1735479831695557, "rewards/margins": 0.3192867338657379, "rewards/rejected": -1.4928348064422607, "step": 15090 }, { "epoch": 2.6016540317022745, "grad_norm": 46.08125686645508, "learning_rate": 5.2757928327902324e-09, "logits/chosen": -2.256523609161377, "logits/rejected": -2.2312490940093994, "logps/chosen": -169.47898864746094, "logps/rejected": -201.4627685546875, "loss": 0.5938, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1771475076675415, "rewards/margins": 0.33779600262641907, "rewards/rejected": -1.5149434804916382, "step": 15100 }, { "epoch": 2.6033769813921435, "grad_norm": 37.76089859008789, "learning_rate": 5.231064460284818e-09, "logits/chosen": -2.286766767501831, "logits/rejected": -2.267291307449341, "logps/chosen": -179.2318572998047, "logps/rejected": -202.9172821044922, "loss": 0.6217, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2481911182403564, "rewards/margins": 0.2732866406440735, "rewards/rejected": -1.5214776992797852, "step": 15110 }, { "epoch": 2.6050999310820124, "grad_norm": 34.025146484375, "learning_rate": 5.1865160321958646e-09, "logits/chosen": -2.2730836868286133, "logits/rejected": -2.2554898262023926, "logps/chosen": -189.20458984375, "logps/rejected": -215.56680297851562, "loss": 0.623, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3072446584701538, "rewards/margins": 0.2902025282382965, "rewards/rejected": -1.597447395324707, "step": 15120 }, { "epoch": 2.6068228807718814, "grad_norm": 41.7362060546875, "learning_rate": 5.142147727581498e-09, "logits/chosen": -2.257009983062744, "logits/rejected": -2.2283775806427, "logps/chosen": -172.26864624023438, "logps/rejected": -204.14013671875, "loss": 0.5864, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.178093671798706, "rewards/margins": 0.3746481239795685, "rewards/rejected": -1.5527417659759521, "step": 15130 }, { "epoch": 2.6085458304617504, "grad_norm": 36.35040283203125, "learning_rate": 5.097959724775819e-09, "logits/chosen": -2.2880465984344482, "logits/rejected": -2.2623438835144043, "logps/chosen": -175.53457641601562, "logps/rejected": -214.331787109375, "loss": 0.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1967875957489014, "rewards/margins": 0.38674241304397583, "rewards/rejected": -1.5835299491882324, "step": 15140 }, { "epoch": 2.6102687801516193, "grad_norm": 32.46341323852539, "learning_rate": 5.053952201388234e-09, "logits/chosen": -2.3879854679107666, "logits/rejected": -2.354921817779541, "logps/chosen": -173.9727020263672, "logps/rejected": -206.2282257080078, "loss": 0.6065, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1880525350570679, "rewards/margins": 0.3559701442718506, "rewards/rejected": -1.5440226793289185, "step": 15150 }, { "epoch": 2.6119917298414888, "grad_norm": 43.05264663696289, "learning_rate": 5.010125334302745e-09, "logits/chosen": -2.2540364265441895, "logits/rejected": -2.2353858947753906, "logps/chosen": -171.33876037597656, "logps/rejected": -208.04934692382812, "loss": 0.5753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1556618213653564, "rewards/margins": 0.3874899744987488, "rewards/rejected": -1.54315185546875, "step": 15160 }, { "epoch": 2.6137146795313577, "grad_norm": 32.51591110229492, "learning_rate": 4.9664792996772285e-09, "logits/chosen": -2.247185230255127, "logits/rejected": -2.2258715629577637, "logps/chosen": -167.01535034179688, "logps/rejected": -204.393798828125, "loss": 0.5853, "rewards/accuracies": 0.625, "rewards/chosen": -1.1358674764633179, "rewards/margins": 0.37841638922691345, "rewards/rejected": -1.5142838954925537, "step": 15170 }, { "epoch": 2.6154376292212267, "grad_norm": 30.162477493286133, "learning_rate": 4.923014272942688e-09, "logits/chosen": -2.3051016330718994, "logits/rejected": -2.293006420135498, "logps/chosen": -183.97933959960938, "logps/rejected": -219.2945556640625, "loss": 0.6015, "rewards/accuracies": 0.65625, "rewards/chosen": -1.271348237991333, "rewards/margins": 0.37271079421043396, "rewards/rejected": -1.644059181213379, "step": 15180 }, { "epoch": 2.6171605789110957, "grad_norm": 41.51902389526367, "learning_rate": 4.87973042880262e-09, "logits/chosen": -2.25746488571167, "logits/rejected": -2.24119234085083, "logps/chosen": -172.4810333251953, "logps/rejected": -201.8636016845703, "loss": 0.5945, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.187237024307251, "rewards/margins": 0.3163180947303772, "rewards/rejected": -1.5035550594329834, "step": 15190 }, { "epoch": 2.618883528600965, "grad_norm": 26.529691696166992, "learning_rate": 4.836627941232252e-09, "logits/chosen": -2.3084750175476074, "logits/rejected": -2.2735514640808105, "logps/chosen": -176.02841186523438, "logps/rejected": -210.1353759765625, "loss": 0.5773, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1953575611114502, "rewards/margins": 0.3805798590183258, "rewards/rejected": -1.5759375095367432, "step": 15200 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -2.358809471130371, "eval_logits/rejected": -2.346810817718506, "eval_logps/chosen": -163.36053466796875, "eval_logps/rejected": -184.21234130859375, "eval_loss": 0.650810182094574, "eval_rewards/accuracies": 0.6024628281593323, "eval_rewards/chosen": -1.0434505939483643, "eval_rewards/margins": 0.17117682099342346, "eval_rewards/rejected": -1.2146275043487549, "eval_runtime": 384.5077, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 15200 }, { "epoch": 2.620606478290834, "grad_norm": 39.072540283203125, "learning_rate": 4.793706983477869e-09, "logits/chosen": -2.226433277130127, "logits/rejected": -2.1847689151763916, "logps/chosen": -183.5162353515625, "logps/rejected": -212.4745330810547, "loss": 0.6092, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2674076557159424, "rewards/margins": 0.343503475189209, "rewards/rejected": -1.6109111309051514, "step": 15210 }, { "epoch": 2.622329427980703, "grad_norm": 44.00055694580078, "learning_rate": 4.750967728056127e-09, "logits/chosen": -2.234224557876587, "logits/rejected": -2.1999948024749756, "logps/chosen": -167.09426879882812, "logps/rejected": -203.2758331298828, "loss": 0.5562, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1392322778701782, "rewards/margins": 0.40779203176498413, "rewards/rejected": -1.5470244884490967, "step": 15220 }, { "epoch": 2.624052377670572, "grad_norm": 47.231117248535156, "learning_rate": 4.7084103467533384e-09, "logits/chosen": -2.258369207382202, "logits/rejected": -2.2343966960906982, "logps/chosen": -180.4640655517578, "logps/rejected": -211.6443634033203, "loss": 0.6035, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2449473142623901, "rewards/margins": 0.3434852957725525, "rewards/rejected": -1.5884325504302979, "step": 15230 }, { "epoch": 2.625775327360441, "grad_norm": 45.69564437866211, "learning_rate": 4.666035010624797e-09, "logits/chosen": -2.2520008087158203, "logits/rejected": -2.213196277618408, "logps/chosen": -179.78683471679688, "logps/rejected": -211.46395874023438, "loss": 0.5779, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1973354816436768, "rewards/margins": 0.38800281286239624, "rewards/rejected": -1.5853383541107178, "step": 15240 }, { "epoch": 2.62749827705031, "grad_norm": 43.75592803955078, "learning_rate": 4.623841889994057e-09, "logits/chosen": -2.2950878143310547, "logits/rejected": -2.2744078636169434, "logps/chosen": -170.07020568847656, "logps/rejected": -206.53176879882812, "loss": 0.5915, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1582139730453491, "rewards/margins": 0.37098950147628784, "rewards/rejected": -1.5292034149169922, "step": 15250 }, { "epoch": 2.6292212267401793, "grad_norm": 33.61025619506836, "learning_rate": 4.581831154452304e-09, "logits/chosen": -2.254704475402832, "logits/rejected": -2.232483386993408, "logps/chosen": -179.52354431152344, "logps/rejected": -203.71212768554688, "loss": 0.6168, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2406651973724365, "rewards/margins": 0.30228352546691895, "rewards/rejected": -1.5429487228393555, "step": 15260 }, { "epoch": 2.6309441764300483, "grad_norm": 39.85773468017578, "learning_rate": 4.540002972857654e-09, "logits/chosen": -2.2991280555725098, "logits/rejected": -2.254185914993286, "logps/chosen": -192.9390411376953, "logps/rejected": -221.02963256835938, "loss": 0.6173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.371813416481018, "rewards/margins": 0.348710298538208, "rewards/rejected": -1.7205238342285156, "step": 15270 }, { "epoch": 2.6326671261199173, "grad_norm": 41.84758377075195, "learning_rate": 4.498357513334433e-09, "logits/chosen": -2.3626158237457275, "logits/rejected": -2.3408305644989014, "logps/chosen": -176.02212524414062, "logps/rejected": -209.27206420898438, "loss": 0.5924, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2225972414016724, "rewards/margins": 0.33926254510879517, "rewards/rejected": -1.5618598461151123, "step": 15280 }, { "epoch": 2.6343900758097862, "grad_norm": 28.77754020690918, "learning_rate": 4.456894943272532e-09, "logits/chosen": -2.285330295562744, "logits/rejected": -2.2468972206115723, "logps/chosen": -179.88723754882812, "logps/rejected": -220.12503051757812, "loss": 0.5744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.243018388748169, "rewards/margins": 0.4545852243900299, "rewards/rejected": -1.6976035833358765, "step": 15290 }, { "epoch": 2.6361130254996556, "grad_norm": 44.45974349975586, "learning_rate": 4.415615429326769e-09, "logits/chosen": -2.2029223442077637, "logits/rejected": -2.167741060256958, "logps/chosen": -179.9371337890625, "logps/rejected": -213.7763214111328, "loss": 0.6049, "rewards/accuracies": 0.65625, "rewards/chosen": -1.27082359790802, "rewards/margins": 0.3803272247314453, "rewards/rejected": -1.6511509418487549, "step": 15300 }, { "epoch": 2.6378359751895246, "grad_norm": 31.738445281982422, "learning_rate": 4.374519137416172e-09, "logits/chosen": -2.324070692062378, "logits/rejected": -2.2945971488952637, "logps/chosen": -177.03167724609375, "logps/rejected": -209.97781372070312, "loss": 0.5879, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.21383798122406, "rewards/margins": 0.3603143095970154, "rewards/rejected": -1.5741521120071411, "step": 15310 }, { "epoch": 2.6395589248793936, "grad_norm": 45.1463508605957, "learning_rate": 4.333606232723308e-09, "logits/chosen": -2.2548727989196777, "logits/rejected": -2.2461423873901367, "logps/chosen": -176.7523193359375, "logps/rejected": -208.3860321044922, "loss": 0.6224, "rewards/accuracies": 0.625, "rewards/chosen": -1.2468162775039673, "rewards/margins": 0.31269317865371704, "rewards/rejected": -1.559509515762329, "step": 15320 }, { "epoch": 2.6412818745692626, "grad_norm": 32.08575439453125, "learning_rate": 4.292876879693646e-09, "logits/chosen": -2.2762961387634277, "logits/rejected": -2.247575283050537, "logps/chosen": -179.0817413330078, "logps/rejected": -213.4488067626953, "loss": 0.5843, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.225376844406128, "rewards/margins": 0.37693697214126587, "rewards/rejected": -1.602313756942749, "step": 15330 }, { "epoch": 2.6430048242591315, "grad_norm": 25.77557945251465, "learning_rate": 4.252331242034912e-09, "logits/chosen": -2.2798683643341064, "logits/rejected": -2.2595818042755127, "logps/chosen": -181.40280151367188, "logps/rejected": -215.9775390625, "loss": 0.5956, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2631257772445679, "rewards/margins": 0.35496434569358826, "rewards/rejected": -1.618090271949768, "step": 15340 }, { "epoch": 2.6447277739490005, "grad_norm": 33.43914031982422, "learning_rate": 4.211969482716354e-09, "logits/chosen": -2.2000720500946045, "logits/rejected": -2.1807875633239746, "logps/chosen": -178.42819213867188, "logps/rejected": -214.453857421875, "loss": 0.5883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2338414192199707, "rewards/margins": 0.38494712114334106, "rewards/rejected": -1.618788480758667, "step": 15350 }, { "epoch": 2.64645072363887, "grad_norm": 37.96018600463867, "learning_rate": 4.171791763968191e-09, "logits/chosen": -2.2911009788513184, "logits/rejected": -2.2742550373077393, "logps/chosen": -176.56112670898438, "logps/rejected": -210.6365966796875, "loss": 0.6081, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2332837581634521, "rewards/margins": 0.3278941512107849, "rewards/rejected": -1.5611779689788818, "step": 15360 }, { "epoch": 2.648173673328739, "grad_norm": 31.777172088623047, "learning_rate": 4.131798247280882e-09, "logits/chosen": -2.303417921066284, "logits/rejected": -2.2715156078338623, "logps/chosen": -180.98085021972656, "logps/rejected": -208.2545166015625, "loss": 0.614, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2401424646377563, "rewards/margins": 0.3373335897922516, "rewards/rejected": -1.577476143836975, "step": 15370 }, { "epoch": 2.649896623018608, "grad_norm": 30.086198806762695, "learning_rate": 4.091989093404513e-09, "logits/chosen": -2.2850561141967773, "logits/rejected": -2.261882781982422, "logps/chosen": -177.33334350585938, "logps/rejected": -215.7422637939453, "loss": 0.5643, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2238959074020386, "rewards/margins": 0.42093032598495483, "rewards/rejected": -1.6448261737823486, "step": 15380 }, { "epoch": 2.651619572708477, "grad_norm": 27.287952423095703, "learning_rate": 4.052364462348118e-09, "logits/chosen": -2.3435025215148926, "logits/rejected": -2.3281779289245605, "logps/chosen": -178.9251708984375, "logps/rejected": -215.21640014648438, "loss": 0.5895, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2218642234802246, "rewards/margins": 0.38189610838890076, "rewards/rejected": -1.6037603616714478, "step": 15390 }, { "epoch": 2.6533425223983462, "grad_norm": 29.21660804748535, "learning_rate": 4.01292451337909e-09, "logits/chosen": -2.303785562515259, "logits/rejected": -2.2735157012939453, "logps/chosen": -189.47283935546875, "logps/rejected": -205.9953155517578, "loss": 0.6489, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3067653179168701, "rewards/margins": 0.2470887005329132, "rewards/rejected": -1.5538541078567505, "step": 15400 }, { "epoch": 2.655065472088215, "grad_norm": 40.113216400146484, "learning_rate": 3.973669405022518e-09, "logits/chosen": -2.2873952388763428, "logits/rejected": -2.2475345134735107, "logps/chosen": -190.634521484375, "logps/rejected": -208.83047485351562, "loss": 0.6316, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2999731302261353, "rewards/margins": 0.28097444772720337, "rewards/rejected": -1.5809476375579834, "step": 15410 }, { "epoch": 2.656788421778084, "grad_norm": 32.246517181396484, "learning_rate": 3.934599295060481e-09, "logits/chosen": -2.2759604454040527, "logits/rejected": -2.251339912414551, "logps/chosen": -172.6602783203125, "logps/rejected": -220.6236114501953, "loss": 0.5528, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1907905340194702, "rewards/margins": 0.49050837755203247, "rewards/rejected": -1.6812989711761475, "step": 15420 }, { "epoch": 2.658511371467953, "grad_norm": 28.570491790771484, "learning_rate": 3.895714340531542e-09, "logits/chosen": -2.351717710494995, "logits/rejected": -2.320129871368408, "logps/chosen": -183.42445373535156, "logps/rejected": -215.3601531982422, "loss": 0.5694, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2372651100158691, "rewards/margins": 0.38909897208213806, "rewards/rejected": -1.6263641119003296, "step": 15430 }, { "epoch": 2.660234321157822, "grad_norm": 33.97868728637695, "learning_rate": 3.857014697730027e-09, "logits/chosen": -2.372708559036255, "logits/rejected": -2.3452935218811035, "logps/chosen": -174.3719940185547, "logps/rejected": -203.66500854492188, "loss": 0.597, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1882606744766235, "rewards/margins": 0.3174138069152832, "rewards/rejected": -1.5056743621826172, "step": 15440 }, { "epoch": 2.661957270847691, "grad_norm": 33.95857620239258, "learning_rate": 3.818500522205392e-09, "logits/chosen": -2.1778063774108887, "logits/rejected": -2.151001453399658, "logps/chosen": -177.09414672851562, "logps/rejected": -211.31924438476562, "loss": 0.5925, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2400333881378174, "rewards/margins": 0.3476566672325134, "rewards/rejected": -1.5876901149749756, "step": 15450 }, { "epoch": 2.66368022053756, "grad_norm": 37.55846405029297, "learning_rate": 3.7801719687616805e-09, "logits/chosen": -2.3410890102386475, "logits/rejected": -2.323452949523926, "logps/chosen": -179.95013427734375, "logps/rejected": -208.1508331298828, "loss": 0.6136, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1833778619766235, "rewards/margins": 0.32879582047462463, "rewards/rejected": -1.5121737718582153, "step": 15460 }, { "epoch": 2.6654031702274295, "grad_norm": 47.06980895996094, "learning_rate": 3.742029191456792e-09, "logits/chosen": -2.3284058570861816, "logits/rejected": -2.3038089275360107, "logps/chosen": -194.41659545898438, "logps/rejected": -222.1026611328125, "loss": 0.6121, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.331230878829956, "rewards/margins": 0.34454160928726196, "rewards/rejected": -1.6757726669311523, "step": 15470 }, { "epoch": 2.6671261199172984, "grad_norm": 31.516748428344727, "learning_rate": 3.704072343601955e-09, "logits/chosen": -2.3013763427734375, "logits/rejected": -2.269360303878784, "logps/chosen": -174.3995361328125, "logps/rejected": -203.1074676513672, "loss": 0.6133, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1772360801696777, "rewards/margins": 0.342081218957901, "rewards/rejected": -1.519317388534546, "step": 15480 }, { "epoch": 2.6688490696071674, "grad_norm": 48.16901397705078, "learning_rate": 3.666301577761033e-09, "logits/chosen": -2.2873353958129883, "logits/rejected": -2.272728681564331, "logps/chosen": -179.4340057373047, "logps/rejected": -204.6569061279297, "loss": 0.6174, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2381466627120972, "rewards/margins": 0.2938258647918701, "rewards/rejected": -1.5319725275039673, "step": 15490 }, { "epoch": 2.670572019297037, "grad_norm": 37.944313049316406, "learning_rate": 3.628717045750007e-09, "logits/chosen": -2.2588024139404297, "logits/rejected": -2.2421507835388184, "logps/chosen": -191.25660705566406, "logps/rejected": -216.8999786376953, "loss": 0.6351, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3631250858306885, "rewards/margins": 0.26649603247642517, "rewards/rejected": -1.629621148109436, "step": 15500 }, { "epoch": 2.6722949689869058, "grad_norm": 34.96455001831055, "learning_rate": 3.591318898636253e-09, "logits/chosen": -2.2264561653137207, "logits/rejected": -2.196096897125244, "logps/chosen": -182.52011108398438, "logps/rejected": -215.2834930419922, "loss": 0.5861, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2337663173675537, "rewards/margins": 0.38769131898880005, "rewards/rejected": -1.6214576959609985, "step": 15510 }, { "epoch": 2.6740179186767747, "grad_norm": 35.925941467285156, "learning_rate": 3.5541072867380174e-09, "logits/chosen": -2.216661214828491, "logits/rejected": -2.1927154064178467, "logps/chosen": -180.17608642578125, "logps/rejected": -206.7606201171875, "loss": 0.6073, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2516624927520752, "rewards/margins": 0.32696646451950073, "rewards/rejected": -1.5786290168762207, "step": 15520 }, { "epoch": 2.6757408683666437, "grad_norm": 30.07876968383789, "learning_rate": 3.5170823596237852e-09, "logits/chosen": -2.228799343109131, "logits/rejected": -2.1991519927978516, "logps/chosen": -167.81399536132812, "logps/rejected": -206.71530151367188, "loss": 0.563, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1289836168289185, "rewards/margins": 0.41038280725479126, "rewards/rejected": -1.539366364479065, "step": 15530 }, { "epoch": 2.6774638180565127, "grad_norm": 27.47809600830078, "learning_rate": 3.480244266111687e-09, "logits/chosen": -2.2589917182922363, "logits/rejected": -2.2270827293395996, "logps/chosen": -182.40170288085938, "logps/rejected": -214.64804077148438, "loss": 0.614, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2905007600784302, "rewards/margins": 0.3463669419288635, "rewards/rejected": -1.6368677616119385, "step": 15540 }, { "epoch": 2.6791867677463816, "grad_norm": 30.874013900756836, "learning_rate": 3.4435931542688813e-09, "logits/chosen": -2.342874050140381, "logits/rejected": -2.313809633255005, "logps/chosen": -183.68667602539062, "logps/rejected": -216.19583129882812, "loss": 0.5916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2788660526275635, "rewards/margins": 0.3673045039176941, "rewards/rejected": -1.6461708545684814, "step": 15550 }, { "epoch": 2.6809097174362506, "grad_norm": 34.93791580200195, "learning_rate": 3.407129171410966e-09, "logits/chosen": -2.280003786087036, "logits/rejected": -2.270197629928589, "logps/chosen": -179.6429443359375, "logps/rejected": -204.54757690429688, "loss": 0.6461, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2780966758728027, "rewards/margins": 0.24179551005363464, "rewards/rejected": -1.5198920965194702, "step": 15560 }, { "epoch": 2.68263266712612, "grad_norm": 34.73818588256836, "learning_rate": 3.3708524641014034e-09, "logits/chosen": -2.321105718612671, "logits/rejected": -2.294581890106201, "logps/chosen": -188.01690673828125, "logps/rejected": -217.1805877685547, "loss": 0.6057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3426463603973389, "rewards/margins": 0.33262020349502563, "rewards/rejected": -1.6752665042877197, "step": 15570 }, { "epoch": 2.684355616815989, "grad_norm": 37.94607162475586, "learning_rate": 3.3347631781509344e-09, "logits/chosen": -2.3370888233184814, "logits/rejected": -2.317061185836792, "logps/chosen": -182.791748046875, "logps/rejected": -211.9905548095703, "loss": 0.6156, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2602826356887817, "rewards/margins": 0.29799336194992065, "rewards/rejected": -1.5582760572433472, "step": 15580 }, { "epoch": 2.686078566505858, "grad_norm": 36.847930908203125, "learning_rate": 3.298861458616947e-09, "logits/chosen": -2.2724945545196533, "logits/rejected": -2.2520852088928223, "logps/chosen": -175.23680114746094, "logps/rejected": -197.07168579101562, "loss": 0.6402, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1939842700958252, "rewards/margins": 0.24006938934326172, "rewards/rejected": -1.4340537786483765, "step": 15590 }, { "epoch": 2.687801516195727, "grad_norm": 30.203340530395508, "learning_rate": 3.263147449802939e-09, "logits/chosen": -2.285566568374634, "logits/rejected": -2.2599756717681885, "logps/chosen": -181.5286865234375, "logps/rejected": -216.1159210205078, "loss": 0.5983, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2878538370132446, "rewards/margins": 0.3697057366371155, "rewards/rejected": -1.6575596332550049, "step": 15600 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -2.354008913040161, "eval_logits/rejected": -2.3419225215911865, "eval_logps/chosen": -165.61573791503906, "eval_logps/rejected": -186.71853637695312, "eval_loss": 0.6505388021469116, "eval_rewards/accuracies": 0.6017658114433289, "eval_rewards/chosen": -1.0660027265548706, "eval_rewards/margins": 0.17368650436401367, "eval_rewards/rejected": -1.2396892309188843, "eval_runtime": 384.7663, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 15600 }, { "epoch": 2.6895244658855963, "grad_norm": 39.31675338745117, "learning_rate": 3.227621295257921e-09, "logits/chosen": -2.342576026916504, "logits/rejected": -2.3186194896698, "logps/chosen": -183.08203125, "logps/rejected": -214.7406463623047, "loss": 0.6001, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2608639001846313, "rewards/margins": 0.34865862131118774, "rewards/rejected": -1.6095225811004639, "step": 15610 }, { "epoch": 2.6912474155754653, "grad_norm": 54.18916702270508, "learning_rate": 3.1922831377758586e-09, "logits/chosen": -2.252718448638916, "logits/rejected": -2.2375519275665283, "logps/chosen": -167.72531127929688, "logps/rejected": -209.5613555908203, "loss": 0.5654, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1279845237731934, "rewards/margins": 0.41473323106765747, "rewards/rejected": -1.542717695236206, "step": 15620 }, { "epoch": 2.6929703652653343, "grad_norm": 29.02688217163086, "learning_rate": 3.1571331193950444e-09, "logits/chosen": -2.2631161212921143, "logits/rejected": -2.2219128608703613, "logps/chosen": -184.05958557128906, "logps/rejected": -218.73251342773438, "loss": 0.5838, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2820950746536255, "rewards/margins": 0.4124404788017273, "rewards/rejected": -1.6945356130599976, "step": 15630 }, { "epoch": 2.6946933149552033, "grad_norm": 27.868053436279297, "learning_rate": 3.1221713813976037e-09, "logits/chosen": -2.2706925868988037, "logits/rejected": -2.242724895477295, "logps/chosen": -172.6728057861328, "logps/rejected": -218.58621215820312, "loss": 0.5448, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1886308193206787, "rewards/margins": 0.4725852608680725, "rewards/rejected": -1.661216139793396, "step": 15640 }, { "epoch": 2.6964162646450722, "grad_norm": 42.40397262573242, "learning_rate": 3.0873980643088603e-09, "logits/chosen": -2.2558560371398926, "logits/rejected": -2.2352757453918457, "logps/chosen": -177.53944396972656, "logps/rejected": -205.47412109375, "loss": 0.6291, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2249729633331299, "rewards/margins": 0.29740267992019653, "rewards/rejected": -1.5223757028579712, "step": 15650 }, { "epoch": 2.698139214334941, "grad_norm": 30.136716842651367, "learning_rate": 3.052813307896801e-09, "logits/chosen": -2.328580617904663, "logits/rejected": -2.3151848316192627, "logps/chosen": -178.67636108398438, "logps/rejected": -210.3419189453125, "loss": 0.5929, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.247610330581665, "rewards/margins": 0.3233638405799866, "rewards/rejected": -1.570974349975586, "step": 15660 }, { "epoch": 2.6998621640248106, "grad_norm": 29.27895164489746, "learning_rate": 3.018417251171529e-09, "logits/chosen": -2.2305965423583984, "logits/rejected": -2.19374942779541, "logps/chosen": -174.97256469726562, "logps/rejected": -208.78018188476562, "loss": 0.5698, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1976064443588257, "rewards/margins": 0.3816137909889221, "rewards/rejected": -1.5792200565338135, "step": 15670 }, { "epoch": 2.7015851137146796, "grad_norm": 27.068044662475586, "learning_rate": 2.984210032384671e-09, "logits/chosen": -2.246372699737549, "logits/rejected": -2.223546028137207, "logps/chosen": -184.91104125976562, "logps/rejected": -221.4980926513672, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3069775104522705, "rewards/margins": 0.3822581470012665, "rewards/rejected": -1.6892354488372803, "step": 15680 }, { "epoch": 2.7033080634045485, "grad_norm": 23.017866134643555, "learning_rate": 2.9501917890288387e-09, "logits/chosen": -2.278564929962158, "logits/rejected": -2.2606887817382812, "logps/chosen": -173.19595336914062, "logps/rejected": -208.59121704101562, "loss": 0.5943, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1975430250167847, "rewards/margins": 0.3586288392543793, "rewards/rejected": -1.5561718940734863, "step": 15690 }, { "epoch": 2.7050310130944175, "grad_norm": 44.127220153808594, "learning_rate": 2.9163626578370736e-09, "logits/chosen": -2.2899715900421143, "logits/rejected": -2.264925241470337, "logps/chosen": -179.9619140625, "logps/rejected": -217.8975830078125, "loss": 0.5913, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2661584615707397, "rewards/margins": 0.39361196756362915, "rewards/rejected": -1.6597706079483032, "step": 15700 }, { "epoch": 2.706753962784287, "grad_norm": 28.553180694580078, "learning_rate": 2.882722774782315e-09, "logits/chosen": -2.2921769618988037, "logits/rejected": -2.2634499073028564, "logps/chosen": -185.9178924560547, "logps/rejected": -224.8857421875, "loss": 0.584, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2706458568572998, "rewards/margins": 0.4241026043891907, "rewards/rejected": -1.6947485208511353, "step": 15710 }, { "epoch": 2.708476912474156, "grad_norm": 47.15506362915039, "learning_rate": 2.8492722750768305e-09, "logits/chosen": -2.2799127101898193, "logits/rejected": -2.272749185562134, "logps/chosen": -183.07467651367188, "logps/rejected": -203.03172302246094, "loss": 0.6529, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2899291515350342, "rewards/margins": 0.2201959192752838, "rewards/rejected": -1.510124921798706, "step": 15720 }, { "epoch": 2.710199862164025, "grad_norm": 32.763004302978516, "learning_rate": 2.8160112931716663e-09, "logits/chosen": -2.347216844558716, "logits/rejected": -2.3261611461639404, "logps/chosen": -172.67808532714844, "logps/rejected": -205.3860626220703, "loss": 0.595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1863274574279785, "rewards/margins": 0.34425634145736694, "rewards/rejected": -1.5305836200714111, "step": 15730 }, { "epoch": 2.711922811853894, "grad_norm": 32.519248962402344, "learning_rate": 2.782939962756126e-09, "logits/chosen": -2.2975738048553467, "logits/rejected": -2.2604293823242188, "logps/chosen": -184.21490478515625, "logps/rejected": -211.71826171875, "loss": 0.6261, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2909154891967773, "rewards/margins": 0.33306872844696045, "rewards/rejected": -1.6239840984344482, "step": 15740 }, { "epoch": 2.713645761543763, "grad_norm": 49.53424072265625, "learning_rate": 2.750058416757245e-09, "logits/chosen": -2.2987732887268066, "logits/rejected": -2.2767536640167236, "logps/chosen": -185.88815307617188, "logps/rejected": -221.35122680664062, "loss": 0.5931, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3290163278579712, "rewards/margins": 0.3406520187854767, "rewards/rejected": -1.669668197631836, "step": 15750 }, { "epoch": 2.7153687112336318, "grad_norm": 36.956336975097656, "learning_rate": 2.717366787339209e-09, "logits/chosen": -2.2139859199523926, "logits/rejected": -2.1922428607940674, "logps/chosen": -177.16494750976562, "logps/rejected": -205.7443389892578, "loss": 0.6176, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2540347576141357, "rewards/margins": 0.30845338106155396, "rewards/rejected": -1.562488317489624, "step": 15760 }, { "epoch": 2.717091660923501, "grad_norm": 31.984783172607422, "learning_rate": 2.684865205902881e-09, "logits/chosen": -2.272738456726074, "logits/rejected": -2.2462387084960938, "logps/chosen": -166.43064880371094, "logps/rejected": -212.916259765625, "loss": 0.5243, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1281863451004028, "rewards/margins": 0.47644680738449097, "rewards/rejected": -1.6046329736709595, "step": 15770 }, { "epoch": 2.71881461061337, "grad_norm": 33.240516662597656, "learning_rate": 2.6525538030852223e-09, "logits/chosen": -2.37518048286438, "logits/rejected": -2.356550931930542, "logps/chosen": -184.48910522460938, "logps/rejected": -201.74148559570312, "loss": 0.6542, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2998746633529663, "rewards/margins": 0.1974526196718216, "rewards/rejected": -1.497327208518982, "step": 15780 }, { "epoch": 2.720537560303239, "grad_norm": 50.996089935302734, "learning_rate": 2.620432708758802e-09, "logits/chosen": -2.2115371227264404, "logits/rejected": -2.197854995727539, "logps/chosen": -183.6704864501953, "logps/rejected": -209.2452392578125, "loss": 0.6137, "rewards/accuracies": 0.625, "rewards/chosen": -1.2914540767669678, "rewards/margins": 0.2838708460330963, "rewards/rejected": -1.5753250122070312, "step": 15790 }, { "epoch": 2.722260509993108, "grad_norm": 33.28852081298828, "learning_rate": 2.5885020520312604e-09, "logits/chosen": -2.345170497894287, "logits/rejected": -2.3010005950927734, "logps/chosen": -177.3496856689453, "logps/rejected": -206.03549194335938, "loss": 0.5765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.17936110496521, "rewards/margins": 0.3781944811344147, "rewards/rejected": -1.5575557947158813, "step": 15800 }, { "epoch": 2.7239834596829775, "grad_norm": 40.61261749267578, "learning_rate": 2.5567619612447854e-09, "logits/chosen": -2.3195712566375732, "logits/rejected": -2.309903621673584, "logps/chosen": -178.634033203125, "logps/rejected": -210.8284912109375, "loss": 0.6173, "rewards/accuracies": 0.65625, "rewards/chosen": -1.29694402217865, "rewards/margins": 0.29339122772216797, "rewards/rejected": -1.5903352499008179, "step": 15810 }, { "epoch": 2.7257064093728465, "grad_norm": 34.352272033691406, "learning_rate": 2.5252125639756207e-09, "logits/chosen": -2.225785255432129, "logits/rejected": -2.202213764190674, "logps/chosen": -179.4593048095703, "logps/rejected": -213.7439422607422, "loss": 0.6017, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2712770700454712, "rewards/margins": 0.35494905710220337, "rewards/rejected": -1.6262260675430298, "step": 15820 }, { "epoch": 2.7274293590627154, "grad_norm": 45.09383773803711, "learning_rate": 2.493853987033523e-09, "logits/chosen": -2.314626455307007, "logits/rejected": -2.3013596534729004, "logps/chosen": -173.50193786621094, "logps/rejected": -207.23251342773438, "loss": 0.6048, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2176988124847412, "rewards/margins": 0.3500228226184845, "rewards/rejected": -1.5677217245101929, "step": 15830 }, { "epoch": 2.7291523087525844, "grad_norm": 32.357215881347656, "learning_rate": 2.4626863564612467e-09, "logits/chosen": -2.3258578777313232, "logits/rejected": -2.3102779388427734, "logps/chosen": -193.52291870117188, "logps/rejected": -226.4375457763672, "loss": 0.6193, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3908226490020752, "rewards/margins": 0.32600319385528564, "rewards/rejected": -1.71682608127594, "step": 15840 }, { "epoch": 2.7308752584424534, "grad_norm": 41.314552307128906, "learning_rate": 2.4317097975340985e-09, "logits/chosen": -2.301025390625, "logits/rejected": -2.2786521911621094, "logps/chosen": -179.1788330078125, "logps/rejected": -206.5927276611328, "loss": 0.6233, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2470613718032837, "rewards/margins": 0.2841149866580963, "rewards/rejected": -1.5311763286590576, "step": 15850 }, { "epoch": 2.7325982081323223, "grad_norm": 72.44612884521484, "learning_rate": 2.4009244347593604e-09, "logits/chosen": -2.2927424907684326, "logits/rejected": -2.264702320098877, "logps/chosen": -178.3214569091797, "logps/rejected": -201.4319305419922, "loss": 0.6281, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.240483283996582, "rewards/margins": 0.2728969156742096, "rewards/rejected": -1.5133801698684692, "step": 15860 }, { "epoch": 2.7343211578221913, "grad_norm": 33.07346725463867, "learning_rate": 2.370330391875819e-09, "logits/chosen": -2.2673001289367676, "logits/rejected": -2.2333619594573975, "logps/chosen": -184.83200073242188, "logps/rejected": -225.6978759765625, "loss": 0.566, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.290633201599121, "rewards/margins": 0.44913339614868164, "rewards/rejected": -1.7397664785385132, "step": 15870 }, { "epoch": 2.7360441075120607, "grad_norm": 33.767364501953125, "learning_rate": 2.3399277918532854e-09, "logits/chosen": -2.2728636264801025, "logits/rejected": -2.257136821746826, "logps/chosen": -179.16775512695312, "logps/rejected": -213.8397979736328, "loss": 0.5858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2510485649108887, "rewards/margins": 0.3816913962364197, "rewards/rejected": -1.6327400207519531, "step": 15880 }, { "epoch": 2.7377670572019297, "grad_norm": 47.875667572021484, "learning_rate": 2.309716756892083e-09, "logits/chosen": -2.3037915229797363, "logits/rejected": -2.2675938606262207, "logps/chosen": -176.85299682617188, "logps/rejected": -209.14602661132812, "loss": 0.5899, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2395904064178467, "rewards/margins": 0.3721577823162079, "rewards/rejected": -1.6117480993270874, "step": 15890 }, { "epoch": 2.7394900068917987, "grad_norm": 49.56647872924805, "learning_rate": 2.2796974084225373e-09, "logits/chosen": -2.299668550491333, "logits/rejected": -2.2523045539855957, "logps/chosen": -190.56446838378906, "logps/rejected": -217.8603973388672, "loss": 0.5895, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2909893989562988, "rewards/margins": 0.37662652134895325, "rewards/rejected": -1.6676161289215088, "step": 15900 }, { "epoch": 2.741212956581668, "grad_norm": 38.5182991027832, "learning_rate": 2.249869867104537e-09, "logits/chosen": -2.2339723110198975, "logits/rejected": -2.208400249481201, "logps/chosen": -172.8338623046875, "logps/rejected": -198.0580291748047, "loss": 0.6159, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.183179497718811, "rewards/margins": 0.30717626214027405, "rewards/rejected": -1.4903557300567627, "step": 15910 }, { "epoch": 2.742935906271537, "grad_norm": 35.76197814941406, "learning_rate": 2.220234252826991e-09, "logits/chosen": -2.2531018257141113, "logits/rejected": -2.2350258827209473, "logps/chosen": -184.27220153808594, "logps/rejected": -216.9780731201172, "loss": 0.6113, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3089081048965454, "rewards/margins": 0.32555288076400757, "rewards/rejected": -1.6344608068466187, "step": 15920 }, { "epoch": 2.744658855961406, "grad_norm": 37.21056365966797, "learning_rate": 2.190790684707411e-09, "logits/chosen": -2.211575984954834, "logits/rejected": -2.1774814128875732, "logps/chosen": -173.06137084960938, "logps/rejected": -198.30435180664062, "loss": 0.5968, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1646642684936523, "rewards/margins": 0.32496175169944763, "rewards/rejected": -1.4896259307861328, "step": 15930 }, { "epoch": 2.746381805651275, "grad_norm": 32.13726806640625, "learning_rate": 2.161539281091351e-09, "logits/chosen": -2.264610528945923, "logits/rejected": -2.2332422733306885, "logps/chosen": -180.72996520996094, "logps/rejected": -229.2228546142578, "loss": 0.5391, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.202547311782837, "rewards/margins": 0.5045925378799438, "rewards/rejected": -1.7071399688720703, "step": 15940 }, { "epoch": 2.748104755341144, "grad_norm": 48.947052001953125, "learning_rate": 2.1324801595520357e-09, "logits/chosen": -2.3333230018615723, "logits/rejected": -2.304076671600342, "logps/chosen": -178.30613708496094, "logps/rejected": -203.87501525878906, "loss": 0.6035, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.189021348953247, "rewards/margins": 0.333077996969223, "rewards/rejected": -1.522099256515503, "step": 15950 }, { "epoch": 2.749827705031013, "grad_norm": 33.582252502441406, "learning_rate": 2.1036134368897785e-09, "logits/chosen": -2.309985876083374, "logits/rejected": -2.283900260925293, "logps/chosen": -183.0913543701172, "logps/rejected": -211.28482055664062, "loss": 0.6135, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2604248523712158, "rewards/margins": 0.33174318075180054, "rewards/rejected": -1.592167854309082, "step": 15960 }, { "epoch": 2.751550654720882, "grad_norm": 33.72100830078125, "learning_rate": 2.0749392291315894e-09, "logits/chosen": -2.291177988052368, "logits/rejected": -2.267993211746216, "logps/chosen": -182.629638671875, "logps/rejected": -216.76400756835938, "loss": 0.5853, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2580294609069824, "rewards/margins": 0.3601645827293396, "rewards/rejected": -1.6181939840316772, "step": 15970 }, { "epoch": 2.7532736044107513, "grad_norm": 38.948707580566406, "learning_rate": 2.046457651530686e-09, "logits/chosen": -2.2676291465759277, "logits/rejected": -2.246793270111084, "logps/chosen": -181.0098114013672, "logps/rejected": -211.0522003173828, "loss": 0.6109, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2406721115112305, "rewards/margins": 0.3142922520637512, "rewards/rejected": -1.554964303970337, "step": 15980 }, { "epoch": 2.7549965541006203, "grad_norm": 41.575164794921875, "learning_rate": 2.0181688185660183e-09, "logits/chosen": -2.370026111602783, "logits/rejected": -2.3678243160247803, "logps/chosen": -175.36595153808594, "logps/rejected": -205.42568969726562, "loss": 0.6076, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1957898139953613, "rewards/margins": 0.3089122772216797, "rewards/rejected": -1.5047019720077515, "step": 15990 }, { "epoch": 2.7567195037904892, "grad_norm": 32.23805236816406, "learning_rate": 1.99007284394182e-09, "logits/chosen": -2.247300624847412, "logits/rejected": -2.2156033515930176, "logps/chosen": -181.68435668945312, "logps/rejected": -212.2035369873047, "loss": 0.5983, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.27165949344635, "rewards/margins": 0.34139880537986755, "rewards/rejected": -1.61305832862854, "step": 16000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -2.3530187606811523, "eval_logits/rejected": -2.34079647064209, "eval_logps/chosen": -166.0839080810547, "eval_logps/rejected": -187.39894104003906, "eval_loss": 0.6500682234764099, "eval_rewards/accuracies": 0.6029275059700012, "eval_rewards/chosen": -1.0706843137741089, "eval_rewards/margins": 0.1758090853691101, "eval_rewards/rejected": -1.2464934587478638, "eval_runtime": 384.7363, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 16000 }, { "epoch": 2.758442453480358, "grad_norm": 39.961814880371094, "learning_rate": 1.9621698405871466e-09, "logits/chosen": -2.3224873542785645, "logits/rejected": -2.309032440185547, "logps/chosen": -184.5162811279297, "logps/rejected": -210.5989990234375, "loss": 0.6341, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2852531671524048, "rewards/margins": 0.2957290709018707, "rewards/rejected": -1.580981969833374, "step": 16010 }, { "epoch": 2.7601654031702276, "grad_norm": 33.21334457397461, "learning_rate": 1.934459920655429e-09, "logits/chosen": -2.356708288192749, "logits/rejected": -2.330230236053467, "logps/chosen": -181.0506591796875, "logps/rejected": -212.9567413330078, "loss": 0.6103, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2669428586959839, "rewards/margins": 0.3563677966594696, "rewards/rejected": -1.6233106851577759, "step": 16020 }, { "epoch": 2.7618883528600966, "grad_norm": 38.95966339111328, "learning_rate": 1.90694319552403e-09, "logits/chosen": -2.3382115364074707, "logits/rejected": -2.3172762393951416, "logps/chosen": -179.08792114257812, "logps/rejected": -211.1810302734375, "loss": 0.6094, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2544938325881958, "rewards/margins": 0.3421923518180847, "rewards/rejected": -1.5966860055923462, "step": 16030 }, { "epoch": 2.7636113025499656, "grad_norm": 47.20829391479492, "learning_rate": 1.879619775793756e-09, "logits/chosen": -2.3078360557556152, "logits/rejected": -2.286541223526001, "logps/chosen": -181.668212890625, "logps/rejected": -212.9115753173828, "loss": 0.6164, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2616897821426392, "rewards/margins": 0.35222771763801575, "rewards/rejected": -1.613917589187622, "step": 16040 }, { "epoch": 2.7653342522398345, "grad_norm": 31.04566764831543, "learning_rate": 1.8524897712884514e-09, "logits/chosen": -2.279730796813965, "logits/rejected": -2.2557711601257324, "logps/chosen": -180.85476684570312, "logps/rejected": -219.46853637695312, "loss": 0.5784, "rewards/accuracies": 0.75, "rewards/chosen": -1.2566825151443481, "rewards/margins": 0.4106810986995697, "rewards/rejected": -1.6673635244369507, "step": 16050 }, { "epoch": 2.7670572019297035, "grad_norm": 34.17963790893555, "learning_rate": 1.8255532910545657e-09, "logits/chosen": -2.295279026031494, "logits/rejected": -2.276061773300171, "logps/chosen": -175.43310546875, "logps/rejected": -205.7451629638672, "loss": 0.5895, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1868820190429688, "rewards/margins": 0.3304034173488617, "rewards/rejected": -1.5172855854034424, "step": 16060 }, { "epoch": 2.7687801516195725, "grad_norm": 38.8708610534668, "learning_rate": 1.798810443360671e-09, "logits/chosen": -2.2456247806549072, "logits/rejected": -2.221320390701294, "logps/chosen": -184.22183227539062, "logps/rejected": -217.4356231689453, "loss": 0.575, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2606128454208374, "rewards/margins": 0.3818413019180298, "rewards/rejected": -1.6424541473388672, "step": 16070 }, { "epoch": 2.770503101309442, "grad_norm": 32.86567306518555, "learning_rate": 1.7722613356970728e-09, "logits/chosen": -2.2950329780578613, "logits/rejected": -2.2494683265686035, "logps/chosen": -183.48300170898438, "logps/rejected": -220.6748504638672, "loss": 0.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2529817819595337, "rewards/margins": 0.4442780613899231, "rewards/rejected": -1.6972599029541016, "step": 16080 }, { "epoch": 2.772226050999311, "grad_norm": 30.092023849487305, "learning_rate": 1.745906074775344e-09, "logits/chosen": -2.2925052642822266, "logits/rejected": -2.263424873352051, "logps/chosen": -170.57962036132812, "logps/rejected": -206.346435546875, "loss": 0.5744, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1700583696365356, "rewards/margins": 0.38973119854927063, "rewards/rejected": -1.5597896575927734, "step": 16090 }, { "epoch": 2.77394900068918, "grad_norm": 34.294044494628906, "learning_rate": 1.7197447665279142e-09, "logits/chosen": -2.3059916496276855, "logits/rejected": -2.288022518157959, "logps/chosen": -178.9658203125, "logps/rejected": -226.75601196289062, "loss": 0.5548, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2523266077041626, "rewards/margins": 0.47826337814331055, "rewards/rejected": -1.7305901050567627, "step": 16100 }, { "epoch": 2.775671950379049, "grad_norm": 46.52742385864258, "learning_rate": 1.6937775161076251e-09, "logits/chosen": -2.220905303955078, "logits/rejected": -2.183434009552002, "logps/chosen": -176.07647705078125, "logps/rejected": -209.58218383789062, "loss": 0.5872, "rewards/accuracies": 0.625, "rewards/chosen": -1.1807199716567993, "rewards/margins": 0.3810442090034485, "rewards/rejected": -1.561764121055603, "step": 16110 }, { "epoch": 2.777394900068918, "grad_norm": 44.02835464477539, "learning_rate": 1.6680044278873428e-09, "logits/chosen": -2.2952685356140137, "logits/rejected": -2.279714345932007, "logps/chosen": -170.16844177246094, "logps/rejected": -202.47323608398438, "loss": 0.6085, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1918901205062866, "rewards/margins": 0.31227028369903564, "rewards/rejected": -1.5041604042053223, "step": 16120 }, { "epoch": 2.779117849758787, "grad_norm": 48.453269958496094, "learning_rate": 1.6424256054595187e-09, "logits/chosen": -2.2706055641174316, "logits/rejected": -2.2444052696228027, "logps/chosen": -183.84979248046875, "logps/rejected": -212.5630340576172, "loss": 0.6162, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3051106929779053, "rewards/margins": 0.3154757618904114, "rewards/rejected": -1.6205863952636719, "step": 16130 }, { "epoch": 2.780840799448656, "grad_norm": 40.92231750488281, "learning_rate": 1.6170411516357563e-09, "logits/chosen": -2.348477363586426, "logits/rejected": -2.3189752101898193, "logps/chosen": -180.53762817382812, "logps/rejected": -211.91152954101562, "loss": 0.5974, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2678172588348389, "rewards/margins": 0.3369593024253845, "rewards/rejected": -1.604776382446289, "step": 16140 }, { "epoch": 2.782563749138525, "grad_norm": 43.04970932006836, "learning_rate": 1.5918511684464008e-09, "logits/chosen": -2.323899030685425, "logits/rejected": -2.3004534244537354, "logps/chosen": -183.69158935546875, "logps/rejected": -213.60726928710938, "loss": 0.6019, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2804274559020996, "rewards/margins": 0.34259334206581116, "rewards/rejected": -1.6230207681655884, "step": 16150 }, { "epoch": 2.784286698828394, "grad_norm": 34.49961471557617, "learning_rate": 1.5668557571401786e-09, "logits/chosen": -2.3167967796325684, "logits/rejected": -2.284923791885376, "logps/chosen": -173.17575073242188, "logps/rejected": -217.793701171875, "loss": 0.5502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1839545965194702, "rewards/margins": 0.46423736214637756, "rewards/rejected": -1.648192048072815, "step": 16160 }, { "epoch": 2.786009648518263, "grad_norm": 25.138389587402344, "learning_rate": 1.5420550181837245e-09, "logits/chosen": -2.2304892539978027, "logits/rejected": -2.2073214054107666, "logps/chosen": -185.9856414794922, "logps/rejected": -214.74386596679688, "loss": 0.6033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.314554214477539, "rewards/margins": 0.3014872372150421, "rewards/rejected": -1.6160414218902588, "step": 16170 }, { "epoch": 2.7877325982081325, "grad_norm": 38.96569061279297, "learning_rate": 1.517449051261227e-09, "logits/chosen": -2.2721707820892334, "logits/rejected": -2.2256643772125244, "logps/chosen": -189.69200134277344, "logps/rejected": -223.68936157226562, "loss": 0.5777, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.297239899635315, "rewards/margins": 0.4274531304836273, "rewards/rejected": -1.7246930599212646, "step": 16180 }, { "epoch": 2.7894555478980014, "grad_norm": 50.26418685913086, "learning_rate": 1.4930379552739791e-09, "logits/chosen": -2.2861552238464355, "logits/rejected": -2.2630105018615723, "logps/chosen": -181.56906127929688, "logps/rejected": -213.3578643798828, "loss": 0.6124, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2681522369384766, "rewards/margins": 0.3266271948814392, "rewards/rejected": -1.594779372215271, "step": 16190 }, { "epoch": 2.7911784975878704, "grad_norm": 39.742286682128906, "learning_rate": 1.4688218283400334e-09, "logits/chosen": -2.2232251167297363, "logits/rejected": -2.1820454597473145, "logps/chosen": -180.40234375, "logps/rejected": -209.013671875, "loss": 0.6131, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2425382137298584, "rewards/margins": 0.3541840612888336, "rewards/rejected": -1.5967223644256592, "step": 16200 }, { "epoch": 2.7929014472777394, "grad_norm": 33.191959381103516, "learning_rate": 1.4448007677937746e-09, "logits/chosen": -2.1963438987731934, "logits/rejected": -2.1801505088806152, "logps/chosen": -180.97879028320312, "logps/rejected": -206.3859405517578, "loss": 0.6406, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2748857736587524, "rewards/margins": 0.2578073740005493, "rewards/rejected": -1.5326931476593018, "step": 16210 }, { "epoch": 2.794624396967609, "grad_norm": 40.69626235961914, "learning_rate": 1.420974870185543e-09, "logits/chosen": -2.2687435150146484, "logits/rejected": -2.2401299476623535, "logps/chosen": -174.5592041015625, "logps/rejected": -214.38064575195312, "loss": 0.5715, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1669092178344727, "rewards/margins": 0.43398723006248474, "rewards/rejected": -1.6008965969085693, "step": 16220 }, { "epoch": 2.7963473466574778, "grad_norm": 32.2374267578125, "learning_rate": 1.3973442312812278e-09, "logits/chosen": -2.289696216583252, "logits/rejected": -2.2658562660217285, "logps/chosen": -178.87796020507812, "logps/rejected": -209.24978637695312, "loss": 0.6037, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2666771411895752, "rewards/margins": 0.32003122568130493, "rewards/rejected": -1.5867083072662354, "step": 16230 }, { "epoch": 2.7980702963473467, "grad_norm": 37.53076171875, "learning_rate": 1.373908946061908e-09, "logits/chosen": -2.2701830863952637, "logits/rejected": -2.248612880706787, "logps/chosen": -183.1614227294922, "logps/rejected": -212.554443359375, "loss": 0.6157, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.298271656036377, "rewards/margins": 0.3004020154476166, "rewards/rejected": -1.598673701286316, "step": 16240 }, { "epoch": 2.7997932460372157, "grad_norm": 36.98796463012695, "learning_rate": 1.3506691087234457e-09, "logits/chosen": -2.2960731983184814, "logits/rejected": -2.2753283977508545, "logps/chosen": -180.3482208251953, "logps/rejected": -206.80184936523438, "loss": 0.6087, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2228682041168213, "rewards/margins": 0.305896133184433, "rewards/rejected": -1.528764009475708, "step": 16250 }, { "epoch": 2.8015161957270847, "grad_norm": 47.55896759033203, "learning_rate": 1.3276248126761259e-09, "logits/chosen": -2.297947883605957, "logits/rejected": -2.271911144256592, "logps/chosen": -182.63546752929688, "logps/rejected": -222.87747192382812, "loss": 0.606, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2915923595428467, "rewards/margins": 0.38956117630004883, "rewards/rejected": -1.6811535358428955, "step": 16260 }, { "epoch": 2.8032391454169536, "grad_norm": 49.676631927490234, "learning_rate": 1.304776150544279e-09, "logits/chosen": -2.3071579933166504, "logits/rejected": -2.2782936096191406, "logps/chosen": -178.2207794189453, "logps/rejected": -212.48013305664062, "loss": 0.5938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.22140371799469, "rewards/margins": 0.3800586760044098, "rewards/rejected": -1.6014623641967773, "step": 16270 }, { "epoch": 2.804962095106823, "grad_norm": 34.81820297241211, "learning_rate": 1.2821232141658866e-09, "logits/chosen": -2.35308837890625, "logits/rejected": -2.3323071002960205, "logps/chosen": -188.4277801513672, "logps/rejected": -214.10342407226562, "loss": 0.612, "rewards/accuracies": 0.65625, "rewards/chosen": -1.313417673110962, "rewards/margins": 0.30872249603271484, "rewards/rejected": -1.6221401691436768, "step": 16280 }, { "epoch": 2.806685044796692, "grad_norm": 32.014060974121094, "learning_rate": 1.2596660945922433e-09, "logits/chosen": -2.307892084121704, "logits/rejected": -2.2928123474121094, "logps/chosen": -175.9683074951172, "logps/rejected": -210.37765502929688, "loss": 0.6093, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2256677150726318, "rewards/margins": 0.3405986428260803, "rewards/rejected": -1.5662662982940674, "step": 16290 }, { "epoch": 2.808407994486561, "grad_norm": 35.85638427734375, "learning_rate": 1.2374048820875893e-09, "logits/chosen": -2.3248770236968994, "logits/rejected": -2.2926807403564453, "logps/chosen": -183.6764678955078, "logps/rejected": -212.8525390625, "loss": 0.6077, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.287197232246399, "rewards/margins": 0.33820948004722595, "rewards/rejected": -1.6254066228866577, "step": 16300 }, { "epoch": 2.81013094417643, "grad_norm": 28.929685592651367, "learning_rate": 1.2153396661287007e-09, "logits/chosen": -2.282907485961914, "logits/rejected": -2.2625086307525635, "logps/chosen": -181.51144409179688, "logps/rejected": -209.85305786132812, "loss": 0.6353, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.277237892150879, "rewards/margins": 0.27741461992263794, "rewards/rejected": -1.5546525716781616, "step": 16310 }, { "epoch": 2.8118538938662994, "grad_norm": 38.36284255981445, "learning_rate": 1.1934705354045894e-09, "logits/chosen": -2.292874813079834, "logits/rejected": -2.273878574371338, "logps/chosen": -185.80093383789062, "logps/rejected": -218.79623413085938, "loss": 0.6061, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2972584962844849, "rewards/margins": 0.3475348949432373, "rewards/rejected": -1.6447932720184326, "step": 16320 }, { "epoch": 2.8135768435561683, "grad_norm": 28.26596450805664, "learning_rate": 1.1717975778161193e-09, "logits/chosen": -2.3105838298797607, "logits/rejected": -2.2864773273468018, "logps/chosen": -176.97756958007812, "logps/rejected": -203.34059143066406, "loss": 0.6141, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1774392127990723, "rewards/margins": 0.32404693961143494, "rewards/rejected": -1.5014861822128296, "step": 16330 }, { "epoch": 2.8152997932460373, "grad_norm": 29.667314529418945, "learning_rate": 1.1503208804756526e-09, "logits/chosen": -2.237299919128418, "logits/rejected": -2.2079734802246094, "logps/chosen": -175.31661987304688, "logps/rejected": -215.7568817138672, "loss": 0.5493, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1911835670471191, "rewards/margins": 0.452349990606308, "rewards/rejected": -1.643533706665039, "step": 16340 }, { "epoch": 2.8170227429359063, "grad_norm": 33.90262222290039, "learning_rate": 1.1290405297066984e-09, "logits/chosen": -2.371725559234619, "logits/rejected": -2.335756778717041, "logps/chosen": -177.57327270507812, "logps/rejected": -211.3468017578125, "loss": 0.5726, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2037913799285889, "rewards/margins": 0.3730456829071045, "rewards/rejected": -1.5768373012542725, "step": 16350 }, { "epoch": 2.8187456926257752, "grad_norm": 41.114654541015625, "learning_rate": 1.1079566110435812e-09, "logits/chosen": -2.2963128089904785, "logits/rejected": -2.2677197456359863, "logps/chosen": -175.20028686523438, "logps/rejected": -209.4230194091797, "loss": 0.5819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2134548425674438, "rewards/margins": 0.3761741518974304, "rewards/rejected": -1.589629054069519, "step": 16360 }, { "epoch": 2.820468642315644, "grad_norm": 38.21614074707031, "learning_rate": 1.0870692092310674e-09, "logits/chosen": -2.2704720497131348, "logits/rejected": -2.2500154972076416, "logps/chosen": -181.661865234375, "logps/rejected": -203.2222900390625, "loss": 0.6249, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2675329446792603, "rewards/margins": 0.27036967873573303, "rewards/rejected": -1.537902593612671, "step": 16370 }, { "epoch": 2.822191592005513, "grad_norm": 34.16209411621094, "learning_rate": 1.0663784082240556e-09, "logits/chosen": -2.3108277320861816, "logits/rejected": -2.2971932888031006, "logps/chosen": -180.47698974609375, "logps/rejected": -210.71932983398438, "loss": 0.6062, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2505805492401123, "rewards/margins": 0.31283727288246155, "rewards/rejected": -1.5634177923202515, "step": 16380 }, { "epoch": 2.8239145416953826, "grad_norm": 33.69387435913086, "learning_rate": 1.0458842911872213e-09, "logits/chosen": -2.2244887351989746, "logits/rejected": -2.2039520740509033, "logps/chosen": -172.71177673339844, "logps/rejected": -205.65087890625, "loss": 0.607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1868815422058105, "rewards/margins": 0.3532125651836395, "rewards/rejected": -1.5400941371917725, "step": 16390 }, { "epoch": 2.8256374913852516, "grad_norm": 38.37247848510742, "learning_rate": 1.0255869404947049e-09, "logits/chosen": -2.233137845993042, "logits/rejected": -2.213040828704834, "logps/chosen": -178.09384155273438, "logps/rejected": -209.20669555664062, "loss": 0.5956, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2155065536499023, "rewards/margins": 0.33726614713668823, "rewards/rejected": -1.5527725219726562, "step": 16400 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -2.3550028800964355, "eval_logits/rejected": -2.34291672706604, "eval_logps/chosen": -164.95201110839844, "eval_logps/rejected": -186.080322265625, "eval_loss": 0.650005042552948, "eval_rewards/accuracies": 0.600836455821991, "eval_rewards/chosen": -1.0593652725219727, "eval_rewards/margins": 0.17394186556339264, "eval_rewards/rejected": -1.2333072423934937, "eval_runtime": 385.0643, "eval_samples_per_second": 11.177, "eval_steps_per_second": 1.397, "step": 16400 }, { "epoch": 2.8273604410751205, "grad_norm": 32.81662368774414, "learning_rate": 1.0054864377297357e-09, "logits/chosen": -2.291510820388794, "logits/rejected": -2.250988245010376, "logps/chosen": -182.9836883544922, "logps/rejected": -213.5102081298828, "loss": 0.5815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2685143947601318, "rewards/margins": 0.3915003836154938, "rewards/rejected": -1.6600148677825928, "step": 16410 }, { "epoch": 2.82908339076499, "grad_norm": 47.90050506591797, "learning_rate": 9.855828636843422e-10, "logits/chosen": -2.2053303718566895, "logits/rejected": -2.1868436336517334, "logps/chosen": -179.76307678222656, "logps/rejected": -211.9607391357422, "loss": 0.6076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2450826168060303, "rewards/margins": 0.3121925890445709, "rewards/rejected": -1.5572751760482788, "step": 16420 }, { "epoch": 2.830806340454859, "grad_norm": 41.66067886352539, "learning_rate": 9.65876298359025e-10, "logits/chosen": -2.2113218307495117, "logits/rejected": -2.198329448699951, "logps/chosen": -182.64976501464844, "logps/rejected": -212.186767578125, "loss": 0.6203, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2813270092010498, "rewards/margins": 0.3096092939376831, "rewards/rejected": -1.5909364223480225, "step": 16430 }, { "epoch": 2.832529290144728, "grad_norm": 33.65230941772461, "learning_rate": 9.463668209624298e-10, "logits/chosen": -2.2552592754364014, "logits/rejected": -2.2291042804718018, "logps/chosen": -178.43710327148438, "logps/rejected": -214.56546020507812, "loss": 0.5943, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2642335891723633, "rewards/margins": 0.3713500201702118, "rewards/rejected": -1.6355836391448975, "step": 16440 }, { "epoch": 2.834252239834597, "grad_norm": 39.56161880493164, "learning_rate": 9.270545099110072e-10, "logits/chosen": -2.338459014892578, "logits/rejected": -2.306795597076416, "logps/chosen": -178.9355926513672, "logps/rejected": -217.2798614501953, "loss": 0.5548, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2057013511657715, "rewards/margins": 0.4274584650993347, "rewards/rejected": -1.633159875869751, "step": 16450 }, { "epoch": 2.835975189524466, "grad_norm": 41.27180099487305, "learning_rate": 9.079394428287312e-10, "logits/chosen": -2.1934924125671387, "logits/rejected": -2.1793243885040283, "logps/chosen": -167.72926330566406, "logps/rejected": -211.9979248046875, "loss": 0.568, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1811243295669556, "rewards/margins": 0.4424045979976654, "rewards/rejected": -1.6235288381576538, "step": 16460 }, { "epoch": 2.837698139214335, "grad_norm": 39.97527313232422, "learning_rate": 8.890216965467656e-10, "logits/chosen": -2.308414936065674, "logits/rejected": -2.282804489135742, "logps/chosen": -175.1788330078125, "logps/rejected": -217.9228057861328, "loss": 0.5758, "rewards/accuracies": 0.75, "rewards/chosen": -1.2298270463943481, "rewards/margins": 0.43262115120887756, "rewards/rejected": -1.6624482870101929, "step": 16470 }, { "epoch": 2.8394210889042037, "grad_norm": 47.5313720703125, "learning_rate": 8.70301347103175e-10, "logits/chosen": -2.2051024436950684, "logits/rejected": -2.183053493499756, "logps/chosen": -177.04518127441406, "logps/rejected": -213.59140014648438, "loss": 0.5864, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.235124111175537, "rewards/margins": 0.36766180396080017, "rewards/rejected": -1.6027857065200806, "step": 16480 }, { "epoch": 2.841144038594073, "grad_norm": 47.61405563354492, "learning_rate": 8.517784697425978e-10, "logits/chosen": -2.287238121032715, "logits/rejected": -2.2698919773101807, "logps/chosen": -176.14376831054688, "logps/rejected": -194.83566284179688, "loss": 0.6508, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2334768772125244, "rewards/margins": 0.20505890250205994, "rewards/rejected": -1.4385356903076172, "step": 16490 }, { "epoch": 2.842866988283942, "grad_norm": 25.528099060058594, "learning_rate": 8.334531389159349e-10, "logits/chosen": -2.315481185913086, "logits/rejected": -2.267341136932373, "logps/chosen": -170.80941772460938, "logps/rejected": -191.43600463867188, "loss": 0.6026, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1156132221221924, "rewards/margins": 0.30887675285339355, "rewards/rejected": -1.4244900941848755, "step": 16500 }, { "epoch": 2.844589937973811, "grad_norm": 52.03614044189453, "learning_rate": 8.153254282801114e-10, "logits/chosen": -2.3164989948272705, "logits/rejected": -2.295607328414917, "logps/chosen": -177.18972778320312, "logps/rejected": -206.91775512695312, "loss": 0.6078, "rewards/accuracies": 0.6875, "rewards/chosen": -1.194132924079895, "rewards/margins": 0.32859188318252563, "rewards/rejected": -1.5227246284484863, "step": 16510 }, { "epoch": 2.84631288766368, "grad_norm": 42.001678466796875, "learning_rate": 7.973954106976876e-10, "logits/chosen": -2.3597311973571777, "logits/rejected": -2.324655771255493, "logps/chosen": -184.51565551757812, "logps/rejected": -224.277587890625, "loss": 0.5683, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2740962505340576, "rewards/margins": 0.4469670355319977, "rewards/rejected": -1.721063256263733, "step": 16520 }, { "epoch": 2.8480358373535495, "grad_norm": 51.226280212402344, "learning_rate": 7.796631582366486e-10, "logits/chosen": -2.2431588172912598, "logits/rejected": -2.224879026412964, "logps/chosen": -178.93814086914062, "logps/rejected": -197.71075439453125, "loss": 0.6314, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.221017599105835, "rewards/margins": 0.23768672347068787, "rewards/rejected": -1.4587042331695557, "step": 16530 }, { "epoch": 2.8497587870434185, "grad_norm": 34.07789993286133, "learning_rate": 7.621287421700762e-10, "logits/chosen": -2.267864227294922, "logits/rejected": -2.2442805767059326, "logps/chosen": -182.6355438232422, "logps/rejected": -209.2755126953125, "loss": 0.6169, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.291688323020935, "rewards/margins": 0.3040887713432312, "rewards/rejected": -1.5957772731781006, "step": 16540 }, { "epoch": 2.8514817367332874, "grad_norm": 27.323087692260742, "learning_rate": 7.447922329758605e-10, "logits/chosen": -2.252119302749634, "logits/rejected": -2.2273201942443848, "logps/chosen": -180.32655334472656, "logps/rejected": -219.2308349609375, "loss": 0.5787, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2568960189819336, "rewards/margins": 0.40228986740112305, "rewards/rejected": -1.6591860055923462, "step": 16550 }, { "epoch": 2.8532046864231564, "grad_norm": 36.010589599609375, "learning_rate": 7.276537003364225e-10, "logits/chosen": -2.2651255130767822, "logits/rejected": -2.240661144256592, "logps/chosen": -180.301025390625, "logps/rejected": -216.3168182373047, "loss": 0.5982, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.248245358467102, "rewards/margins": 0.3549201488494873, "rewards/rejected": -1.603165626525879, "step": 16560 }, { "epoch": 2.8549276361130254, "grad_norm": 34.78297424316406, "learning_rate": 7.107132131384475e-10, "logits/chosen": -2.330124855041504, "logits/rejected": -2.2960896492004395, "logps/chosen": -176.1905975341797, "logps/rejected": -208.61962890625, "loss": 0.5842, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2155747413635254, "rewards/margins": 0.363336980342865, "rewards/rejected": -1.5789117813110352, "step": 16570 }, { "epoch": 2.8566505858028943, "grad_norm": 40.64534378051758, "learning_rate": 6.939708394725907e-10, "logits/chosen": -2.247532367706299, "logits/rejected": -2.2236592769622803, "logps/chosen": -176.4193572998047, "logps/rejected": -212.16561889648438, "loss": 0.5932, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2451902627944946, "rewards/margins": 0.35713082551956177, "rewards/rejected": -1.6023210287094116, "step": 16580 }, { "epoch": 2.8583735354927637, "grad_norm": 40.58009338378906, "learning_rate": 6.774266466331946e-10, "logits/chosen": -2.2344393730163574, "logits/rejected": -2.205369710922241, "logps/chosen": -189.36399841308594, "logps/rejected": -212.946044921875, "loss": 0.6217, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3462426662445068, "rewards/margins": 0.2884506285190582, "rewards/rejected": -1.6346931457519531, "step": 16590 }, { "epoch": 2.8600964851826327, "grad_norm": 35.32307815551758, "learning_rate": 6.610807011180552e-10, "logits/chosen": -2.2636892795562744, "logits/rejected": -2.2521121501922607, "logps/chosen": -187.82833862304688, "logps/rejected": -211.86575317382812, "loss": 0.6346, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3014774322509766, "rewards/margins": 0.27267351746559143, "rewards/rejected": -1.5741510391235352, "step": 16600 }, { "epoch": 2.8618194348725017, "grad_norm": 30.843059539794922, "learning_rate": 6.449330686281285e-10, "logits/chosen": -2.257110118865967, "logits/rejected": -2.216294050216675, "logps/chosen": -186.468505859375, "logps/rejected": -221.2821807861328, "loss": 0.5793, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3059688806533813, "rewards/margins": 0.38301438093185425, "rewards/rejected": -1.6889832019805908, "step": 16610 }, { "epoch": 2.8635423845623706, "grad_norm": 41.82108688354492, "learning_rate": 6.289838140672521e-10, "logits/chosen": -2.2378902435302734, "logits/rejected": -2.2138800621032715, "logps/chosen": -174.65951538085938, "logps/rejected": -203.35629272460938, "loss": 0.6086, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.171541452407837, "rewards/margins": 0.3206041753292084, "rewards/rejected": -1.4921454191207886, "step": 16620 }, { "epoch": 2.86526533425224, "grad_norm": 46.764320373535156, "learning_rate": 6.132330015419296e-10, "logits/chosen": -2.243197202682495, "logits/rejected": -2.206315755844116, "logps/chosen": -186.30581665039062, "logps/rejected": -213.79360961914062, "loss": 0.5886, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2337594032287598, "rewards/margins": 0.3845231831073761, "rewards/rejected": -1.618282675743103, "step": 16630 }, { "epoch": 2.866988283942109, "grad_norm": 50.13645553588867, "learning_rate": 5.97680694361019e-10, "logits/chosen": -2.2617459297180176, "logits/rejected": -2.227799654006958, "logps/chosen": -190.19491577148438, "logps/rejected": -218.0207061767578, "loss": 0.6145, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3346335887908936, "rewards/margins": 0.33890143036842346, "rewards/rejected": -1.6735349893569946, "step": 16640 }, { "epoch": 2.868711233631978, "grad_norm": 35.483299255371094, "learning_rate": 5.823269550355281e-10, "logits/chosen": -2.2349085807800293, "logits/rejected": -2.213834047317505, "logps/chosen": -171.34573364257812, "logps/rejected": -202.91136169433594, "loss": 0.5896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.156395673751831, "rewards/margins": 0.36401891708374023, "rewards/rejected": -1.5204145908355713, "step": 16650 }, { "epoch": 2.870434183321847, "grad_norm": 41.430599212646484, "learning_rate": 5.671718452783247e-10, "logits/chosen": -2.2703566551208496, "logits/rejected": -2.253518581390381, "logps/chosen": -179.66993713378906, "logps/rejected": -211.7154541015625, "loss": 0.6102, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2494404315948486, "rewards/margins": 0.33247634768486023, "rewards/rejected": -1.5819166898727417, "step": 16660 }, { "epoch": 2.872157133011716, "grad_norm": 31.6019229888916, "learning_rate": 5.522154260039158e-10, "logits/chosen": -2.2803568840026855, "logits/rejected": -2.261735439300537, "logps/chosen": -169.9952392578125, "logps/rejected": -211.58175659179688, "loss": 0.5666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.168507695198059, "rewards/margins": 0.4273078441619873, "rewards/rejected": -1.5958155393600464, "step": 16670 }, { "epoch": 2.873880082701585, "grad_norm": 47.5838623046875, "learning_rate": 5.374577573281746e-10, "logits/chosen": -2.2111976146698, "logits/rejected": -2.194802761077881, "logps/chosen": -176.59658813476562, "logps/rejected": -202.6920166015625, "loss": 0.6284, "rewards/accuracies": 0.625, "rewards/chosen": -1.2688227891921997, "rewards/margins": 0.27188339829444885, "rewards/rejected": -1.5407060384750366, "step": 16680 }, { "epoch": 2.8756030323914543, "grad_norm": 35.962890625, "learning_rate": 5.228988985681416e-10, "logits/chosen": -2.2994630336761475, "logits/rejected": -2.2786083221435547, "logps/chosen": -176.1400146484375, "logps/rejected": -202.96847534179688, "loss": 0.6151, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2068394422531128, "rewards/margins": 0.2947150766849518, "rewards/rejected": -1.5015544891357422, "step": 16690 }, { "epoch": 2.8773259820813233, "grad_norm": 43.47890090942383, "learning_rate": 5.085389082417291e-10, "logits/chosen": -2.362004280090332, "logits/rejected": -2.339451551437378, "logps/chosen": -183.06504821777344, "logps/rejected": -207.77261352539062, "loss": 0.6376, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2732914686203003, "rewards/margins": 0.27610525488853455, "rewards/rejected": -1.5493966341018677, "step": 16700 }, { "epoch": 2.8790489317711923, "grad_norm": 29.09917640686035, "learning_rate": 4.943778440675451e-10, "logits/chosen": -2.289375066757202, "logits/rejected": -2.261080026626587, "logps/chosen": -176.30088806152344, "logps/rejected": -209.833984375, "loss": 0.5974, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.241347312927246, "rewards/margins": 0.3593154549598694, "rewards/rejected": -1.6006628274917603, "step": 16710 }, { "epoch": 2.8807718814610612, "grad_norm": 48.67392349243164, "learning_rate": 4.804157629646144e-10, "logits/chosen": -2.251337766647339, "logits/rejected": -2.2181668281555176, "logps/chosen": -179.15492248535156, "logps/rejected": -207.46963500976562, "loss": 0.5897, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2366948127746582, "rewards/margins": 0.3579455316066742, "rewards/rejected": -1.5946404933929443, "step": 16720 }, { "epoch": 2.8824948311509306, "grad_norm": 33.78643035888672, "learning_rate": 4.666527210521742e-10, "logits/chosen": -2.2321724891662598, "logits/rejected": -2.2046022415161133, "logps/chosen": -173.66099548339844, "logps/rejected": -217.14547729492188, "loss": 0.5569, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.211617112159729, "rewards/margins": 0.45391637086868286, "rewards/rejected": -1.665533423423767, "step": 16730 }, { "epoch": 2.8842177808407996, "grad_norm": 44.23442459106445, "learning_rate": 4.53088773649446e-10, "logits/chosen": -2.2257206439971924, "logits/rejected": -2.199591636657715, "logps/chosen": -181.7847900390625, "logps/rejected": -212.50894165039062, "loss": 0.6114, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2515366077423096, "rewards/margins": 0.3494786322116852, "rewards/rejected": -1.601015329360962, "step": 16740 }, { "epoch": 2.8859407305306686, "grad_norm": 38.216522216796875, "learning_rate": 4.397239752754134e-10, "logits/chosen": -2.3091447353363037, "logits/rejected": -2.28430438041687, "logps/chosen": -178.2207489013672, "logps/rejected": -219.61129760742188, "loss": 0.571, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.232283353805542, "rewards/margins": 0.3908485472202301, "rewards/rejected": -1.6231319904327393, "step": 16750 }, { "epoch": 2.8876636802205375, "grad_norm": 42.792030334472656, "learning_rate": 4.265583796485783e-10, "logits/chosen": -2.2111003398895264, "logits/rejected": -2.182976245880127, "logps/chosen": -188.66864013671875, "logps/rejected": -222.423583984375, "loss": 0.5928, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3284666538238525, "rewards/margins": 0.3791476786136627, "rewards/rejected": -1.707614541053772, "step": 16760 }, { "epoch": 2.8893866299104065, "grad_norm": 57.47098159790039, "learning_rate": 4.135920396867942e-10, "logits/chosen": -2.3359036445617676, "logits/rejected": -2.297128200531006, "logps/chosen": -181.53477478027344, "logps/rejected": -212.4428253173828, "loss": 0.5775, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2404382228851318, "rewards/margins": 0.3955693244934082, "rewards/rejected": -1.6360076665878296, "step": 16770 }, { "epoch": 2.8911095796002755, "grad_norm": 30.25149154663086, "learning_rate": 4.0082500750701076e-10, "logits/chosen": -2.2654218673706055, "logits/rejected": -2.248629331588745, "logps/chosen": -175.7464141845703, "logps/rejected": -214.15731811523438, "loss": 0.5822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2026407718658447, "rewards/margins": 0.36839795112609863, "rewards/rejected": -1.571038842201233, "step": 16780 }, { "epoch": 2.892832529290145, "grad_norm": 27.726421356201172, "learning_rate": 3.8825733442507947e-10, "logits/chosen": -2.284965753555298, "logits/rejected": -2.2358572483062744, "logps/chosen": -179.79421997070312, "logps/rejected": -203.9884033203125, "loss": 0.6165, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2187730073928833, "rewards/margins": 0.32310670614242554, "rewards/rejected": -1.5418797731399536, "step": 16790 }, { "epoch": 2.894555478980014, "grad_norm": 70.15962219238281, "learning_rate": 3.75889070955554e-10, "logits/chosen": -2.3128035068511963, "logits/rejected": -2.2996985912323, "logps/chosen": -179.56320190429688, "logps/rejected": -210.6813201904297, "loss": 0.6221, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2907289266586304, "rewards/margins": 0.3105553686618805, "rewards/rejected": -1.601284384727478, "step": 16800 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -2.355128288269043, "eval_logits/rejected": -2.343019962310791, "eval_logps/chosen": -164.93360900878906, "eval_logps/rejected": -186.08456420898438, "eval_loss": 0.6498690247535706, "eval_rewards/accuracies": 0.6040892004966736, "eval_rewards/chosen": -1.0591812133789062, "eval_rewards/margins": 0.17416824400424957, "eval_rewards/rejected": -1.2333494424819946, "eval_runtime": 384.6915, "eval_samples_per_second": 11.188, "eval_steps_per_second": 1.399, "step": 16800 }, { "epoch": 2.896278428669883, "grad_norm": 34.69468688964844, "learning_rate": 3.6372026681146806e-10, "logits/chosen": -2.260162830352783, "logits/rejected": -2.2307093143463135, "logps/chosen": -168.69715881347656, "logps/rejected": -201.69468688964844, "loss": 0.5917, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.14708411693573, "rewards/margins": 0.3847166895866394, "rewards/rejected": -1.531800627708435, "step": 16810 }, { "epoch": 2.898001378359752, "grad_norm": 34.2429084777832, "learning_rate": 3.517509709041688e-10, "logits/chosen": -2.2920870780944824, "logits/rejected": -2.263136625289917, "logps/chosen": -183.0124969482422, "logps/rejected": -202.57705688476562, "loss": 0.6253, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2682347297668457, "rewards/margins": 0.2431812733411789, "rewards/rejected": -1.5114161968231201, "step": 16820 }, { "epoch": 2.899724328049621, "grad_norm": 46.141563415527344, "learning_rate": 3.399812313430728e-10, "logits/chosen": -2.3478941917419434, "logits/rejected": -2.3191308975219727, "logps/chosen": -192.22817993164062, "logps/rejected": -230.08999633789062, "loss": 0.5932, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3660786151885986, "rewards/margins": 0.3936581015586853, "rewards/rejected": -1.7597367763519287, "step": 16830 }, { "epoch": 2.90144727773949, "grad_norm": 37.20547866821289, "learning_rate": 3.284110954355157e-10, "logits/chosen": -2.2482810020446777, "logits/rejected": -2.215100049972534, "logps/chosen": -176.05003356933594, "logps/rejected": -210.0874786376953, "loss": 0.5841, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2200827598571777, "rewards/margins": 0.3753504753112793, "rewards/rejected": -1.5954333543777466, "step": 16840 }, { "epoch": 2.903170227429359, "grad_norm": 29.959806442260742, "learning_rate": 3.1704060968654746e-10, "logits/chosen": -2.29280161857605, "logits/rejected": -2.2742457389831543, "logps/chosen": -191.05917358398438, "logps/rejected": -208.31741333007812, "loss": 0.6377, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.345529317855835, "rewards/margins": 0.24640360474586487, "rewards/rejected": -1.5919328927993774, "step": 16850 }, { "epoch": 2.904893177119228, "grad_norm": 40.781044006347656, "learning_rate": 3.0586981979873747e-10, "logits/chosen": -2.2989661693573, "logits/rejected": -2.260462522506714, "logps/chosen": -183.75123596191406, "logps/rejected": -206.5366668701172, "loss": 0.627, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2734284400939941, "rewards/margins": 0.291414350271225, "rewards/rejected": -1.564842939376831, "step": 16860 }, { "epoch": 2.906616126809097, "grad_norm": 44.73786544799805, "learning_rate": 2.9489877067199185e-10, "logits/chosen": -2.3094325065612793, "logits/rejected": -2.2826006412506104, "logps/chosen": -179.9336395263672, "logps/rejected": -205.8349609375, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2519553899765015, "rewards/margins": 0.3051519989967346, "rewards/rejected": -1.5571073293685913, "step": 16870 }, { "epoch": 2.908339076498966, "grad_norm": 49.504329681396484, "learning_rate": 2.8412750640338654e-10, "logits/chosen": -2.326951265335083, "logits/rejected": -2.300234079360962, "logps/chosen": -173.57034301757812, "logps/rejected": -217.76400756835938, "loss": 0.5652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.231156587600708, "rewards/margins": 0.4336206316947937, "rewards/rejected": -1.664777159690857, "step": 16880 }, { "epoch": 2.910062026188835, "grad_norm": 44.85762023925781, "learning_rate": 2.7355607028698437e-10, "logits/chosen": -2.258765935897827, "logits/rejected": -2.242250919342041, "logps/chosen": -179.78518676757812, "logps/rejected": -208.6576385498047, "loss": 0.6117, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2435871362686157, "rewards/margins": 0.31127530336380005, "rewards/rejected": -1.5548624992370605, "step": 16890 }, { "epoch": 2.9117849758787044, "grad_norm": 39.28889083862305, "learning_rate": 2.6318450481365164e-10, "logits/chosen": -2.3123373985290527, "logits/rejected": -2.2920501232147217, "logps/chosen": -174.97561645507812, "logps/rejected": -210.5087890625, "loss": 0.5735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2110536098480225, "rewards/margins": 0.40395593643188477, "rewards/rejected": -1.6150096654891968, "step": 16900 }, { "epoch": 2.9135079255685734, "grad_norm": 30.90997314453125, "learning_rate": 2.5301285167088624e-10, "logits/chosen": -2.3353562355041504, "logits/rejected": -2.313014507293701, "logps/chosen": -180.96817016601562, "logps/rejected": -211.5290985107422, "loss": 0.6066, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2850770950317383, "rewards/margins": 0.2966782748699188, "rewards/rejected": -1.5817553997039795, "step": 16910 }, { "epoch": 2.9152308752584424, "grad_norm": 32.51064682006836, "learning_rate": 2.430411517426734e-10, "logits/chosen": -2.281481981277466, "logits/rejected": -2.2618112564086914, "logps/chosen": -176.6731719970703, "logps/rejected": -202.71807861328125, "loss": 0.6197, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2086846828460693, "rewards/margins": 0.2883586585521698, "rewards/rejected": -1.497043490409851, "step": 16920 }, { "epoch": 2.9169538249483113, "grad_norm": 32.26247787475586, "learning_rate": 2.332694451092965e-10, "logits/chosen": -2.324796676635742, "logits/rejected": -2.3038277626037598, "logps/chosen": -171.7364959716797, "logps/rejected": -195.21363830566406, "loss": 0.6096, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1428351402282715, "rewards/margins": 0.30366069078445435, "rewards/rejected": -1.446495771408081, "step": 16930 }, { "epoch": 2.9186767746381808, "grad_norm": 40.67167282104492, "learning_rate": 2.2369777104718768e-10, "logits/chosen": -2.2941434383392334, "logits/rejected": -2.272467613220215, "logps/chosen": -178.9009246826172, "logps/rejected": -210.223876953125, "loss": 0.5812, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1943846940994263, "rewards/margins": 0.3680204451084137, "rewards/rejected": -1.5624051094055176, "step": 16940 }, { "epoch": 2.9203997243280497, "grad_norm": 37.32505416870117, "learning_rate": 2.143261680287667e-10, "logits/chosen": -2.2652194499969482, "logits/rejected": -2.2491161823272705, "logps/chosen": -168.2947540283203, "logps/rejected": -213.27236938476562, "loss": 0.5776, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1722462177276611, "rewards/margins": 0.417537122964859, "rewards/rejected": -1.5897831916809082, "step": 16950 }, { "epoch": 2.9221226740179187, "grad_norm": 36.005714416503906, "learning_rate": 2.051546737222909e-10, "logits/chosen": -2.263373851776123, "logits/rejected": -2.240389347076416, "logps/chosen": -183.56320190429688, "logps/rejected": -221.6046600341797, "loss": 0.5857, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.275015950202942, "rewards/margins": 0.39710181951522827, "rewards/rejected": -1.6721179485321045, "step": 16960 }, { "epoch": 2.9238456237077877, "grad_norm": 35.368194580078125, "learning_rate": 1.9618332499169442e-10, "logits/chosen": -2.3448400497436523, "logits/rejected": -2.311893939971924, "logps/chosen": -179.88156127929688, "logps/rejected": -215.29086303710938, "loss": 0.5784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2340666055679321, "rewards/margins": 0.39124569296836853, "rewards/rejected": -1.6253122091293335, "step": 16970 }, { "epoch": 2.9255685733976566, "grad_norm": 29.743484497070312, "learning_rate": 1.8741215789644936e-10, "logits/chosen": -2.3190419673919678, "logits/rejected": -2.297215700149536, "logps/chosen": -174.7429656982422, "logps/rejected": -202.0531463623047, "loss": 0.6004, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1987437009811401, "rewards/margins": 0.3031441569328308, "rewards/rejected": -1.5018876791000366, "step": 16980 }, { "epoch": 2.9272915230875256, "grad_norm": 31.945310592651367, "learning_rate": 1.7884120769141032e-10, "logits/chosen": -2.3586273193359375, "logits/rejected": -2.3274989128112793, "logps/chosen": -165.72108459472656, "logps/rejected": -206.23239135742188, "loss": 0.5635, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1223981380462646, "rewards/margins": 0.4226682782173157, "rewards/rejected": -1.545066237449646, "step": 16990 }, { "epoch": 2.929014472777395, "grad_norm": 40.08998489379883, "learning_rate": 1.7047050882669223e-10, "logits/chosen": -2.355996608734131, "logits/rejected": -2.332170009613037, "logps/chosen": -175.21693420410156, "logps/rejected": -202.94973754882812, "loss": 0.6114, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1910465955734253, "rewards/margins": 0.30955350399017334, "rewards/rejected": -1.5006000995635986, "step": 17000 }, { "epoch": 2.930737422467264, "grad_norm": 28.414857864379883, "learning_rate": 1.623000949475095e-10, "logits/chosen": -2.266655445098877, "logits/rejected": -2.245577096939087, "logps/chosen": -188.32498168945312, "logps/rejected": -222.77828979492188, "loss": 0.6018, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.345150351524353, "rewards/margins": 0.33940568566322327, "rewards/rejected": -1.684556245803833, "step": 17010 }, { "epoch": 2.932460372157133, "grad_norm": 46.6025276184082, "learning_rate": 1.5432999889404274e-10, "logits/chosen": -2.264037847518921, "logits/rejected": -2.262972593307495, "logps/chosen": -181.83615112304688, "logps/rejected": -202.73126220703125, "loss": 0.6667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2782509326934814, "rewards/margins": 0.22515583038330078, "rewards/rejected": -1.5034067630767822, "step": 17020 }, { "epoch": 2.934183321847002, "grad_norm": 42.2696647644043, "learning_rate": 1.4656025270133876e-10, "logits/chosen": -2.3105950355529785, "logits/rejected": -2.2809667587280273, "logps/chosen": -182.8857421875, "logps/rejected": -206.82546997070312, "loss": 0.6195, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2540470361709595, "rewards/margins": 0.2889506220817566, "rewards/rejected": -1.5429977178573608, "step": 17030 }, { "epoch": 2.9359062715368713, "grad_norm": 35.74762725830078, "learning_rate": 1.3899088759913302e-10, "logits/chosen": -2.341332197189331, "logits/rejected": -2.3059799671173096, "logps/chosen": -180.80311584472656, "logps/rejected": -213.46328735351562, "loss": 0.5792, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2436562776565552, "rewards/margins": 0.38733428716659546, "rewards/rejected": -1.6309906244277954, "step": 17040 }, { "epoch": 2.9376292212267403, "grad_norm": 40.4786262512207, "learning_rate": 1.316219340117608e-10, "logits/chosen": -2.2711122035980225, "logits/rejected": -2.2497386932373047, "logps/chosen": -182.53744506835938, "logps/rejected": -209.8964385986328, "loss": 0.6232, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2701737880706787, "rewards/margins": 0.2986606955528259, "rewards/rejected": -1.5688344240188599, "step": 17050 }, { "epoch": 2.9393521709166093, "grad_norm": 33.46916961669922, "learning_rate": 1.2445342155801842e-10, "logits/chosen": -2.325920581817627, "logits/rejected": -2.294126272201538, "logps/chosen": -183.05918884277344, "logps/rejected": -211.62661743164062, "loss": 0.629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2804826498031616, "rewards/margins": 0.29142576456069946, "rewards/rejected": -1.5719083547592163, "step": 17060 }, { "epoch": 2.9410751206064782, "grad_norm": 50.899085998535156, "learning_rate": 1.1748537905105217e-10, "logits/chosen": -2.289062976837158, "logits/rejected": -2.2559592723846436, "logps/chosen": -185.1095428466797, "logps/rejected": -210.09213256835938, "loss": 0.6258, "rewards/accuracies": 0.6875, "rewards/chosen": -1.290991187095642, "rewards/margins": 0.2987528145313263, "rewards/rejected": -1.5897438526153564, "step": 17070 }, { "epoch": 2.942798070296347, "grad_norm": 33.59029769897461, "learning_rate": 1.1071783449823624e-10, "logits/chosen": -2.2389519214630127, "logits/rejected": -2.20768666267395, "logps/chosen": -176.70346069335938, "logps/rejected": -223.4357147216797, "loss": 0.5514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2281625270843506, "rewards/margins": 0.4805348515510559, "rewards/rejected": -1.7086973190307617, "step": 17080 }, { "epoch": 2.944521019986216, "grad_norm": 31.26056671142578, "learning_rate": 1.0415081510106172e-10, "logits/chosen": -2.288973331451416, "logits/rejected": -2.2687900066375732, "logps/chosen": -177.4471893310547, "logps/rejected": -215.8143310546875, "loss": 0.5844, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2379207611083984, "rewards/margins": 0.3961821496486664, "rewards/rejected": -1.6341030597686768, "step": 17090 }, { "epoch": 2.9462439696760856, "grad_norm": 34.02079772949219, "learning_rate": 9.778434725503105e-11, "logits/chosen": -2.2463839054107666, "logits/rejected": -2.2128617763519287, "logps/chosen": -183.8589630126953, "logps/rejected": -215.9231719970703, "loss": 0.5856, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2714749574661255, "rewards/margins": 0.3676893413066864, "rewards/rejected": -1.6391643285751343, "step": 17100 }, { "epoch": 2.9479669193659546, "grad_norm": 32.88734817504883, "learning_rate": 9.161845654954703e-11, "logits/chosen": -2.2507004737854004, "logits/rejected": -2.238379716873169, "logps/chosen": -195.37942504882812, "logps/rejected": -225.2381591796875, "loss": 0.6197, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3950939178466797, "rewards/margins": 0.32841694355010986, "rewards/rejected": -1.723510980606079, "step": 17110 }, { "epoch": 2.9496898690558235, "grad_norm": 37.77426528930664, "learning_rate": 8.565316776780739e-11, "logits/chosen": -2.3413896560668945, "logits/rejected": -2.30137300491333, "logps/chosen": -176.1731414794922, "logps/rejected": -216.9041748046875, "loss": 0.5323, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.2267433404922485, "rewards/margins": 0.46331986784935, "rewards/rejected": -1.690063238143921, "step": 17120 }, { "epoch": 2.9514128187456925, "grad_norm": 40.9581413269043, "learning_rate": 7.988850488672705e-11, "logits/chosen": -2.2292721271514893, "logits/rejected": -2.1956279277801514, "logps/chosen": -183.31048583984375, "logps/rejected": -217.1750030517578, "loss": 0.5832, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2756314277648926, "rewards/margins": 0.36559030413627625, "rewards/rejected": -1.6412217617034912, "step": 17130 }, { "epoch": 2.953135768435562, "grad_norm": 32.19672775268555, "learning_rate": 7.432449107679928e-11, "logits/chosen": -2.234701633453369, "logits/rejected": -2.2178635597229004, "logps/chosen": -174.7184295654297, "logps/rejected": -208.18563842773438, "loss": 0.5853, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1894357204437256, "rewards/margins": 0.3720587193965912, "rewards/rejected": -1.5614944696426392, "step": 17140 }, { "epoch": 2.954858718125431, "grad_norm": 42.60933303833008, "learning_rate": 6.896114870204583e-11, "logits/chosen": -2.339540958404541, "logits/rejected": -2.3109090328216553, "logps/chosen": -184.4068603515625, "logps/rejected": -211.8623504638672, "loss": 0.6216, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2877216339111328, "rewards/margins": 0.31411486864089966, "rewards/rejected": -1.6018365621566772, "step": 17150 }, { "epoch": 2.9565816678153, "grad_norm": 29.023967742919922, "learning_rate": 6.379849931990034e-11, "logits/chosen": -2.338467597961426, "logits/rejected": -2.3185648918151855, "logps/chosen": -172.90652465820312, "logps/rejected": -225.8275604248047, "loss": 0.5377, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2059366703033447, "rewards/margins": 0.539142370223999, "rewards/rejected": -1.7450790405273438, "step": 17160 }, { "epoch": 2.958304617505169, "grad_norm": 29.838197708129883, "learning_rate": 5.883656368114164e-11, "logits/chosen": -2.3640499114990234, "logits/rejected": -2.339353561401367, "logps/chosen": -194.65211486816406, "logps/rejected": -218.7022705078125, "loss": 0.6556, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3461074829101562, "rewards/margins": 0.27067288756370544, "rewards/rejected": -1.6167805194854736, "step": 17170 }, { "epoch": 2.960027567195038, "grad_norm": 39.5936279296875, "learning_rate": 5.407536172978844e-11, "logits/chosen": -2.311352491378784, "logits/rejected": -2.289877414703369, "logps/chosen": -179.19358825683594, "logps/rejected": -213.298583984375, "loss": 0.605, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2765612602233887, "rewards/margins": 0.34954506158828735, "rewards/rejected": -1.6261062622070312, "step": 17180 }, { "epoch": 2.9617505168849068, "grad_norm": 37.97072219848633, "learning_rate": 4.951491260302698e-11, "logits/chosen": -2.2549374103546143, "logits/rejected": -2.238830327987671, "logps/chosen": -174.09535217285156, "logps/rejected": -204.37635803222656, "loss": 0.606, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2118583917617798, "rewards/margins": 0.31206992268562317, "rewards/rejected": -1.523928165435791, "step": 17190 }, { "epoch": 2.963473466574776, "grad_norm": 59.260013580322266, "learning_rate": 4.515523463115012e-11, "logits/chosen": -2.312384605407715, "logits/rejected": -2.298529624938965, "logps/chosen": -172.6897430419922, "logps/rejected": -202.76080322265625, "loss": 0.6096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1744720935821533, "rewards/margins": 0.3171740174293518, "rewards/rejected": -1.4916460514068604, "step": 17200 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -2.3549132347106934, "eval_logits/rejected": -2.342874765396118, "eval_logps/chosen": -164.96139526367188, "eval_logps/rejected": -186.09048461914062, "eval_loss": 0.650018572807312, "eval_rewards/accuracies": 0.6045538783073425, "eval_rewards/chosen": -1.0594593286514282, "eval_rewards/margins": 0.1739494800567627, "eval_rewards/rejected": -1.2334089279174805, "eval_runtime": 384.5491, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 17200 }, { "epoch": 2.965196416264645, "grad_norm": 41.53389358520508, "learning_rate": 4.099634533745733e-11, "logits/chosen": -2.2500195503234863, "logits/rejected": -2.231739044189453, "logps/chosen": -182.5985107421875, "logps/rejected": -217.8794708251953, "loss": 0.6055, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2984097003936768, "rewards/margins": 0.3526950180530548, "rewards/rejected": -1.6511045694351196, "step": 17210 }, { "epoch": 2.966919365954514, "grad_norm": 27.601787567138672, "learning_rate": 3.7038261438204765e-11, "logits/chosen": -2.289471387863159, "logits/rejected": -2.269460678100586, "logps/chosen": -182.10348510742188, "logps/rejected": -233.2564239501953, "loss": 0.5604, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2569857835769653, "rewards/margins": 0.51573246717453, "rewards/rejected": -1.7727181911468506, "step": 17220 }, { "epoch": 2.968642315644383, "grad_norm": 33.00059127807617, "learning_rate": 3.3280998842527554e-11, "logits/chosen": -2.2976396083831787, "logits/rejected": -2.2868614196777344, "logps/chosen": -178.9959259033203, "logps/rejected": -215.16806030273438, "loss": 0.6023, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2521908283233643, "rewards/margins": 0.3561028242111206, "rewards/rejected": -1.6082935333251953, "step": 17230 }, { "epoch": 2.9703652653342525, "grad_norm": 34.8737907409668, "learning_rate": 2.972457265237871e-11, "logits/chosen": -2.2938859462738037, "logits/rejected": -2.2738187313079834, "logps/chosen": -169.763427734375, "logps/rejected": -201.87765502929688, "loss": 0.5938, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1499814987182617, "rewards/margins": 0.326345831155777, "rewards/rejected": -1.4763273000717163, "step": 17240 }, { "epoch": 2.9720882150241215, "grad_norm": 35.13932800292969, "learning_rate": 2.6368997162479202e-11, "logits/chosen": -2.272411823272705, "logits/rejected": -2.2502591609954834, "logps/chosen": -176.018310546875, "logps/rejected": -217.7426300048828, "loss": 0.5582, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2261368036270142, "rewards/margins": 0.4333557188510895, "rewards/rejected": -1.6594922542572021, "step": 17250 }, { "epoch": 2.9738111647139904, "grad_norm": 42.27689743041992, "learning_rate": 2.321428586022911e-11, "logits/chosen": -2.2786245346069336, "logits/rejected": -2.2477970123291016, "logps/chosen": -174.90060424804688, "logps/rejected": -213.00875854492188, "loss": 0.567, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1894375085830688, "rewards/margins": 0.4287734031677246, "rewards/rejected": -1.618210792541504, "step": 17260 }, { "epoch": 2.9755341144038594, "grad_norm": 40.46282958984375, "learning_rate": 2.0260451425690994e-11, "logits/chosen": -2.2892489433288574, "logits/rejected": -2.260471820831299, "logps/chosen": -185.9062042236328, "logps/rejected": -212.2075653076172, "loss": 0.6475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2983613014221191, "rewards/margins": 0.28522801399230957, "rewards/rejected": -1.5835893154144287, "step": 17270 }, { "epoch": 2.9772570640937284, "grad_norm": 32.98335647583008, "learning_rate": 1.7507505731523266e-11, "logits/chosen": -2.2438535690307617, "logits/rejected": -2.2073891162872314, "logps/chosen": -176.59791564941406, "logps/rejected": -214.287109375, "loss": 0.5699, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2137374877929688, "rewards/margins": 0.4162743091583252, "rewards/rejected": -1.6300119161605835, "step": 17280 }, { "epoch": 2.9789800137835973, "grad_norm": 29.91464614868164, "learning_rate": 1.4955459842913576e-11, "logits/chosen": -2.3043110370635986, "logits/rejected": -2.2792234420776367, "logps/chosen": -183.82244873046875, "logps/rejected": -214.74813842773438, "loss": 0.5871, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2343107461929321, "rewards/margins": 0.3532152771949768, "rewards/rejected": -1.5875260829925537, "step": 17290 }, { "epoch": 2.9807029634734663, "grad_norm": 56.498451232910156, "learning_rate": 1.2604324017573276e-11, "logits/chosen": -2.3131518363952637, "logits/rejected": -2.297375440597534, "logps/chosen": -183.20748901367188, "logps/rejected": -206.3766632080078, "loss": 0.6271, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.265207052230835, "rewards/margins": 0.28281646966934204, "rewards/rejected": -1.5480234622955322, "step": 17300 }, { "epoch": 2.9824259131633357, "grad_norm": 45.164241790771484, "learning_rate": 1.0454107705665238e-11, "logits/chosen": -2.3076233863830566, "logits/rejected": -2.2815582752227783, "logps/chosen": -180.4513397216797, "logps/rejected": -215.8129119873047, "loss": 0.582, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2481348514556885, "rewards/margins": 0.3904893100261688, "rewards/rejected": -1.6386241912841797, "step": 17310 }, { "epoch": 2.9841488628532047, "grad_norm": 39.80954360961914, "learning_rate": 8.504819549770559e-12, "logits/chosen": -2.3062615394592285, "logits/rejected": -2.284247398376465, "logps/chosen": -181.01290893554688, "logps/rejected": -204.4337921142578, "loss": 0.6206, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2549407482147217, "rewards/margins": 0.27550196647644043, "rewards/rejected": -1.5304428339004517, "step": 17320 }, { "epoch": 2.9858718125430737, "grad_norm": 45.717803955078125, "learning_rate": 6.7564673848719e-12, "logits/chosen": -2.255821943283081, "logits/rejected": -2.2316794395446777, "logps/chosen": -180.52059936523438, "logps/rejected": -203.00363159179688, "loss": 0.6294, "rewards/accuracies": 0.625, "rewards/chosen": -1.237790822982788, "rewards/margins": 0.28286388516426086, "rewards/rejected": -1.5206546783447266, "step": 17330 }, { "epoch": 2.987594762232943, "grad_norm": 26.13838768005371, "learning_rate": 5.2090582382924295e-12, "logits/chosen": -2.31354022026062, "logits/rejected": -2.2810709476470947, "logps/chosen": -174.29434204101562, "logps/rejected": -211.77145385742188, "loss": 0.5811, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2048596143722534, "rewards/margins": 0.3996252119541168, "rewards/rejected": -1.6044849157333374, "step": 17340 }, { "epoch": 2.989317711922812, "grad_norm": 34.028167724609375, "learning_rate": 3.8625983297069234e-12, "logits/chosen": -2.320079803466797, "logits/rejected": -2.2873146533966064, "logps/chosen": -171.4598846435547, "logps/rejected": -200.25634765625, "loss": 0.597, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1712242364883423, "rewards/margins": 0.32024580240249634, "rewards/rejected": -1.4914699792861938, "step": 17350 }, { "epoch": 2.991040661612681, "grad_norm": 43.01903533935547, "learning_rate": 2.7170930710695983e-12, "logits/chosen": -2.3500120639801025, "logits/rejected": -2.3285441398620605, "logps/chosen": -177.98741149902344, "logps/rejected": -217.0200653076172, "loss": 0.589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.227912187576294, "rewards/margins": 0.400569349527359, "rewards/rejected": -1.6284816265106201, "step": 17360 }, { "epoch": 2.99276361130255, "grad_norm": 38.99733352661133, "learning_rate": 1.7725470666363208e-12, "logits/chosen": -2.285942792892456, "logits/rejected": -2.2686188220977783, "logps/chosen": -180.58023071289062, "logps/rejected": -207.6968231201172, "loss": 0.6286, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2742855548858643, "rewards/margins": 0.26797252893447876, "rewards/rejected": -1.5422580242156982, "step": 17370 }, { "epoch": 2.994486560992419, "grad_norm": 54.05454635620117, "learning_rate": 1.0289641129146431e-12, "logits/chosen": -2.268444776535034, "logits/rejected": -2.254495620727539, "logps/chosen": -182.00112915039062, "logps/rejected": -209.5324249267578, "loss": 0.6173, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.284103512763977, "rewards/margins": 0.27302271127700806, "rewards/rejected": -1.5571261644363403, "step": 17380 }, { "epoch": 2.996209510682288, "grad_norm": 41.577232360839844, "learning_rate": 4.863471986693568e-13, "logits/chosen": -2.2996535301208496, "logits/rejected": -2.279010057449341, "logps/chosen": -180.03443908691406, "logps/rejected": -210.28857421875, "loss": 0.6006, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2460159063339233, "rewards/margins": 0.3277485966682434, "rewards/rejected": -1.5737645626068115, "step": 17390 }, { "epoch": 2.997932460372157, "grad_norm": 30.558481216430664, "learning_rate": 1.4469850488918467e-13, "logits/chosen": -2.2963578701019287, "logits/rejected": -2.2678630352020264, "logps/chosen": -181.9180450439453, "logps/rejected": -216.5481719970703, "loss": 0.585, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2563611268997192, "rewards/margins": 0.4204491078853607, "rewards/rejected": -1.6768100261688232, "step": 17400 }, { "epoch": 2.9996554100620263, "grad_norm": 52.21319580078125, "learning_rate": 4.019404797883652e-15, "logits/chosen": -2.2540431022644043, "logits/rejected": -2.228407621383667, "logps/chosen": -172.69688415527344, "logps/rejected": -216.9202880859375, "loss": 0.5487, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1783651113510132, "rewards/margins": 0.4475262761116028, "rewards/rejected": -1.6258913278579712, "step": 17410 }, { "epoch": 3.0, "step": 17412, "total_flos": 0.0, "train_loss": 0.6300815686244623, "train_runtime": 86182.1351, "train_samples_per_second": 3.232, "train_steps_per_second": 0.202 } ], "logging_steps": 10, "max_steps": 17412, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }