diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7474 @@ +{ + "best_metric": 0.573898196220398, + "best_model_checkpoint": "data/tinyllama_moe_dpo_ultrafeedback_v2_epochs5/checkpoint-3300", + "epoch": 4.998953427524856, + "eval_steps": 100, + "global_step": 4775, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.208333333333333e-09, + "logits/chosen": -2.7229816913604736, + "logits/rejected": -2.704376220703125, + "logps/chosen": -295.48358154296875, + "logps/rejected": -277.29522705078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -2.7768375873565674, + "logits/rejected": -2.6537435054779053, + "logps/chosen": -356.50335693359375, + "logps/rejected": -288.44366455078125, + "loss": 0.6934, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": -0.0006966523360460997, + "rewards/margins": -0.0007656050729565322, + "rewards/rejected": 6.895273691043258e-05, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.7214996814727783, + "logits/rejected": -2.6908183097839355, + "logps/chosen": -313.5826721191406, + "logps/rejected": -281.9164733886719, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.00022428599186241627, + "rewards/margins": 0.000738097180146724, + "rewards/rejected": -0.0005138111882843077, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.5624999999999999e-07, + "logits/chosen": -2.7791266441345215, + "logits/rejected": -2.7023978233337402, + "logps/chosen": -346.8282165527344, + "logps/rejected": -305.5320739746094, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00040574674494564533, + "rewards/margins": -0.00035077956272289157, + "rewards/rejected": 0.0007565263076685369, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.7579116821289062, + "logits/rejected": -2.6938705444335938, + "logps/chosen": -336.7049865722656, + "logps/rejected": -282.226806640625, + "loss": 0.6935, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -4.12855988543015e-05, + "rewards/margins": 0.0006455664406530559, + "rewards/rejected": -0.0006868520868010819, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 2.604166666666667e-07, + "logits/chosen": -2.773176670074463, + "logits/rejected": -2.7138824462890625, + "logps/chosen": -352.06036376953125, + "logps/rejected": -314.73699951171875, + "loss": 0.6927, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.001099282642826438, + "rewards/margins": 0.0008529500337317586, + "rewards/rejected": 0.0002463326381985098, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.752551555633545, + "logits/rejected": -2.667330265045166, + "logps/chosen": -353.0582275390625, + "logps/rejected": -323.7419738769531, + "loss": 0.6928, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0003327417653053999, + "rewards/margins": 5.731172677769791e-06, + "rewards/rejected": 0.000327010580804199, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 3.645833333333333e-07, + "logits/chosen": -2.8295810222625732, + "logits/rejected": -2.751282215118408, + "logps/chosen": -387.351318359375, + "logps/rejected": -340.2878112792969, + "loss": 0.6928, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0006954811397008598, + "rewards/margins": -0.0006631066789850593, + "rewards/rejected": 0.00135858787689358, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.6580421924591064, + "logits/rejected": -2.5816047191619873, + "logps/chosen": -359.86114501953125, + "logps/rejected": -300.00640869140625, + "loss": 0.6926, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0022001974284648895, + "rewards/margins": 0.0019074224401265383, + "rewards/rejected": 0.00029277493013069034, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 4.6874999999999996e-07, + "logits/chosen": -2.7795090675354004, + "logits/rejected": -2.7111852169036865, + "logps/chosen": -353.28106689453125, + "logps/rejected": -316.5885314941406, + "loss": 0.6918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003978191874921322, + "rewards/margins": 0.004179838579148054, + "rewards/rejected": -0.00020164628222119063, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 4.999990983803055e-07, + "logits/chosen": -2.7545604705810547, + "logits/rejected": -2.7175841331481934, + "logps/chosen": -371.634765625, + "logps/rejected": -347.4078674316406, + "loss": 0.6913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0036698828916996717, + "rewards/margins": 0.0029316016007214785, + "rewards/rejected": 0.0007382815820164979, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.7888541221618652, + "eval_logits/rejected": -2.717860698699951, + "eval_logps/chosen": -348.8463134765625, + "eval_logps/rejected": -307.7887268066406, + "eval_loss": 0.6915069818496704, + "eval_rewards/accuracies": 0.601190447807312, + "eval_rewards/chosen": 0.005134147591888905, + "eval_rewards/margins": 0.004059688653796911, + "eval_rewards/rejected": 0.0010744588216766715, + "eval_runtime": 351.1264, + "eval_samples_per_second": 5.696, + "eval_steps_per_second": 0.179, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 4.999889552334295e-07, + "logits/chosen": -2.7624781131744385, + "logits/rejected": -2.6426429748535156, + "logps/chosen": -319.4280700683594, + "logps/rejected": -255.9808807373047, + "loss": 0.691, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.003979907371103764, + "rewards/margins": 0.0030905543826520443, + "rewards/rejected": 0.0008893535705283284, + "step": 110 + }, + { + "epoch": 0.13, + "learning_rate": 4.999675423738452e-07, + "logits/chosen": -2.739222764968872, + "logits/rejected": -2.634364128112793, + "logps/chosen": -365.7749938964844, + "logps/rejected": -293.59381103515625, + "loss": 0.6903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.008735042065382004, + "rewards/margins": 0.008576452732086182, + "rewards/rejected": 0.00015858971164561808, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 4.999348607668623e-07, + "logits/chosen": -2.7971653938293457, + "logits/rejected": -2.7021219730377197, + "logps/chosen": -385.40155029296875, + "logps/rejected": -314.69622802734375, + "loss": 0.6895, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.009917149320244789, + "rewards/margins": 0.00689274538308382, + "rewards/rejected": 0.0030244034714996815, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 4.998909118857952e-07, + "logits/chosen": -2.7118449211120605, + "logits/rejected": -2.6747097969055176, + "logps/chosen": -291.15789794921875, + "logps/rejected": -265.5232849121094, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010748682543635368, + "rewards/margins": 0.0077992090955376625, + "rewards/rejected": 0.002949473215267062, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 4.998356977118967e-07, + "logits/chosen": -2.7854163646698, + "logits/rejected": -2.7411043643951416, + "logps/chosen": -341.4150695800781, + "logps/rejected": -339.93988037109375, + "loss": 0.6895, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.008866357617080212, + "rewards/margins": 0.002817091066390276, + "rewards/rejected": 0.006049267947673798, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 4.99769220734268e-07, + "logits/chosen": -2.793144464492798, + "logits/rejected": -2.7030389308929443, + "logps/chosen": -357.8925476074219, + "logps/rejected": -336.72650146484375, + "loss": 0.6872, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.014735780656337738, + "rewards/margins": 0.01035328023135662, + "rewards/rejected": 0.004382501356303692, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 4.996914839497473e-07, + "logits/chosen": -2.7929883003234863, + "logits/rejected": -2.7215566635131836, + "logps/chosen": -330.4803161621094, + "logps/rejected": -284.5147705078125, + "loss": 0.6865, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.0183271374553442, + "rewards/margins": 0.013379251584410667, + "rewards/rejected": 0.004947885405272245, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 4.996024908627745e-07, + "logits/chosen": -2.7179646492004395, + "logits/rejected": -2.629631280899048, + "logps/chosen": -302.9635009765625, + "logps/rejected": -271.3373107910156, + "loss": 0.6859, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.01472156960517168, + "rewards/margins": 0.016796987503767014, + "rewards/rejected": -0.0020754183642566204, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 4.995022454852327e-07, + "logits/chosen": -2.793166160583496, + "logits/rejected": -2.6981942653656006, + "logps/chosen": -346.3775939941406, + "logps/rejected": -303.22491455078125, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0220264233648777, + "rewards/margins": 0.01690804772078991, + "rewards/rejected": 0.005118372850120068, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 4.993907523362682e-07, + "logits/chosen": -2.7156126499176025, + "logits/rejected": -2.6670401096343994, + "logps/chosen": -347.8214416503906, + "logps/rejected": -319.7621154785156, + "loss": 0.6848, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02535415254533291, + "rewards/margins": 0.01954091526567936, + "rewards/rejected": 0.005813241004943848, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.778569221496582, + "eval_logits/rejected": -2.706406593322754, + "eval_logps/chosen": -347.1147766113281, + "eval_logps/rejected": -307.7813720703125, + "eval_loss": 0.6843611001968384, + "eval_rewards/accuracies": 0.6547619104385376, + "eval_rewards/chosen": 0.022449664771556854, + "eval_rewards/margins": 0.02130187302827835, + "eval_rewards/rejected": 0.0011477925581857562, + "eval_runtime": 354.9846, + "eval_samples_per_second": 5.634, + "eval_steps_per_second": 0.177, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 4.992680164420859e-07, + "logits/chosen": -2.7765281200408936, + "logits/rejected": -2.670767068862915, + "logps/chosen": -370.70208740234375, + "logps/rejected": -300.8094177246094, + "loss": 0.6842, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.022380836308002472, + "rewards/margins": 0.020519474521279335, + "rewards/rejected": 0.001861358410678804, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 4.991340433357235e-07, + "logits/chosen": -2.776369571685791, + "logits/rejected": -2.6916940212249756, + "logps/chosen": -353.1146545410156, + "logps/rejected": -323.1269226074219, + "loss": 0.6831, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.027633443474769592, + "rewards/margins": 0.02584686316549778, + "rewards/rejected": 0.0017865825211629272, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 4.989888390568014e-07, + "logits/chosen": -2.739046812057495, + "logits/rejected": -2.6546576023101807, + "logps/chosen": -353.8674621582031, + "logps/rejected": -290.97503662109375, + "loss": 0.6788, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.023385953158140182, + "rewards/margins": 0.026143008843064308, + "rewards/rejected": -0.002757056849077344, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.988324101512507e-07, + "logits/chosen": -2.730693817138672, + "logits/rejected": -2.6349058151245117, + "logps/chosen": -338.3262634277344, + "logps/rejected": -270.018798828125, + "loss": 0.6805, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.027155738323926926, + "rewards/margins": 0.025723371654748917, + "rewards/rejected": 0.0014323694631457329, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.986647636710183e-07, + "logits/chosen": -2.7343668937683105, + "logits/rejected": -2.7023162841796875, + "logps/chosen": -322.39031982421875, + "logps/rejected": -322.66693115234375, + "loss": 0.6811, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.02547827921807766, + "rewards/margins": 0.026273246854543686, + "rewards/rejected": -0.0007949693244881928, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.984859071737489e-07, + "logits/chosen": -2.7031362056732178, + "logits/rejected": -2.6224429607391357, + "logps/chosen": -345.26470947265625, + "logps/rejected": -309.10003662109375, + "loss": 0.679, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.025715002790093422, + "rewards/margins": 0.036292947828769684, + "rewards/rejected": -0.010577939450740814, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.982958487224441e-07, + "logits/chosen": -2.809894323348999, + "logits/rejected": -2.717299699783325, + "logps/chosen": -356.2657165527344, + "logps/rejected": -297.43341064453125, + "loss": 0.6773, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.03299617022275925, + "rewards/margins": 0.04462386667728424, + "rewards/rejected": -0.011627699248492718, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.980945968850989e-07, + "logits/chosen": -2.7708637714385986, + "logits/rejected": -2.7318742275238037, + "logps/chosen": -355.09332275390625, + "logps/rejected": -334.1681213378906, + "loss": 0.6789, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022588271647691727, + "rewards/margins": 0.02767338789999485, + "rewards/rejected": -0.0050851134583354, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.978821607343156e-07, + "logits/chosen": -2.7207131385803223, + "logits/rejected": -2.6686415672302246, + "logps/chosen": -339.83685302734375, + "logps/rejected": -300.10791015625, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024562764912843704, + "rewards/margins": 0.03406853228807449, + "rewards/rejected": -0.009505772963166237, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.976585498468949e-07, + "logits/chosen": -2.750870704650879, + "logits/rejected": -2.6027140617370605, + "logps/chosen": -343.34881591796875, + "logps/rejected": -281.9125671386719, + "loss": 0.6719, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02178148925304413, + "rewards/margins": 0.04298964887857437, + "rewards/rejected": -0.021208161488175392, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.756394147872925, + "eval_logits/rejected": -2.682809829711914, + "eval_logps/chosen": -347.1925964355469, + "eval_logps/rejected": -310.327392578125, + "eval_loss": 0.6745370030403137, + "eval_rewards/accuracies": 0.6567460298538208, + "eval_rewards/chosen": 0.021671386435627937, + "eval_rewards/margins": 0.04598393663764, + "eval_rewards/rejected": -0.024312546476721764, + "eval_runtime": 370.5051, + "eval_samples_per_second": 5.398, + "eval_steps_per_second": 0.17, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.974237743034041e-07, + "logits/chosen": -2.6656975746154785, + "logits/rejected": -2.628554582595825, + "logps/chosen": -343.7931213378906, + "logps/rejected": -320.90826416015625, + "loss": 0.6684, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.0219754446297884, + "rewards/margins": 0.06229216977953911, + "rewards/rejected": -0.04031673073768616, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.971778446877224e-07, + "logits/chosen": -2.688197374343872, + "logits/rejected": -2.6364893913269043, + "logps/chosen": -334.7703857421875, + "logps/rejected": -318.79986572265625, + "loss": 0.6706, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.02248135581612587, + "rewards/margins": 0.0489434115588665, + "rewards/rejected": -0.02646205946803093, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 4.96920772086564e-07, + "logits/chosen": -2.6598384380340576, + "logits/rejected": -2.589719533920288, + "logps/chosen": -335.1375427246094, + "logps/rejected": -277.85833740234375, + "loss": 0.6743, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.025626670569181442, + "rewards/margins": 0.04879312217235565, + "rewards/rejected": -0.023166455328464508, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 4.966525680889785e-07, + "logits/chosen": -2.6839308738708496, + "logits/rejected": -2.6098990440368652, + "logps/chosen": -296.1307067871094, + "logps/rejected": -268.22125244140625, + "loss": 0.674, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0010839557508006692, + "rewards/margins": 0.031559232622385025, + "rewards/rejected": -0.032643191516399384, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 4.963732447858279e-07, + "logits/chosen": -2.65653133392334, + "logits/rejected": -2.650408983230591, + "logps/chosen": -334.28076171875, + "logps/rejected": -332.80877685546875, + "loss": 0.6724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0002481082337908447, + "rewards/margins": 0.046056605875492096, + "rewards/rejected": -0.046304717659950256, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 4.960828147692421e-07, + "logits/chosen": -2.7294137477874756, + "logits/rejected": -2.6597273349761963, + "logps/chosen": -334.6825256347656, + "logps/rejected": -288.56488037109375, + "loss": 0.6662, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.012845727615058422, + "rewards/margins": 0.04838230460882187, + "rewards/rejected": -0.06122802942991257, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 4.957812911320509e-07, + "logits/chosen": -2.6296286582946777, + "logits/rejected": -2.6109249591827393, + "logps/chosen": -287.88055419921875, + "logps/rejected": -300.3601989746094, + "loss": 0.6645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.01973501779139042, + "rewards/margins": 0.057510875165462494, + "rewards/rejected": -0.07724590599536896, + "step": 370 + }, + { + "epoch": 0.4, + "learning_rate": 4.95468687467194e-07, + "logits/chosen": -2.7505180835723877, + "logits/rejected": -2.6816720962524414, + "logps/chosen": -361.86358642578125, + "logps/rejected": -319.72576904296875, + "loss": 0.6666, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.012716621160507202, + "rewards/margins": 0.06772245466709137, + "rewards/rejected": -0.08043907582759857, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 4.951450178671078e-07, + "logits/chosen": -2.6552157402038574, + "logits/rejected": -2.5710456371307373, + "logps/chosen": -332.8970031738281, + "logps/rejected": -284.53131103515625, + "loss": 0.6676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0116237448528409, + "rewards/margins": 0.06472723931074142, + "rewards/rejected": -0.07635099440813065, + "step": 390 + }, + { + "epoch": 0.42, + "learning_rate": 4.948102969230907e-07, + "logits/chosen": -2.7454886436462402, + "logits/rejected": -2.6737310886383057, + "logps/chosen": -372.53106689453125, + "logps/rejected": -322.9483337402344, + "loss": 0.6593, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01839020662009716, + "rewards/margins": 0.07976034283638, + "rewards/rejected": -0.09815056622028351, + "step": 400 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.7167913913726807, + "eval_logits/rejected": -2.641690731048584, + "eval_logps/chosen": -351.2079162597656, + "eval_logps/rejected": -317.75079345703125, + "eval_loss": 0.662617564201355, + "eval_rewards/accuracies": 0.6626983880996704, + "eval_rewards/chosen": -0.01848192885518074, + "eval_rewards/margins": 0.08006466180086136, + "eval_rewards/rejected": -0.0985465869307518, + "eval_runtime": 329.4056, + "eval_samples_per_second": 6.072, + "eval_steps_per_second": 0.191, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 4.944645397246446e-07, + "logits/chosen": -2.7801225185394287, + "logits/rejected": -2.722992420196533, + "logps/chosen": -375.0428771972656, + "logps/rejected": -349.60369873046875, + "loss": 0.6599, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.00478482898324728, + "rewards/margins": 0.08646519482135773, + "rewards/rejected": -0.09125002473592758, + "step": 410 + }, + { + "epoch": 0.44, + "learning_rate": 4.941077618587955e-07, + "logits/chosen": -2.634456157684326, + "logits/rejected": -2.5576937198638916, + "logps/chosen": -313.3975524902344, + "logps/rejected": -267.78460693359375, + "loss": 0.6589, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.017164412885904312, + "rewards/margins": 0.07831953465938568, + "rewards/rejected": -0.0954839438199997, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 4.937399794093903e-07, + "logits/chosen": -2.6605842113494873, + "logits/rejected": -2.618790864944458, + "logps/chosen": -318.903076171875, + "logps/rejected": -288.6401062011719, + "loss": 0.6616, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.028123896569013596, + "rewards/margins": 0.06676146388053894, + "rewards/rejected": -0.09488535672426224, + "step": 430 + }, + { + "epoch": 0.46, + "learning_rate": 4.933612089563714e-07, + "logits/chosen": -2.6676137447357178, + "logits/rejected": -2.6490044593811035, + "logps/chosen": -319.513671875, + "logps/rejected": -300.4092712402344, + "loss": 0.6587, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04882526397705078, + "rewards/margins": 0.0546044185757637, + "rewards/rejected": -0.10342969000339508, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 4.929714675750299e-07, + "logits/chosen": -2.5612893104553223, + "logits/rejected": -2.5102691650390625, + "logps/chosen": -322.2162170410156, + "logps/rejected": -295.4486389160156, + "loss": 0.6549, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.023067612200975418, + "rewards/margins": 0.1121089830994606, + "rewards/rejected": -0.13517656922340393, + "step": 450 + }, + { + "epoch": 0.48, + "learning_rate": 4.925707728352358e-07, + "logits/chosen": -2.659719467163086, + "logits/rejected": -2.5714974403381348, + "logps/chosen": -328.37091064453125, + "logps/rejected": -311.16217041015625, + "loss": 0.6502, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04032892733812332, + "rewards/margins": 0.08182945102453232, + "rewards/rejected": -0.12215838581323624, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.921591428006456e-07, + "logits/chosen": -2.680175304412842, + "logits/rejected": -2.578962564468384, + "logps/chosen": -373.35870361328125, + "logps/rejected": -318.6756896972656, + "loss": 0.6436, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.03967723622918129, + "rewards/margins": 0.15787221491336823, + "rewards/rejected": -0.19754944741725922, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.917365960278877e-07, + "logits/chosen": -2.5912580490112305, + "logits/rejected": -2.5477182865142822, + "logps/chosen": -288.45233154296875, + "logps/rejected": -286.4162292480469, + "loss": 0.6651, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09010852873325348, + "rewards/margins": 0.052035313099622726, + "rewards/rejected": -0.14214381575584412, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.913031515657269e-07, + "logits/chosen": -2.668935775756836, + "logits/rejected": -2.566549777984619, + "logps/chosen": -343.8060607910156, + "logps/rejected": -311.8091735839844, + "loss": 0.6491, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.08070842921733856, + "rewards/margins": 0.10430131107568741, + "rewards/rejected": -0.18500974774360657, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.908588289542046e-07, + "logits/chosen": -2.633600950241089, + "logits/rejected": -2.5761375427246094, + "logps/chosen": -332.14471435546875, + "logps/rejected": -312.8584289550781, + "loss": 0.6489, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08002828061580658, + "rewards/margins": 0.12527289986610413, + "rewards/rejected": -0.2053011953830719, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.676621913909912, + "eval_logits/rejected": -2.599640130996704, + "eval_logps/chosen": -359.7169494628906, + "eval_logps/rejected": -330.5643615722656, + "eval_loss": 0.6502917408943176, + "eval_rewards/accuracies": 0.6666666865348816, + "eval_rewards/chosen": -0.10357183963060379, + "eval_rewards/margins": 0.12311027199029922, + "eval_rewards/rejected": -0.22668209671974182, + "eval_runtime": 372.9183, + "eval_samples_per_second": 5.363, + "eval_steps_per_second": 0.169, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.904036482237585e-07, + "logits/chosen": -2.6458828449249268, + "logits/rejected": -2.524355411529541, + "logps/chosen": -375.0708923339844, + "logps/rejected": -317.0387878417969, + "loss": 0.6448, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.09278295934200287, + "rewards/margins": 0.160991370677948, + "rewards/rejected": -0.2537743151187897, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.899376298943193e-07, + "logits/chosen": -2.5954272747039795, + "logits/rejected": -2.545722484588623, + "logps/chosen": -318.3174133300781, + "logps/rejected": -312.9372863769531, + "loss": 0.6473, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09085050970315933, + "rewards/margins": 0.15199792385101318, + "rewards/rejected": -0.2428484410047531, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.894607949743861e-07, + "logits/chosen": -2.581209182739258, + "logits/rejected": -2.5345587730407715, + "logps/chosen": -355.0510559082031, + "logps/rejected": -327.82257080078125, + "loss": 0.6446, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11026673018932343, + "rewards/margins": 0.12424556910991669, + "rewards/rejected": -0.2345122992992401, + "step": 530 + }, + { + "epoch": 0.57, + "learning_rate": 4.889731649600786e-07, + "logits/chosen": -2.6255240440368652, + "logits/rejected": -2.5667052268981934, + "logps/chosen": -375.1460876464844, + "logps/rejected": -374.27880859375, + "loss": 0.6423, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.12368792295455933, + "rewards/margins": 0.1251518726348877, + "rewards/rejected": -0.24883978068828583, + "step": 540 + }, + { + "epoch": 0.58, + "learning_rate": 4.884747618341686e-07, + "logits/chosen": -2.600996971130371, + "logits/rejected": -2.514336585998535, + "logps/chosen": -343.29559326171875, + "logps/rejected": -321.33917236328125, + "loss": 0.651, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1610172837972641, + "rewards/margins": 0.13775303959846497, + "rewards/rejected": -0.29877036809921265, + "step": 550 + }, + { + "epoch": 0.59, + "learning_rate": 4.879656080650891e-07, + "logits/chosen": -2.6180787086486816, + "logits/rejected": -2.528000831604004, + "logps/chosen": -340.5784606933594, + "logps/rejected": -307.54510498046875, + "loss": 0.6388, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.15956440567970276, + "rewards/margins": 0.16902579367160797, + "rewards/rejected": -0.3285902142524719, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 4.874457266059209e-07, + "logits/chosen": -2.612618923187256, + "logits/rejected": -2.5180106163024902, + "logps/chosen": -358.2880859375, + "logps/rejected": -335.1981506347656, + "loss": 0.6475, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17318308353424072, + "rewards/margins": 0.12875264883041382, + "rewards/rejected": -0.30193573236465454, + "step": 570 + }, + { + "epoch": 0.61, + "learning_rate": 4.869151408933583e-07, + "logits/chosen": -2.545635223388672, + "logits/rejected": -2.465250253677368, + "logps/chosen": -351.6809997558594, + "logps/rejected": -309.2189636230469, + "loss": 0.642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2154681235551834, + "rewards/margins": 0.11899904906749725, + "rewards/rejected": -0.3344671428203583, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 4.863738748466519e-07, + "logits/chosen": -2.6205108165740967, + "logits/rejected": -2.5699057579040527, + "logps/chosen": -340.952392578125, + "logps/rejected": -325.1321105957031, + "loss": 0.6416, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.12603029608726501, + "rewards/margins": 0.10710600763559341, + "rewards/rejected": -0.23313629627227783, + "step": 590 + }, + { + "epoch": 0.63, + "learning_rate": 4.858219528665313e-07, + "logits/chosen": -2.610783815383911, + "logits/rejected": -2.5357155799865723, + "logps/chosen": -409.34844970703125, + "logps/rejected": -393.9020080566406, + "loss": 0.6442, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.11347125470638275, + "rewards/margins": 0.1751668006181717, + "rewards/rejected": -0.28863808512687683, + "step": 600 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.6208555698394775, + "eval_logits/rejected": -2.541459321975708, + "eval_logps/chosen": -364.4345397949219, + "eval_logps/rejected": -339.30987548828125, + "eval_loss": 0.6407224535942078, + "eval_rewards/accuracies": 0.6805555820465088, + "eval_rewards/chosen": -0.15074825286865234, + "eval_rewards/margins": 0.16338865458965302, + "eval_rewards/rejected": -0.31413692235946655, + "eval_runtime": 378.0153, + "eval_samples_per_second": 5.291, + "eval_steps_per_second": 0.167, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 4.852593998341043e-07, + "logits/chosen": -2.625915288925171, + "logits/rejected": -2.523160457611084, + "logps/chosen": -351.0770568847656, + "logps/rejected": -295.78399658203125, + "loss": 0.6338, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.14120660722255707, + "rewards/margins": 0.1647595465183258, + "rewards/rejected": -0.30596619844436646, + "step": 610 + }, + { + "epoch": 0.65, + "learning_rate": 4.846862411097354e-07, + "logits/chosen": -2.6131348609924316, + "logits/rejected": -2.516840696334839, + "logps/chosen": -360.5911865234375, + "logps/rejected": -314.7618103027344, + "loss": 0.6325, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21129730343818665, + "rewards/margins": 0.1473945826292038, + "rewards/rejected": -0.35869190096855164, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 4.841025025319029e-07, + "logits/chosen": -2.4459609985351562, + "logits/rejected": -2.3932125568389893, + "logps/chosen": -338.36541748046875, + "logps/rejected": -334.18218994140625, + "loss": 0.6301, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1767289787530899, + "rewards/margins": 0.17090250551700592, + "rewards/rejected": -0.3476315140724182, + "step": 630 + }, + { + "epoch": 0.67, + "learning_rate": 4.835082104160337e-07, + "logits/chosen": -2.5294649600982666, + "logits/rejected": -2.4497077465057373, + "logps/chosen": -345.24273681640625, + "logps/rejected": -330.613037109375, + "loss": 0.6319, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17142672836780548, + "rewards/margins": 0.1845071017742157, + "rewards/rejected": -0.3559338450431824, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 4.829033915533171e-07, + "logits/chosen": -2.647000312805176, + "logits/rejected": -2.5052175521850586, + "logps/chosen": -399.4244689941406, + "logps/rejected": -361.20965576171875, + "loss": 0.622, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.17484009265899658, + "rewards/margins": 0.23640482127666473, + "rewards/rejected": -0.4112449288368225, + "step": 650 + }, + { + "epoch": 0.69, + "learning_rate": 4.822880732094967e-07, + "logits/chosen": -2.6102538108825684, + "logits/rejected": -2.5597729682922363, + "logps/chosen": -377.75408935546875, + "logps/rejected": -357.68804931640625, + "loss": 0.6272, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.183266744017601, + "rewards/margins": 0.19729962944984436, + "rewards/rejected": -0.38056638836860657, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 4.81662283123642e-07, + "logits/chosen": -2.5665385723114014, + "logits/rejected": -2.529106855392456, + "logps/chosen": -362.9713134765625, + "logps/rejected": -356.2767333984375, + "loss": 0.6291, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19505253434181213, + "rewards/margins": 0.19748732447624207, + "rewards/rejected": -0.3925398290157318, + "step": 670 + }, + { + "epoch": 0.71, + "learning_rate": 4.810260495068973e-07, + "logits/chosen": -2.485835313796997, + "logits/rejected": -2.4558098316192627, + "logps/chosen": -342.9903564453125, + "logps/rejected": -319.09075927734375, + "loss": 0.6333, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.22894680500030518, + "rewards/margins": 0.08809840679168701, + "rewards/rejected": -0.3170451819896698, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 4.8037940104121e-07, + "logits/chosen": -2.5049188137054443, + "logits/rejected": -2.4172816276550293, + "logps/chosen": -346.4105529785156, + "logps/rejected": -328.77423095703125, + "loss": 0.6336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2281329184770584, + "rewards/margins": 0.15840637683868408, + "rewards/rejected": -0.3865392804145813, + "step": 690 + }, + { + "epoch": 0.73, + "learning_rate": 4.797223668780377e-07, + "logits/chosen": -2.5285067558288574, + "logits/rejected": -2.4108242988586426, + "logps/chosen": -332.65069580078125, + "logps/rejected": -327.6502990722656, + "loss": 0.6271, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2695836126804352, + "rewards/margins": 0.19101601839065552, + "rewards/rejected": -0.4605995714664459, + "step": 700 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.5658488273620605, + "eval_logits/rejected": -2.4835686683654785, + "eval_logps/chosen": -373.3324279785156, + "eval_logps/rejected": -352.50689697265625, + "eval_loss": 0.6320837140083313, + "eval_rewards/accuracies": 0.6765872836112976, + "eval_rewards/chosen": -0.23972678184509277, + "eval_rewards/margins": 0.20638057589530945, + "eval_rewards/rejected": -0.4461073875427246, + "eval_runtime": 360.7065, + "eval_samples_per_second": 5.545, + "eval_steps_per_second": 0.175, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 4.79054976637034e-07, + "logits/chosen": -2.5603199005126953, + "logits/rejected": -2.4289393424987793, + "logps/chosen": -392.89959716796875, + "logps/rejected": -323.50262451171875, + "loss": 0.6176, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.23820409178733826, + "rewards/margins": 0.20737656950950623, + "rewards/rejected": -0.4455806612968445, + "step": 710 + }, + { + "epoch": 0.75, + "learning_rate": 4.783772604047133e-07, + "logits/chosen": -2.5404629707336426, + "logits/rejected": -2.4736697673797607, + "logps/chosen": -374.0919189453125, + "logps/rejected": -350.94293212890625, + "loss": 0.6356, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.23239894211292267, + "rewards/margins": 0.18589885532855988, + "rewards/rejected": -0.41829776763916016, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 4.776892487330943e-07, + "logits/chosen": -2.53133225440979, + "logits/rejected": -2.422051191329956, + "logps/chosen": -380.31622314453125, + "logps/rejected": -339.6204528808594, + "loss": 0.6308, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18756382167339325, + "rewards/margins": 0.2003902941942215, + "rewards/rejected": -0.38795414566993713, + "step": 730 + }, + { + "epoch": 0.77, + "learning_rate": 4.769909726383226e-07, + "logits/chosen": -2.5187153816223145, + "logits/rejected": -2.3927228450775146, + "logps/chosen": -406.51263427734375, + "logps/rejected": -342.09661865234375, + "loss": 0.6223, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.15098270773887634, + "rewards/margins": 0.2193053960800171, + "rewards/rejected": -0.37028807401657104, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 4.762824635992729e-07, + "logits/chosen": -2.530505657196045, + "logits/rejected": -2.4965600967407227, + "logps/chosen": -370.498779296875, + "logps/rejected": -370.7012634277344, + "loss": 0.6209, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.261788547039032, + "rewards/margins": 0.18216492235660553, + "rewards/rejected": -0.4439534544944763, + "step": 750 + }, + { + "epoch": 0.8, + "learning_rate": 4.755637535561297e-07, + "logits/chosen": -2.459725856781006, + "logits/rejected": -2.4072091579437256, + "logps/chosen": -365.5503845214844, + "logps/rejected": -354.66497802734375, + "loss": 0.6126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.37326163053512573, + "rewards/margins": 0.21484375, + "rewards/rejected": -0.5881053805351257, + "step": 760 + }, + { + "epoch": 0.81, + "learning_rate": 4.7483487490894716e-07, + "logits/chosen": -2.4971468448638916, + "logits/rejected": -2.4607887268066406, + "logps/chosen": -397.69354248046875, + "logps/rejected": -411.0802307128906, + "loss": 0.6239, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.37995341420173645, + "rewards/margins": 0.20637984573841095, + "rewards/rejected": -0.5863332748413086, + "step": 770 + }, + { + "epoch": 0.82, + "learning_rate": 4.7409586051618866e-07, + "logits/chosen": -2.418168306350708, + "logits/rejected": -2.357445478439331, + "logps/chosen": -344.15997314453125, + "logps/rejected": -328.99871826171875, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2923930883407593, + "rewards/margins": 0.1856629103422165, + "rewards/rejected": -0.47805601358413696, + "step": 780 + }, + { + "epoch": 0.83, + "learning_rate": 4.733467436932458e-07, + "logits/chosen": -2.507992744445801, + "logits/rejected": -2.4629783630371094, + "logps/chosen": -393.92144775390625, + "logps/rejected": -387.3020935058594, + "loss": 0.6342, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2580786347389221, + "rewards/margins": 0.2315601110458374, + "rewards/rejected": -0.4896388053894043, + "step": 790 + }, + { + "epoch": 0.84, + "learning_rate": 4.7258755821093583e-07, + "logits/chosen": -2.453043222427368, + "logits/rejected": -2.361077070236206, + "logps/chosen": -436.2608947753906, + "logps/rejected": -373.888427734375, + "loss": 0.607, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27933627367019653, + "rewards/margins": 0.22823591530323029, + "rewards/rejected": -0.5075721740722656, + "step": 800 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.505126953125, + "eval_logits/rejected": -2.4199066162109375, + "eval_logps/chosen": -379.1497497558594, + "eval_logps/rejected": -361.6934509277344, + "eval_loss": 0.6261005401611328, + "eval_rewards/accuracies": 0.6845238208770752, + "eval_rewards/chosen": -0.29789987206459045, + "eval_rewards/margins": 0.24007315933704376, + "eval_rewards/rejected": -0.5379729866981506, + "eval_runtime": 373.2963, + "eval_samples_per_second": 5.358, + "eval_steps_per_second": 0.169, + "step": 800 + }, + { + "epoch": 0.85, + "learning_rate": 4.7181833829398005e-07, + "logits/chosen": -2.4596476554870605, + "logits/rejected": -2.324451446533203, + "logps/chosen": -374.5833435058594, + "logps/rejected": -325.2947692871094, + "loss": 0.6294, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.30218952894210815, + "rewards/margins": 0.1286691129207611, + "rewards/rejected": -0.4308586120605469, + "step": 810 + }, + { + "epoch": 0.86, + "learning_rate": 4.7103911861946033e-07, + "logits/chosen": -2.3883352279663086, + "logits/rejected": -2.3170790672302246, + "logps/chosen": -323.11480712890625, + "logps/rejected": -317.70635986328125, + "loss": 0.6392, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2386181354522705, + "rewards/margins": 0.19333642721176147, + "rewards/rejected": -0.431954562664032, + "step": 820 + }, + { + "epoch": 0.87, + "learning_rate": 4.70249934315256e-07, + "logits/chosen": -2.4071390628814697, + "logits/rejected": -2.3966832160949707, + "logps/chosen": -331.5174865722656, + "logps/rejected": -332.76898193359375, + "loss": 0.6164, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.27694451808929443, + "rewards/margins": 0.23727154731750488, + "rewards/rejected": -0.5142160654067993, + "step": 830 + }, + { + "epoch": 0.88, + "learning_rate": 4.6945082095846047e-07, + "logits/chosen": -2.4078102111816406, + "logits/rejected": -2.356518030166626, + "logps/chosen": -403.46954345703125, + "logps/rejected": -375.2245178222656, + "loss": 0.6229, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3784494996070862, + "rewards/margins": 0.12926678359508514, + "rewards/rejected": -0.5077162981033325, + "step": 840 + }, + { + "epoch": 0.89, + "learning_rate": 4.6864181457377695e-07, + "logits/chosen": -2.4845261573791504, + "logits/rejected": -2.4015376567840576, + "logps/chosen": -403.55596923828125, + "logps/rejected": -351.72515869140625, + "loss": 0.622, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3145061135292053, + "rewards/margins": 0.266304612159729, + "rewards/rejected": -0.5808106660842896, + "step": 850 + }, + { + "epoch": 0.9, + "learning_rate": 4.678229516318948e-07, + "logits/chosen": -2.483037233352661, + "logits/rejected": -2.425265073776245, + "logps/chosen": -379.2450866699219, + "logps/rejected": -362.83404541015625, + "loss": 0.6281, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.36860939860343933, + "rewards/margins": 0.25606662034988403, + "rewards/rejected": -0.6246760487556458, + "step": 860 + }, + { + "epoch": 0.91, + "learning_rate": 4.6699426904784545e-07, + "logits/chosen": -2.4010143280029297, + "logits/rejected": -2.367020845413208, + "logps/chosen": -338.23992919921875, + "logps/rejected": -354.3396301269531, + "loss": 0.6117, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3722101151943207, + "rewards/margins": 0.2783013582229614, + "rewards/rejected": -0.6505114436149597, + "step": 870 + }, + { + "epoch": 0.92, + "learning_rate": 4.6615580417933785e-07, + "logits/chosen": -2.414269208908081, + "logits/rejected": -2.3180108070373535, + "logps/chosen": -386.2663269042969, + "logps/rejected": -361.62493896484375, + "loss": 0.6215, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.43918028473854065, + "rewards/margins": 0.2183331698179245, + "rewards/rejected": -0.6575134992599487, + "step": 880 + }, + { + "epoch": 0.93, + "learning_rate": 4.6530759482507466e-07, + "logits/chosen": -2.4086251258850098, + "logits/rejected": -2.359178066253662, + "logps/chosen": -376.55157470703125, + "logps/rejected": -377.1549377441406, + "loss": 0.6339, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44110578298568726, + "rewards/margins": 0.16434124112129211, + "rewards/rejected": -0.605446994304657, + "step": 890 + }, + { + "epoch": 0.94, + "learning_rate": 4.6444967922304813e-07, + "logits/chosen": -2.3653807640075684, + "logits/rejected": -2.2835304737091064, + "logps/chosen": -407.1882019042969, + "logps/rejected": -394.25579833984375, + "loss": 0.6322, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.44054508209228516, + "rewards/margins": 0.16737648844718933, + "rewards/rejected": -0.6079215407371521, + "step": 900 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.4507651329040527, + "eval_logits/rejected": -2.364361047744751, + "eval_logps/chosen": -397.4640808105469, + "eval_logps/rejected": -382.21417236328125, + "eval_loss": 0.6199224591255188, + "eval_rewards/accuracies": 0.6904761791229248, + "eval_rewards/chosen": -0.48104292154312134, + "eval_rewards/margins": 0.26213717460632324, + "eval_rewards/rejected": -0.7431801557540894, + "eval_runtime": 387.4727, + "eval_samples_per_second": 5.162, + "eval_steps_per_second": 0.163, + "step": 900 + }, + { + "epoch": 0.95, + "learning_rate": 4.6358209604881637e-07, + "logits/chosen": -2.3927271366119385, + "logits/rejected": -2.3169281482696533, + "logps/chosen": -358.5960388183594, + "logps/rejected": -359.0953674316406, + "loss": 0.6075, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5474358797073364, + "rewards/margins": 0.20302316546440125, + "rewards/rejected": -0.7504590153694153, + "step": 910 + }, + { + "epoch": 0.96, + "learning_rate": 4.627048844137598e-07, + "logits/chosen": -2.4270455837249756, + "logits/rejected": -2.3073556423187256, + "logps/chosen": -401.132080078125, + "logps/rejected": -383.88067626953125, + "loss": 0.6136, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5267983675003052, + "rewards/margins": 0.2972319722175598, + "rewards/rejected": -0.8240302801132202, + "step": 920 + }, + { + "epoch": 0.97, + "learning_rate": 4.6181808386331787e-07, + "logits/chosen": -2.4496045112609863, + "logits/rejected": -2.3281662464141846, + "logps/chosen": -384.0579833984375, + "logps/rejected": -371.2091369628906, + "loss": 0.5891, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.47724461555480957, + "rewards/margins": 0.333379864692688, + "rewards/rejected": -0.8106244802474976, + "step": 930 + }, + { + "epoch": 0.98, + "learning_rate": 4.6092173437520666e-07, + "logits/chosen": -2.423539638519287, + "logits/rejected": -2.3008933067321777, + "logps/chosen": -444.1749572753906, + "logps/rejected": -432.6753845214844, + "loss": 0.6111, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5619600415229797, + "rewards/margins": 0.33264535665512085, + "rewards/rejected": -0.8946054577827454, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 4.600158763576161e-07, + "logits/chosen": -2.438096523284912, + "logits/rejected": -2.3391449451446533, + "logps/chosen": -401.29083251953125, + "logps/rejected": -386.20361328125, + "loss": 0.6197, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5298935174942017, + "rewards/margins": 0.28705304861068726, + "rewards/rejected": -0.8169466257095337, + "step": 950 + }, + { + "epoch": 1.0, + "learning_rate": 4.591005506473887e-07, + "logits/chosen": -2.3625149726867676, + "logits/rejected": -2.2783892154693604, + "logps/chosen": -371.94158935546875, + "logps/rejected": -384.6352233886719, + "loss": 0.6026, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.42412814497947693, + "rewards/margins": 0.35006412863731384, + "rewards/rejected": -0.774192214012146, + "step": 960 + }, + { + "epoch": 1.02, + "learning_rate": 4.5817579850817884e-07, + "logits/chosen": -2.3949708938598633, + "logits/rejected": -2.3096823692321777, + "logps/chosen": -418.3802795410156, + "logps/rejected": -408.92279052734375, + "loss": 0.5971, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4029483199119568, + "rewards/margins": 0.32788950204849243, + "rewards/rejected": -0.730837881565094, + "step": 970 + }, + { + "epoch": 1.03, + "learning_rate": 4.572416616285918e-07, + "logits/chosen": -2.2977919578552246, + "logits/rejected": -2.2567481994628906, + "logps/chosen": -355.2760314941406, + "logps/rejected": -390.1808776855469, + "loss": 0.5833, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.42498597502708435, + "rewards/margins": 0.4129951596260071, + "rewards/rejected": -0.837981104850769, + "step": 980 + }, + { + "epoch": 1.04, + "learning_rate": 4.5629818212030525e-07, + "logits/chosen": -2.3631339073181152, + "logits/rejected": -2.265576124191284, + "logps/chosen": -423.474365234375, + "logps/rejected": -398.92816162109375, + "loss": 0.604, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4605909287929535, + "rewards/margins": 0.31168457865715027, + "rewards/rejected": -0.7722755670547485, + "step": 990 + }, + { + "epoch": 1.05, + "learning_rate": 4.5534540251617013e-07, + "logits/chosen": -2.3864855766296387, + "logits/rejected": -2.369788408279419, + "logps/chosen": -378.5528564453125, + "logps/rejected": -388.29364013671875, + "loss": 0.605, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5324582457542419, + "rewards/margins": 0.23893216252326965, + "rewards/rejected": -0.7713904976844788, + "step": 1000 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.3963613510131836, + "eval_logits/rejected": -2.3067517280578613, + "eval_logps/chosen": -404.5889892578125, + "eval_logps/rejected": -394.02880859375, + "eval_loss": 0.6115422248840332, + "eval_rewards/accuracies": 0.6884920597076416, + "eval_rewards/chosen": -0.5522919297218323, + "eval_rewards/margins": 0.30903440713882446, + "eval_rewards/rejected": -0.8613263368606567, + "eval_runtime": 345.5833, + "eval_samples_per_second": 5.787, + "eval_steps_per_second": 0.182, + "step": 1000 + }, + { + "epoch": 1.06, + "learning_rate": 4.5438336576829377e-07, + "logits/chosen": -2.3662519454956055, + "logits/rejected": -2.2876665592193604, + "logps/chosen": -418.6935119628906, + "logps/rejected": -390.25067138671875, + "loss": 0.596, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5613728761672974, + "rewards/margins": 0.25794973969459534, + "rewards/rejected": -0.8193224668502808, + "step": 1010 + }, + { + "epoch": 1.07, + "learning_rate": 4.5341211524610323e-07, + "logits/chosen": -2.353506565093994, + "logits/rejected": -2.3161935806274414, + "logps/chosen": -407.1302795410156, + "logps/rejected": -410.1849670410156, + "loss": 0.5985, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5053799748420715, + "rewards/margins": 0.3071553409099579, + "rewards/rejected": -0.8125354051589966, + "step": 1020 + }, + { + "epoch": 1.08, + "learning_rate": 4.5243169473439026e-07, + "logits/chosen": -2.2898788452148438, + "logits/rejected": -2.24770188331604, + "logps/chosen": -371.4761962890625, + "logps/rejected": -374.82989501953125, + "loss": 0.5841, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.4240152835845947, + "rewards/margins": 0.37721922993659973, + "rewards/rejected": -0.8012345433235168, + "step": 1030 + }, + { + "epoch": 1.09, + "learning_rate": 4.5144214843133753e-07, + "logits/chosen": -2.280208110809326, + "logits/rejected": -2.2782938480377197, + "logps/chosen": -369.32598876953125, + "logps/rejected": -416.7386779785156, + "loss": 0.6018, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4348181188106537, + "rewards/margins": 0.3252604603767395, + "rewards/rejected": -0.7600786089897156, + "step": 1040 + }, + { + "epoch": 1.1, + "learning_rate": 4.5044352094652603e-07, + "logits/chosen": -2.3721535205841064, + "logits/rejected": -2.2657477855682373, + "logps/chosen": -398.3066101074219, + "logps/rejected": -372.4281921386719, + "loss": 0.5902, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4137346148490906, + "rewards/margins": 0.3253127634525299, + "rewards/rejected": -0.7390474081039429, + "step": 1050 + }, + { + "epoch": 1.11, + "learning_rate": 4.494358572989241e-07, + "logits/chosen": -2.3646328449249268, + "logits/rejected": -2.1730899810791016, + "logps/chosen": -439.25579833984375, + "logps/rejected": -406.27655029296875, + "loss": 0.5674, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.4350808262825012, + "rewards/margins": 0.45484787225723267, + "rewards/rejected": -0.8899286389350891, + "step": 1060 + }, + { + "epoch": 1.12, + "learning_rate": 4.484192029148578e-07, + "logits/chosen": -2.313396692276001, + "logits/rejected": -2.204408645629883, + "logps/chosen": -376.7505798339844, + "logps/rejected": -346.17791748046875, + "loss": 0.5977, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5117573738098145, + "rewards/margins": 0.32838425040245056, + "rewards/rejected": -0.8401415944099426, + "step": 1070 + }, + { + "epoch": 1.13, + "learning_rate": 4.4739360362596336e-07, + "logits/chosen": -2.273745059967041, + "logits/rejected": -2.2262158393859863, + "logps/chosen": -369.76641845703125, + "logps/rejected": -395.5487976074219, + "loss": 0.5952, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.546228289604187, + "rewards/margins": 0.31487131118774414, + "rewards/rejected": -0.8610996007919312, + "step": 1080 + }, + { + "epoch": 1.14, + "learning_rate": 4.4635910566712073e-07, + "logits/chosen": -2.3198351860046387, + "logits/rejected": -2.2201361656188965, + "logps/chosen": -424.00286865234375, + "logps/rejected": -421.1241760253906, + "loss": 0.574, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.595988929271698, + "rewards/margins": 0.38041016459465027, + "rewards/rejected": -0.9763991236686707, + "step": 1090 + }, + { + "epoch": 1.15, + "learning_rate": 4.4531575567436933e-07, + "logits/chosen": -2.3476712703704834, + "logits/rejected": -2.268463134765625, + "logps/chosen": -410.62371826171875, + "logps/rejected": -415.69256591796875, + "loss": 0.601, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7680894136428833, + "rewards/margins": 0.27382129430770874, + "rewards/rejected": -1.0419107675552368, + "step": 1100 + }, + { + "epoch": 1.15, + "eval_logits/chosen": -2.3601648807525635, + "eval_logits/rejected": -2.2683041095733643, + "eval_logps/chosen": -418.7676696777344, + "eval_logps/rejected": -411.0064697265625, + "eval_loss": 0.6067742705345154, + "eval_rewards/accuracies": 0.6964285969734192, + "eval_rewards/chosen": -0.6940793991088867, + "eval_rewards/margins": 0.33702388405799866, + "eval_rewards/rejected": -1.031103253364563, + "eval_runtime": 356.6096, + "eval_samples_per_second": 5.608, + "eval_steps_per_second": 0.177, + "step": 1100 + }, + { + "epoch": 1.16, + "learning_rate": 4.44263600682806e-07, + "logits/chosen": -2.357461452484131, + "logits/rejected": -2.2750308513641357, + "logps/chosen": -418.9007263183594, + "logps/rejected": -407.72772216796875, + "loss": 0.5931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6749259233474731, + "rewards/margins": 0.2714986801147461, + "rewards/rejected": -0.9464246034622192, + "step": 1110 + }, + { + "epoch": 1.17, + "learning_rate": 4.4320268812446404e-07, + "logits/chosen": -2.371415615081787, + "logits/rejected": -2.2759017944335938, + "logps/chosen": -417.7850036621094, + "logps/rejected": -398.28692626953125, + "loss": 0.5898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5170518159866333, + "rewards/margins": 0.35224297642707825, + "rewards/rejected": -0.8692947626113892, + "step": 1120 + }, + { + "epoch": 1.18, + "learning_rate": 4.421330658261754e-07, + "logits/chosen": -2.32688570022583, + "logits/rejected": -2.2558743953704834, + "logps/chosen": -387.0340270996094, + "logps/rejected": -385.77984619140625, + "loss": 0.5755, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3961396813392639, + "rewards/margins": 0.3396463990211487, + "rewards/rejected": -0.7357860803604126, + "step": 1130 + }, + { + "epoch": 1.19, + "learning_rate": 4.410547820074143e-07, + "logits/chosen": -2.3766913414001465, + "logits/rejected": -2.2579758167266846, + "logps/chosen": -411.9817810058594, + "logps/rejected": -376.52716064453125, + "loss": 0.5798, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.43211793899536133, + "rewards/margins": 0.41633152961730957, + "rewards/rejected": -0.8484494090080261, + "step": 1140 + }, + { + "epoch": 1.2, + "learning_rate": 4.399678852781238e-07, + "logits/chosen": -2.342559337615967, + "logits/rejected": -2.266874074935913, + "logps/chosen": -410.984619140625, + "logps/rejected": -401.4251403808594, + "loss": 0.5879, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5716805458068848, + "rewards/margins": 0.32295480370521545, + "rewards/rejected": -0.8946353197097778, + "step": 1150 + }, + { + "epoch": 1.21, + "learning_rate": 4.3887242463652415e-07, + "logits/chosen": -2.3485589027404785, + "logits/rejected": -2.269087791442871, + "logps/chosen": -400.2742004394531, + "logps/rejected": -413.7886657714844, + "loss": 0.5823, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5351490378379822, + "rewards/margins": 0.39718011021614075, + "rewards/rejected": -0.9323290586471558, + "step": 1160 + }, + { + "epoch": 1.22, + "learning_rate": 4.3776844946690385e-07, + "logits/chosen": -2.3736624717712402, + "logits/rejected": -2.2624993324279785, + "logps/chosen": -424.0856018066406, + "logps/rejected": -380.76812744140625, + "loss": 0.5792, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48149457573890686, + "rewards/margins": 0.3061677813529968, + "rewards/rejected": -0.7876623868942261, + "step": 1170 + }, + { + "epoch": 1.23, + "learning_rate": 4.3665600953739367e-07, + "logits/chosen": -2.313255548477173, + "logits/rejected": -2.192188024520874, + "logps/chosen": -404.3397216796875, + "logps/rejected": -371.1601257324219, + "loss": 0.5742, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.491854190826416, + "rewards/margins": 0.3715011477470398, + "rewards/rejected": -0.8633554577827454, + "step": 1180 + }, + { + "epoch": 1.25, + "learning_rate": 4.3553515499772285e-07, + "logits/chosen": -2.393124580383301, + "logits/rejected": -2.2997257709503174, + "logps/chosen": -403.5997619628906, + "logps/rejected": -397.8185729980469, + "loss": 0.5659, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5112482309341431, + "rewards/margins": 0.4146638512611389, + "rewards/rejected": -0.925912082195282, + "step": 1190 + }, + { + "epoch": 1.26, + "learning_rate": 4.344059363769583e-07, + "logits/chosen": -2.329709529876709, + "logits/rejected": -2.240239381790161, + "logps/chosen": -423.2294006347656, + "logps/rejected": -421.49468994140625, + "loss": 0.5676, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5605155229568481, + "rewards/margins": 0.39043912291526794, + "rewards/rejected": -0.9509545564651489, + "step": 1200 + }, + { + "epoch": 1.26, + "eval_logits/chosen": -2.3216235637664795, + "eval_logits/rejected": -2.2290165424346924, + "eval_logps/chosen": -417.0859375, + "eval_logps/rejected": -411.97637939453125, + "eval_loss": 0.6020426154136658, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": -0.677262008190155, + "eval_rewards/margins": 0.36354002356529236, + "eval_rewards/rejected": -1.040802001953125, + "eval_runtime": 368.9191, + "eval_samples_per_second": 5.421, + "eval_steps_per_second": 0.171, + "step": 1200 + }, + { + "epoch": 1.27, + "learning_rate": 4.332684045812268e-07, + "logits/chosen": -2.3038039207458496, + "logits/rejected": -2.197749614715576, + "logps/chosen": -371.4241943359375, + "logps/rejected": -387.53070068359375, + "loss": 0.5788, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5937258005142212, + "rewards/margins": 0.3397255539894104, + "rewards/rejected": -0.9334513545036316, + "step": 1210 + }, + { + "epoch": 1.28, + "learning_rate": 4.3212261089142e-07, + "logits/chosen": -2.328768253326416, + "logits/rejected": -2.1700007915496826, + "logps/chosen": -417.7594299316406, + "logps/rejected": -393.429443359375, + "loss": 0.59, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4774077832698822, + "rewards/margins": 0.405425488948822, + "rewards/rejected": -0.8828333020210266, + "step": 1220 + }, + { + "epoch": 1.29, + "learning_rate": 4.3096860696088267e-07, + "logits/chosen": -2.322392463684082, + "logits/rejected": -2.1980550289154053, + "logps/chosen": -430.95068359375, + "logps/rejected": -409.2894592285156, + "loss": 0.5845, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.521595299243927, + "rewards/margins": 0.3559170365333557, + "rewards/rejected": -0.8775123357772827, + "step": 1230 + }, + { + "epoch": 1.3, + "learning_rate": 4.2980644481308426e-07, + "logits/chosen": -2.23865008354187, + "logits/rejected": -2.2324957847595215, + "logps/chosen": -385.8111877441406, + "logps/rejected": -396.346435546875, + "loss": 0.6033, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6240901947021484, + "rewards/margins": 0.3070584237575531, + "rewards/rejected": -0.9311486482620239, + "step": 1240 + }, + { + "epoch": 1.31, + "learning_rate": 4.286361768392734e-07, + "logits/chosen": -2.2613332271575928, + "logits/rejected": -2.18135404586792, + "logps/chosen": -415.98651123046875, + "logps/rejected": -406.7262268066406, + "loss": 0.5709, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6372745633125305, + "rewards/margins": 0.37148743867874146, + "rewards/rejected": -1.008762001991272, + "step": 1250 + }, + { + "epoch": 1.32, + "learning_rate": 4.2745785579611636e-07, + "logits/chosen": -2.216391086578369, + "logits/rejected": -2.1812686920166016, + "logps/chosen": -363.19464111328125, + "logps/rejected": -382.83489990234375, + "loss": 0.5881, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6171839237213135, + "rewards/margins": 0.3068179488182068, + "rewards/rejected": -0.9240018725395203, + "step": 1260 + }, + { + "epoch": 1.33, + "learning_rate": 4.262715348033184e-07, + "logits/chosen": -2.2606654167175293, + "logits/rejected": -2.183107852935791, + "logps/chosen": -382.5652770996094, + "logps/rejected": -387.09539794921875, + "loss": 0.5615, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.4672677516937256, + "rewards/margins": 0.40395697951316833, + "rewards/rejected": -0.871224582195282, + "step": 1270 + }, + { + "epoch": 1.34, + "learning_rate": 4.2507726734122927e-07, + "logits/chosen": -2.3232672214508057, + "logits/rejected": -2.2005207538604736, + "logps/chosen": -399.9627685546875, + "logps/rejected": -384.1025390625, + "loss": 0.5709, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4604893624782562, + "rewards/margins": 0.41125577688217163, + "rewards/rejected": -0.871745228767395, + "step": 1280 + }, + { + "epoch": 1.35, + "learning_rate": 4.2387510724843243e-07, + "logits/chosen": -2.278716564178467, + "logits/rejected": -2.1945688724517822, + "logps/chosen": -405.977783203125, + "logps/rejected": -400.54888916015625, + "loss": 0.5861, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5321739912033081, + "rewards/margins": 0.38776397705078125, + "rewards/rejected": -0.9199379682540894, + "step": 1290 + }, + { + "epoch": 1.36, + "learning_rate": 4.226651087193175e-07, + "logits/chosen": -2.2307355403900146, + "logits/rejected": -2.2070822715759277, + "logps/chosen": -383.90289306640625, + "logps/rejected": -393.69342041015625, + "loss": 0.5909, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6051900386810303, + "rewards/margins": 0.3618764281272888, + "rewards/rejected": -0.9670664668083191, + "step": 1300 + }, + { + "epoch": 1.36, + "eval_logits/chosen": -2.29123854637146, + "eval_logits/rejected": -2.1982269287109375, + "eval_logps/chosen": -412.9469909667969, + "eval_logps/rejected": -408.3128356933594, + "eval_loss": 0.5999146699905396, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": -0.6358725428581238, + "eval_rewards/margins": 0.368294358253479, + "eval_rewards/rejected": -1.0041669607162476, + "eval_runtime": 359.7432, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 0.175, + "step": 1300 + }, + { + "epoch": 1.37, + "learning_rate": 4.214473263016376e-07, + "logits/chosen": -2.2382800579071045, + "logits/rejected": -2.144857883453369, + "logps/chosen": -382.93292236328125, + "logps/rejected": -396.829345703125, + "loss": 0.5854, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5666841268539429, + "rewards/margins": 0.36565086245536804, + "rewards/rejected": -0.9323350191116333, + "step": 1310 + }, + { + "epoch": 1.38, + "learning_rate": 4.2022181489405005e-07, + "logits/chosen": -2.2324352264404297, + "logits/rejected": -2.1366093158721924, + "logps/chosen": -384.2945251464844, + "logps/rejected": -413.99041748046875, + "loss": 0.5728, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6076509952545166, + "rewards/margins": 0.41816553473472595, + "rewards/rejected": -1.025816559791565, + "step": 1320 + }, + { + "epoch": 1.39, + "learning_rate": 4.189886297436416e-07, + "logits/chosen": -2.208909511566162, + "logits/rejected": -2.137064218521118, + "logps/chosen": -418.9502868652344, + "logps/rejected": -436.11090087890625, + "loss": 0.5882, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6793769001960754, + "rewards/margins": 0.40103235840797424, + "rewards/rejected": -1.080409288406372, + "step": 1330 + }, + { + "epoch": 1.4, + "learning_rate": 4.177478264434375e-07, + "logits/chosen": -2.2093963623046875, + "logits/rejected": -2.14264178276062, + "logps/chosen": -392.21478271484375, + "logps/rejected": -403.21063232421875, + "loss": 0.6091, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6416140794754028, + "rewards/margins": 0.3692830204963684, + "rewards/rejected": -1.0108970403671265, + "step": 1340 + }, + { + "epoch": 1.41, + "learning_rate": 4.164994609298962e-07, + "logits/chosen": -2.1711971759796143, + "logits/rejected": -2.1330151557922363, + "logps/chosen": -351.8862609863281, + "logps/rejected": -370.3506774902344, + "loss": 0.5698, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5294076204299927, + "rewards/margins": 0.3314458429813385, + "rewards/rejected": -0.860853374004364, + "step": 1350 + }, + { + "epoch": 1.42, + "learning_rate": 4.1524358948038664e-07, + "logits/chosen": -2.250774383544922, + "logits/rejected": -2.1088974475860596, + "logps/chosen": -424.53668212890625, + "logps/rejected": -390.82122802734375, + "loss": 0.5885, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5655269026756287, + "rewards/margins": 0.29255813360214233, + "rewards/rejected": -0.858085036277771, + "step": 1360 + }, + { + "epoch": 1.43, + "learning_rate": 4.139802687106516e-07, + "logits/chosen": -2.3377394676208496, + "logits/rejected": -2.1961159706115723, + "logps/chosen": -416.22869873046875, + "logps/rejected": -389.2613220214844, + "loss": 0.5645, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5650314092636108, + "rewards/margins": 0.42392808198928833, + "rewards/rejected": -0.9889594912528992, + "step": 1370 + }, + { + "epoch": 1.44, + "learning_rate": 4.1270955557225596e-07, + "logits/chosen": -2.244158983230591, + "logits/rejected": -2.1470203399658203, + "logps/chosen": -397.0245056152344, + "logps/rejected": -449.65704345703125, + "loss": 0.5523, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5677224397659302, + "rewards/margins": 0.4837714731693268, + "rewards/rejected": -1.0514938831329346, + "step": 1380 + }, + { + "epoch": 1.45, + "learning_rate": 4.1143150735001835e-07, + "logits/chosen": -2.212290048599243, + "logits/rejected": -2.181854009628296, + "logps/chosen": -401.3909606933594, + "logps/rejected": -396.84222412109375, + "loss": 0.579, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5991008877754211, + "rewards/margins": 0.39938944578170776, + "rewards/rejected": -0.9984903335571289, + "step": 1390 + }, + { + "epoch": 1.47, + "learning_rate": 4.1014618165942936e-07, + "logits/chosen": -2.2670254707336426, + "logits/rejected": -2.1317477226257324, + "logps/chosen": -434.9769592285156, + "logps/rejected": -407.5884704589844, + "loss": 0.5711, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6577389240264893, + "rewards/margins": 0.49383634328842163, + "rewards/rejected": -1.1515752077102661, + "step": 1400 + }, + { + "epoch": 1.47, + "eval_logits/chosen": -2.2460079193115234, + "eval_logits/rejected": -2.1507139205932617, + "eval_logps/chosen": -420.5697326660156, + "eval_logps/rejected": -419.07220458984375, + "eval_loss": 0.5966773629188538, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.7120997905731201, + "eval_rewards/margins": 0.39966049790382385, + "eval_rewards/rejected": -1.1117603778839111, + "eval_runtime": 352.9373, + "eval_samples_per_second": 5.667, + "eval_steps_per_second": 0.179, + "step": 1400 + }, + { + "epoch": 1.48, + "learning_rate": 4.088536364440541e-07, + "logits/chosen": -2.219907283782959, + "logits/rejected": -2.084876537322998, + "logps/chosen": -438.87921142578125, + "logps/rejected": -417.52349853515625, + "loss": 0.5658, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.705518364906311, + "rewards/margins": 0.49317169189453125, + "rewards/rejected": -1.1986901760101318, + "step": 1410 + }, + { + "epoch": 1.49, + "learning_rate": 4.075539299729196e-07, + "logits/chosen": -2.19868803024292, + "logits/rejected": -2.1398653984069824, + "logps/chosen": -422.9442443847656, + "logps/rejected": -429.850830078125, + "loss": 0.5771, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6790863275527954, + "rewards/margins": 0.3902955949306488, + "rewards/rejected": -1.0693819522857666, + "step": 1420 + }, + { + "epoch": 1.5, + "learning_rate": 4.062471208378886e-07, + "logits/chosen": -2.1475436687469482, + "logits/rejected": -2.0641520023345947, + "logps/chosen": -410.59124755859375, + "logps/rejected": -400.1944885253906, + "loss": 0.5804, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6431189775466919, + "rewards/margins": 0.3516607880592346, + "rewards/rejected": -0.9947795867919922, + "step": 1430 + }, + { + "epoch": 1.51, + "learning_rate": 4.049332679510178e-07, + "logits/chosen": -2.243961811065674, + "logits/rejected": -2.1046929359436035, + "logps/chosen": -425.0006408691406, + "logps/rejected": -418.7171936035156, + "loss": 0.569, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.53559809923172, + "rewards/margins": 0.44754093885421753, + "rewards/rejected": -0.983138918876648, + "step": 1440 + }, + { + "epoch": 1.52, + "learning_rate": 4.036124305419024e-07, + "logits/chosen": -2.165278911590576, + "logits/rejected": -2.0803096294403076, + "logps/chosen": -406.1650695800781, + "logps/rejected": -402.72027587890625, + "loss": 0.5734, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6142688989639282, + "rewards/margins": 0.37497222423553467, + "rewards/rejected": -0.9892411231994629, + "step": 1450 + }, + { + "epoch": 1.53, + "learning_rate": 4.0228466815500535e-07, + "logits/chosen": -2.2216262817382812, + "logits/rejected": -2.0967283248901367, + "logps/chosen": -418.2197265625, + "logps/rejected": -391.4878234863281, + "loss": 0.5574, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5702084302902222, + "rewards/margins": 0.42537397146224976, + "rewards/rejected": -0.9955822825431824, + "step": 1460 + }, + { + "epoch": 1.54, + "learning_rate": 4.009500406469737e-07, + "logits/chosen": -2.242321729660034, + "logits/rejected": -2.18538761138916, + "logps/chosen": -413.0029296875, + "logps/rejected": -424.19427490234375, + "loss": 0.5851, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.619260847568512, + "rewards/margins": 0.3288739323616028, + "rewards/rejected": -0.9481347799301147, + "step": 1470 + }, + { + "epoch": 1.55, + "learning_rate": 3.996086081839399e-07, + "logits/chosen": -2.2441189289093018, + "logits/rejected": -2.1407032012939453, + "logps/chosen": -440.66827392578125, + "logps/rejected": -419.3600158691406, + "loss": 0.5616, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5705949664115906, + "rewards/margins": 0.4493609368801117, + "rewards/rejected": -1.0199559926986694, + "step": 1480 + }, + { + "epoch": 1.56, + "learning_rate": 3.982604312388096e-07, + "logits/chosen": -2.1618101596832275, + "logits/rejected": -2.077331066131592, + "logps/chosen": -406.7264099121094, + "logps/rejected": -422.53631591796875, + "loss": 0.5731, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5052849054336548, + "rewards/margins": 0.5251529216766357, + "rewards/rejected": -1.030437707901001, + "step": 1490 + }, + { + "epoch": 1.57, + "learning_rate": 3.969055705885351e-07, + "logits/chosen": -2.152574300765991, + "logits/rejected": -2.0879902839660645, + "logps/chosen": -394.4706726074219, + "logps/rejected": -436.9656677246094, + "loss": 0.5655, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6157764792442322, + "rewards/margins": 0.4567118287086487, + "rewards/rejected": -1.0724884271621704, + "step": 1500 + }, + { + "epoch": 1.57, + "eval_logits/chosen": -2.2211546897888184, + "eval_logits/rejected": -2.1252570152282715, + "eval_logps/chosen": -412.4960632324219, + "eval_logps/rejected": -410.0142517089844, + "eval_loss": 0.5956543684005737, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": -0.6313630938529968, + "eval_rewards/margins": 0.3898184597492218, + "eval_rewards/rejected": -1.0211814641952515, + "eval_runtime": 365.7512, + "eval_samples_per_second": 5.468, + "eval_steps_per_second": 0.172, + "step": 1500 + }, + { + "epoch": 1.58, + "learning_rate": 3.9554408731137604e-07, + "logits/chosen": -2.1627113819122314, + "logits/rejected": -2.1003527641296387, + "logps/chosen": -392.1766662597656, + "logps/rejected": -400.1446228027344, + "loss": 0.5686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6373321413993835, + "rewards/margins": 0.4628276824951172, + "rewards/rejected": -1.100159764289856, + "step": 1510 + }, + { + "epoch": 1.59, + "learning_rate": 3.9417604278414556e-07, + "logits/chosen": -2.209413766860962, + "logits/rejected": -2.105988025665283, + "logps/chosen": -438.8935546875, + "logps/rejected": -427.12225341796875, + "loss": 0.5667, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7425927519798279, + "rewards/margins": 0.4536716043949127, + "rewards/rejected": -1.1962645053863525, + "step": 1520 + }, + { + "epoch": 1.6, + "learning_rate": 3.9280149867944335e-07, + "logits/chosen": -2.132628917694092, + "logits/rejected": -2.042515754699707, + "logps/chosen": -395.2568664550781, + "logps/rejected": -395.38031005859375, + "loss": 0.5581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6433058977127075, + "rewards/margins": 0.44076013565063477, + "rewards/rejected": -1.0840660333633423, + "step": 1530 + }, + { + "epoch": 1.61, + "learning_rate": 3.9142051696287583e-07, + "logits/chosen": -2.2434608936309814, + "logits/rejected": -2.1431119441986084, + "logps/chosen": -451.15728759765625, + "logps/rejected": -441.5335388183594, + "loss": 0.5773, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6491286158561707, + "rewards/margins": 0.4809587001800537, + "rewards/rejected": -1.1300873756408691, + "step": 1540 + }, + { + "epoch": 1.62, + "learning_rate": 3.900331598902621e-07, + "logits/chosen": -2.1666946411132812, + "logits/rejected": -2.090304374694824, + "logps/chosen": -425.1583557128906, + "logps/rejected": -407.2892761230469, + "loss": 0.5567, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.641929566860199, + "rewards/margins": 0.35586634278297424, + "rewards/rejected": -0.9977958798408508, + "step": 1550 + }, + { + "epoch": 1.63, + "learning_rate": 3.8863949000482774e-07, + "logits/chosen": -2.1718239784240723, + "logits/rejected": -2.112691879272461, + "logps/chosen": -367.64971923828125, + "logps/rejected": -395.51373291015625, + "loss": 0.5745, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5719778537750244, + "rewards/margins": 0.37684187293052673, + "rewards/rejected": -0.9488197565078735, + "step": 1560 + }, + { + "epoch": 1.64, + "learning_rate": 3.872395701343854e-07, + "logits/chosen": -2.142659902572632, + "logits/rejected": -2.0468955039978027, + "logps/chosen": -432.7139587402344, + "logps/rejected": -418.50079345703125, + "loss": 0.5754, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6141091585159302, + "rewards/margins": 0.381761372089386, + "rewards/rejected": -0.9958705902099609, + "step": 1570 + }, + { + "epoch": 1.65, + "learning_rate": 3.8583346338850217e-07, + "logits/chosen": -2.1769824028015137, + "logits/rejected": -2.167893886566162, + "logps/chosen": -383.44659423828125, + "logps/rejected": -440.98748779296875, + "loss": 0.5678, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5885382890701294, + "rewards/margins": 0.5099713802337646, + "rewards/rejected": -1.098509669303894, + "step": 1580 + }, + { + "epoch": 1.66, + "learning_rate": 3.8442123315565477e-07, + "logits/chosen": -2.0826640129089355, + "logits/rejected": -2.0363707542419434, + "logps/chosen": -391.0645751953125, + "logps/rejected": -404.0660095214844, + "loss": 0.5686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6791858673095703, + "rewards/margins": 0.4309251308441162, + "rewards/rejected": -1.1101109981536865, + "step": 1590 + }, + { + "epoch": 1.67, + "learning_rate": 3.830029431003718e-07, + "logits/chosen": -2.1413321495056152, + "logits/rejected": -2.0600619316101074, + "logps/chosen": -392.31353759765625, + "logps/rejected": -388.4493103027344, + "loss": 0.5655, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.61674964427948, + "rewards/margins": 0.5000275373458862, + "rewards/rejected": -1.1167770624160767, + "step": 1600 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -2.1858322620391846, + "eval_logits/rejected": -2.0877087116241455, + "eval_logps/chosen": -414.4089660644531, + "eval_logps/rejected": -414.78515625, + "eval_loss": 0.5924570560455322, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.6504923701286316, + "eval_rewards/margins": 0.41839832067489624, + "eval_rewards/rejected": -1.0688906908035278, + "eval_runtime": 373.9388, + "eval_samples_per_second": 5.348, + "eval_steps_per_second": 0.168, + "step": 1600 + }, + { + "epoch": 1.68, + "learning_rate": 3.81578657160364e-07, + "logits/chosen": -2.0504841804504395, + "logits/rejected": -2.0568032264709473, + "logps/chosen": -396.2165222167969, + "logps/rejected": -428.9229431152344, + "loss": 0.5529, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.6419362425804138, + "rewards/margins": 0.47833889722824097, + "rewards/rejected": -1.1202751398086548, + "step": 1610 + }, + { + "epoch": 1.7, + "learning_rate": 3.801484395436412e-07, + "logits/chosen": -2.210151195526123, + "logits/rejected": -2.111720561981201, + "logps/chosen": -431.15216064453125, + "logps/rejected": -404.6205139160156, + "loss": 0.5567, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7074218392372131, + "rewards/margins": 0.4450675845146179, + "rewards/rejected": -1.152489423751831, + "step": 1620 + }, + { + "epoch": 1.71, + "learning_rate": 3.787123547256185e-07, + "logits/chosen": -2.1102566719055176, + "logits/rejected": -2.0364174842834473, + "logps/chosen": -426.64361572265625, + "logps/rejected": -438.0272521972656, + "loss": 0.547, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7283273339271545, + "rewards/margins": 0.42402735352516174, + "rewards/rejected": -1.1523545980453491, + "step": 1630 + }, + { + "epoch": 1.72, + "learning_rate": 3.7727046744620953e-07, + "logits/chosen": -2.1615149974823, + "logits/rejected": -2.052468776702881, + "logps/chosen": -399.452880859375, + "logps/rejected": -403.0694580078125, + "loss": 0.5675, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5816777944564819, + "rewards/margins": 0.4937531352043152, + "rewards/rejected": -1.075430989265442, + "step": 1640 + }, + { + "epoch": 1.73, + "learning_rate": 3.7582284270690747e-07, + "logits/chosen": -2.19050931930542, + "logits/rejected": -2.093792200088501, + "logps/chosen": -443.578857421875, + "logps/rejected": -413.1700134277344, + "loss": 0.5751, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.643337607383728, + "rewards/margins": 0.384897381067276, + "rewards/rejected": -1.0282350778579712, + "step": 1650 + }, + { + "epoch": 1.74, + "learning_rate": 3.7436954576785503e-07, + "logits/chosen": -2.136133909225464, + "logits/rejected": -2.069423198699951, + "logps/chosen": -379.0152587890625, + "logps/rejected": -400.8043518066406, + "loss": 0.5684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6269843578338623, + "rewards/margins": 0.5023621916770935, + "rewards/rejected": -1.1293466091156006, + "step": 1660 + }, + { + "epoch": 1.75, + "learning_rate": 3.7291064214490274e-07, + "logits/chosen": -2.18449068069458, + "logits/rejected": -2.0997793674468994, + "logps/chosen": -401.9382019042969, + "logps/rejected": -396.7825012207031, + "loss": 0.5718, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5713291764259338, + "rewards/margins": 0.4387635290622711, + "rewards/rejected": -1.0100927352905273, + "step": 1670 + }, + { + "epoch": 1.76, + "learning_rate": 3.714461976066549e-07, + "logits/chosen": -2.199491500854492, + "logits/rejected": -2.050266742706299, + "logps/chosen": -434.72210693359375, + "logps/rejected": -408.1536560058594, + "loss": 0.5647, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5964989066123962, + "rewards/margins": 0.4263014793395996, + "rewards/rejected": -1.0228004455566406, + "step": 1680 + }, + { + "epoch": 1.77, + "learning_rate": 3.699762781715051e-07, + "logits/chosen": -2.078326463699341, + "logits/rejected": -2.058079957962036, + "logps/chosen": -373.3815002441406, + "logps/rejected": -401.06793212890625, + "loss": 0.5689, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5760546922683716, + "rewards/margins": 0.4842708110809326, + "rewards/rejected": -1.0603255033493042, + "step": 1690 + }, + { + "epoch": 1.78, + "learning_rate": 3.6850095010465976e-07, + "logits/chosen": -2.146766185760498, + "logits/rejected": -2.088407516479492, + "logps/chosen": -408.16387939453125, + "logps/rejected": -418.07513427734375, + "loss": 0.5364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5772097706794739, + "rewards/margins": 0.573888897895813, + "rewards/rejected": -1.1510984897613525, + "step": 1700 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -2.149921417236328, + "eval_logits/rejected": -2.050013780593872, + "eval_logps/chosen": -425.4825134277344, + "eval_logps/rejected": -428.4342041015625, + "eval_loss": 0.5872865915298462, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.7612276077270508, + "eval_rewards/margins": 0.44415298104286194, + "eval_rewards/rejected": -1.2053806781768799, + "eval_runtime": 358.9295, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 0.176, + "step": 1700 + }, + { + "epoch": 1.79, + "learning_rate": 3.670202799151511e-07, + "logits/chosen": -2.130225658416748, + "logits/rejected": -2.0438361167907715, + "logps/chosen": -441.6221618652344, + "logps/rejected": -462.5069274902344, + "loss": 0.5669, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7283957004547119, + "rewards/margins": 0.4606415331363678, + "rewards/rejected": -1.1890372037887573, + "step": 1710 + }, + { + "epoch": 1.8, + "learning_rate": 3.6553433435283863e-07, + "logits/chosen": -2.1264588832855225, + "logits/rejected": -2.0698792934417725, + "logps/chosen": -391.79986572265625, + "logps/rejected": -420.40899658203125, + "loss": 0.5828, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7596856355667114, + "rewards/margins": 0.4628881812095642, + "rewards/rejected": -1.22257399559021, + "step": 1720 + }, + { + "epoch": 1.81, + "learning_rate": 3.640431804054002e-07, + "logits/chosen": -2.1733834743499756, + "logits/rejected": -2.100900411605835, + "logps/chosen": -413.070068359375, + "logps/rejected": -454.635009765625, + "loss": 0.5688, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6341695189476013, + "rewards/margins": 0.45797285437583923, + "rewards/rejected": -1.0921423435211182, + "step": 1730 + }, + { + "epoch": 1.82, + "learning_rate": 3.6254688529531195e-07, + "logits/chosen": -2.187265396118164, + "logits/rejected": -2.078583240509033, + "logps/chosen": -394.0973815917969, + "logps/rejected": -415.5884704589844, + "loss": 0.5537, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6361076831817627, + "rewards/margins": 0.5312715768814087, + "rewards/rejected": -1.1673791408538818, + "step": 1740 + }, + { + "epoch": 1.83, + "learning_rate": 3.610455164768181e-07, + "logits/chosen": -2.1528546810150146, + "logits/rejected": -2.012460947036743, + "logps/chosen": -445.7840270996094, + "logps/rejected": -415.9200744628906, + "loss": 0.5585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7289555668830872, + "rewards/margins": 0.4902923107147217, + "rewards/rejected": -1.219247817993164, + "step": 1750 + }, + { + "epoch": 1.84, + "learning_rate": 3.595391416328897e-07, + "logits/chosen": -2.0355443954467773, + "logits/rejected": -1.9784704446792603, + "logps/chosen": -363.8067626953125, + "logps/rejected": -387.1481628417969, + "loss": 0.5571, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7023047208786011, + "rewards/margins": 0.4179447293281555, + "rewards/rejected": -1.1202495098114014, + "step": 1760 + }, + { + "epoch": 1.85, + "learning_rate": 3.580278286721738e-07, + "logits/chosen": -2.1093432903289795, + "logits/rejected": -2.0327229499816895, + "logps/chosen": -422.77447509765625, + "logps/rejected": -428.30035400390625, + "loss": 0.5745, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6180187463760376, + "rewards/margins": 0.4408366084098816, + "rewards/rejected": -1.0588552951812744, + "step": 1770 + }, + { + "epoch": 1.86, + "learning_rate": 3.56511645725932e-07, + "logits/chosen": -2.1519556045532227, + "logits/rejected": -2.0771679878234863, + "logps/chosen": -400.14923095703125, + "logps/rejected": -423.17919921875, + "loss": 0.5574, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5555980205535889, + "rewards/margins": 0.5850681066513062, + "rewards/rejected": -1.1406662464141846, + "step": 1780 + }, + { + "epoch": 1.87, + "learning_rate": 3.549906611449688e-07, + "logits/chosen": -2.179636001586914, + "logits/rejected": -2.0839486122131348, + "logps/chosen": -403.0487365722656, + "logps/rejected": -393.1607971191406, + "loss": 0.5519, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5947157144546509, + "rewards/margins": 0.4571925699710846, + "rewards/rejected": -1.051908254623413, + "step": 1790 + }, + { + "epoch": 1.88, + "learning_rate": 3.534649434965505e-07, + "logits/chosen": -2.122799873352051, + "logits/rejected": -2.0291850566864014, + "logps/chosen": -423.5863342285156, + "logps/rejected": -406.68280029296875, + "loss": 0.5702, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7036025524139404, + "rewards/margins": 0.47179120779037476, + "rewards/rejected": -1.1753937005996704, + "step": 1800 + }, + { + "epoch": 1.88, + "eval_logits/chosen": -2.1546154022216797, + "eval_logits/rejected": -2.053884267807007, + "eval_logps/chosen": -424.3879089355469, + "eval_logps/rejected": -429.0813903808594, + "eval_loss": 0.5842701196670532, + "eval_rewards/accuracies": 0.7361111044883728, + "eval_rewards/chosen": -0.7502815127372742, + "eval_rewards/margins": 0.4615708589553833, + "eval_rewards/rejected": -1.2118524312973022, + "eval_runtime": 384.6527, + "eval_samples_per_second": 5.199, + "eval_steps_per_second": 0.164, + "step": 1800 + }, + { + "epoch": 1.89, + "learning_rate": 3.5193456156131394e-07, + "logits/chosen": -2.099229097366333, + "logits/rejected": -2.0521140098571777, + "logps/chosen": -401.4447021484375, + "logps/rejected": -410.57550048828125, + "loss": 0.5632, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7107268571853638, + "rewards/margins": 0.4665645658969879, + "rewards/rejected": -1.1772915124893188, + "step": 1810 + }, + { + "epoch": 1.9, + "learning_rate": 3.503995843301662e-07, + "logits/chosen": -2.123899459838867, + "logits/rejected": -1.977447509765625, + "logps/chosen": -439.928466796875, + "logps/rejected": -431.59295654296875, + "loss": 0.5532, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.775931715965271, + "rewards/margins": 0.5444117188453674, + "rewards/rejected": -1.3203436136245728, + "step": 1820 + }, + { + "epoch": 1.92, + "learning_rate": 3.488600810011739e-07, + "logits/chosen": -2.1486592292785645, + "logits/rejected": -2.0278477668762207, + "logps/chosen": -425.73101806640625, + "logps/rejected": -444.21044921875, + "loss": 0.563, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6286954879760742, + "rewards/margins": 0.5148676633834839, + "rewards/rejected": -1.143563151359558, + "step": 1830 + }, + { + "epoch": 1.93, + "learning_rate": 3.4731612097644425e-07, + "logits/chosen": -2.12416410446167, + "logits/rejected": -1.9885361194610596, + "logps/chosen": -420.1853942871094, + "logps/rejected": -403.20587158203125, + "loss": 0.5727, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6813632845878601, + "rewards/margins": 0.4498722553253174, + "rewards/rejected": -1.1312355995178223, + "step": 1840 + }, + { + "epoch": 1.94, + "learning_rate": 3.4576777385899567e-07, + "logits/chosen": -2.059755802154541, + "logits/rejected": -2.0208868980407715, + "logps/chosen": -406.7179260253906, + "logps/rejected": -423.005126953125, + "loss": 0.548, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5665189623832703, + "rewards/margins": 0.44578060507774353, + "rewards/rejected": -1.0122995376586914, + "step": 1850 + }, + { + "epoch": 1.95, + "learning_rate": 3.4421510944962075e-07, + "logits/chosen": -2.1009135246276855, + "logits/rejected": -2.0589375495910645, + "logps/chosen": -414.166015625, + "logps/rejected": -463.72100830078125, + "loss": 0.5715, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7198529243469238, + "rewards/margins": 0.3939817547798157, + "rewards/rejected": -1.1138347387313843, + "step": 1860 + }, + { + "epoch": 1.96, + "learning_rate": 3.4265819774373923e-07, + "logits/chosen": -2.1253786087036133, + "logits/rejected": -2.0618138313293457, + "logps/chosen": -407.43878173828125, + "logps/rejected": -423.29974365234375, + "loss": 0.5431, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6462190747261047, + "rewards/margins": 0.46022695302963257, + "rewards/rejected": -1.1064460277557373, + "step": 1870 + }, + { + "epoch": 1.97, + "learning_rate": 3.410971089282423e-07, + "logits/chosen": -2.095548391342163, + "logits/rejected": -2.008098602294922, + "logps/chosen": -416.75262451171875, + "logps/rejected": -424.6083984375, + "loss": 0.5702, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7109544277191162, + "rewards/margins": 0.5041101574897766, + "rewards/rejected": -1.215064287185669, + "step": 1880 + }, + { + "epoch": 1.98, + "learning_rate": 3.395319133783289e-07, + "logits/chosen": -2.039357900619507, + "logits/rejected": -1.9043476581573486, + "logps/chosen": -384.81793212890625, + "logps/rejected": -385.3327331542969, + "loss": 0.5724, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5902493000030518, + "rewards/margins": 0.43905600905418396, + "rewards/rejected": -1.0293052196502686, + "step": 1890 + }, + { + "epoch": 1.99, + "learning_rate": 3.3796268165433314e-07, + "logits/chosen": -2.0562853813171387, + "logits/rejected": -2.0060746669769287, + "logps/chosen": -386.97503662109375, + "logps/rejected": -424.0105895996094, + "loss": 0.5505, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6826264262199402, + "rewards/margins": 0.4815604090690613, + "rewards/rejected": -1.1641868352890015, + "step": 1900 + }, + { + "epoch": 1.99, + "eval_logits/chosen": -2.133981227874756, + "eval_logits/rejected": -2.032824993133545, + "eval_logps/chosen": -413.9261169433594, + "eval_logps/rejected": -417.8120422363281, + "eval_loss": 0.5851796865463257, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.6456640958786011, + "eval_rewards/margins": 0.4534952640533447, + "eval_rewards/rejected": -1.0991593599319458, + "eval_runtime": 341.8323, + "eval_samples_per_second": 5.851, + "eval_steps_per_second": 0.184, + "step": 1900 + }, + { + "epoch": 2.0, + "learning_rate": 3.363894844985432e-07, + "logits/chosen": -2.1606099605560303, + "logits/rejected": -2.1270124912261963, + "logps/chosen": -408.40240478515625, + "logps/rejected": -433.38653564453125, + "loss": 0.5841, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6379219889640808, + "rewards/margins": 0.37952426075935364, + "rewards/rejected": -1.0174461603164673, + "step": 1910 + }, + { + "epoch": 2.01, + "learning_rate": 3.3481239283201205e-07, + "logits/chosen": -2.0863089561462402, + "logits/rejected": -1.9511423110961914, + "logps/chosen": -434.5306091308594, + "logps/rejected": -442.3555603027344, + "loss": 0.5229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7100010514259338, + "rewards/margins": 0.5080522298812866, + "rewards/rejected": -1.2180532217025757, + "step": 1920 + }, + { + "epoch": 2.02, + "learning_rate": 3.332314777513608e-07, + "logits/chosen": -2.0927722454071045, + "logits/rejected": -1.9584945440292358, + "logps/chosen": -425.11480712890625, + "logps/rejected": -425.4661560058594, + "loss": 0.5512, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7203429937362671, + "rewards/margins": 0.5184942483901978, + "rewards/rejected": -1.2388372421264648, + "step": 1930 + }, + { + "epoch": 2.03, + "learning_rate": 3.3164681052557315e-07, + "logits/chosen": -2.045835494995117, + "logits/rejected": -1.8995403051376343, + "logps/chosen": -425.0332946777344, + "logps/rejected": -431.9063415527344, + "loss": 0.5476, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7351408004760742, + "rewards/margins": 0.5671769976615906, + "rewards/rejected": -1.30231773853302, + "step": 1940 + }, + { + "epoch": 2.04, + "learning_rate": 3.3005846259278257e-07, + "logits/chosen": -1.9578487873077393, + "logits/rejected": -1.9154150485992432, + "logps/chosen": -363.75, + "logps/rejected": -405.8369445800781, + "loss": 0.5311, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7119430899620056, + "rewards/margins": 0.5486541986465454, + "rewards/rejected": -1.2605974674224854, + "step": 1950 + }, + { + "epoch": 2.05, + "learning_rate": 3.2846650555705207e-07, + "logits/chosen": -2.09869384765625, + "logits/rejected": -2.004453420639038, + "logps/chosen": -420.51593017578125, + "logps/rejected": -439.56475830078125, + "loss": 0.5468, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.770484447479248, + "rewards/margins": 0.5066950917243958, + "rewards/rejected": -1.2771797180175781, + "step": 1960 + }, + { + "epoch": 2.06, + "learning_rate": 3.268710111851459e-07, + "logits/chosen": -2.1401185989379883, + "logits/rejected": -2.0326156616210938, + "logps/chosen": -423.18609619140625, + "logps/rejected": -450.2783203125, + "loss": 0.5479, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8261698484420776, + "rewards/margins": 0.5264754295349121, + "rewards/rejected": -1.3526453971862793, + "step": 1970 + }, + { + "epoch": 2.07, + "learning_rate": 3.252720514032946e-07, + "logits/chosen": -2.0674030780792236, + "logits/rejected": -1.956199049949646, + "logps/chosen": -419.334716796875, + "logps/rejected": -439.58056640625, + "loss": 0.5457, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7481150031089783, + "rewards/margins": 0.5188611745834351, + "rewards/rejected": -1.2669761180877686, + "step": 1980 + }, + { + "epoch": 2.08, + "learning_rate": 3.236696982939521e-07, + "logits/chosen": -2.081023693084717, + "logits/rejected": -2.012528419494629, + "logps/chosen": -398.89764404296875, + "logps/rejected": -420.46185302734375, + "loss": 0.5421, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7415474057197571, + "rewards/margins": 0.4471089243888855, + "rewards/rejected": -1.188656210899353, + "step": 1990 + }, + { + "epoch": 2.09, + "learning_rate": 3.2206402409254655e-07, + "logits/chosen": -1.9845138788223267, + "logits/rejected": -1.8962827920913696, + "logps/chosen": -381.1123352050781, + "logps/rejected": -405.03948974609375, + "loss": 0.5389, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.640026330947876, + "rewards/margins": 0.5347259044647217, + "rewards/rejected": -1.1747523546218872, + "step": 2000 + }, + { + "epoch": 2.09, + "eval_logits/chosen": -2.080613374710083, + "eval_logits/rejected": -1.976927399635315, + "eval_logps/chosen": -422.3401794433594, + "eval_logps/rejected": -427.39385986328125, + "eval_loss": 0.5828012228012085, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.7298040390014648, + "eval_rewards/margins": 0.4651729166507721, + "eval_rewards/rejected": -1.1949769258499146, + "eval_runtime": 350.2329, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 0.18, + "step": 2000 + }, + { + "epoch": 2.1, + "learning_rate": 3.204551011842237e-07, + "logits/chosen": -2.084751605987549, + "logits/rejected": -1.9851831197738647, + "logps/chosen": -423.9139099121094, + "logps/rejected": -449.2900390625, + "loss": 0.5353, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6902648210525513, + "rewards/margins": 0.5586960911750793, + "rewards/rejected": -1.2489607334136963, + "step": 2010 + }, + { + "epoch": 2.11, + "learning_rate": 3.188430021005837e-07, + "logits/chosen": -2.000121831893921, + "logits/rejected": -1.9606053829193115, + "logps/chosen": -390.7865905761719, + "logps/rejected": -434.36004638671875, + "loss": 0.5281, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7369731068611145, + "rewards/margins": 0.5088338255882263, + "rewards/rejected": -1.2458069324493408, + "step": 2020 + }, + { + "epoch": 2.12, + "learning_rate": 3.172277995164112e-07, + "logits/chosen": -2.0466647148132324, + "logits/rejected": -1.9290826320648193, + "logps/chosen": -436.10076904296875, + "logps/rejected": -415.53369140625, + "loss": 0.5291, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.711915135383606, + "rewards/margins": 0.4383808970451355, + "rewards/rejected": -1.1502960920333862, + "step": 2030 + }, + { + "epoch": 2.14, + "learning_rate": 3.156095662463998e-07, + "logits/chosen": -1.9822591543197632, + "logits/rejected": -1.9326177835464478, + "logps/chosen": -386.89080810546875, + "logps/rejected": -449.0328674316406, + "loss": 0.5138, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8381987810134888, + "rewards/margins": 0.5326210260391235, + "rewards/rejected": -1.3708198070526123, + "step": 2040 + }, + { + "epoch": 2.15, + "learning_rate": 3.139883752418682e-07, + "logits/chosen": -1.9811346530914307, + "logits/rejected": -1.902254343032837, + "logps/chosen": -441.2205505371094, + "logps/rejected": -464.0350036621094, + "loss": 0.5244, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8369643092155457, + "rewards/margins": 0.5579534769058228, + "rewards/rejected": -1.3949177265167236, + "step": 2050 + }, + { + "epoch": 2.16, + "learning_rate": 3.1236429958747294e-07, + "logits/chosen": -1.9652820825576782, + "logits/rejected": -1.8512372970581055, + "logps/chosen": -416.72607421875, + "logps/rejected": -415.53912353515625, + "loss": 0.5416, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9153169393539429, + "rewards/margins": 0.4332866668701172, + "rewards/rejected": -1.34860360622406, + "step": 2060 + }, + { + "epoch": 2.17, + "learning_rate": 3.107374124979127e-07, + "logits/chosen": -2.0142264366149902, + "logits/rejected": -1.8807601928710938, + "logps/chosen": -388.02777099609375, + "logps/rejected": -400.16973876953125, + "loss": 0.5307, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7066773176193237, + "rewards/margins": 0.537675678730011, + "rewards/rejected": -1.2443530559539795, + "step": 2070 + }, + { + "epoch": 2.18, + "learning_rate": 3.0910778731462807e-07, + "logits/chosen": -2.0407018661499023, + "logits/rejected": -1.960519552230835, + "logps/chosen": -405.73577880859375, + "logps/rejected": -430.8636169433594, + "loss": 0.5371, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6238777041435242, + "rewards/margins": 0.5529268980026245, + "rewards/rejected": -1.176804542541504, + "step": 2080 + }, + { + "epoch": 2.19, + "learning_rate": 3.0747549750249517e-07, + "logits/chosen": -2.1523895263671875, + "logits/rejected": -1.9908783435821533, + "logps/chosen": -470.22454833984375, + "logps/rejected": -458.03350830078125, + "loss": 0.5252, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7559888958930969, + "rewards/margins": 0.6371704339981079, + "rewards/rejected": -1.3931593894958496, + "step": 2090 + }, + { + "epoch": 2.2, + "learning_rate": 3.058406166465139e-07, + "logits/chosen": -2.0282669067382812, + "logits/rejected": -1.9857925176620483, + "logps/chosen": -439.1322326660156, + "logps/rejected": -473.20806884765625, + "loss": 0.531, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8369047045707703, + "rewards/margins": 0.5290185213088989, + "rewards/rejected": -1.3659231662750244, + "step": 2100 + }, + { + "epoch": 2.2, + "eval_logits/chosen": -2.056457281112671, + "eval_logits/rejected": -1.9511338472366333, + "eval_logps/chosen": -437.7683410644531, + "eval_logps/rejected": -446.1321716308594, + "eval_loss": 0.5804704427719116, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.8840858936309814, + "eval_rewards/margins": 0.4982740879058838, + "eval_rewards/rejected": -1.3823601007461548, + "eval_runtime": 346.9914, + "eval_samples_per_second": 5.764, + "eval_steps_per_second": 0.182, + "step": 2100 + }, + { + "epoch": 2.21, + "learning_rate": 3.0420321844849056e-07, + "logits/chosen": -2.0606753826141357, + "logits/rejected": -1.9562078714370728, + "logps/chosen": -453.8863830566406, + "logps/rejected": -467.1893615722656, + "loss": 0.5282, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8377038836479187, + "rewards/margins": 0.6110423803329468, + "rewards/rejected": -1.4487463235855103, + "step": 2110 + }, + { + "epoch": 2.22, + "learning_rate": 3.0256337672371543e-07, + "logits/chosen": -2.0410220623016357, + "logits/rejected": -2.0064282417297363, + "logps/chosen": -400.1402282714844, + "logps/rejected": -408.17462158203125, + "loss": 0.5285, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7289966344833374, + "rewards/margins": 0.5061737298965454, + "rewards/rejected": -1.2351701259613037, + "step": 2120 + }, + { + "epoch": 2.23, + "learning_rate": 3.0092116539763487e-07, + "logits/chosen": -2.0146710872650146, + "logits/rejected": -1.920397162437439, + "logps/chosen": -437.10235595703125, + "logps/rejected": -466.9913635253906, + "loss": 0.5066, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8071281313896179, + "rewards/margins": 0.6220065355300903, + "rewards/rejected": -1.429134726524353, + "step": 2130 + }, + { + "epoch": 2.24, + "learning_rate": 2.99276658502519e-07, + "logits/chosen": -1.964906096458435, + "logits/rejected": -1.9002695083618164, + "logps/chosen": -375.3905944824219, + "logps/rejected": -396.67144775390625, + "loss": 0.5269, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.7183672189712524, + "rewards/margins": 0.5746656656265259, + "rewards/rejected": -1.2930328845977783, + "step": 2140 + }, + { + "epoch": 2.25, + "learning_rate": 2.9762993017412404e-07, + "logits/chosen": -2.0477283000946045, + "logits/rejected": -1.9289798736572266, + "logps/chosen": -438.35833740234375, + "logps/rejected": -434.1504821777344, + "loss": 0.5129, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7393466234207153, + "rewards/margins": 0.5844893455505371, + "rewards/rejected": -1.323835849761963, + "step": 2150 + }, + { + "epoch": 2.26, + "learning_rate": 2.959810546483505e-07, + "logits/chosen": -2.0242910385131836, + "logits/rejected": -1.91313898563385, + "logps/chosen": -394.24859619140625, + "logps/rejected": -425.3580017089844, + "loss": 0.5063, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.703346312046051, + "rewards/margins": 0.6648355722427368, + "rewards/rejected": -1.3681819438934326, + "step": 2160 + }, + { + "epoch": 2.27, + "learning_rate": 2.94330106257896e-07, + "logits/chosen": -2.034824848175049, + "logits/rejected": -1.9532169103622437, + "logps/chosen": -418.4927673339844, + "logps/rejected": -450.23297119140625, + "loss": 0.4943, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9016658663749695, + "rewards/margins": 0.5852879285812378, + "rewards/rejected": -1.486953854560852, + "step": 2170 + }, + { + "epoch": 2.28, + "learning_rate": 2.92677159428905e-07, + "logits/chosen": -2.0668792724609375, + "logits/rejected": -1.9423834085464478, + "logps/chosen": -431.8003845214844, + "logps/rejected": -459.1951599121094, + "loss": 0.5502, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.950252890586853, + "rewards/margins": 0.5695884227752686, + "rewards/rejected": -1.5198414325714111, + "step": 2180 + }, + { + "epoch": 2.29, + "learning_rate": 2.9102228867761297e-07, + "logits/chosen": -1.9795408248901367, + "logits/rejected": -1.8501724004745483, + "logps/chosen": -449.26910400390625, + "logps/rejected": -450.2264099121094, + "loss": 0.5187, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9034382700920105, + "rewards/margins": 0.6167550086975098, + "rewards/rejected": -1.5201932191848755, + "step": 2190 + }, + { + "epoch": 2.3, + "learning_rate": 2.8936556860698764e-07, + "logits/chosen": -1.9710218906402588, + "logits/rejected": -1.8519203662872314, + "logps/chosen": -418.4292907714844, + "logps/rejected": -471.595703125, + "loss": 0.5162, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.7069419622421265, + "rewards/margins": 0.6886085867881775, + "rewards/rejected": -1.3955506086349487, + "step": 2200 + }, + { + "epoch": 2.3, + "eval_logits/chosen": -2.018048048019409, + "eval_logits/rejected": -1.9111573696136475, + "eval_logps/chosen": -435.0021667480469, + "eval_logps/rejected": -443.46435546875, + "eval_loss": 0.5830379724502563, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.8564241528511047, + "eval_rewards/margins": 0.4992583692073822, + "eval_rewards/rejected": -1.355682611465454, + "eval_runtime": 366.1679, + "eval_samples_per_second": 5.462, + "eval_steps_per_second": 0.172, + "step": 2200 + }, + { + "epoch": 2.31, + "learning_rate": 2.8770707390336545e-07, + "logits/chosen": -2.042503833770752, + "logits/rejected": -1.9252078533172607, + "logps/chosen": -412.6761169433594, + "logps/rejected": -434.27337646484375, + "loss": 0.5181, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7401344180107117, + "rewards/margins": 0.6476941108703613, + "rewards/rejected": -1.3878285884857178, + "step": 2210 + }, + { + "epoch": 2.32, + "learning_rate": 2.860468793330849e-07, + "logits/chosen": -1.983633041381836, + "logits/rejected": -1.8197529315948486, + "logps/chosen": -450.23626708984375, + "logps/rejected": -447.0399475097656, + "loss": 0.4956, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8933134078979492, + "rewards/margins": 0.6681145429611206, + "rewards/rejected": -1.5614279508590698, + "step": 2220 + }, + { + "epoch": 2.33, + "learning_rate": 2.843850597391159e-07, + "logits/chosen": -2.0030102729797363, + "logits/rejected": -1.901752233505249, + "logps/chosen": -439.53912353515625, + "logps/rejected": -469.72259521484375, + "loss": 0.5302, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.8122542500495911, + "rewards/margins": 0.6632959842681885, + "rewards/rejected": -1.4755501747131348, + "step": 2230 + }, + { + "epoch": 2.34, + "learning_rate": 2.827216900376857e-07, + "logits/chosen": -1.844321846961975, + "logits/rejected": -1.7138290405273438, + "logps/chosen": -439.25732421875, + "logps/rejected": -470.17889404296875, + "loss": 0.4927, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.8151344060897827, + "rewards/margins": 0.7723340392112732, + "rewards/rejected": -1.5874683856964111, + "step": 2240 + }, + { + "epoch": 2.35, + "learning_rate": 2.810568452149019e-07, + "logits/chosen": -2.096543312072754, + "logits/rejected": -1.9603042602539062, + "logps/chosen": -471.48907470703125, + "logps/rejected": -477.1119689941406, + "loss": 0.5464, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8581037521362305, + "rewards/margins": 0.599617600440979, + "rewards/rejected": -1.457721471786499, + "step": 2250 + }, + { + "epoch": 2.37, + "learning_rate": 2.793906003233714e-07, + "logits/chosen": -2.0717244148254395, + "logits/rejected": -1.9762405157089233, + "logps/chosen": -421.10687255859375, + "logps/rejected": -447.38873291015625, + "loss": 0.5298, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7920399904251099, + "rewards/margins": 0.5180048942565918, + "rewards/rejected": -1.3100448846817017, + "step": 2260 + }, + { + "epoch": 2.38, + "learning_rate": 2.77723030478818e-07, + "logits/chosen": -1.9469770193099976, + "logits/rejected": -1.9061082601547241, + "logps/chosen": -380.3778076171875, + "logps/rejected": -470.05694580078125, + "loss": 0.5139, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7958939075469971, + "rewards/margins": 0.6314458250999451, + "rewards/rejected": -1.4273395538330078, + "step": 2270 + }, + { + "epoch": 2.39, + "learning_rate": 2.760542108566949e-07, + "logits/chosen": -2.0161285400390625, + "logits/rejected": -1.881219506263733, + "logps/chosen": -457.6134338378906, + "logps/rejected": -443.34747314453125, + "loss": 0.54, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7452512979507446, + "rewards/margins": 0.5063202977180481, + "rewards/rejected": -1.2515714168548584, + "step": 2280 + }, + { + "epoch": 2.4, + "learning_rate": 2.7438421668879676e-07, + "logits/chosen": -1.9628798961639404, + "logits/rejected": -1.935486078262329, + "logps/chosen": -380.9690246582031, + "logps/rejected": -417.7799377441406, + "loss": 0.5236, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7742933630943298, + "rewards/margins": 0.5208727121353149, + "rewards/rejected": -1.295166254043579, + "step": 2290 + }, + { + "epoch": 2.41, + "learning_rate": 2.7271312325986734e-07, + "logits/chosen": -1.9569809436798096, + "logits/rejected": -1.8591148853302002, + "logps/chosen": -414.0211486816406, + "logps/rejected": -455.06341552734375, + "loss": 0.5297, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8452693223953247, + "rewards/margins": 0.6661224961280823, + "rewards/rejected": -1.5113918781280518, + "step": 2300 + }, + { + "epoch": 2.41, + "eval_logits/chosen": -1.9911383390426636, + "eval_logits/rejected": -1.8837895393371582, + "eval_logps/chosen": -448.7518615722656, + "eval_logps/rejected": -459.412353515625, + "eval_loss": 0.5794528722763062, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.9939210414886475, + "eval_rewards/margins": 0.521240770816803, + "eval_rewards/rejected": -1.5151617527008057, + "eval_runtime": 387.0195, + "eval_samples_per_second": 5.168, + "eval_steps_per_second": 0.163, + "step": 2300 + }, + { + "epoch": 2.42, + "learning_rate": 2.710410059042066e-07, + "logits/chosen": -1.9845149517059326, + "logits/rejected": -1.8956083059310913, + "logps/chosen": -429.16522216796875, + "logps/rejected": -471.6270446777344, + "loss": 0.5208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.981905460357666, + "rewards/margins": 0.7246658802032471, + "rewards/rejected": -1.706571340560913, + "step": 2310 + }, + { + "epoch": 2.43, + "learning_rate": 2.693679400022733e-07, + "logits/chosen": -1.9099270105361938, + "logits/rejected": -1.8261696100234985, + "logps/chosen": -418.8258361816406, + "logps/rejected": -440.328369140625, + "loss": 0.5342, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9699192047119141, + "rewards/margins": 0.4898689389228821, + "rewards/rejected": -1.4597880840301514, + "step": 2320 + }, + { + "epoch": 2.44, + "learning_rate": 2.6769400097728797e-07, + "logits/chosen": -1.98947274684906, + "logits/rejected": -1.855577826499939, + "logps/chosen": -429.56695556640625, + "logps/rejected": -422.18768310546875, + "loss": 0.5434, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7829546332359314, + "rewards/margins": 0.5939928293228149, + "rewards/rejected": -1.3769476413726807, + "step": 2330 + }, + { + "epoch": 2.45, + "learning_rate": 2.660192642918321e-07, + "logits/chosen": -1.9994666576385498, + "logits/rejected": -1.9309895038604736, + "logps/chosen": -437.77703857421875, + "logps/rejected": -442.0591735839844, + "loss": 0.5328, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7046215534210205, + "rewards/margins": 0.5308600068092346, + "rewards/rejected": -1.2354816198349, + "step": 2340 + }, + { + "epoch": 2.46, + "learning_rate": 2.643438054444462e-07, + "logits/chosen": -1.9171969890594482, + "logits/rejected": -1.8357493877410889, + "logps/chosen": -449.2801818847656, + "logps/rejected": -445.749755859375, + "loss": 0.5248, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7818909883499146, + "rewards/margins": 0.6039397716522217, + "rewards/rejected": -1.3858308792114258, + "step": 2350 + }, + { + "epoch": 2.47, + "learning_rate": 2.626676999662269e-07, + "logits/chosen": -1.9482982158660889, + "logits/rejected": -1.8158115148544312, + "logps/chosen": -417.89886474609375, + "logps/rejected": -441.44964599609375, + "loss": 0.545, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8549971580505371, + "rewards/margins": 0.5220503807067871, + "rewards/rejected": -1.3770474195480347, + "step": 2360 + }, + { + "epoch": 2.48, + "learning_rate": 2.60991023417421e-07, + "logits/chosen": -1.9489740133285522, + "logits/rejected": -1.7895710468292236, + "logps/chosen": -433.61700439453125, + "logps/rejected": -446.4412536621094, + "loss": 0.5197, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7963303923606873, + "rewards/margins": 0.702279806137085, + "rewards/rejected": -1.498610258102417, + "step": 2370 + }, + { + "epoch": 2.49, + "learning_rate": 2.593138513840199e-07, + "logits/chosen": -1.8876575231552124, + "logits/rejected": -1.8479945659637451, + "logps/chosen": -415.013916015625, + "logps/rejected": -434.8460998535156, + "loss": 0.532, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9357647895812988, + "rewards/margins": 0.4299185872077942, + "rewards/rejected": -1.3656833171844482, + "step": 2380 + }, + { + "epoch": 2.5, + "learning_rate": 2.576362594743518e-07, + "logits/chosen": -1.9661098718643188, + "logits/rejected": -1.857072114944458, + "logps/chosen": -408.663818359375, + "logps/rejected": -415.71331787109375, + "loss": 0.5464, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7987567186355591, + "rewards/margins": 0.6057206392288208, + "rewards/rejected": -1.4044773578643799, + "step": 2390 + }, + { + "epoch": 2.51, + "learning_rate": 2.559583233156734e-07, + "logits/chosen": -1.9324119091033936, + "logits/rejected": -1.7781422138214111, + "logps/chosen": -424.632080078125, + "logps/rejected": -436.2330017089844, + "loss": 0.5143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8693172335624695, + "rewards/margins": 0.5977479815483093, + "rewards/rejected": -1.4670653343200684, + "step": 2400 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -1.9853116273880005, + "eval_logits/rejected": -1.8784489631652832, + "eval_logps/chosen": -436.2056884765625, + "eval_logps/rejected": -445.7617492675781, + "eval_loss": 0.5805792212486267, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.8684592843055725, + "eval_rewards/margins": 0.5101962685585022, + "eval_rewards/rejected": -1.3786555528640747, + "eval_runtime": 338.8092, + "eval_samples_per_second": 5.903, + "eval_steps_per_second": 0.186, + "step": 2400 + }, + { + "epoch": 2.52, + "learning_rate": 2.5428011855076023e-07, + "logits/chosen": -1.9499645233154297, + "logits/rejected": -1.8366508483886719, + "logps/chosen": -436.2471618652344, + "logps/rejected": -451.01507568359375, + "loss": 0.5401, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8622430562973022, + "rewards/margins": 0.6568028926849365, + "rewards/rejected": -1.5190460681915283, + "step": 2410 + }, + { + "epoch": 2.53, + "learning_rate": 2.5260172083449693e-07, + "logits/chosen": -2.0690159797668457, + "logits/rejected": -1.9587256908416748, + "logps/chosen": -443.0089416503906, + "logps/rejected": -469.9134826660156, + "loss": 0.4956, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.8210717439651489, + "rewards/margins": 0.6067072153091431, + "rewards/rejected": -1.427778959274292, + "step": 2420 + }, + { + "epoch": 2.54, + "learning_rate": 2.509232058304666e-07, + "logits/chosen": -1.95901358127594, + "logits/rejected": -1.9063999652862549, + "logps/chosen": -447.19708251953125, + "logps/rejected": -482.42864990234375, + "loss": 0.5353, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8421271443367004, + "rewards/margins": 0.6823471784591675, + "rewards/rejected": -1.5244743824005127, + "step": 2430 + }, + { + "epoch": 2.55, + "learning_rate": 2.492446492075396e-07, + "logits/chosen": -1.949507474899292, + "logits/rejected": -1.9200721979141235, + "logps/chosen": -383.2177429199219, + "logps/rejected": -432.87353515625, + "loss": 0.5123, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.887121319770813, + "rewards/margins": 0.6234883069992065, + "rewards/rejected": -1.5106096267700195, + "step": 2440 + }, + { + "epoch": 2.56, + "learning_rate": 2.475661266364628e-07, + "logits/chosen": -1.8691284656524658, + "logits/rejected": -1.7979360818862915, + "logps/chosen": -446.46044921875, + "logps/rejected": -468.6094665527344, + "loss": 0.4934, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.869927704334259, + "rewards/margins": 0.6707456111907959, + "rewards/rejected": -1.5406733751296997, + "step": 2450 + }, + { + "epoch": 2.57, + "learning_rate": 2.4588771378644754e-07, + "logits/chosen": -2.0588347911834717, + "logits/rejected": -1.9376767873764038, + "logps/chosen": -473.5218811035156, + "logps/rejected": -489.5943298339844, + "loss": 0.496, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9387062788009644, + "rewards/margins": 0.6001947522163391, + "rewards/rejected": -1.5389010906219482, + "step": 2460 + }, + { + "epoch": 2.59, + "learning_rate": 2.4420948632175926e-07, + "logits/chosen": -1.9621975421905518, + "logits/rejected": -1.8935844898223877, + "logps/chosen": -433.59033203125, + "logps/rejected": -479.13861083984375, + "loss": 0.5293, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7825122475624084, + "rewards/margins": 0.6647100448608398, + "rewards/rejected": -1.447222113609314, + "step": 2470 + }, + { + "epoch": 2.6, + "learning_rate": 2.4253151989830596e-07, + "logits/chosen": -1.8841426372528076, + "logits/rejected": -1.8874428272247314, + "logps/chosen": -425.667724609375, + "logps/rejected": -442.14306640625, + "loss": 0.523, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8772993087768555, + "rewards/margins": 0.4364490509033203, + "rewards/rejected": -1.3137483596801758, + "step": 2480 + }, + { + "epoch": 2.61, + "learning_rate": 2.408538901602275e-07, + "logits/chosen": -1.9295036792755127, + "logits/rejected": -1.81784987449646, + "logps/chosen": -408.718505859375, + "logps/rejected": -428.29803466796875, + "loss": 0.5225, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8873177766799927, + "rewards/margins": 0.5828737020492554, + "rewards/rejected": -1.470191240310669, + "step": 2490 + }, + { + "epoch": 2.62, + "learning_rate": 2.3917667273648594e-07, + "logits/chosen": -1.9346414804458618, + "logits/rejected": -1.8728351593017578, + "logps/chosen": -439.614501953125, + "logps/rejected": -444.1954650878906, + "loss": 0.5377, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9809148907661438, + "rewards/margins": 0.43168848752975464, + "rewards/rejected": -1.4126031398773193, + "step": 2500 + }, + { + "epoch": 2.62, + "eval_logits/chosen": -1.9647775888442993, + "eval_logits/rejected": -1.8571594953536987, + "eval_logps/chosen": -443.1573791503906, + "eval_logps/rejected": -454.7680358886719, + "eval_loss": 0.5785647034645081, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -0.9379764199256897, + "eval_rewards/margins": 0.5307427644729614, + "eval_rewards/rejected": -1.468719244003296, + "eval_runtime": 338.5492, + "eval_samples_per_second": 5.908, + "eval_steps_per_second": 0.186, + "step": 2500 + }, + { + "epoch": 2.63, + "learning_rate": 2.374999432374556e-07, + "logits/chosen": -1.8711265325546265, + "logits/rejected": -1.8591110706329346, + "logps/chosen": -402.60113525390625, + "logps/rejected": -465.11822509765625, + "loss": 0.5378, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9721585512161255, + "rewards/margins": 0.5550946593284607, + "rewards/rejected": -1.5272531509399414, + "step": 2510 + }, + { + "epoch": 2.64, + "learning_rate": 2.3582377725151504e-07, + "logits/chosen": -1.9033355712890625, + "logits/rejected": -1.7703205347061157, + "logps/chosen": -436.557861328125, + "logps/rejected": -434.04351806640625, + "loss": 0.5082, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9090617299079895, + "rewards/margins": 0.6059231758117676, + "rewards/rejected": -1.5149848461151123, + "step": 2520 + }, + { + "epoch": 2.65, + "learning_rate": 2.3414825034163877e-07, + "logits/chosen": -1.9475148916244507, + "logits/rejected": -1.9001047611236572, + "logps/chosen": -474.4970703125, + "logps/rejected": -480.343505859375, + "loss": 0.5253, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8905152082443237, + "rewards/margins": 0.5469620227813721, + "rewards/rejected": -1.4374772310256958, + "step": 2530 + }, + { + "epoch": 2.66, + "learning_rate": 2.3247343804199176e-07, + "logits/chosen": -1.8764568567276, + "logits/rejected": -1.7902402877807617, + "logps/chosen": -422.77581787109375, + "logps/rejected": -479.0648498535156, + "loss": 0.496, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8904309272766113, + "rewards/margins": 0.7856873273849487, + "rewards/rejected": -1.67611825466156, + "step": 2540 + }, + { + "epoch": 2.67, + "learning_rate": 2.3079941585452318e-07, + "logits/chosen": -1.9895591735839844, + "logits/rejected": -1.8538814783096313, + "logps/chosen": -476.71563720703125, + "logps/rejected": -475.23406982421875, + "loss": 0.5143, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8902314901351929, + "rewards/margins": 0.617530107498169, + "rewards/rejected": -1.5077615976333618, + "step": 2550 + }, + { + "epoch": 2.68, + "learning_rate": 2.2912625924556366e-07, + "logits/chosen": -1.8772594928741455, + "logits/rejected": -1.8478418588638306, + "logps/chosen": -429.35595703125, + "logps/rejected": -496.8270568847656, + "loss": 0.5236, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8693763613700867, + "rewards/margins": 0.6027558445930481, + "rewards/rejected": -1.4721323251724243, + "step": 2560 + }, + { + "epoch": 2.69, + "learning_rate": 2.2745404364242276e-07, + "logits/chosen": -1.9632251262664795, + "logits/rejected": -1.812048316001892, + "logps/chosen": -458.9662170410156, + "logps/rejected": -465.8284606933594, + "loss": 0.5293, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9237990379333496, + "rewards/margins": 0.5360890626907349, + "rewards/rejected": -1.459887981414795, + "step": 2570 + }, + { + "epoch": 2.7, + "learning_rate": 2.2578284442998854e-07, + "logits/chosen": -1.8958606719970703, + "logits/rejected": -1.7544406652450562, + "logps/chosen": -471.21124267578125, + "logps/rejected": -447.39581298828125, + "loss": 0.5315, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9969693422317505, + "rewards/margins": 0.6323047876358032, + "rewards/rejected": -1.6292740106582642, + "step": 2580 + }, + { + "epoch": 2.71, + "learning_rate": 2.2411273694732952e-07, + "logits/chosen": -1.8865602016448975, + "logits/rejected": -1.7838201522827148, + "logps/chosen": -442.34429931640625, + "logps/rejected": -468.42486572265625, + "loss": 0.5237, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.9607971906661987, + "rewards/margins": 0.6720181703567505, + "rewards/rejected": -1.6328153610229492, + "step": 2590 + }, + { + "epoch": 2.72, + "learning_rate": 2.224437964842979e-07, + "logits/chosen": -1.8734734058380127, + "logits/rejected": -1.7853962182998657, + "logps/chosen": -408.92877197265625, + "logps/rejected": -463.09503173828125, + "loss": 0.4868, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7472456097602844, + "rewards/margins": 0.808722972869873, + "rewards/rejected": -1.5559687614440918, + "step": 2600 + }, + { + "epoch": 2.72, + "eval_logits/chosen": -1.9504142999649048, + "eval_logits/rejected": -1.8415662050247192, + "eval_logps/chosen": -439.4378662109375, + "eval_logps/rejected": -450.51556396484375, + "eval_loss": 0.579669177532196, + "eval_rewards/accuracies": 0.7301587462425232, + "eval_rewards/chosen": -0.9007813930511475, + "eval_rewards/margins": 0.5254126787185669, + "eval_rewards/rejected": -1.4261939525604248, + "eval_runtime": 397.2665, + "eval_samples_per_second": 5.034, + "eval_steps_per_second": 0.159, + "step": 2600 + }, + { + "epoch": 2.73, + "learning_rate": 2.2077609827813592e-07, + "logits/chosen": -1.8535270690917969, + "logits/rejected": -1.7869393825531006, + "logps/chosen": -416.6768493652344, + "logps/rejected": -461.3543395996094, + "loss": 0.5043, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9135526418685913, + "rewards/margins": 0.6141065359115601, + "rewards/rejected": -1.5276591777801514, + "step": 2610 + }, + { + "epoch": 2.74, + "learning_rate": 2.1910971751008347e-07, + "logits/chosen": -1.897220253944397, + "logits/rejected": -1.807562232017517, + "logps/chosen": -446.760498046875, + "logps/rejected": -471.2540588378906, + "loss": 0.5076, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9037901759147644, + "rewards/margins": 0.6555687785148621, + "rewards/rejected": -1.559358835220337, + "step": 2620 + }, + { + "epoch": 2.75, + "learning_rate": 2.1744472930198977e-07, + "logits/chosen": -1.9448814392089844, + "logits/rejected": -1.871311902999878, + "logps/chosen": -445.2456970214844, + "logps/rejected": -482.9363708496094, + "loss": 0.4983, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9247520565986633, + "rewards/margins": 0.561426043510437, + "rewards/rejected": -1.4861780405044556, + "step": 2630 + }, + { + "epoch": 2.76, + "learning_rate": 2.1578120871292553e-07, + "logits/chosen": -1.9801807403564453, + "logits/rejected": -1.870661973953247, + "logps/chosen": -468.67926025390625, + "logps/rejected": -501.02392578125, + "loss": 0.5288, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9676277041435242, + "rewards/margins": 0.5955823659896851, + "rewards/rejected": -1.5632102489471436, + "step": 2640 + }, + { + "epoch": 2.77, + "learning_rate": 2.141192307358008e-07, + "logits/chosen": -1.8639633655548096, + "logits/rejected": -1.810063123703003, + "logps/chosen": -423.62939453125, + "logps/rejected": -425.0008239746094, + "loss": 0.5144, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8432968258857727, + "rewards/margins": 0.6521421074867249, + "rewards/rejected": -1.495439052581787, + "step": 2650 + }, + { + "epoch": 2.78, + "learning_rate": 2.1245887029398247e-07, + "logits/chosen": -1.9204838275909424, + "logits/rejected": -1.8166393041610718, + "logps/chosen": -424.845458984375, + "logps/rejected": -459.1109313964844, + "loss": 0.5224, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9046141505241394, + "rewards/margins": 0.6672372817993164, + "rewards/rejected": -1.5718514919281006, + "step": 2660 + }, + { + "epoch": 2.79, + "learning_rate": 2.108002022379184e-07, + "logits/chosen": -1.9082868099212646, + "logits/rejected": -1.8343610763549805, + "logps/chosen": -463.3162536621094, + "logps/rejected": -486.88018798828125, + "loss": 0.5358, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.066706895828247, + "rewards/margins": 0.5957939624786377, + "rewards/rejected": -1.6625009775161743, + "step": 2670 + }, + { + "epoch": 2.8, + "learning_rate": 2.0914330134176185e-07, + "logits/chosen": -1.9134643077850342, + "logits/rejected": -1.8917551040649414, + "logps/chosen": -440.2626953125, + "logps/rejected": -504.54150390625, + "loss": 0.5004, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9903669357299805, + "rewards/margins": 0.610992968082428, + "rewards/rejected": -1.6013599634170532, + "step": 2680 + }, + { + "epoch": 2.82, + "learning_rate": 2.0748824230000098e-07, + "logits/chosen": -1.8200502395629883, + "logits/rejected": -1.6779934167861938, + "logps/chosen": -431.42169189453125, + "logps/rejected": -430.67974853515625, + "loss": 0.5036, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9627591967582703, + "rewards/margins": 0.6116820573806763, + "rewards/rejected": -1.5744411945343018, + "step": 2690 + }, + { + "epoch": 2.83, + "learning_rate": 2.0583509972409186e-07, + "logits/chosen": -1.8566009998321533, + "logits/rejected": -1.7274389266967773, + "logps/chosen": -421.042724609375, + "logps/rejected": -423.007080078125, + "loss": 0.5275, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8959705233573914, + "rewards/margins": 0.5554197430610657, + "rewards/rejected": -1.4513903856277466, + "step": 2700 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -1.921860694885254, + "eval_logits/rejected": -1.8117154836654663, + "eval_logps/chosen": -447.6714172363281, + "eval_logps/rejected": -460.6926574707031, + "eval_loss": 0.575380265712738, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -0.983116626739502, + "eval_rewards/margins": 0.5448485016822815, + "eval_rewards/rejected": -1.5279650688171387, + "eval_runtime": 389.6361, + "eval_samples_per_second": 5.133, + "eval_steps_per_second": 0.162, + "step": 2700 + }, + { + "epoch": 2.84, + "learning_rate": 2.0418394813909434e-07, + "logits/chosen": -1.9249767065048218, + "logits/rejected": -1.7549772262573242, + "logps/chosen": -426.38470458984375, + "logps/rejected": -445.249755859375, + "loss": 0.5327, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9240328669548035, + "rewards/margins": 0.6637715101242065, + "rewards/rejected": -1.5878043174743652, + "step": 2710 + }, + { + "epoch": 2.85, + "learning_rate": 2.025348619803132e-07, + "logits/chosen": -1.861976981163025, + "logits/rejected": -1.747554063796997, + "logps/chosen": -411.2970275878906, + "logps/rejected": -425.08441162109375, + "loss": 0.548, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8863030672073364, + "rewards/margins": 0.5835781693458557, + "rewards/rejected": -1.4698810577392578, + "step": 2720 + }, + { + "epoch": 2.86, + "learning_rate": 2.0088791558994143e-07, + "logits/chosen": -1.8526477813720703, + "logits/rejected": -1.8287807703018188, + "logps/chosen": -447.93243408203125, + "logps/rejected": -471.99273681640625, + "loss": 0.5235, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8392646908760071, + "rewards/margins": 0.6029472351074219, + "rewards/rejected": -1.4422122240066528, + "step": 2730 + }, + { + "epoch": 2.87, + "learning_rate": 1.9924318321371013e-07, + "logits/chosen": -1.9511082172393799, + "logits/rejected": -1.810118317604065, + "logps/chosen": -435.09991455078125, + "logps/rejected": -457.4765625, + "loss": 0.4957, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8674267530441284, + "rewards/margins": 0.7397493124008179, + "rewards/rejected": -1.6071761846542358, + "step": 2740 + }, + { + "epoch": 2.88, + "learning_rate": 1.976007389975401e-07, + "logits/chosen": -1.909641981124878, + "logits/rejected": -1.8077083826065063, + "logps/chosen": -428.82171630859375, + "logps/rejected": -457.6050720214844, + "loss": 0.5122, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8866938352584839, + "rewards/margins": 0.6875293850898743, + "rewards/rejected": -1.5742231607437134, + "step": 2750 + }, + { + "epoch": 2.89, + "learning_rate": 1.959606569842006e-07, + "logits/chosen": -1.8715641498565674, + "logits/rejected": -1.7681375741958618, + "logps/chosen": -402.3782958984375, + "logps/rejected": -419.69970703125, + "loss": 0.5167, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9384256601333618, + "rewards/margins": 0.5532727241516113, + "rewards/rejected": -1.4916983842849731, + "step": 2760 + }, + { + "epoch": 2.9, + "learning_rate": 1.9432301110997034e-07, + "logits/chosen": -1.85273015499115, + "logits/rejected": -1.7387834787368774, + "logps/chosen": -444.2647399902344, + "logps/rejected": -481.83685302734375, + "loss": 0.4981, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9170455932617188, + "rewards/margins": 0.6824191212654114, + "rewards/rejected": -1.5994646549224854, + "step": 2770 + }, + { + "epoch": 2.91, + "learning_rate": 1.9268787520130504e-07, + "logits/chosen": -1.7926514148712158, + "logits/rejected": -1.67780339717865, + "logps/chosen": -389.5690002441406, + "logps/rejected": -420.969970703125, + "loss": 0.5282, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9584437608718872, + "rewards/margins": 0.5365868806838989, + "rewards/rejected": -1.4950306415557861, + "step": 2780 + }, + { + "epoch": 2.92, + "learning_rate": 1.91055322971509e-07, + "logits/chosen": -1.8190996646881104, + "logits/rejected": -1.7043695449829102, + "logps/chosen": -409.02398681640625, + "logps/rejected": -439.34368896484375, + "loss": 0.5061, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8551441431045532, + "rewards/margins": 0.780997633934021, + "rewards/rejected": -1.6361417770385742, + "step": 2790 + }, + { + "epoch": 2.93, + "learning_rate": 1.8942542801741207e-07, + "logits/chosen": -1.9003918170928955, + "logits/rejected": -1.779552698135376, + "logps/chosen": -433.330810546875, + "logps/rejected": -451.95916748046875, + "loss": 0.5042, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9382666349411011, + "rewards/margins": 0.6287131309509277, + "rewards/rejected": -1.5669798851013184, + "step": 2800 + }, + { + "epoch": 2.93, + "eval_logits/chosen": -1.9484288692474365, + "eval_logits/rejected": -1.8400510549545288, + "eval_logps/chosen": -447.79278564453125, + "eval_logps/rejected": -460.85772705078125, + "eval_loss": 0.574294924736023, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.984330415725708, + "eval_rewards/margins": 0.5452856421470642, + "eval_rewards/rejected": -1.5296159982681274, + "eval_runtime": 367.54, + "eval_samples_per_second": 5.442, + "eval_steps_per_second": 0.171, + "step": 2800 + }, + { + "epoch": 2.94, + "learning_rate": 1.8779826381605198e-07, + "logits/chosen": -1.8743737936019897, + "logits/rejected": -1.812796950340271, + "logps/chosen": -486.4170837402344, + "logps/rejected": -492.00811767578125, + "loss": 0.5333, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0714589357376099, + "rewards/margins": 0.49343618750572205, + "rewards/rejected": -1.5648950338363647, + "step": 2810 + }, + { + "epoch": 2.95, + "learning_rate": 1.861739037213616e-07, + "logits/chosen": -1.9613018035888672, + "logits/rejected": -1.8247106075286865, + "logps/chosen": -471.99530029296875, + "logps/rejected": -517.2361450195312, + "loss": 0.5048, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8858569264411926, + "rewards/margins": 0.7351481318473816, + "rewards/rejected": -1.6210052967071533, + "step": 2820 + }, + { + "epoch": 2.96, + "learning_rate": 1.845524209608627e-07, + "logits/chosen": -1.8594707250595093, + "logits/rejected": -1.7978969812393188, + "logps/chosen": -451.421875, + "logps/rejected": -477.1441955566406, + "loss": 0.516, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.957187831401825, + "rewards/margins": 0.6043495535850525, + "rewards/rejected": -1.5615373849868774, + "step": 2830 + }, + { + "epoch": 2.97, + "learning_rate": 1.8293388863236391e-07, + "logits/chosen": -1.8253387212753296, + "logits/rejected": -1.7428086996078491, + "logps/chosen": -437.1249084472656, + "logps/rejected": -436.38995361328125, + "loss": 0.4964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8720731735229492, + "rewards/margins": 0.5980420708656311, + "rewards/rejected": -1.4701151847839355, + "step": 2840 + }, + { + "epoch": 2.98, + "learning_rate": 1.8131837970066635e-07, + "logits/chosen": -1.9482589960098267, + "logits/rejected": -1.8544782400131226, + "logps/chosen": -473.46160888671875, + "logps/rejected": -489.701904296875, + "loss": 0.5122, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9760125875473022, + "rewards/margins": 0.6014086008071899, + "rewards/rejected": -1.5774211883544922, + "step": 2850 + }, + { + "epoch": 2.99, + "learning_rate": 1.7970596699427355e-07, + "logits/chosen": -1.96894109249115, + "logits/rejected": -1.8086011409759521, + "logps/chosen": -452.5389099121094, + "logps/rejected": -470.1752014160156, + "loss": 0.4997, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9787393808364868, + "rewards/margins": 0.6899208426475525, + "rewards/rejected": -1.6686604022979736, + "step": 2860 + }, + { + "epoch": 3.0, + "learning_rate": 1.7809672320210872e-07, + "logits/chosen": -1.898374319076538, + "logits/rejected": -1.8311573266983032, + "logps/chosen": -472.875244140625, + "logps/rejected": -486.66265869140625, + "loss": 0.5019, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9755460619926453, + "rewards/margins": 0.6229265928268433, + "rewards/rejected": -1.5984727144241333, + "step": 2870 + }, + { + "epoch": 3.01, + "learning_rate": 1.7649072087023784e-07, + "logits/chosen": -1.9573551416397095, + "logits/rejected": -1.8683230876922607, + "logps/chosen": -464.22247314453125, + "logps/rejected": -508.75341796875, + "loss": 0.4887, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9004373550415039, + "rewards/margins": 0.733317494392395, + "rewards/rejected": -1.6337547302246094, + "step": 2880 + }, + { + "epoch": 3.02, + "learning_rate": 1.748880323985989e-07, + "logits/chosen": -1.904697060585022, + "logits/rejected": -1.7625919580459595, + "logps/chosen": -442.80084228515625, + "logps/rejected": -461.4578552246094, + "loss": 0.4727, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9410102963447571, + "rewards/margins": 0.7015891671180725, + "rewards/rejected": -1.6425994634628296, + "step": 2890 + }, + { + "epoch": 3.04, + "learning_rate": 1.7328873003773848e-07, + "logits/chosen": -1.9040206670761108, + "logits/rejected": -1.8098185062408447, + "logps/chosen": -448.4427795410156, + "logps/rejected": -453.7347717285156, + "loss": 0.4862, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0325968265533447, + "rewards/margins": 0.6374907493591309, + "rewards/rejected": -1.6700875759124756, + "step": 2900 + }, + { + "epoch": 3.04, + "eval_logits/chosen": -1.9315091371536255, + "eval_logits/rejected": -1.821599006652832, + "eval_logps/chosen": -452.8863220214844, + "eval_logps/rejected": -467.03509521484375, + "eval_loss": 0.575552225112915, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.0352660417556763, + "eval_rewards/margins": 0.5561232566833496, + "eval_rewards/rejected": -1.5913892984390259, + "eval_runtime": 368.824, + "eval_samples_per_second": 5.423, + "eval_steps_per_second": 0.171, + "step": 2900 + }, + { + "epoch": 3.05, + "learning_rate": 1.7169288588555424e-07, + "logits/chosen": -1.881466269493103, + "logits/rejected": -1.747097373008728, + "logps/chosen": -449.9195251464844, + "logps/rejected": -475.69586181640625, + "loss": 0.4603, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.9050649404525757, + "rewards/margins": 0.9311367869377136, + "rewards/rejected": -1.8362019062042236, + "step": 2910 + }, + { + "epoch": 3.06, + "learning_rate": 1.701005718840453e-07, + "logits/chosen": -1.882340431213379, + "logits/rejected": -1.7553138732910156, + "logps/chosen": -452.6888732910156, + "logps/rejected": -472.3089904785156, + "loss": 0.4984, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.981615424156189, + "rewards/margins": 0.6442556977272034, + "rewards/rejected": -1.6258710622787476, + "step": 2920 + }, + { + "epoch": 3.07, + "learning_rate": 1.6851185981606795e-07, + "logits/chosen": -1.8996105194091797, + "logits/rejected": -1.7734209299087524, + "logps/chosen": -448.3755798339844, + "logps/rejected": -456.1847229003906, + "loss": 0.4861, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.964205265045166, + "rewards/margins": 0.7373130917549133, + "rewards/rejected": -1.7015184164047241, + "step": 2930 + }, + { + "epoch": 3.08, + "learning_rate": 1.669268213021009e-07, + "logits/chosen": -1.9143121242523193, + "logits/rejected": -1.822167158126831, + "logps/chosen": -435.2249450683594, + "logps/rejected": -487.91827392578125, + "loss": 0.4969, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9106824994087219, + "rewards/margins": 0.8411850929260254, + "rewards/rejected": -1.751867651939392, + "step": 2940 + }, + { + "epoch": 3.09, + "learning_rate": 1.6534552779701555e-07, + "logits/chosen": -1.7521066665649414, + "logits/rejected": -1.6937494277954102, + "logps/chosen": -430.65081787109375, + "logps/rejected": -494.86737060546875, + "loss": 0.4794, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9239088892936707, + "rewards/margins": 0.80866938829422, + "rewards/rejected": -1.7325782775878906, + "step": 2950 + }, + { + "epoch": 3.1, + "learning_rate": 1.6376805058685538e-07, + "logits/chosen": -1.8456952571868896, + "logits/rejected": -1.7061046361923218, + "logps/chosen": -420.896728515625, + "logps/rejected": -445.06024169921875, + "loss": 0.5017, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9549886584281921, + "rewards/margins": 0.6535457372665405, + "rewards/rejected": -1.6085344552993774, + "step": 2960 + }, + { + "epoch": 3.11, + "learning_rate": 1.6219446078562192e-07, + "logits/chosen": -1.847887635231018, + "logits/rejected": -1.7504163980484009, + "logps/chosen": -455.67535400390625, + "logps/rejected": -512.421142578125, + "loss": 0.4828, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9610303640365601, + "rewards/margins": 0.8271835446357727, + "rewards/rejected": -1.7882139682769775, + "step": 2970 + }, + { + "epoch": 3.12, + "learning_rate": 1.6062482933206911e-07, + "logits/chosen": -1.778282880783081, + "logits/rejected": -1.736619234085083, + "logps/chosen": -446.7540588378906, + "logps/rejected": -529.2030029296875, + "loss": 0.4945, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0167171955108643, + "rewards/margins": 0.7579668164253235, + "rewards/rejected": -1.774683952331543, + "step": 2980 + }, + { + "epoch": 3.13, + "learning_rate": 1.5905922698650536e-07, + "logits/chosen": -1.877215027809143, + "logits/rejected": -1.774298906326294, + "logps/chosen": -445.44500732421875, + "logps/rejected": -485.49560546875, + "loss": 0.4743, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9984437227249146, + "rewards/margins": 0.8055132627487183, + "rewards/rejected": -1.803957223892212, + "step": 2990 + }, + { + "epoch": 3.14, + "learning_rate": 1.574977243276031e-07, + "logits/chosen": -1.893512487411499, + "logits/rejected": -1.7661199569702148, + "logps/chosen": -487.65594482421875, + "logps/rejected": -501.9541931152344, + "loss": 0.4817, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9194480180740356, + "rewards/margins": 0.7937260866165161, + "rewards/rejected": -1.7131742238998413, + "step": 3000 + }, + { + "epoch": 3.14, + "eval_logits/chosen": -1.883595585823059, + "eval_logits/rejected": -1.7716362476348877, + "eval_logps/chosen": -453.8664245605469, + "eval_logps/rejected": -469.6033630371094, + "eval_loss": 0.5785899758338928, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -1.0450665950775146, + "eval_rewards/margins": 0.5720054507255554, + "eval_rewards/rejected": -1.6170721054077148, + "eval_runtime": 357.8971, + "eval_samples_per_second": 5.588, + "eval_steps_per_second": 0.176, + "step": 3000 + }, + { + "epoch": 3.15, + "learning_rate": 1.5594039174921808e-07, + "logits/chosen": -1.8807668685913086, + "logits/rejected": -1.770019769668579, + "logps/chosen": -436.2010192871094, + "logps/rejected": -452.5880432128906, + "loss": 0.5051, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9551477432250977, + "rewards/margins": 0.6700800061225891, + "rewards/rejected": -1.6252275705337524, + "step": 3010 + }, + { + "epoch": 3.16, + "learning_rate": 1.543872994572145e-07, + "logits/chosen": -1.7073822021484375, + "logits/rejected": -1.5819844007492065, + "logps/chosen": -417.85791015625, + "logps/rejected": -461.3323669433594, + "loss": 0.4775, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9456771612167358, + "rewards/margins": 0.8424018621444702, + "rewards/rejected": -1.7880792617797852, + "step": 3020 + }, + { + "epoch": 3.17, + "learning_rate": 1.5283851746630173e-07, + "logits/chosen": -1.8902000188827515, + "logits/rejected": -1.8180053234100342, + "logps/chosen": -436.1978454589844, + "logps/rejected": -472.62860107421875, + "loss": 0.5024, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9585925936698914, + "rewards/margins": 0.6844288110733032, + "rewards/rejected": -1.6430212259292603, + "step": 3030 + }, + { + "epoch": 3.18, + "learning_rate": 1.5129411559687632e-07, + "logits/chosen": -1.8170684576034546, + "logits/rejected": -1.6763471364974976, + "logps/chosen": -446.4208068847656, + "logps/rejected": -443.6482849121094, + "loss": 0.5056, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0374691486358643, + "rewards/margins": 0.6019953489303589, + "rewards/rejected": -1.6394646167755127, + "step": 3040 + }, + { + "epoch": 3.19, + "learning_rate": 1.4975416347187593e-07, + "logits/chosen": -1.917802095413208, + "logits/rejected": -1.7453248500823975, + "logps/chosen": -473.4043884277344, + "logps/rejected": -455.8473205566406, + "loss": 0.4936, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9156352281570435, + "rewards/margins": 0.7537345886230469, + "rewards/rejected": -1.6693699359893799, + "step": 3050 + }, + { + "epoch": 3.2, + "learning_rate": 1.4821873051363955e-07, + "logits/chosen": -1.8621914386749268, + "logits/rejected": -1.749542474746704, + "logps/chosen": -451.08282470703125, + "logps/rejected": -497.8018493652344, + "loss": 0.4788, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9146499633789062, + "rewards/margins": 0.7977242469787598, + "rewards/rejected": -1.7123743295669556, + "step": 3060 + }, + { + "epoch": 3.21, + "learning_rate": 1.4668788594077859e-07, + "logits/chosen": -1.7952085733413696, + "logits/rejected": -1.6200335025787354, + "logps/chosen": -431.6758728027344, + "logps/rejected": -463.505126953125, + "loss": 0.4606, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.8738845586776733, + "rewards/margins": 0.7860761880874634, + "rewards/rejected": -1.6599609851837158, + "step": 3070 + }, + { + "epoch": 3.22, + "learning_rate": 1.4516169876505596e-07, + "logits/chosen": -1.8675405979156494, + "logits/rejected": -1.6885595321655273, + "logps/chosen": -459.55908203125, + "logps/rejected": -451.66259765625, + "loss": 0.5049, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.923184871673584, + "rewards/margins": 0.6750983595848083, + "rewards/rejected": -1.598283290863037, + "step": 3080 + }, + { + "epoch": 3.23, + "learning_rate": 1.4364023778827538e-07, + "logits/chosen": -1.8272289037704468, + "logits/rejected": -1.7138440608978271, + "logps/chosen": -444.3658752441406, + "logps/rejected": -482.9459533691406, + "loss": 0.4838, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.017207384109497, + "rewards/margins": 0.6880172491073608, + "rewards/rejected": -1.705224633216858, + "step": 3090 + }, + { + "epoch": 3.24, + "learning_rate": 1.4212357159917942e-07, + "logits/chosen": -1.8443920612335205, + "logits/rejected": -1.7631704807281494, + "logps/chosen": -409.96490478515625, + "logps/rejected": -471.09027099609375, + "loss": 0.4767, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9304403066635132, + "rewards/margins": 0.6487723588943481, + "rewards/rejected": -1.5792125463485718, + "step": 3100 + }, + { + "epoch": 3.24, + "eval_logits/chosen": -1.866295576095581, + "eval_logits/rejected": -1.753827452659607, + "eval_logps/chosen": -457.42584228515625, + "eval_logps/rejected": -472.9984436035156, + "eval_loss": 0.5770441293716431, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -1.0806607007980347, + "eval_rewards/margins": 0.5703620314598083, + "eval_rewards/rejected": -1.6510227918624878, + "eval_runtime": 378.5753, + "eval_samples_per_second": 5.283, + "eval_steps_per_second": 0.166, + "step": 3100 + }, + { + "epoch": 3.25, + "learning_rate": 1.4061176857035765e-07, + "logits/chosen": -1.8807693719863892, + "logits/rejected": -1.8139241933822632, + "logps/chosen": -469.0538635253906, + "logps/rejected": -511.9051208496094, + "loss": 0.4898, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9563783407211304, + "rewards/margins": 0.724995493888855, + "rewards/rejected": -1.6813738346099854, + "step": 3110 + }, + { + "epoch": 3.27, + "learning_rate": 1.391048968551643e-07, + "logits/chosen": -1.739332914352417, + "logits/rejected": -1.6512079238891602, + "logps/chosen": -407.3808288574219, + "logps/rejected": -492.01348876953125, + "loss": 0.4484, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9379755258560181, + "rewards/margins": 0.8809243440628052, + "rewards/rejected": -1.8188997507095337, + "step": 3120 + }, + { + "epoch": 3.28, + "learning_rate": 1.376030243846456e-07, + "logits/chosen": -1.8204562664031982, + "logits/rejected": -1.735701322555542, + "logps/chosen": -419.04693603515625, + "logps/rejected": -456.9537048339844, + "loss": 0.499, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.9971426725387573, + "rewards/margins": 0.8115785717964172, + "rewards/rejected": -1.8087211847305298, + "step": 3130 + }, + { + "epoch": 3.29, + "learning_rate": 1.3610621886447792e-07, + "logits/chosen": -1.8077905178070068, + "logits/rejected": -1.7940162420272827, + "logps/chosen": -393.58673095703125, + "logps/rejected": -454.60418701171875, + "loss": 0.4829, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9757000207901001, + "rewards/margins": 0.7894998788833618, + "rewards/rejected": -1.765199899673462, + "step": 3140 + }, + { + "epoch": 3.3, + "learning_rate": 1.3461454777191512e-07, + "logits/chosen": -1.8403291702270508, + "logits/rejected": -1.7041610479354858, + "logps/chosen": -441.6806640625, + "logps/rejected": -433.76751708984375, + "loss": 0.4941, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9537010192871094, + "rewards/margins": 0.6518079042434692, + "rewards/rejected": -1.6055090427398682, + "step": 3150 + }, + { + "epoch": 3.31, + "learning_rate": 1.3312807835274676e-07, + "logits/chosen": -1.8076324462890625, + "logits/rejected": -1.7209094762802124, + "logps/chosen": -429.984375, + "logps/rejected": -467.8804626464844, + "loss": 0.4794, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0047121047973633, + "rewards/margins": 0.705715000629425, + "rewards/rejected": -1.710426926612854, + "step": 3160 + }, + { + "epoch": 3.32, + "learning_rate": 1.3164687761826628e-07, + "logits/chosen": -1.801439642906189, + "logits/rejected": -1.6736797094345093, + "logps/chosen": -430.87982177734375, + "logps/rejected": -493.06671142578125, + "loss": 0.47, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0352692604064941, + "rewards/margins": 0.8472986221313477, + "rewards/rejected": -1.8825680017471313, + "step": 3170 + }, + { + "epoch": 3.33, + "learning_rate": 1.3017101234225097e-07, + "logits/chosen": -1.8457056283950806, + "logits/rejected": -1.7224591970443726, + "logps/chosen": -449.48101806640625, + "logps/rejected": -476.2332458496094, + "loss": 0.4734, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9599472284317017, + "rewards/margins": 0.7956485748291016, + "rewards/rejected": -1.7555955648422241, + "step": 3180 + }, + { + "epoch": 3.34, + "learning_rate": 1.2870054905795083e-07, + "logits/chosen": -1.8150501251220703, + "logits/rejected": -1.7427335977554321, + "logps/chosen": -452.417724609375, + "logps/rejected": -494.1238708496094, + "loss": 0.4863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.044499158859253, + "rewards/margins": 0.6634548306465149, + "rewards/rejected": -1.7079538106918335, + "step": 3190 + }, + { + "epoch": 3.35, + "learning_rate": 1.272355540550893e-07, + "logits/chosen": -1.6971858739852905, + "logits/rejected": -1.6030826568603516, + "logps/chosen": -432.5633850097656, + "logps/rejected": -473.17950439453125, + "loss": 0.4794, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0610132217407227, + "rewards/margins": 0.7416720390319824, + "rewards/rejected": -1.8026853799819946, + "step": 3200 + }, + { + "epoch": 3.35, + "eval_logits/chosen": -1.8515363931655884, + "eval_logits/rejected": -1.738411784172058, + "eval_logps/chosen": -460.2549743652344, + "eval_logps/rejected": -476.874267578125, + "eval_loss": 0.5789009928703308, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -1.108952283859253, + "eval_rewards/margins": 0.5808290839195251, + "eval_rewards/rejected": -1.6897813081741333, + "eval_runtime": 388.0721, + "eval_samples_per_second": 5.154, + "eval_steps_per_second": 0.162, + "step": 3200 + }, + { + "epoch": 3.36, + "learning_rate": 1.2577609337687545e-07, + "logits/chosen": -1.7929632663726807, + "logits/rejected": -1.6986091136932373, + "logps/chosen": -415.9329528808594, + "logps/rejected": -493.5342712402344, + "loss": 0.4843, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0511469841003418, + "rewards/margins": 0.8548563718795776, + "rewards/rejected": -1.9060032367706299, + "step": 3210 + }, + { + "epoch": 3.37, + "learning_rate": 1.2432223281702616e-07, + "logits/chosen": -1.8026511669158936, + "logits/rejected": -1.7878223657608032, + "logps/chosen": -420.894287109375, + "logps/rejected": -473.5709533691406, + "loss": 0.4781, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0878639221191406, + "rewards/margins": 0.5462750196456909, + "rewards/rejected": -1.634138822555542, + "step": 3220 + }, + { + "epoch": 3.38, + "learning_rate": 1.228740379168004e-07, + "logits/chosen": -1.7239850759506226, + "logits/rejected": -1.6399204730987549, + "logps/chosen": -473.14208984375, + "logps/rejected": -485.84442138671875, + "loss": 0.4803, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0646392107009888, + "rewards/margins": 0.8183633685112, + "rewards/rejected": -1.8830026388168335, + "step": 3230 + }, + { + "epoch": 3.39, + "learning_rate": 1.2143157396204415e-07, + "logits/chosen": -1.7619224786758423, + "logits/rejected": -1.7232242822647095, + "logps/chosen": -421.8963928222656, + "logps/rejected": -484.1393127441406, + "loss": 0.5063, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0958998203277588, + "rewards/margins": 0.6340087652206421, + "rewards/rejected": -1.7299085855484009, + "step": 3240 + }, + { + "epoch": 3.4, + "learning_rate": 1.199949059802478e-07, + "logits/chosen": -1.8530910015106201, + "logits/rejected": -1.7129993438720703, + "logps/chosen": -473.8306579589844, + "logps/rejected": -485.2450256347656, + "loss": 0.4622, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9937135577201843, + "rewards/margins": 0.8251352310180664, + "rewards/rejected": -1.8188488483428955, + "step": 3250 + }, + { + "epoch": 3.41, + "learning_rate": 1.1856409873761428e-07, + "logits/chosen": -1.750261902809143, + "logits/rejected": -1.646456003189087, + "logps/chosen": -430.5122985839844, + "logps/rejected": -435.7897033691406, + "loss": 0.4918, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9967744946479797, + "rewards/margins": 0.6008384227752686, + "rewards/rejected": -1.5976128578186035, + "step": 3260 + }, + { + "epoch": 3.42, + "learning_rate": 1.1713921673613961e-07, + "logits/chosen": -1.855929970741272, + "logits/rejected": -1.7337143421173096, + "logps/chosen": -441.72998046875, + "logps/rejected": -476.8902893066406, + "loss": 0.4857, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0006901025772095, + "rewards/margins": 0.8095808029174805, + "rewards/rejected": -1.81027090549469, + "step": 3270 + }, + { + "epoch": 3.43, + "learning_rate": 1.1572032421070452e-07, + "logits/chosen": -1.7859611511230469, + "logits/rejected": -1.5934228897094727, + "logps/chosen": -472.6265563964844, + "logps/rejected": -509.2264709472656, + "loss": 0.4612, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0804340839385986, + "rewards/margins": 0.8621044158935547, + "rewards/rejected": -1.9425384998321533, + "step": 3280 + }, + { + "epoch": 3.44, + "learning_rate": 1.1430748512617974e-07, + "logits/chosen": -1.9406483173370361, + "logits/rejected": -1.8423467874526978, + "logps/chosen": -465.8262634277344, + "logps/rejected": -487.7626037597656, + "loss": 0.4756, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.089054822921753, + "rewards/margins": 0.6790697574615479, + "rewards/rejected": -1.7681243419647217, + "step": 3290 + }, + { + "epoch": 3.46, + "learning_rate": 1.1290076317454142e-07, + "logits/chosen": -1.7945177555084229, + "logits/rejected": -1.6354175806045532, + "logps/chosen": -451.65625, + "logps/rejected": -489.162841796875, + "loss": 0.4784, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0753741264343262, + "rewards/margins": 0.7434049844741821, + "rewards/rejected": -1.8187793493270874, + "step": 3300 + }, + { + "epoch": 3.46, + "eval_logits/chosen": -1.844208002090454, + "eval_logits/rejected": -1.731345772743225, + "eval_logps/chosen": -468.6473388671875, + "eval_logps/rejected": -486.3179626464844, + "eval_loss": 0.573898196220398, + "eval_rewards/accuracies": 0.716269850730896, + "eval_rewards/chosen": -1.1928762197494507, + "eval_rewards/margins": 0.5913423895835876, + "eval_rewards/rejected": -1.7842185497283936, + "eval_runtime": 214.6932, + "eval_samples_per_second": 9.316, + "eval_steps_per_second": 0.293, + "step": 3300 + }, + { + "epoch": 3.47, + "learning_rate": 1.115002217720001e-07, + "logits/chosen": -1.7635328769683838, + "logits/rejected": -1.6847097873687744, + "logps/chosen": -431.44573974609375, + "logps/rejected": -467.83251953125, + "loss": 0.4942, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0202503204345703, + "rewards/margins": 0.7337538003921509, + "rewards/rejected": -1.7540042400360107, + "step": 3310 + }, + { + "epoch": 3.48, + "learning_rate": 1.1010592405614221e-07, + "logits/chosen": -1.7923495769500732, + "logits/rejected": -1.7173646688461304, + "logps/chosen": -435.59906005859375, + "logps/rejected": -487.1529235839844, + "loss": 0.4797, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1358600854873657, + "rewards/margins": 0.7092723846435547, + "rewards/rejected": -1.8451322317123413, + "step": 3320 + }, + { + "epoch": 3.49, + "learning_rate": 1.087179328830834e-07, + "logits/chosen": -1.7255041599273682, + "logits/rejected": -1.6920020580291748, + "logps/chosen": -379.7822265625, + "logps/rejected": -449.1914978027344, + "loss": 0.4954, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0361435413360596, + "rewards/margins": 0.6081300973892212, + "rewards/rejected": -1.6442735195159912, + "step": 3330 + }, + { + "epoch": 3.5, + "learning_rate": 1.0733631082463517e-07, + "logits/chosen": -1.8408622741699219, + "logits/rejected": -1.711627721786499, + "logps/chosen": -470.7630310058594, + "logps/rejected": -467.80767822265625, + "loss": 0.4856, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1355063915252686, + "rewards/margins": 0.6574573516845703, + "rewards/rejected": -1.7929637432098389, + "step": 3340 + }, + { + "epoch": 3.51, + "learning_rate": 1.0596112016548372e-07, + "logits/chosen": -1.770371675491333, + "logits/rejected": -1.734692931175232, + "logps/chosen": -418.6500549316406, + "logps/rejected": -466.70330810546875, + "loss": 0.4953, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0982646942138672, + "rewards/margins": 0.5657511949539185, + "rewards/rejected": -1.664015769958496, + "step": 3350 + }, + { + "epoch": 3.52, + "learning_rate": 1.0459242290038259e-07, + "logits/chosen": -1.820656180381775, + "logits/rejected": -1.7680647373199463, + "logps/chosen": -428.38922119140625, + "logps/rejected": -440.9400939941406, + "loss": 0.5059, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0621531009674072, + "rewards/margins": 0.5993833541870117, + "rewards/rejected": -1.661536455154419, + "step": 3360 + }, + { + "epoch": 3.53, + "learning_rate": 1.0323028073135756e-07, + "logits/chosen": -1.7957019805908203, + "logits/rejected": -1.682960867881775, + "logps/chosen": -457.93780517578125, + "logps/rejected": -469.3650817871094, + "loss": 0.4787, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -0.8964568972587585, + "rewards/margins": 0.7799767255783081, + "rewards/rejected": -1.676433801651001, + "step": 3370 + }, + { + "epoch": 3.54, + "learning_rate": 1.0187475506492526e-07, + "logits/chosen": -1.8339675664901733, + "logits/rejected": -1.712244987487793, + "logps/chosen": -443.6895446777344, + "logps/rejected": -467.30810546875, + "loss": 0.487, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.044893503189087, + "rewards/margins": 0.7343738079071045, + "rewards/rejected": -1.7792673110961914, + "step": 3380 + }, + { + "epoch": 3.55, + "learning_rate": 1.0052590700932445e-07, + "logits/chosen": -1.8773367404937744, + "logits/rejected": -1.78768789768219, + "logps/chosen": -436.32806396484375, + "logps/rejected": -466.9984436035156, + "loss": 0.4934, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0362013578414917, + "rewards/margins": 0.7617989182472229, + "rewards/rejected": -1.7980003356933594, + "step": 3390 + }, + { + "epoch": 3.56, + "learning_rate": 9.918379737176207e-08, + "logits/chosen": -1.7708876132965088, + "logits/rejected": -1.7042875289916992, + "logps/chosen": -430.687255859375, + "logps/rejected": -476.30035400390625, + "loss": 0.4797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0874220132827759, + "rewards/margins": 0.6192290782928467, + "rewards/rejected": -1.7066510915756226, + "step": 3400 + }, + { + "epoch": 3.56, + "eval_logits/chosen": -1.846415400505066, + "eval_logits/rejected": -1.7339593172073364, + "eval_logps/chosen": -464.2335510253906, + "eval_logps/rejected": -480.9566345214844, + "eval_loss": 0.5754001140594482, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.1487380266189575, + "eval_rewards/margins": 0.5818668603897095, + "eval_rewards/rejected": -1.730604887008667, + "eval_runtime": 230.6525, + "eval_samples_per_second": 8.671, + "eval_steps_per_second": 0.273, + "step": 3400 + }, + { + "epoch": 3.57, + "learning_rate": 9.78484866556713e-08, + "logits/chosen": -1.7351133823394775, + "logits/rejected": -1.6162784099578857, + "logps/chosen": -431.660400390625, + "logps/rejected": -473.376220703125, + "loss": 0.4657, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9811315536499023, + "rewards/margins": 0.7743980288505554, + "rewards/rejected": -1.755529761314392, + "step": 3410 + }, + { + "epoch": 3.58, + "learning_rate": 9.652003505798397e-08, + "logits/chosen": -1.7996597290039062, + "logits/rejected": -1.6331745386123657, + "logps/chosen": -447.5265197753906, + "logps/rejected": -471.54364013671875, + "loss": 0.4628, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9811578989028931, + "rewards/margins": 0.927898108959198, + "rewards/rejected": -1.9090559482574463, + "step": 3420 + }, + { + "epoch": 3.59, + "learning_rate": 9.519850246641739e-08, + "logits/chosen": -1.8197190761566162, + "logits/rejected": -1.692530632019043, + "logps/chosen": -466.42608642578125, + "logps/rejected": -493.58819580078125, + "loss": 0.5109, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1230065822601318, + "rewards/margins": 0.6617223024368286, + "rewards/rejected": -1.784728765487671, + "step": 3430 + }, + { + "epoch": 3.6, + "learning_rate": 9.38839484567741e-08, + "logits/chosen": -1.8353790044784546, + "logits/rejected": -1.7911113500595093, + "logps/chosen": -457.2601013183594, + "logps/rejected": -514.238037109375, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0778439044952393, + "rewards/margins": 0.7159544825553894, + "rewards/rejected": -1.7937984466552734, + "step": 3440 + }, + { + "epoch": 3.61, + "learning_rate": 9.25764322902564e-08, + "logits/chosen": -1.8523584604263306, + "logits/rejected": -1.746606469154358, + "logps/chosen": -434.7613830566406, + "logps/rejected": -508.73553466796875, + "loss": 0.4831, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -0.9743539094924927, + "rewards/margins": 0.7968435883522034, + "rewards/rejected": -1.7711975574493408, + "step": 3450 + }, + { + "epoch": 3.62, + "learning_rate": 9.127601291079436e-08, + "logits/chosen": -1.73825204372406, + "logits/rejected": -1.714342713356018, + "logps/chosen": -428.89892578125, + "logps/rejected": -511.9532775878906, + "loss": 0.4928, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1295487880706787, + "rewards/margins": 0.6823315024375916, + "rewards/rejected": -1.811880350112915, + "step": 3460 + }, + { + "epoch": 3.63, + "learning_rate": 8.998274894238953e-08, + "logits/chosen": -1.8188636302947998, + "logits/rejected": -1.7010266780853271, + "logps/chosen": -440.44732666015625, + "logps/rejected": -499.5472106933594, + "loss": 0.4817, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0014610290527344, + "rewards/margins": 0.7511878609657288, + "rewards/rejected": -1.7526487112045288, + "step": 3470 + }, + { + "epoch": 3.64, + "learning_rate": 8.869669868647084e-08, + "logits/chosen": -1.884778380393982, + "logits/rejected": -1.7300710678100586, + "logps/chosen": -481.62237548828125, + "logps/rejected": -493.79840087890625, + "loss": 0.4825, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0563971996307373, + "rewards/margins": 0.7872760891914368, + "rewards/rejected": -1.8436731100082397, + "step": 3480 + }, + { + "epoch": 3.65, + "learning_rate": 8.741792011926736e-08, + "logits/chosen": -1.8345582485198975, + "logits/rejected": -1.776908278465271, + "logps/chosen": -457.0000915527344, + "logps/rejected": -501.9378967285156, + "loss": 0.5381, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.134314775466919, + "rewards/margins": 0.6081364750862122, + "rewards/rejected": -1.7424513101577759, + "step": 3490 + }, + { + "epoch": 3.66, + "learning_rate": 8.614647088919424e-08, + "logits/chosen": -1.7891185283660889, + "logits/rejected": -1.7254148721694946, + "logps/chosen": -481.73486328125, + "logps/rejected": -502.29901123046875, + "loss": 0.4967, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0791096687316895, + "rewards/margins": 0.6187223792076111, + "rewards/rejected": -1.6978321075439453, + "step": 3500 + }, + { + "epoch": 3.66, + "eval_logits/chosen": -1.845801830291748, + "eval_logits/rejected": -1.7331349849700928, + "eval_logps/chosen": -462.4029846191406, + "eval_logps/rejected": -478.66900634765625, + "eval_loss": 0.576257586479187, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -1.1304326057434082, + "eval_rewards/margins": 0.5772957801818848, + "eval_rewards/rejected": -1.7077282667160034, + "eval_runtime": 232.7718, + "eval_samples_per_second": 8.592, + "eval_steps_per_second": 0.271, + "step": 3500 + }, + { + "epoch": 3.68, + "learning_rate": 8.488240831425395e-08, + "logits/chosen": -1.6944081783294678, + "logits/rejected": -1.602821946144104, + "logps/chosen": -444.11724853515625, + "logps/rejected": -483.67230224609375, + "loss": 0.4754, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1205617189407349, + "rewards/margins": 0.7284508943557739, + "rewards/rejected": -1.8490123748779297, + "step": 3510 + }, + { + "epoch": 3.69, + "learning_rate": 8.362578937945231e-08, + "logits/chosen": -1.7747758626937866, + "logits/rejected": -1.6951490640640259, + "logps/chosen": -469.2557678222656, + "logps/rejected": -511.41839599609375, + "loss": 0.4801, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9889723658561707, + "rewards/margins": 0.733228325843811, + "rewards/rejected": -1.7222007513046265, + "step": 3520 + }, + { + "epoch": 3.7, + "learning_rate": 8.237667073422943e-08, + "logits/chosen": -1.801891565322876, + "logits/rejected": -1.6538407802581787, + "logps/chosen": -435.76861572265625, + "logps/rejected": -445.46368408203125, + "loss": 0.4818, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.061366319656372, + "rewards/margins": 0.6765199899673462, + "rewards/rejected": -1.7378864288330078, + "step": 3530 + }, + { + "epoch": 3.71, + "learning_rate": 8.113510868990626e-08, + "logits/chosen": -1.8102309703826904, + "logits/rejected": -1.761523962020874, + "logps/chosen": -455.29010009765625, + "logps/rejected": -522.002197265625, + "loss": 0.4912, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0575816631317139, + "rewards/margins": 0.6482858061790466, + "rewards/rejected": -1.7058674097061157, + "step": 3540 + }, + { + "epoch": 3.72, + "learning_rate": 7.990115921714571e-08, + "logits/chosen": -1.7839081287384033, + "logits/rejected": -1.6682395935058594, + "logps/chosen": -429.5260314941406, + "logps/rejected": -455.45111083984375, + "loss": 0.5005, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0066113471984863, + "rewards/margins": 0.7589043378829956, + "rewards/rejected": -1.765515685081482, + "step": 3550 + }, + { + "epoch": 3.73, + "learning_rate": 7.867487794342966e-08, + "logits/chosen": -1.7547132968902588, + "logits/rejected": -1.6351432800292969, + "logps/chosen": -486.7994079589844, + "logps/rejected": -483.152099609375, + "loss": 0.4809, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0975134372711182, + "rewards/margins": 0.6140449643135071, + "rewards/rejected": -1.7115581035614014, + "step": 3560 + }, + { + "epoch": 3.74, + "learning_rate": 7.745632015055079e-08, + "logits/chosen": -1.7974326610565186, + "logits/rejected": -1.7494417428970337, + "logps/chosen": -408.68145751953125, + "logps/rejected": -484.828125, + "loss": 0.4701, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9123633503913879, + "rewards/margins": 0.703347384929657, + "rewards/rejected": -1.6157108545303345, + "step": 3570 + }, + { + "epoch": 3.75, + "learning_rate": 7.624554077212128e-08, + "logits/chosen": -1.7972793579101562, + "logits/rejected": -1.7015396356582642, + "logps/chosen": -466.3648376464844, + "logps/rejected": -488.1949157714844, + "loss": 0.4937, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9682089686393738, + "rewards/margins": 0.7773339152336121, + "rewards/rejected": -1.7455428838729858, + "step": 3580 + }, + { + "epoch": 3.76, + "learning_rate": 7.504259439109534e-08, + "logits/chosen": -1.6706613302230835, + "logits/rejected": -1.5966769456863403, + "logps/chosen": -427.3377990722656, + "logps/rejected": -471.311279296875, + "loss": 0.5085, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0817948579788208, + "rewards/margins": 0.7141741514205933, + "rewards/rejected": -1.795969009399414, + "step": 3590 + }, + { + "epoch": 3.77, + "learning_rate": 7.384753523730935e-08, + "logits/chosen": -1.8219425678253174, + "logits/rejected": -1.7524003982543945, + "logps/chosen": -445.67999267578125, + "logps/rejected": -502.49163818359375, + "loss": 0.4747, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9601839184761047, + "rewards/margins": 0.7520685195922852, + "rewards/rejected": -1.7122526168823242, + "step": 3600 + }, + { + "epoch": 3.77, + "eval_logits/chosen": -1.8401782512664795, + "eval_logits/rejected": -1.7268399000167847, + "eval_logps/chosen": -462.3710021972656, + "eval_logps/rejected": -479.5740661621094, + "eval_loss": 0.5767081379890442, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -1.1301121711730957, + "eval_rewards/margins": 0.5866668820381165, + "eval_rewards/rejected": -1.7167788743972778, + "eval_runtime": 227.946, + "eval_samples_per_second": 8.774, + "eval_steps_per_second": 0.276, + "step": 3600 + }, + { + "epoch": 3.78, + "learning_rate": 7.266041718503671e-08, + "logits/chosen": -1.7200260162353516, + "logits/rejected": -1.6315845251083374, + "logps/chosen": -432.0589294433594, + "logps/rejected": -461.25079345703125, + "loss": 0.5043, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9935817718505859, + "rewards/margins": 0.6516721844673157, + "rewards/rejected": -1.6452537775039673, + "step": 3610 + }, + { + "epoch": 3.79, + "learning_rate": 7.148129375055936e-08, + "logits/chosen": -1.8993425369262695, + "logits/rejected": -1.69882071018219, + "logps/chosen": -478.901123046875, + "logps/rejected": -489.272216796875, + "loss": 0.4855, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.928783118724823, + "rewards/margins": 0.854119598865509, + "rewards/rejected": -1.782902479171753, + "step": 3620 + }, + { + "epoch": 3.8, + "learning_rate": 7.031021808975518e-08, + "logits/chosen": -1.9520289897918701, + "logits/rejected": -1.785881757736206, + "logps/chosen": -485.9696350097656, + "logps/rejected": -474.34893798828125, + "loss": 0.4884, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9730976819992065, + "rewards/margins": 0.8256509900093079, + "rewards/rejected": -1.7987486124038696, + "step": 3630 + }, + { + "epoch": 3.81, + "learning_rate": 6.914724299570127e-08, + "logits/chosen": -1.9349133968353271, + "logits/rejected": -1.8336073160171509, + "logps/chosen": -470.2247619628906, + "logps/rejected": -479.83624267578125, + "loss": 0.5086, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0859735012054443, + "rewards/margins": 0.6873432397842407, + "rewards/rejected": -1.7733169794082642, + "step": 3640 + }, + { + "epoch": 3.82, + "learning_rate": 6.799242089629497e-08, + "logits/chosen": -1.687898874282837, + "logits/rejected": -1.6447770595550537, + "logps/chosen": -406.9367370605469, + "logps/rejected": -469.34552001953125, + "loss": 0.4775, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0913324356079102, + "rewards/margins": 0.66700279712677, + "rewards/rejected": -1.7583353519439697, + "step": 3650 + }, + { + "epoch": 3.83, + "learning_rate": 6.684580385188917e-08, + "logits/chosen": -1.778376817703247, + "logits/rejected": -1.7570167779922485, + "logps/chosen": -451.86871337890625, + "logps/rejected": -502.09478759765625, + "loss": 0.4809, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0968141555786133, + "rewards/margins": 0.6900449991226196, + "rewards/rejected": -1.786859154701233, + "step": 3660 + }, + { + "epoch": 3.84, + "learning_rate": 6.570744355294642e-08, + "logits/chosen": -1.856090784072876, + "logits/rejected": -1.7611362934112549, + "logps/chosen": -461.3770446777344, + "logps/rejected": -473.8399963378906, + "loss": 0.5269, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1256920099258423, + "rewards/margins": 0.5923901796340942, + "rewards/rejected": -1.7180821895599365, + "step": 3670 + }, + { + "epoch": 3.85, + "learning_rate": 6.45773913177077e-08, + "logits/chosen": -1.7539339065551758, + "logits/rejected": -1.698042869567871, + "logps/chosen": -442.38128662109375, + "logps/rejected": -486.7842712402344, + "loss": 0.468, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0839567184448242, + "rewards/margins": 0.7219823002815247, + "rewards/rejected": -1.805938959121704, + "step": 3680 + }, + { + "epoch": 3.86, + "learning_rate": 6.345569808988019e-08, + "logits/chosen": -1.800353765487671, + "logits/rejected": -1.6878328323364258, + "logps/chosen": -438.8720703125, + "logps/rejected": -452.9892578125, + "loss": 0.5116, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0886685848236084, + "rewards/margins": 0.643202006816864, + "rewards/rejected": -1.7318706512451172, + "step": 3690 + }, + { + "epoch": 3.87, + "learning_rate": 6.23424144363393e-08, + "logits/chosen": -1.8801469802856445, + "logits/rejected": -1.7438074350357056, + "logps/chosen": -430.7373962402344, + "logps/rejected": -452.6865234375, + "loss": 0.4895, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0514805316925049, + "rewards/margins": 0.6827796697616577, + "rewards/rejected": -1.7342602014541626, + "step": 3700 + }, + { + "epoch": 3.87, + "eval_logits/chosen": -1.84304678440094, + "eval_logits/rejected": -1.7301536798477173, + "eval_logps/chosen": -463.2915344238281, + "eval_logps/rejected": -479.66912841796875, + "eval_loss": 0.5747166872024536, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.1393183469772339, + "eval_rewards/margins": 0.5784114003181458, + "eval_rewards/rejected": -1.7177296876907349, + "eval_runtime": 226.3769, + "eval_samples_per_second": 8.835, + "eval_steps_per_second": 0.278, + "step": 3700 + }, + { + "epoch": 3.88, + "learning_rate": 6.123759054485015e-08, + "logits/chosen": -1.8235986232757568, + "logits/rejected": -1.6753406524658203, + "logps/chosen": -486.02740478515625, + "logps/rejected": -501.69476318359375, + "loss": 0.4715, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9594398736953735, + "rewards/margins": 0.8951163291931152, + "rewards/rejected": -1.8545563220977783, + "step": 3710 + }, + { + "epoch": 3.89, + "learning_rate": 6.014127622180452e-08, + "logits/chosen": -1.8062944412231445, + "logits/rejected": -1.7111324071884155, + "logps/chosen": -451.428466796875, + "logps/rejected": -489.58074951171875, + "loss": 0.4911, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9969078302383423, + "rewards/margins": 0.6812020540237427, + "rewards/rejected": -1.6781097650527954, + "step": 3720 + }, + { + "epoch": 3.91, + "learning_rate": 5.90535208899757e-08, + "logits/chosen": -1.789720892906189, + "logits/rejected": -1.6432859897613525, + "logps/chosen": -454.414794921875, + "logps/rejected": -467.2537536621094, + "loss": 0.4889, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1260120868682861, + "rewards/margins": 0.6561237573623657, + "rewards/rejected": -1.7821362018585205, + "step": 3730 + }, + { + "epoch": 3.92, + "learning_rate": 5.797437358629051e-08, + "logits/chosen": -1.8378006219863892, + "logits/rejected": -1.74234139919281, + "logps/chosen": -448.30377197265625, + "logps/rejected": -492.735595703125, + "loss": 0.482, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.9833332896232605, + "rewards/margins": 0.8310686349868774, + "rewards/rejected": -1.814401626586914, + "step": 3740 + }, + { + "epoch": 3.93, + "learning_rate": 5.6903882959618317e-08, + "logits/chosen": -1.8451528549194336, + "logits/rejected": -1.6656252145767212, + "logps/chosen": -457.91973876953125, + "logps/rejected": -448.98974609375, + "loss": 0.4921, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9729013442993164, + "rewards/margins": 0.7378336191177368, + "rewards/rejected": -1.7107349634170532, + "step": 3750 + }, + { + "epoch": 3.94, + "learning_rate": 5.584209726857872e-08, + "logits/chosen": -1.8105888366699219, + "logits/rejected": -1.7128746509552002, + "logps/chosen": -460.19171142578125, + "logps/rejected": -504.76812744140625, + "loss": 0.4847, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0890675783157349, + "rewards/margins": 0.790399432182312, + "rewards/rejected": -1.8794670104980469, + "step": 3760 + }, + { + "epoch": 3.95, + "learning_rate": 5.478906437936501e-08, + "logits/chosen": -1.7644094228744507, + "logits/rejected": -1.700378179550171, + "logps/chosen": -456.9208984375, + "logps/rejected": -476.3636169433594, + "loss": 0.4955, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0362350940704346, + "rewards/margins": 0.5969945192337036, + "rewards/rejected": -1.6332294940948486, + "step": 3770 + }, + { + "epoch": 3.96, + "learning_rate": 5.374483176358696e-08, + "logits/chosen": -1.7678935527801514, + "logits/rejected": -1.7221410274505615, + "logps/chosen": -440.03759765625, + "logps/rejected": -522.8914184570312, + "loss": 0.4722, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.046316385269165, + "rewards/margins": 0.8649559020996094, + "rewards/rejected": -1.9112722873687744, + "step": 3780 + }, + { + "epoch": 3.97, + "learning_rate": 5.2709446496130685e-08, + "logits/chosen": -1.7751652002334595, + "logits/rejected": -1.7962526082992554, + "logps/chosen": -424.12384033203125, + "logps/rejected": -523.4373779296875, + "loss": 0.4674, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9684499502182007, + "rewards/margins": 0.8860515356063843, + "rewards/rejected": -1.854501485824585, + "step": 3790 + }, + { + "epoch": 3.98, + "learning_rate": 5.1682955253036286e-08, + "logits/chosen": -1.750946283340454, + "logits/rejected": -1.570204734802246, + "logps/chosen": -469.3467712402344, + "logps/rejected": -441.6266174316406, + "loss": 0.5118, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.075725793838501, + "rewards/margins": 0.7432368397712708, + "rewards/rejected": -1.8189626932144165, + "step": 3800 + }, + { + "epoch": 3.98, + "eval_logits/chosen": -1.841734766960144, + "eval_logits/rejected": -1.7281790971755981, + "eval_logps/chosen": -464.1390075683594, + "eval_logps/rejected": -481.311767578125, + "eval_loss": 0.5743067860603333, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -1.1477924585342407, + "eval_rewards/margins": 0.5863636136054993, + "eval_rewards/rejected": -1.7341560125350952, + "eval_runtime": 249.2328, + "eval_samples_per_second": 8.025, + "eval_steps_per_second": 0.253, + "step": 3800 + }, + { + "epoch": 3.99, + "learning_rate": 5.066540430939384e-08, + "logits/chosen": -1.8474823236465454, + "logits/rejected": -1.6917006969451904, + "logps/chosen": -473.05572509765625, + "logps/rejected": -487.7589416503906, + "loss": 0.4862, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.088275671005249, + "rewards/margins": 0.6751676201820374, + "rewards/rejected": -1.7634432315826416, + "step": 3810 + }, + { + "epoch": 4.0, + "learning_rate": 4.965683953725705e-08, + "logits/chosen": -1.8910309076309204, + "logits/rejected": -1.8070173263549805, + "logps/chosen": -462.40985107421875, + "logps/rejected": -494.81427001953125, + "loss": 0.5017, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.032575011253357, + "rewards/margins": 0.6556354761123657, + "rewards/rejected": -1.6882108449935913, + "step": 3820 + }, + { + "epoch": 4.01, + "learning_rate": 4.8657306403575546e-08, + "logits/chosen": -1.9338630437850952, + "logits/rejected": -1.8257001638412476, + "logps/chosen": -471.525390625, + "logps/rejected": -491.8013610839844, + "loss": 0.4745, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9843519330024719, + "rewards/margins": 0.7315724492073059, + "rewards/rejected": -1.7159245014190674, + "step": 3830 + }, + { + "epoch": 4.02, + "learning_rate": 4.766684996814505e-08, + "logits/chosen": -1.7188827991485596, + "logits/rejected": -1.670910120010376, + "logps/chosen": -475.8091735839844, + "logps/rejected": -488.88433837890625, + "loss": 0.4848, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.1858031749725342, + "rewards/margins": 0.6162821054458618, + "rewards/rejected": -1.802085280418396, + "step": 3840 + }, + { + "epoch": 4.03, + "learning_rate": 4.6685514881576184e-08, + "logits/chosen": -1.8002866506576538, + "logits/rejected": -1.6799087524414062, + "logps/chosen": -458.14410400390625, + "logps/rejected": -469.3955078125, + "loss": 0.4868, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.0560219287872314, + "rewards/margins": 0.7603785395622253, + "rewards/rejected": -1.8164005279541016, + "step": 3850 + }, + { + "epoch": 4.04, + "learning_rate": 4.5713345383281225e-08, + "logits/chosen": -1.824496865272522, + "logits/rejected": -1.7280305624008179, + "logps/chosen": -446.33746337890625, + "logps/rejected": -480.5821838378906, + "loss": 0.4587, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.101201057434082, + "rewards/margins": 0.7129907011985779, + "rewards/rejected": -1.8141918182373047, + "step": 3860 + }, + { + "epoch": 4.05, + "learning_rate": 4.475038529948036e-08, + "logits/chosen": -1.7647641897201538, + "logits/rejected": -1.7041082382202148, + "logps/chosen": -423.53021240234375, + "logps/rejected": -502.34637451171875, + "loss": 0.4692, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9148675799369812, + "rewards/margins": 0.9240180253982544, + "rewards/rejected": -1.8388856649398804, + "step": 3870 + }, + { + "epoch": 4.06, + "learning_rate": 4.379667804122531e-08, + "logits/chosen": -1.758404016494751, + "logits/rejected": -1.624509572982788, + "logps/chosen": -421.4674377441406, + "logps/rejected": -452.26361083984375, + "loss": 0.4571, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.049045443534851, + "rewards/margins": 0.7535545825958252, + "rewards/rejected": -1.8025999069213867, + "step": 3880 + }, + { + "epoch": 4.07, + "learning_rate": 4.285226660244273e-08, + "logits/chosen": -1.747127890586853, + "logits/rejected": -1.6422996520996094, + "logps/chosen": -437.3968200683594, + "logps/rejected": -486.3650817871094, + "loss": 0.491, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0798274278640747, + "rewards/margins": 0.7409617900848389, + "rewards/rejected": -1.820789098739624, + "step": 3890 + }, + { + "epoch": 4.08, + "learning_rate": 4.191719355799595e-08, + "logits/chosen": -1.7357877492904663, + "logits/rejected": -1.6818689107894897, + "logps/chosen": -445.1678161621094, + "logps/rejected": -490.9187927246094, + "loss": 0.5007, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.124782919883728, + "rewards/margins": 0.6258308291435242, + "rewards/rejected": -1.750613808631897, + "step": 3900 + }, + { + "epoch": 4.08, + "eval_logits/chosen": -1.8403288125991821, + "eval_logits/rejected": -1.7268848419189453, + "eval_logps/chosen": -462.8506774902344, + "eval_logps/rejected": -480.0435791015625, + "eval_loss": 0.5753123760223389, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -1.1349091529846191, + "eval_rewards/margins": 0.5865655541419983, + "eval_rewards/rejected": -1.7214747667312622, + "eval_runtime": 224.0163, + "eval_samples_per_second": 8.928, + "eval_steps_per_second": 0.281, + "step": 3900 + }, + { + "epoch": 4.09, + "learning_rate": 4.0991501061765574e-08, + "logits/chosen": -1.8390766382217407, + "logits/rejected": -1.7180551290512085, + "logps/chosen": -449.53582763671875, + "logps/rejected": -492.67864990234375, + "loss": 0.4856, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0572012662887573, + "rewards/margins": 0.7417745590209961, + "rewards/rejected": -1.798975944519043, + "step": 3910 + }, + { + "epoch": 4.1, + "learning_rate": 4.007523084474929e-08, + "logits/chosen": -1.796332597732544, + "logits/rejected": -1.685544729232788, + "logps/chosen": -465.2630920410156, + "logps/rejected": -485.39776611328125, + "loss": 0.4619, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0321013927459717, + "rewards/margins": 0.8061541318893433, + "rewards/rejected": -1.8382556438446045, + "step": 3920 + }, + { + "epoch": 4.11, + "learning_rate": 3.916842421318015e-08, + "logits/chosen": -1.7621917724609375, + "logits/rejected": -1.6985727548599243, + "logps/chosen": -416.25469970703125, + "logps/rejected": -488.31103515625, + "loss": 0.4732, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0770134925842285, + "rewards/margins": 0.7757894396781921, + "rewards/rejected": -1.8528029918670654, + "step": 3930 + }, + { + "epoch": 4.13, + "learning_rate": 3.8271122046665326e-08, + "logits/chosen": -1.7745786905288696, + "logits/rejected": -1.7049392461776733, + "logps/chosen": -420.1312561035156, + "logps/rejected": -456.5462951660156, + "loss": 0.4799, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0381975173950195, + "rewards/margins": 0.7507133483886719, + "rewards/rejected": -1.7889106273651123, + "step": 3940 + }, + { + "epoch": 4.14, + "learning_rate": 3.738336479634227e-08, + "logits/chosen": -1.744370460510254, + "logits/rejected": -1.7423713207244873, + "logps/chosen": -401.23883056640625, + "logps/rejected": -456.70623779296875, + "loss": 0.4917, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.081215262413025, + "rewards/margins": 0.6640897989273071, + "rewards/rejected": -1.745304822921753, + "step": 3950 + }, + { + "epoch": 4.15, + "learning_rate": 3.650519248305583e-08, + "logits/chosen": -1.8554394245147705, + "logits/rejected": -1.7356094121932983, + "logps/chosen": -440.2884216308594, + "logps/rejected": -496.92877197265625, + "loss": 0.4588, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0013624429702759, + "rewards/margins": 0.8840063810348511, + "rewards/rejected": -1.8853687047958374, + "step": 3960 + }, + { + "epoch": 4.16, + "learning_rate": 3.5636644695553554e-08, + "logits/chosen": -1.8372949361801147, + "logits/rejected": -1.7880789041519165, + "logps/chosen": -425.9288635253906, + "logps/rejected": -475.31256103515625, + "loss": 0.4733, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.003899335861206, + "rewards/margins": 0.763668417930603, + "rewards/rejected": -1.7675678730010986, + "step": 3970 + }, + { + "epoch": 4.17, + "learning_rate": 3.477776058870166e-08, + "logits/chosen": -1.7965351343154907, + "logits/rejected": -1.646406888961792, + "logps/chosen": -440.19598388671875, + "logps/rejected": -478.03033447265625, + "loss": 0.4435, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0087398290634155, + "rewards/margins": 0.8503010869026184, + "rewards/rejected": -1.8590409755706787, + "step": 3980 + }, + { + "epoch": 4.18, + "learning_rate": 3.392857888171904e-08, + "logits/chosen": -1.8338630199432373, + "logits/rejected": -1.6785008907318115, + "logps/chosen": -429.9710998535156, + "logps/rejected": -448.28094482421875, + "loss": 0.4643, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9645228385925293, + "rewards/margins": 0.7628434300422668, + "rewards/rejected": -1.7273662090301514, + "step": 3990 + }, + { + "epoch": 4.19, + "learning_rate": 3.308913785643255e-08, + "logits/chosen": -1.7557369470596313, + "logits/rejected": -1.6469757556915283, + "logps/chosen": -422.50653076171875, + "logps/rejected": -435.71063232421875, + "loss": 0.461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9858303070068359, + "rewards/margins": 0.7525902986526489, + "rewards/rejected": -1.7384207248687744, + "step": 4000 + }, + { + "epoch": 4.19, + "eval_logits/chosen": -1.832720398902893, + "eval_logits/rejected": -1.7189408540725708, + "eval_logps/chosen": -466.1141662597656, + "eval_logps/rejected": -483.5272521972656, + "eval_loss": 0.5745397210121155, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": -1.1675440073013306, + "eval_rewards/margins": 0.5887669324874878, + "eval_rewards/rejected": -1.7563108205795288, + "eval_runtime": 225.2947, + "eval_samples_per_second": 8.877, + "eval_steps_per_second": 0.28, + "step": 4000 + }, + { + "epoch": 4.2, + "learning_rate": 3.225947535555079e-08, + "logits/chosen": -1.7829539775848389, + "logits/rejected": -1.7191102504730225, + "logps/chosen": -451.741943359375, + "logps/rejected": -505.08514404296875, + "loss": 0.4744, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.09004545211792, + "rewards/margins": 0.8111955523490906, + "rewards/rejected": -1.9012410640716553, + "step": 4010 + }, + { + "epoch": 4.21, + "learning_rate": 3.143962878095829e-08, + "logits/chosen": -1.7982536554336548, + "logits/rejected": -1.722922921180725, + "logps/chosen": -444.19189453125, + "logps/rejected": -515.3712158203125, + "loss": 0.4698, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0294020175933838, + "rewards/margins": 0.8970456123352051, + "rewards/rejected": -1.9264476299285889, + "step": 4020 + }, + { + "epoch": 4.22, + "learning_rate": 3.0629635092029345e-08, + "logits/chosen": -1.7503137588500977, + "logits/rejected": -1.6368910074234009, + "logps/chosen": -433.9314880371094, + "logps/rejected": -452.56524658203125, + "loss": 0.4584, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.0288403034210205, + "rewards/margins": 0.811961829662323, + "rewards/rejected": -1.8408019542694092, + "step": 4030 + }, + { + "epoch": 4.23, + "learning_rate": 2.9829530803961665e-08, + "logits/chosen": -1.7854505777359009, + "logits/rejected": -1.679652452468872, + "logps/chosen": -434.6220703125, + "logps/rejected": -481.2295837402344, + "loss": 0.4636, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0290977954864502, + "rewards/margins": 0.8236631155014038, + "rewards/rejected": -1.852760910987854, + "step": 4040 + }, + { + "epoch": 4.24, + "learning_rate": 2.903935198613089e-08, + "logits/chosen": -1.8103902339935303, + "logits/rejected": -1.780846357345581, + "logps/chosen": -447.97113037109375, + "logps/rejected": -498.028076171875, + "loss": 0.4595, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0087683200836182, + "rewards/margins": 0.7507632374763489, + "rewards/rejected": -1.7595316171646118, + "step": 4050 + }, + { + "epoch": 4.25, + "learning_rate": 2.8259134260463586e-08, + "logits/chosen": -1.8016932010650635, + "logits/rejected": -1.702823281288147, + "logps/chosen": -474.5953674316406, + "logps/rejected": -477.8955993652344, + "loss": 0.4824, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0956062078475952, + "rewards/margins": 0.7087380886077881, + "rewards/rejected": -1.8043444156646729, + "step": 4060 + }, + { + "epoch": 4.26, + "learning_rate": 2.748891279983226e-08, + "logits/chosen": -1.7880712747573853, + "logits/rejected": -1.675244927406311, + "logps/chosen": -443.755859375, + "logps/rejected": -508.98553466796875, + "loss": 0.4635, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.1239733695983887, + "rewards/margins": 0.8481414914131165, + "rewards/rejected": -1.97211492061615, + "step": 4070 + }, + { + "epoch": 4.27, + "learning_rate": 2.6728722326469167e-08, + "logits/chosen": -1.7646725177764893, + "logits/rejected": -1.669007658958435, + "logps/chosen": -416.451171875, + "logps/rejected": -482.55859375, + "loss": 0.4678, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0863951444625854, + "rewards/margins": 0.7753079533576965, + "rewards/rejected": -1.8617031574249268, + "step": 4080 + }, + { + "epoch": 4.28, + "learning_rate": 2.5978597110401402e-08, + "logits/chosen": -1.8389514684677124, + "logits/rejected": -1.725760817527771, + "logps/chosen": -470.4855041503906, + "logps/rejected": -503.29248046875, + "loss": 0.4757, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.050333023071289, + "rewards/margins": 0.7826107740402222, + "rewards/rejected": -1.8329439163208008, + "step": 4090 + }, + { + "epoch": 4.29, + "learning_rate": 2.5238570967905492e-08, + "logits/chosen": -1.816535234451294, + "logits/rejected": -1.7674373388290405, + "logps/chosen": -427.8245544433594, + "logps/rejected": -469.5841369628906, + "loss": 0.4881, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9843165278434753, + "rewards/margins": 0.717639148235321, + "rewards/rejected": -1.701956033706665, + "step": 4100 + }, + { + "epoch": 4.29, + "eval_logits/chosen": -1.8260232210159302, + "eval_logits/rejected": -1.712431788444519, + "eval_logps/chosen": -464.182861328125, + "eval_logps/rejected": -481.8480529785156, + "eval_loss": 0.5762295126914978, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -1.1482311487197876, + "eval_rewards/margins": 0.5912875533103943, + "eval_rewards/rejected": -1.7395187616348267, + "eval_runtime": 229.9132, + "eval_samples_per_second": 8.699, + "eval_steps_per_second": 0.274, + "step": 4100 + }, + { + "epoch": 4.3, + "learning_rate": 2.4508677259983486e-08, + "logits/chosen": -1.7945934534072876, + "logits/rejected": -1.721040964126587, + "logps/chosen": -452.2828063964844, + "logps/rejected": -499.61480712890625, + "loss": 0.4607, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0209802389144897, + "rewards/margins": 0.7915663123130798, + "rewards/rejected": -1.8125463724136353, + "step": 4110 + }, + { + "epoch": 4.31, + "learning_rate": 2.3788948890858613e-08, + "logits/chosen": -1.8254003524780273, + "logits/rejected": -1.7241672277450562, + "logps/chosen": -441.425048828125, + "logps/rejected": -490.83245849609375, + "loss": 0.4782, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1029802560806274, + "rewards/margins": 0.6961835622787476, + "rewards/rejected": -1.799163579940796, + "step": 4120 + }, + { + "epoch": 4.32, + "learning_rate": 2.3079418306492098e-08, + "logits/chosen": -1.8279308080673218, + "logits/rejected": -1.726564645767212, + "logps/chosen": -449.76678466796875, + "logps/rejected": -478.5965881347656, + "loss": 0.4654, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0333783626556396, + "rewards/margins": 0.800345778465271, + "rewards/rejected": -1.8337242603302002, + "step": 4130 + }, + { + "epoch": 4.33, + "learning_rate": 2.2380117493120493e-08, + "logits/chosen": -1.7413800954818726, + "logits/rejected": -1.6594340801239014, + "logps/chosen": -419.1304626464844, + "logps/rejected": -470.057373046875, + "loss": 0.4836, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.9636515378952026, + "rewards/margins": 0.8068215250968933, + "rewards/rejected": -1.7704731225967407, + "step": 4140 + }, + { + "epoch": 4.34, + "learning_rate": 2.1691077975813488e-08, + "logits/chosen": -1.8358606100082397, + "logits/rejected": -1.718096137046814, + "logps/chosen": -452.9070739746094, + "logps/rejected": -510.75439453125, + "loss": 0.4629, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0342588424682617, + "rewards/margins": 0.8376301527023315, + "rewards/rejected": -1.871889352798462, + "step": 4150 + }, + { + "epoch": 4.36, + "learning_rate": 2.1012330817053142e-08, + "logits/chosen": -1.675252914428711, + "logits/rejected": -1.6835241317749023, + "logps/chosen": -434.5631408691406, + "logps/rejected": -507.29278564453125, + "loss": 0.4657, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.1027967929840088, + "rewards/margins": 0.7846490740776062, + "rewards/rejected": -1.8874458074569702, + "step": 4160 + }, + { + "epoch": 4.37, + "learning_rate": 2.0343906615333113e-08, + "logits/chosen": -1.7944949865341187, + "logits/rejected": -1.6714982986450195, + "logps/chosen": -463.69842529296875, + "logps/rejected": -492.56488037109375, + "loss": 0.4669, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0258897542953491, + "rewards/margins": 0.7885478734970093, + "rewards/rejected": -1.8144375085830688, + "step": 4170 + }, + { + "epoch": 4.38, + "learning_rate": 1.968583550377953e-08, + "logits/chosen": -1.7210184335708618, + "logits/rejected": -1.5984843969345093, + "logps/chosen": -449.9814453125, + "logps/rejected": -471.9698181152344, + "loss": 0.4691, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9973635673522949, + "rewards/margins": 0.8492447733879089, + "rewards/rejected": -1.8466084003448486, + "step": 4180 + }, + { + "epoch": 4.39, + "learning_rate": 1.903814714879251e-08, + "logits/chosen": -1.8879683017730713, + "logits/rejected": -1.7043514251708984, + "logps/chosen": -494.8345642089844, + "logps/rejected": -487.78680419921875, + "loss": 0.466, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0199146270751953, + "rewards/margins": 0.7881089448928833, + "rewards/rejected": -1.808023452758789, + "step": 4190 + }, + { + "epoch": 4.4, + "learning_rate": 1.840087074870883e-08, + "logits/chosen": -1.8216197490692139, + "logits/rejected": -1.724962592124939, + "logps/chosen": -480.49090576171875, + "logps/rejected": -513.2200927734375, + "loss": 0.4449, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.0223886966705322, + "rewards/margins": 0.8934313654899597, + "rewards/rejected": -1.9158203601837158, + "step": 4200 + }, + { + "epoch": 4.4, + "eval_logits/chosen": -1.8251301050186157, + "eval_logits/rejected": -1.711572289466858, + "eval_logps/chosen": -466.1421203613281, + "eval_logps/rejected": -484.05059814453125, + "eval_loss": 0.5764839053153992, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.1678234338760376, + "eval_rewards/margins": 0.5937210321426392, + "eval_rewards/rejected": -1.7615445852279663, + "eval_runtime": 219.2125, + "eval_samples_per_second": 9.124, + "eval_steps_per_second": 0.287, + "step": 4200 + }, + { + "epoch": 4.41, + "learning_rate": 1.7774035032485367e-08, + "logits/chosen": -1.7790334224700928, + "logits/rejected": -1.6728601455688477, + "logps/chosen": -456.02197265625, + "logps/rejected": -504.24505615234375, + "loss": 0.4849, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.129804015159607, + "rewards/margins": 0.7883261442184448, + "rewards/rejected": -1.9181305170059204, + "step": 4210 + }, + { + "epoch": 4.42, + "learning_rate": 1.7157668258404312e-08, + "logits/chosen": -1.6565701961517334, + "logits/rejected": -1.6299617290496826, + "logps/chosen": -389.723876953125, + "logps/rejected": -458.76214599609375, + "loss": 0.4669, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0441362857818604, + "rewards/margins": 0.6970081329345703, + "rewards/rejected": -1.7411444187164307, + "step": 4220 + }, + { + "epoch": 4.43, + "learning_rate": 1.6551798212799227e-08, + "logits/chosen": -1.7372820377349854, + "logits/rejected": -1.674203634262085, + "logps/chosen": -443.078125, + "logps/rejected": -481.3499450683594, + "loss": 0.4812, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0964055061340332, + "rewards/margins": 0.6954679489135742, + "rewards/rejected": -1.791873574256897, + "step": 4230 + }, + { + "epoch": 4.44, + "learning_rate": 1.595645220880204e-08, + "logits/chosen": -1.7401357889175415, + "logits/rejected": -1.607173204421997, + "logps/chosen": -464.32763671875, + "logps/rejected": -509.621337890625, + "loss": 0.483, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0939157009124756, + "rewards/margins": 0.7595055103302002, + "rewards/rejected": -1.8534212112426758, + "step": 4240 + }, + { + "epoch": 4.45, + "learning_rate": 1.537165708511226e-08, + "logits/chosen": -1.8759247064590454, + "logits/rejected": -1.725313425064087, + "logps/chosen": -477.0743713378906, + "logps/rejected": -488.4725646972656, + "loss": 0.4588, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1122633218765259, + "rewards/margins": 0.7690132856369019, + "rewards/rejected": -1.8812764883041382, + "step": 4250 + }, + { + "epoch": 4.46, + "learning_rate": 1.479743920478671e-08, + "logits/chosen": -1.8185522556304932, + "logits/rejected": -1.745117425918579, + "logps/chosen": -472.5321350097656, + "logps/rejected": -490.56317138671875, + "loss": 0.4814, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1119530200958252, + "rewards/margins": 0.6407719254493713, + "rewards/rejected": -1.7527250051498413, + "step": 4260 + }, + { + "epoch": 4.47, + "learning_rate": 1.4233824454051191e-08, + "logits/chosen": -1.7532942295074463, + "logits/rejected": -1.644690752029419, + "logps/chosen": -444.12176513671875, + "logps/rejected": -484.6136779785156, + "loss": 0.469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0666099786758423, + "rewards/margins": 0.803938090801239, + "rewards/rejected": -1.8705480098724365, + "step": 4270 + }, + { + "epoch": 4.48, + "learning_rate": 1.3680838241133475e-08, + "logits/chosen": -1.8085733652114868, + "logits/rejected": -1.7114070653915405, + "logps/chosen": -454.65179443359375, + "logps/rejected": -483.69598388671875, + "loss": 0.463, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9596077799797058, + "rewards/margins": 0.8614352941513062, + "rewards/rejected": -1.8210432529449463, + "step": 4280 + }, + { + "epoch": 4.49, + "learning_rate": 1.3138505495117913e-08, + "logits/chosen": -1.7711594104766846, + "logits/rejected": -1.7229337692260742, + "logps/chosen": -445.0728454589844, + "logps/rejected": -505.59564208984375, + "loss": 0.4603, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.026932954788208, + "rewards/margins": 0.8151108026504517, + "rewards/rejected": -1.8420432806015015, + "step": 4290 + }, + { + "epoch": 4.5, + "learning_rate": 1.2606850664821617e-08, + "logits/chosen": -1.8515970706939697, + "logits/rejected": -1.6959202289581299, + "logps/chosen": -457.86737060546875, + "logps/rejected": -480.14093017578125, + "loss": 0.4692, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0556819438934326, + "rewards/margins": 0.7872077226638794, + "rewards/rejected": -1.8428895473480225, + "step": 4300 + }, + { + "epoch": 4.5, + "eval_logits/chosen": -1.8279350996017456, + "eval_logits/rejected": -1.7143094539642334, + "eval_logps/chosen": -466.46240234375, + "eval_logps/rejected": -484.0967712402344, + "eval_loss": 0.5759356021881104, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -1.171026349067688, + "eval_rewards/margins": 0.5909795761108398, + "eval_rewards/rejected": -1.7620059251785278, + "eval_runtime": 226.1955, + "eval_samples_per_second": 8.842, + "eval_steps_per_second": 0.279, + "step": 4300 + }, + { + "epoch": 4.51, + "learning_rate": 1.208589771769225e-08, + "logits/chosen": -1.7081100940704346, + "logits/rejected": -1.6469475030899048, + "logps/chosen": -421.621337890625, + "logps/rejected": -453.98956298828125, + "loss": 0.4712, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0625663995742798, + "rewards/margins": 0.669289767742157, + "rewards/rejected": -1.731856107711792, + "step": 4310 + }, + { + "epoch": 4.52, + "learning_rate": 1.1575670138727456e-08, + "logits/chosen": -1.8913682699203491, + "logits/rejected": -1.6768405437469482, + "logps/chosen": -473.85992431640625, + "logps/rejected": -507.3983459472656, + "loss": 0.4676, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.0402823686599731, + "rewards/margins": 0.8671220541000366, + "rewards/rejected": -1.9074045419692993, + "step": 4320 + }, + { + "epoch": 4.53, + "learning_rate": 1.1076190929416418e-08, + "logits/chosen": -1.871779441833496, + "logits/rejected": -1.73202383518219, + "logps/chosen": -485.59912109375, + "logps/rejected": -490.35723876953125, + "loss": 0.4702, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.149161458015442, + "rewards/margins": 0.6439096927642822, + "rewards/rejected": -1.7930711507797241, + "step": 4330 + }, + { + "epoch": 4.54, + "learning_rate": 1.0587482606702697e-08, + "logits/chosen": -1.8452228307724, + "logits/rejected": -1.6967185735702515, + "logps/chosen": -459.0291442871094, + "logps/rejected": -490.520263671875, + "loss": 0.4512, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0778141021728516, + "rewards/margins": 0.7190856337547302, + "rewards/rejected": -1.7969001531600952, + "step": 4340 + }, + { + "epoch": 4.55, + "learning_rate": 1.0109567201969176e-08, + "logits/chosen": -1.823883056640625, + "logits/rejected": -1.7028872966766357, + "logps/chosen": -429.84124755859375, + "logps/rejected": -477.64215087890625, + "loss": 0.4699, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9281118512153625, + "rewards/margins": 0.8856562376022339, + "rewards/rejected": -1.8137681484222412, + "step": 4350 + }, + { + "epoch": 4.56, + "learning_rate": 9.642466260044918e-09, + "logits/chosen": -1.762036919593811, + "logits/rejected": -1.6680705547332764, + "logps/chosen": -427.251220703125, + "logps/rejected": -469.9976501464844, + "loss": 0.479, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.0374650955200195, + "rewards/margins": 0.7968862056732178, + "rewards/rejected": -1.8343513011932373, + "step": 4360 + }, + { + "epoch": 4.58, + "learning_rate": 9.186200838233904e-09, + "logits/chosen": -1.7466312646865845, + "logits/rejected": -1.6498746871948242, + "logps/chosen": -464.2078552246094, + "logps/rejected": -464.82037353515625, + "loss": 0.4989, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1440064907073975, + "rewards/margins": 0.6234654784202576, + "rewards/rejected": -1.7674716711044312, + "step": 4370 + }, + { + "epoch": 4.59, + "learning_rate": 8.740791505365747e-09, + "logits/chosen": -1.8438808917999268, + "logits/rejected": -1.7786586284637451, + "logps/chosen": -413.7540588378906, + "logps/rejected": -479.59954833984375, + "loss": 0.4642, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.982395350933075, + "rewards/margins": 0.7656720876693726, + "rewards/rejected": -1.7480674982070923, + "step": 4380 + }, + { + "epoch": 4.6, + "learning_rate": 8.3062583408684e-09, + "logits/chosen": -1.7615505456924438, + "logits/rejected": -1.6486164331436157, + "logps/chosen": -465.7588806152344, + "logps/rejected": -513.684814453125, + "loss": 0.4674, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0538051128387451, + "rewards/margins": 0.9259963035583496, + "rewards/rejected": -1.9798015356063843, + "step": 4390 + }, + { + "epoch": 4.61, + "learning_rate": 7.88262093386302e-09, + "logits/chosen": -1.7515672445297241, + "logits/rejected": -1.6775462627410889, + "logps/chosen": -452.8060607910156, + "logps/rejected": -492.5433044433594, + "loss": 0.4654, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0378320217132568, + "rewards/margins": 0.7652202844619751, + "rewards/rejected": -1.803052306175232, + "step": 4400 + }, + { + "epoch": 4.61, + "eval_logits/chosen": -1.8290122747421265, + "eval_logits/rejected": -1.7153981924057007, + "eval_logps/chosen": -466.3009338378906, + "eval_logps/rejected": -484.222412109375, + "eval_loss": 0.5759946703910828, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -1.1694118976593018, + "eval_rewards/margins": 0.5938506722450256, + "eval_rewards/rejected": -1.763262391090393, + "eval_runtime": 229.4785, + "eval_samples_per_second": 8.715, + "eval_steps_per_second": 0.275, + "step": 4400 + }, + { + "epoch": 4.62, + "learning_rate": 7.469898382280765e-09, + "logits/chosen": -1.8184077739715576, + "logits/rejected": -1.722876787185669, + "logps/chosen": -464.96612548828125, + "logps/rejected": -512.9050903320312, + "loss": 0.5075, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1251068115234375, + "rewards/margins": 0.7477315068244934, + "rewards/rejected": -1.872838020324707, + "step": 4410 + }, + { + "epoch": 4.63, + "learning_rate": 7.068109292002022e-09, + "logits/chosen": -1.8087494373321533, + "logits/rejected": -1.651476502418518, + "logps/chosen": -461.4347229003906, + "logps/rejected": -481.1415100097656, + "loss": 0.4707, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9792950749397278, + "rewards/margins": 0.8313227891921997, + "rewards/rejected": -1.8106176853179932, + "step": 4420 + }, + { + "epoch": 4.64, + "learning_rate": 6.677271776017457e-09, + "logits/chosen": -1.7190685272216797, + "logits/rejected": -1.5987236499786377, + "logps/chosen": -453.14617919921875, + "logps/rejected": -511.3948669433594, + "loss": 0.4754, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0528547763824463, + "rewards/margins": 0.8089377284049988, + "rewards/rejected": -1.8617923259735107, + "step": 4430 + }, + { + "epoch": 4.65, + "learning_rate": 6.297403453611488e-09, + "logits/chosen": -1.753126859664917, + "logits/rejected": -1.6576951742172241, + "logps/chosen": -443.3055725097656, + "logps/rejected": -492.17633056640625, + "loss": 0.4545, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0997533798217773, + "rewards/margins": 0.753233790397644, + "rewards/rejected": -1.852987289428711, + "step": 4440 + }, + { + "epoch": 4.66, + "learning_rate": 5.928521449568236e-09, + "logits/chosen": -1.8097671270370483, + "logits/rejected": -1.6349399089813232, + "logps/chosen": -489.1026916503906, + "logps/rejected": -497.45684814453125, + "loss": 0.4709, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0746281147003174, + "rewards/margins": 0.84410160779953, + "rewards/rejected": -1.9187300205230713, + "step": 4450 + }, + { + "epoch": 4.67, + "learning_rate": 5.570642393399105e-09, + "logits/chosen": -1.8092399835586548, + "logits/rejected": -1.6813846826553345, + "logps/chosen": -449.77532958984375, + "logps/rejected": -478.45013427734375, + "loss": 0.4655, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.064793586730957, + "rewards/margins": 0.7874218225479126, + "rewards/rejected": -1.8522160053253174, + "step": 4460 + }, + { + "epoch": 4.68, + "learning_rate": 5.223782418593503e-09, + "logits/chosen": -1.7820489406585693, + "logits/rejected": -1.7109342813491821, + "logps/chosen": -465.9383850097656, + "logps/rejected": -522.559814453125, + "loss": 0.4838, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1747386455535889, + "rewards/margins": 0.7057716250419617, + "rewards/rejected": -1.8805103302001953, + "step": 4470 + }, + { + "epoch": 4.69, + "learning_rate": 4.887957161891304e-09, + "logits/chosen": -1.7804561853408813, + "logits/rejected": -1.6633250713348389, + "logps/chosen": -428.78375244140625, + "logps/rejected": -464.534423828125, + "loss": 0.4671, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0096886157989502, + "rewards/margins": 0.8078888654708862, + "rewards/rejected": -1.817577600479126, + "step": 4480 + }, + { + "epoch": 4.7, + "learning_rate": 4.5631817625780274e-09, + "logits/chosen": -1.8089252710342407, + "logits/rejected": -1.712436318397522, + "logps/chosen": -463.1101989746094, + "logps/rejected": -498.7040100097656, + "loss": 0.4795, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0910892486572266, + "rewards/margins": 0.6699890494346619, + "rewards/rejected": -1.7610784769058228, + "step": 4490 + }, + { + "epoch": 4.71, + "learning_rate": 4.249470861802218e-09, + "logits/chosen": -1.8122116327285767, + "logits/rejected": -1.7331546545028687, + "logps/chosen": -415.1197814941406, + "logps/rejected": -455.2989196777344, + "loss": 0.4608, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.057392954826355, + "rewards/margins": 0.7255457639694214, + "rewards/rejected": -1.7829385995864868, + "step": 4500 + }, + { + "epoch": 4.71, + "eval_logits/chosen": -1.8304409980773926, + "eval_logits/rejected": -1.7171387672424316, + "eval_logps/chosen": -467.0130920410156, + "eval_logps/rejected": -484.81231689453125, + "eval_loss": 0.5753689408302307, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.176533579826355, + "eval_rewards/margins": 0.592628002166748, + "eval_rewards/rejected": -1.769161581993103, + "eval_runtime": 230.1385, + "eval_samples_per_second": 8.69, + "eval_steps_per_second": 0.274, + "step": 4500 + }, + { + "epoch": 4.72, + "learning_rate": 3.946838601915581e-09, + "logits/chosen": -1.7701961994171143, + "logits/rejected": -1.678413987159729, + "logps/chosen": -433.9873962402344, + "logps/rejected": -465.7084045410156, + "loss": 0.4713, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0330111980438232, + "rewards/margins": 0.6660115718841553, + "rewards/rejected": -1.699022889137268, + "step": 4510 + }, + { + "epoch": 4.73, + "learning_rate": 3.6552986258354123e-09, + "logits/chosen": -1.7901086807250977, + "logits/rejected": -1.6710205078125, + "logps/chosen": -508.4490661621094, + "logps/rejected": -499.4988708496094, + "loss": 0.4517, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1582683324813843, + "rewards/margins": 0.6341474652290344, + "rewards/rejected": -1.792415976524353, + "step": 4520 + }, + { + "epoch": 4.74, + "learning_rate": 3.3748640764293955e-09, + "logits/chosen": -1.7829793691635132, + "logits/rejected": -1.653700590133667, + "logps/chosen": -477.4127502441406, + "logps/rejected": -496.84991455078125, + "loss": 0.4831, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1034595966339111, + "rewards/margins": 0.7908321619033813, + "rewards/rejected": -1.894291639328003, + "step": 4530 + }, + { + "epoch": 4.75, + "learning_rate": 3.1055475959232693e-09, + "logits/chosen": -1.761479139328003, + "logits/rejected": -1.678342580795288, + "logps/chosen": -474.9033203125, + "logps/rejected": -467.4913635253906, + "loss": 0.4562, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0001628398895264, + "rewards/margins": 0.7919613718986511, + "rewards/rejected": -1.7921243906021118, + "step": 4540 + }, + { + "epoch": 4.76, + "learning_rate": 2.8473613253308937e-09, + "logits/chosen": -1.7882721424102783, + "logits/rejected": -1.7147448062896729, + "logps/chosen": -466.8858947753906, + "logps/rejected": -501.8744201660156, + "loss": 0.4796, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9468268156051636, + "rewards/margins": 0.6786229014396667, + "rewards/rejected": -1.625449776649475, + "step": 4550 + }, + { + "epoch": 4.77, + "learning_rate": 2.6003169039068574e-09, + "logits/chosen": -1.8359006643295288, + "logits/rejected": -1.7158222198486328, + "logps/chosen": -440.7718200683594, + "logps/rejected": -496.41583251953125, + "loss": 0.4748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.087088942527771, + "rewards/margins": 0.7704466581344604, + "rewards/rejected": -1.8575356006622314, + "step": 4560 + }, + { + "epoch": 4.78, + "learning_rate": 2.3644254686217837e-09, + "logits/chosen": -1.7801955938339233, + "logits/rejected": -1.6484178304672241, + "logps/chosen": -475.614013671875, + "logps/rejected": -476.01654052734375, + "loss": 0.4831, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1018412113189697, + "rewards/margins": 0.7570894360542297, + "rewards/rejected": -1.8589305877685547, + "step": 4570 + }, + { + "epoch": 4.79, + "learning_rate": 2.139697653660316e-09, + "logits/chosen": -1.8015964031219482, + "logits/rejected": -1.6739526987075806, + "logps/chosen": -466.79052734375, + "logps/rejected": -482.6788024902344, + "loss": 0.4746, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1629666090011597, + "rewards/margins": 0.607349157333374, + "rewards/rejected": -1.7703158855438232, + "step": 4580 + }, + { + "epoch": 4.81, + "learning_rate": 1.92614358994167e-09, + "logits/chosen": -1.8202970027923584, + "logits/rejected": -1.7315584421157837, + "logps/chosen": -479.6277770996094, + "logps/rejected": -499.9185485839844, + "loss": 0.4615, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1074717044830322, + "rewards/margins": 0.7402085065841675, + "rewards/rejected": -1.8476800918579102, + "step": 4590 + }, + { + "epoch": 4.82, + "learning_rate": 1.7237729046629679e-09, + "logits/chosen": -1.728281021118164, + "logits/rejected": -1.6742477416992188, + "logps/chosen": -438.21844482421875, + "logps/rejected": -507.11712646484375, + "loss": 0.4661, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1053134202957153, + "rewards/margins": 0.7916030883789062, + "rewards/rejected": -1.8969166278839111, + "step": 4600 + }, + { + "epoch": 4.82, + "eval_logits/chosen": -1.825499176979065, + "eval_logits/rejected": -1.711985468864441, + "eval_logps/chosen": -467.5480651855469, + "eval_logps/rejected": -485.3936767578125, + "eval_loss": 0.5754343271255493, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -1.181883454322815, + "eval_rewards/margins": 0.5930914282798767, + "eval_rewards/rejected": -1.7749747037887573, + "eval_runtime": 232.3791, + "eval_samples_per_second": 8.607, + "eval_steps_per_second": 0.271, + "step": 4600 + }, + { + "epoch": 4.83, + "learning_rate": 1.5325947208651713e-09, + "logits/chosen": -1.7464689016342163, + "logits/rejected": -1.6714897155761719, + "logps/chosen": -427.6334533691406, + "logps/rejected": -462.86590576171875, + "loss": 0.445, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0283820629119873, + "rewards/margins": 0.809057891368866, + "rewards/rejected": -1.8374401330947876, + "step": 4610 + }, + { + "epoch": 4.84, + "learning_rate": 1.352617657021854e-09, + "logits/chosen": -1.7710460424423218, + "logits/rejected": -1.703453779220581, + "logps/chosen": -393.85345458984375, + "logps/rejected": -438.5416564941406, + "loss": 0.4722, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0014649629592896, + "rewards/margins": 0.7734811902046204, + "rewards/rejected": -1.7749459743499756, + "step": 4620 + }, + { + "epoch": 4.85, + "learning_rate": 1.1838498266507069e-09, + "logits/chosen": -1.8539412021636963, + "logits/rejected": -1.7409346103668213, + "logps/chosen": -438.21563720703125, + "logps/rejected": -478.8936462402344, + "loss": 0.4596, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0243479013442993, + "rewards/margins": 0.816145122051239, + "rewards/rejected": -1.8404929637908936, + "step": 4630 + }, + { + "epoch": 4.86, + "learning_rate": 1.0262988379476922e-09, + "logits/chosen": -1.7646992206573486, + "logits/rejected": -1.6506078243255615, + "logps/chosen": -478.5440979003906, + "logps/rejected": -482.2001953125, + "loss": 0.4836, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.151123285293579, + "rewards/margins": 0.6975986957550049, + "rewards/rejected": -1.8487218618392944, + "step": 4640 + }, + { + "epoch": 4.87, + "learning_rate": 8.79971793444123e-10, + "logits/chosen": -1.7563340663909912, + "logits/rejected": -1.6405454874038696, + "logps/chosen": -439.6841735839844, + "logps/rejected": -495.43829345703125, + "loss": 0.4669, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.0974491834640503, + "rewards/margins": 0.8005310893058777, + "rewards/rejected": -1.8979803323745728, + "step": 4650 + }, + { + "epoch": 4.88, + "learning_rate": 7.448752896864197e-10, + "logits/chosen": -1.8486369848251343, + "logits/rejected": -1.6208820343017578, + "logps/chosen": -469.00390625, + "logps/rejected": -463.940673828125, + "loss": 0.502, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.085375189781189, + "rewards/margins": 0.7727323770523071, + "rewards/rejected": -1.858107328414917, + "step": 4660 + }, + { + "epoch": 4.89, + "learning_rate": 6.210154169388193e-10, + "logits/chosen": -1.7519057989120483, + "logits/rejected": -1.6661628484725952, + "logps/chosen": -452.79205322265625, + "logps/rejected": -476.6766662597656, + "loss": 0.4611, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1015011072158813, + "rewards/margins": 0.7438825964927673, + "rewards/rejected": -1.845383644104004, + "step": 4670 + }, + { + "epoch": 4.9, + "learning_rate": 5.083977589086796e-10, + "logits/chosen": -1.882615327835083, + "logits/rejected": -1.7311588525772095, + "logps/chosen": -480.04827880859375, + "logps/rejected": -506.5596618652344, + "loss": 0.4507, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.9937393069267273, + "rewards/margins": 0.8901308178901672, + "rewards/rejected": -1.8838701248168945, + "step": 4680 + }, + { + "epoch": 4.91, + "learning_rate": 4.070273924949574e-10, + "logits/chosen": -1.789244294166565, + "logits/rejected": -1.7519454956054688, + "logps/chosen": -469.8148498535156, + "logps/rejected": -508.5157165527344, + "loss": 0.4728, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.0590931177139282, + "rewards/margins": 0.7643530964851379, + "rewards/rejected": -1.823446273803711, + "step": 4690 + }, + { + "epoch": 4.92, + "learning_rate": 3.169088875591419e-10, + "logits/chosen": -1.7920825481414795, + "logits/rejected": -1.705712080001831, + "logps/chosen": -448.9642639160156, + "logps/rejected": -477.53350830078125, + "loss": 0.4859, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0943710803985596, + "rewards/margins": 0.7004526257514954, + "rewards/rejected": -1.7948236465454102, + "step": 4700 + }, + { + "epoch": 4.92, + "eval_logits/chosen": -1.8237359523773193, + "eval_logits/rejected": -1.710079550743103, + "eval_logps/chosen": -467.6951904296875, + "eval_logps/rejected": -485.5031433105469, + "eval_loss": 0.5756003856658936, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -1.1833546161651611, + "eval_rewards/margins": 0.5927155017852783, + "eval_rewards/rejected": -1.7760698795318604, + "eval_runtime": 225.1994, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 0.28, + "step": 4700 + }, + { + "epoch": 4.93, + "learning_rate": 2.380463067193361e-10, + "logits/chosen": -1.744341492652893, + "logits/rejected": -1.6670739650726318, + "logps/chosen": -422.7705993652344, + "logps/rejected": -458.7456970214844, + "loss": 0.4661, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.988696277141571, + "rewards/margins": 0.7636533975601196, + "rewards/rejected": -1.7523494958877563, + "step": 4710 + }, + { + "epoch": 4.94, + "learning_rate": 1.7044320516718113e-10, + "logits/chosen": -1.7861382961273193, + "logits/rejected": -1.728690505027771, + "logps/chosen": -434.71026611328125, + "logps/rejected": -507.8138732910156, + "loss": 0.4709, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0964405536651611, + "rewards/margins": 0.8338233232498169, + "rewards/rejected": -1.930263876914978, + "step": 4720 + }, + { + "epoch": 4.95, + "learning_rate": 1.1410263050737335e-10, + "logits/chosen": -1.765300989151001, + "logits/rejected": -1.6384683847427368, + "logps/chosen": -451.39990234375, + "logps/rejected": -479.0953674316406, + "loss": 0.4637, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0694044828414917, + "rewards/margins": 0.893588662147522, + "rewards/rejected": -1.9629930257797241, + "step": 4730 + }, + { + "epoch": 4.96, + "learning_rate": 6.902712262055188e-11, + "logits/chosen": -1.751755714416504, + "logits/rejected": -1.6271288394927979, + "logps/chosen": -445.4202575683594, + "logps/rejected": -468.89678955078125, + "loss": 0.4951, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0948576927185059, + "rewards/margins": 0.7223270535469055, + "rewards/rejected": -1.8171848058700562, + "step": 4740 + }, + { + "epoch": 4.97, + "learning_rate": 3.52187135485571e-11, + "logits/chosen": -1.7895715236663818, + "logits/rejected": -1.596680998802185, + "logps/chosen": -461.5955505371094, + "logps/rejected": -480.7381896972656, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0684163570404053, + "rewards/margins": 0.6338636875152588, + "rewards/rejected": -1.702280044555664, + "step": 4750 + }, + { + "epoch": 4.98, + "learning_rate": 1.2678927402948181e-11, + "logits/chosen": -1.756028175354004, + "logits/rejected": -1.7099990844726562, + "logps/chosen": -447.0310974121094, + "logps/rejected": -500.3916931152344, + "loss": 0.4766, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0929630994796753, + "rewards/margins": 0.802161693572998, + "rewards/rejected": -1.8951247930526733, + "step": 4760 + }, + { + "epoch": 4.99, + "learning_rate": 1.408780296280332e-12, + "logits/chosen": -1.8381448984146118, + "logits/rejected": -1.7302604913711548, + "logps/chosen": -483.72064208984375, + "logps/rejected": -538.6234130859375, + "loss": 0.4449, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0196287631988525, + "rewards/margins": 0.8810880780220032, + "rewards/rejected": -1.9007165431976318, + "step": 4770 + }, + { + "epoch": 5.0, + "step": 4775, + "total_flos": 0.0, + "train_loss": 0.15748632995245967, + "train_runtime": 23969.9997, + "train_samples_per_second": 12.752, + "train_steps_per_second": 0.199 + } + ], + "logging_steps": 10, + "max_steps": 4775, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}