{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998707732069783, "eval_steps": 100, "global_step": 5803, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2412109375, "learning_rate": 8.605851979345955e-09, "logits/chosen": -3.5356550216674805, "logits/rejected": -3.5272138118743896, "logps/chosen": -54.58121871948242, "logps/rejected": -48.71324920654297, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.271484375, "learning_rate": 8.605851979345955e-08, "logits/chosen": -3.501009464263916, "logits/rejected": -3.495290756225586, "logps/chosen": -61.66798400878906, "logps/rejected": -57.93266296386719, "loss": 0.693, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.00027778727235272527, "rewards/margins": 0.0003447102790232748, "rewards/rejected": -6.692303577437997e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.2490234375, "learning_rate": 1.721170395869191e-07, "logits/chosen": -3.5296554565429688, "logits/rejected": -3.5278468132019043, "logps/chosen": -63.88775634765625, "logps/rejected": -61.84159469604492, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.00010944288078462705, "rewards/margins": 0.00027477304683998227, "rewards/rejected": -0.0001653301587793976, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.302734375, "learning_rate": 2.5817555938037866e-07, "logits/chosen": -3.5192017555236816, "logits/rejected": -3.5138943195343018, "logps/chosen": -67.57090759277344, "logps/rejected": -66.92351531982422, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 4.100705336895771e-05, "rewards/margins": -0.0001098484281101264, "rewards/rejected": 0.00015085548511706293, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.2392578125, "learning_rate": 3.442340791738382e-07, "logits/chosen": -3.5046753883361816, "logits/rejected": -3.498481273651123, "logps/chosen": -66.16209411621094, "logps/rejected": -58.65666580200195, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00033376910141669214, "rewards/margins": -0.00016887595120351762, "rewards/rejected": 0.0005026450380682945, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.25390625, "learning_rate": 4.302925989672978e-07, "logits/chosen": -3.509906053543091, "logits/rejected": -3.5057880878448486, "logps/chosen": -62.557472229003906, "logps/rejected": -58.73368453979492, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0005178075516596437, "rewards/margins": -5.5213680752785876e-05, "rewards/rejected": 0.000573021243326366, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.302734375, "learning_rate": 5.163511187607573e-07, "logits/chosen": -3.5339508056640625, "logits/rejected": -3.5271873474121094, "logps/chosen": -67.87001037597656, "logps/rejected": -62.6706428527832, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0009180423803627491, "rewards/margins": 0.0002732599969021976, "rewards/rejected": 0.0006447824416682124, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.2578125, "learning_rate": 6.024096385542169e-07, "logits/chosen": -3.5073513984680176, "logits/rejected": -3.502321243286133, "logps/chosen": -64.41812133789062, "logps/rejected": -61.06683349609375, "loss": 0.693, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0016494952142238617, "rewards/margins": 0.0002577772247605026, "rewards/rejected": 0.0013917179312556982, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.2734375, "learning_rate": 6.884681583476764e-07, "logits/chosen": -3.543259859085083, "logits/rejected": -3.535987138748169, "logps/chosen": -69.9263687133789, "logps/rejected": -62.202117919921875, "loss": 0.6928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002187983598560095, "rewards/margins": 0.0006142753991298378, "rewards/rejected": 0.001573708257637918, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.26171875, "learning_rate": 7.745266781411361e-07, "logits/chosen": -3.5097403526306152, "logits/rejected": -3.505981922149658, "logps/chosen": -64.8249282836914, "logps/rejected": -60.57047653198242, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.002674184273928404, "rewards/margins": 0.00034979888005182147, "rewards/rejected": 0.002324385568499565, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.263671875, "learning_rate": 8.605851979345956e-07, "logits/chosen": -3.5349838733673096, "logits/rejected": -3.5218091011047363, "logps/chosen": -66.39881134033203, "logps/rejected": -57.2912712097168, "loss": 0.6927, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0036652986891567707, "rewards/margins": 0.0009702268871478736, "rewards/rejected": 0.0026950715109705925, "step": 100 }, { "epoch": 0.02, "eval_logits/chosen": -3.4983110427856445, "eval_logits/rejected": -3.496581554412842, "eval_logps/chosen": -70.91754913330078, "eval_logps/rejected": -74.6415786743164, "eval_loss": 0.6930133700370789, "eval_rewards/accuracies": 0.5204461216926575, "eval_rewards/chosen": 0.004924696870148182, "eval_rewards/margins": 0.0002700270852074027, "eval_rewards/rejected": 0.004654669668525457, "eval_runtime": 486.5221, "eval_samples_per_second": 8.846, "eval_steps_per_second": 1.106, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.291015625, "learning_rate": 9.466437177280551e-07, "logits/chosen": -3.5417017936706543, "logits/rejected": -3.537182331085205, "logps/chosen": -66.14574432373047, "logps/rejected": -60.83588790893555, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004560687579214573, "rewards/margins": 0.0003698725195135921, "rewards/rejected": 0.004190815147012472, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.287109375, "learning_rate": 1.0327022375215146e-06, "logits/chosen": -3.5192337036132812, "logits/rejected": -3.5158679485321045, "logps/chosen": -62.41202926635742, "logps/rejected": -63.18855667114258, "loss": 0.6929, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004526413977146149, "rewards/margins": 0.0005844075931236148, "rewards/rejected": 0.003942006267607212, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.251953125, "learning_rate": 1.1187607573149743e-06, "logits/chosen": -3.538072109222412, "logits/rejected": -3.535295009613037, "logps/chosen": -66.57141876220703, "logps/rejected": -61.85688018798828, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0056819794699549675, "rewards/margins": 0.0005479130195453763, "rewards/rejected": 0.005134066101163626, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.27734375, "learning_rate": 1.2048192771084338e-06, "logits/chosen": -3.5338687896728516, "logits/rejected": -3.5286478996276855, "logps/chosen": -63.46132278442383, "logps/rejected": -60.513450622558594, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.006024450063705444, "rewards/margins": 0.0008360937936231494, "rewards/rejected": 0.005188356153666973, "step": 140 }, { "epoch": 0.03, "grad_norm": 0.271484375, "learning_rate": 1.2908777969018935e-06, "logits/chosen": -3.507450819015503, "logits/rejected": -3.5040555000305176, "logps/chosen": -62.18046951293945, "logps/rejected": -59.2562141418457, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.006412480026483536, "rewards/margins": 0.0007751357043161988, "rewards/rejected": 0.005637344904243946, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.2431640625, "learning_rate": 1.3769363166953528e-06, "logits/chosen": -3.505376100540161, "logits/rejected": -3.5017974376678467, "logps/chosen": -63.6754150390625, "logps/rejected": -62.3272590637207, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.007500818930566311, "rewards/margins": 0.000878830614965409, "rewards/rejected": 0.006621988955885172, "step": 160 }, { "epoch": 0.03, "grad_norm": 0.2734375, "learning_rate": 1.4629948364888125e-06, "logits/chosen": -3.510266065597534, "logits/rejected": -3.506471633911133, "logps/chosen": -64.6450424194336, "logps/rejected": -58.39984130859375, "loss": 0.6926, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.008529067039489746, "rewards/margins": 0.0011345654493197799, "rewards/rejected": 0.007394501473754644, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.29296875, "learning_rate": 1.5490533562822722e-06, "logits/chosen": -3.538785219192505, "logits/rejected": -3.533876895904541, "logps/chosen": -64.92842864990234, "logps/rejected": -59.96929168701172, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00772461574524641, "rewards/margins": 0.0008454096387140453, "rewards/rejected": 0.0068792058154940605, "step": 180 }, { "epoch": 0.03, "grad_norm": 0.306640625, "learning_rate": 1.6351118760757316e-06, "logits/chosen": -3.518183946609497, "logits/rejected": -3.5080676078796387, "logps/chosen": -68.33369445800781, "logps/rejected": -59.007720947265625, "loss": 0.6917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.010243936441838741, "rewards/margins": 0.0029686293564736843, "rewards/rejected": 0.007275307085365057, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.28125, "learning_rate": 1.7211703958691911e-06, "logits/chosen": -3.5061545372009277, "logits/rejected": -3.5032687187194824, "logps/chosen": -63.0229606628418, "logps/rejected": -61.38782501220703, "loss": 0.692, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.01057803351432085, "rewards/margins": 0.00223658699542284, "rewards/rejected": 0.008341444656252861, "step": 200 }, { "epoch": 0.03, "eval_logits/chosen": -3.494154214859009, "eval_logits/rejected": -3.492429733276367, "eval_logps/chosen": -69.94579315185547, "eval_logps/rejected": -73.75847625732422, "eval_loss": 0.6925765872001648, "eval_rewards/accuracies": 0.5615706443786621, "eval_rewards/chosen": 0.014642315916717052, "eval_rewards/margins": 0.001156591926701367, "eval_rewards/rejected": 0.013485724106431007, "eval_runtime": 484.1845, "eval_samples_per_second": 8.889, "eval_steps_per_second": 1.111, "step": 200 }, { "epoch": 0.04, "grad_norm": 0.259765625, "learning_rate": 1.8072289156626508e-06, "logits/chosen": -3.5086097717285156, "logits/rejected": -3.5070700645446777, "logps/chosen": -62.868133544921875, "logps/rejected": -63.98527908325195, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.011296862736344337, "rewards/margins": 0.0025715038646012545, "rewards/rejected": 0.008725358173251152, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.2578125, "learning_rate": 1.8932874354561103e-06, "logits/chosen": -3.5019657611846924, "logits/rejected": -3.4947357177734375, "logps/chosen": -62.300025939941406, "logps/rejected": -58.14155960083008, "loss": 0.6911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.013878649100661278, "rewards/margins": 0.0040546623058617115, "rewards/rejected": 0.009823987260460854, "step": 220 }, { "epoch": 0.04, "grad_norm": 0.26953125, "learning_rate": 1.9793459552495696e-06, "logits/chosen": -3.5150229930877686, "logits/rejected": -3.511476993560791, "logps/chosen": -57.71666717529297, "logps/rejected": -56.743553161621094, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": 0.014196398667991161, "rewards/margins": 0.002722408389672637, "rewards/rejected": 0.01147399004548788, "step": 230 }, { "epoch": 0.04, "grad_norm": 0.255859375, "learning_rate": 2.0654044750430293e-06, "logits/chosen": -3.5202598571777344, "logits/rejected": -3.5103707313537598, "logps/chosen": -64.93174743652344, "logps/rejected": -59.1458854675293, "loss": 0.6908, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.015858512371778488, "rewards/margins": 0.004820539150387049, "rewards/rejected": 0.011037970893085003, "step": 240 }, { "epoch": 0.04, "grad_norm": 0.26953125, "learning_rate": 2.151462994836489e-06, "logits/chosen": -3.5057454109191895, "logits/rejected": -3.502401828765869, "logps/chosen": -61.890663146972656, "logps/rejected": -57.9714469909668, "loss": 0.6914, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01803828775882721, "rewards/margins": 0.003597863484174013, "rewards/rejected": 0.014440424740314484, "step": 250 }, { "epoch": 0.04, "grad_norm": 0.271484375, "learning_rate": 2.2375215146299486e-06, "logits/chosen": -3.509375810623169, "logits/rejected": -3.5082786083221436, "logps/chosen": -63.0054931640625, "logps/rejected": -62.8714485168457, "loss": 0.6892, "rewards/accuracies": 0.6875, "rewards/chosen": 0.019104022532701492, "rewards/margins": 0.007894990965723991, "rewards/rejected": 0.011209032498300076, "step": 260 }, { "epoch": 0.05, "grad_norm": 0.267578125, "learning_rate": 2.323580034423408e-06, "logits/chosen": -3.5024642944335938, "logits/rejected": -3.49824595451355, "logps/chosen": -60.08552169799805, "logps/rejected": -59.7121467590332, "loss": 0.6901, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.020494792610406876, "rewards/margins": 0.00626087561249733, "rewards/rejected": 0.014233916997909546, "step": 270 }, { "epoch": 0.05, "grad_norm": 0.283203125, "learning_rate": 2.4096385542168676e-06, "logits/chosen": -3.5296006202697754, "logits/rejected": -3.522291660308838, "logps/chosen": -66.11088562011719, "logps/rejected": -59.79486846923828, "loss": 0.6893, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.022822776809334755, "rewards/margins": 0.007838909514248371, "rewards/rejected": 0.014983865432441235, "step": 280 }, { "epoch": 0.05, "grad_norm": 0.287109375, "learning_rate": 2.4956970740103273e-06, "logits/chosen": -3.5092430114746094, "logits/rejected": -3.5068252086639404, "logps/chosen": -64.64472198486328, "logps/rejected": -60.8430290222168, "loss": 0.6903, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.021010177209973335, "rewards/margins": 0.005890936590731144, "rewards/rejected": 0.015119239687919617, "step": 290 }, { "epoch": 0.05, "grad_norm": 0.283203125, "learning_rate": 2.581755593803787e-06, "logits/chosen": -3.508896589279175, "logits/rejected": -3.5066845417022705, "logps/chosen": -62.75114059448242, "logps/rejected": -60.66050338745117, "loss": 0.6887, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02356800064444542, "rewards/margins": 0.009082594886422157, "rewards/rejected": 0.014485405758023262, "step": 300 }, { "epoch": 0.05, "eval_logits/chosen": -3.4875731468200684, "eval_logits/rejected": -3.485839605331421, "eval_logps/chosen": -67.90235900878906, "eval_logps/rejected": -72.03018951416016, "eval_loss": 0.6910621523857117, "eval_rewards/accuracies": 0.5731877088546753, "eval_rewards/chosen": 0.03507662191987038, "eval_rewards/margins": 0.004308138974010944, "eval_rewards/rejected": 0.030768483877182007, "eval_runtime": 484.3093, "eval_samples_per_second": 8.887, "eval_steps_per_second": 1.111, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.29296875, "learning_rate": 2.6678141135972463e-06, "logits/chosen": -3.5070648193359375, "logits/rejected": -3.506229877471924, "logps/chosen": -62.83290481567383, "logps/rejected": -59.43952178955078, "loss": 0.69, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.024709565564990044, "rewards/margins": 0.006442278623580933, "rewards/rejected": 0.01826728694140911, "step": 310 }, { "epoch": 0.06, "grad_norm": 0.279296875, "learning_rate": 2.7538726333907055e-06, "logits/chosen": -3.497121810913086, "logits/rejected": -3.4916274547576904, "logps/chosen": -64.09577178955078, "logps/rejected": -56.424560546875, "loss": 0.6883, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02018563821911812, "rewards/margins": 0.009932487271726131, "rewards/rejected": 0.010253149084746838, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.279296875, "learning_rate": 2.8399311531841657e-06, "logits/chosen": -3.5062835216522217, "logits/rejected": -3.502513885498047, "logps/chosen": -63.776573181152344, "logps/rejected": -58.834251403808594, "loss": 0.6875, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.01910565234720707, "rewards/margins": 0.011445741169154644, "rewards/rejected": 0.007659909315407276, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.30859375, "learning_rate": 2.925989672977625e-06, "logits/chosen": -3.513662815093994, "logits/rejected": -3.508847713470459, "logps/chosen": -60.776634216308594, "logps/rejected": -60.560829162597656, "loss": 0.6876, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.018180642277002335, "rewards/margins": 0.011402291245758533, "rewards/rejected": 0.006778349168598652, "step": 340 }, { "epoch": 0.06, "grad_norm": 0.310546875, "learning_rate": 3.012048192771085e-06, "logits/chosen": -3.485520124435425, "logits/rejected": -3.477074384689331, "logps/chosen": -65.42162322998047, "logps/rejected": -62.7513427734375, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.014861424453556538, "rewards/margins": 0.009721105918288231, "rewards/rejected": 0.005140319466590881, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.310546875, "learning_rate": 3.0981067125645443e-06, "logits/chosen": -3.527310609817505, "logits/rejected": -3.523397445678711, "logps/chosen": -64.94847106933594, "logps/rejected": -58.3104133605957, "loss": 0.6882, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.006313213612884283, "rewards/margins": 0.010341762565076351, "rewards/rejected": -0.004028548486530781, "step": 360 }, { "epoch": 0.06, "grad_norm": 0.314453125, "learning_rate": 3.1841652323580036e-06, "logits/chosen": -3.5157413482666016, "logits/rejected": -3.510713577270508, "logps/chosen": -64.20075988769531, "logps/rejected": -61.081268310546875, "loss": 0.6867, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0031775743700563908, "rewards/margins": 0.013444487936794758, "rewards/rejected": -0.010266912169754505, "step": 370 }, { "epoch": 0.07, "grad_norm": 0.298828125, "learning_rate": 3.2702237521514633e-06, "logits/chosen": -3.523954391479492, "logits/rejected": -3.519592761993408, "logps/chosen": -64.20637512207031, "logps/rejected": -60.44745635986328, "loss": 0.6863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.004102800972759724, "rewards/margins": 0.014114337041974068, "rewards/rejected": -0.010011536069214344, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.3359375, "learning_rate": 3.356282271944923e-06, "logits/chosen": -3.509253978729248, "logits/rejected": -3.5043952465057373, "logps/chosen": -66.97331237792969, "logps/rejected": -63.58121871948242, "loss": 0.6855, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0037429791409522295, "rewards/margins": 0.015767943114042282, "rewards/rejected": -0.012024962343275547, "step": 390 }, { "epoch": 0.07, "grad_norm": 0.296875, "learning_rate": 3.4423407917383822e-06, "logits/chosen": -3.508552074432373, "logits/rejected": -3.5039405822753906, "logps/chosen": -64.53360748291016, "logps/rejected": -61.95587921142578, "loss": 0.6865, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0035368031822144985, "rewards/margins": 0.013852817006409168, "rewards/rejected": -0.01738962158560753, "step": 400 }, { "epoch": 0.07, "eval_logits/chosen": -3.4804558753967285, "eval_logits/rejected": -3.4786477088928223, "eval_logps/chosen": -69.7676773071289, "eval_logps/rejected": -74.33695220947266, "eval_loss": 0.6890220046043396, "eval_rewards/accuracies": 0.5608736276626587, "eval_rewards/chosen": 0.016423281282186508, "eval_rewards/margins": 0.008722393773496151, "eval_rewards/rejected": 0.0077008879743516445, "eval_runtime": 483.8566, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 400 }, { "epoch": 0.07, "grad_norm": 0.306640625, "learning_rate": 3.528399311531842e-06, "logits/chosen": -3.5053210258483887, "logits/rejected": -3.5049126148223877, "logps/chosen": -61.0108528137207, "logps/rejected": -65.10823059082031, "loss": 0.6871, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.007881749421358109, "rewards/margins": 0.012780706398189068, "rewards/rejected": -0.0206624586135149, "step": 410 }, { "epoch": 0.07, "grad_norm": 0.369140625, "learning_rate": 3.6144578313253016e-06, "logits/chosen": -3.4826858043670654, "logits/rejected": -3.4816393852233887, "logps/chosen": -66.45621490478516, "logps/rejected": -64.80440521240234, "loss": 0.6903, "rewards/accuracies": 0.5625, "rewards/chosen": -0.015979185700416565, "rewards/margins": 0.006417684257030487, "rewards/rejected": -0.022396868094801903, "step": 420 }, { "epoch": 0.07, "grad_norm": 0.3359375, "learning_rate": 3.700516351118761e-06, "logits/chosen": -3.514251232147217, "logits/rejected": -3.5107944011688232, "logps/chosen": -65.9752197265625, "logps/rejected": -64.03770446777344, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006704515311866999, "rewards/margins": 0.013841964304447174, "rewards/rejected": -0.020546479150652885, "step": 430 }, { "epoch": 0.08, "grad_norm": 0.37109375, "learning_rate": 3.7865748709122206e-06, "logits/chosen": -3.5058677196502686, "logits/rejected": -3.4982612133026123, "logps/chosen": -65.30903625488281, "logps/rejected": -63.014625549316406, "loss": 0.6843, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007113118655979633, "rewards/margins": 0.01851661317050457, "rewards/rejected": -0.025629732757806778, "step": 440 }, { "epoch": 0.08, "grad_norm": 0.32421875, "learning_rate": 3.87263339070568e-06, "logits/chosen": -3.4888668060302734, "logits/rejected": -3.481332302093506, "logps/chosen": -66.73682403564453, "logps/rejected": -66.31524658203125, "loss": 0.6814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1589378118515015e-05, "rewards/margins": 0.02444417215883732, "rewards/rejected": -0.024432582780718803, "step": 450 }, { "epoch": 0.08, "grad_norm": 0.34375, "learning_rate": 3.958691910499139e-06, "logits/chosen": -3.480597734451294, "logits/rejected": -3.476097822189331, "logps/chosen": -68.15984344482422, "logps/rejected": -63.6702995300293, "loss": 0.6876, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.017680566757917404, "rewards/margins": 0.012018715031445026, "rewards/rejected": -0.029699280858039856, "step": 460 }, { "epoch": 0.08, "grad_norm": 0.34765625, "learning_rate": 4.0447504302926e-06, "logits/chosen": -3.4964325428009033, "logits/rejected": -3.489607334136963, "logps/chosen": -65.46156311035156, "logps/rejected": -62.06793212890625, "loss": 0.6826, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01024774368852377, "rewards/margins": 0.021988026797771454, "rewards/rejected": -0.0322357676923275, "step": 470 }, { "epoch": 0.08, "grad_norm": 0.384765625, "learning_rate": 4.1308089500860585e-06, "logits/chosen": -3.4784443378448486, "logits/rejected": -3.478658676147461, "logps/chosen": -65.70050048828125, "logps/rejected": -70.04431915283203, "loss": 0.6837, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.015509704127907753, "rewards/margins": 0.02020612731575966, "rewards/rejected": -0.035715825855731964, "step": 480 }, { "epoch": 0.08, "grad_norm": 0.392578125, "learning_rate": 4.216867469879519e-06, "logits/chosen": -3.472611665725708, "logits/rejected": -3.4615890979766846, "logps/chosen": -71.42842864990234, "logps/rejected": -62.09379959106445, "loss": 0.6781, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0040070959366858006, "rewards/margins": 0.03170691058039665, "rewards/rejected": -0.035714007914066315, "step": 490 }, { "epoch": 0.09, "grad_norm": 0.392578125, "learning_rate": 4.302925989672978e-06, "logits/chosen": -3.4761765003204346, "logits/rejected": -3.4696431159973145, "logps/chosen": -67.80470275878906, "logps/rejected": -63.649932861328125, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -0.01582060381770134, "rewards/margins": 0.014767659828066826, "rewards/rejected": -0.030588263645768166, "step": 500 }, { "epoch": 0.09, "eval_logits/chosen": -3.4679765701293945, "eval_logits/rejected": -3.4661855697631836, "eval_logps/chosen": -69.05375671386719, "eval_logps/rejected": -74.21285247802734, "eval_loss": 0.6863834261894226, "eval_rewards/accuracies": 0.5755111575126648, "eval_rewards/chosen": 0.023562604561448097, "eval_rewards/margins": 0.014620588161051273, "eval_rewards/rejected": 0.00894201546907425, "eval_runtime": 483.8442, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 500 }, { "epoch": 0.09, "grad_norm": 0.42578125, "learning_rate": 4.388984509466438e-06, "logits/chosen": -3.4684576988220215, "logits/rejected": -3.462451219558716, "logps/chosen": -71.22663879394531, "logps/rejected": -65.5745849609375, "loss": 0.6856, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02633030153810978, "rewards/margins": 0.01643994078040123, "rewards/rejected": -0.04277024418115616, "step": 510 }, { "epoch": 0.09, "grad_norm": 0.59375, "learning_rate": 4.475043029259897e-06, "logits/chosen": -3.477713108062744, "logits/rejected": -3.4715778827667236, "logps/chosen": -72.2740707397461, "logps/rejected": -67.93409729003906, "loss": 0.683, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.05916525050997734, "rewards/margins": 0.022036103531718254, "rewards/rejected": -0.08120135962963104, "step": 520 }, { "epoch": 0.09, "grad_norm": 0.41015625, "learning_rate": 4.561101549053357e-06, "logits/chosen": -3.4684062004089355, "logits/rejected": -3.4601237773895264, "logps/chosen": -71.05815124511719, "logps/rejected": -66.8134765625, "loss": 0.6744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04559968411922455, "rewards/margins": 0.03998229280114174, "rewards/rejected": -0.08558198064565659, "step": 530 }, { "epoch": 0.09, "grad_norm": 0.435546875, "learning_rate": 4.647160068846816e-06, "logits/chosen": -3.4680933952331543, "logits/rejected": -3.4654452800750732, "logps/chosen": -70.15602111816406, "logps/rejected": -68.97514343261719, "loss": 0.6822, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.059289127588272095, "rewards/margins": 0.024244192987680435, "rewards/rejected": -0.08353332430124283, "step": 540 }, { "epoch": 0.09, "grad_norm": 0.46875, "learning_rate": 4.7332185886402755e-06, "logits/chosen": -3.4790825843811035, "logits/rejected": -3.47590708732605, "logps/chosen": -68.46205139160156, "logps/rejected": -67.3900146484375, "loss": 0.6761, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0445096381008625, "rewards/margins": 0.037113942205905914, "rewards/rejected": -0.08162357658147812, "step": 550 }, { "epoch": 0.1, "grad_norm": 0.55859375, "learning_rate": 4.819277108433735e-06, "logits/chosen": -3.48732328414917, "logits/rejected": -3.486783504486084, "logps/chosen": -67.97535705566406, "logps/rejected": -71.35958099365234, "loss": 0.675, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.045551758259534836, "rewards/margins": 0.03953787311911583, "rewards/rejected": -0.08508963137865067, "step": 560 }, { "epoch": 0.1, "grad_norm": 0.462890625, "learning_rate": 4.905335628227195e-06, "logits/chosen": -3.473559617996216, "logits/rejected": -3.471503734588623, "logps/chosen": -67.1822738647461, "logps/rejected": -70.67951965332031, "loss": 0.6794, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.052295297384262085, "rewards/margins": 0.030304264277219772, "rewards/rejected": -0.08259955793619156, "step": 570 }, { "epoch": 0.1, "grad_norm": 0.5078125, "learning_rate": 4.991394148020655e-06, "logits/chosen": -3.4739394187927246, "logits/rejected": -3.4727649688720703, "logps/chosen": -65.24308776855469, "logps/rejected": -66.8325424194336, "loss": 0.6808, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04294530302286148, "rewards/margins": 0.027125859633088112, "rewards/rejected": -0.07007116079330444, "step": 580 }, { "epoch": 0.1, "grad_norm": 0.62890625, "learning_rate": 4.999963354556567e-06, "logits/chosen": -3.4689114093780518, "logits/rejected": -3.463846206665039, "logps/chosen": -65.95047760009766, "logps/rejected": -68.07025146484375, "loss": 0.6766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.011568538844585419, "rewards/margins": 0.03579581156373024, "rewards/rejected": -0.04736434668302536, "step": 590 }, { "epoch": 0.1, "grad_norm": 0.54296875, "learning_rate": 4.9998366803288885e-06, "logits/chosen": -3.459731340408325, "logits/rejected": -3.4565346240997314, "logps/chosen": -67.85731506347656, "logps/rejected": -70.11259460449219, "loss": 0.6731, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.019781148061156273, "rewards/margins": 0.04344576969742775, "rewards/rejected": -0.06322692334651947, "step": 600 }, { "epoch": 0.1, "eval_logits/chosen": -3.451451063156128, "eval_logits/rejected": -3.44972562789917, "eval_logps/chosen": -71.21887969970703, "eval_logps/rejected": -77.00116729736328, "eval_loss": 0.6837956309318542, "eval_rewards/accuracies": 0.5871282815933228, "eval_rewards/chosen": 0.001911371131427586, "eval_rewards/margins": 0.020852578803896904, "eval_rewards/rejected": -0.018941204994916916, "eval_runtime": 483.7869, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 600 }, { "epoch": 0.11, "grad_norm": 0.5859375, "learning_rate": 4.9996195294877135e-06, "logits/chosen": -3.468444347381592, "logits/rejected": -3.4676971435546875, "logps/chosen": -72.33124542236328, "logps/rejected": -73.11021423339844, "loss": 0.677, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07435192912817001, "rewards/margins": 0.03560345247387886, "rewards/rejected": -0.10995538532733917, "step": 610 }, { "epoch": 0.11, "grad_norm": 0.68359375, "learning_rate": 4.999311909892384e-06, "logits/chosen": -3.4697864055633545, "logits/rejected": -3.4647693634033203, "logps/chosen": -77.31246948242188, "logps/rejected": -79.62358856201172, "loss": 0.6599, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11068302392959595, "rewards/margins": 0.07288263738155365, "rewards/rejected": -0.1835656464099884, "step": 620 }, { "epoch": 0.11, "grad_norm": 0.6171875, "learning_rate": 4.998913832676579e-06, "logits/chosen": -3.458186626434326, "logits/rejected": -3.454568862915039, "logps/chosen": -78.59623718261719, "logps/rejected": -76.2980728149414, "loss": 0.6742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.13098138570785522, "rewards/margins": 0.04336429387331009, "rewards/rejected": -0.1743456870317459, "step": 630 }, { "epoch": 0.11, "grad_norm": 1.0390625, "learning_rate": 4.998425312247913e-06, "logits/chosen": -3.4712257385253906, "logits/rejected": -3.468402862548828, "logps/chosen": -77.81741333007812, "logps/rejected": -81.34028625488281, "loss": 0.6765, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.14999021589756012, "rewards/margins": 0.04017460346221924, "rewards/rejected": -0.19016483426094055, "step": 640 }, { "epoch": 0.11, "grad_norm": 0.58203125, "learning_rate": 4.997846366287408e-06, "logits/chosen": -3.491013288497925, "logits/rejected": -3.4919509887695312, "logps/chosen": -79.38850402832031, "logps/rejected": -79.69107818603516, "loss": 0.6961, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.16921451687812805, "rewards/margins": -0.0013396050781011581, "rewards/rejected": -0.16787490248680115, "step": 650 }, { "epoch": 0.11, "grad_norm": 0.58203125, "learning_rate": 4.997177015748862e-06, "logits/chosen": -3.45072603225708, "logits/rejected": -3.449314594268799, "logps/chosen": -75.71781158447266, "logps/rejected": -75.43241119384766, "loss": 0.6732, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0993271917104721, "rewards/margins": 0.04374626651406288, "rewards/rejected": -0.14307346940040588, "step": 660 }, { "epoch": 0.12, "grad_norm": 0.51953125, "learning_rate": 4.996417284858085e-06, "logits/chosen": -3.4625446796417236, "logits/rejected": -3.4615135192871094, "logps/chosen": -71.90211486816406, "logps/rejected": -77.84752655029297, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": -0.0817331001162529, "rewards/margins": 0.04009716957807541, "rewards/rejected": -0.12183026969432831, "step": 670 }, { "epoch": 0.12, "grad_norm": 0.53125, "learning_rate": 4.995567201112025e-06, "logits/chosen": -3.4504151344299316, "logits/rejected": -3.446338653564453, "logps/chosen": -73.89292907714844, "logps/rejected": -71.89225769042969, "loss": 0.6741, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09309293329715729, "rewards/margins": 0.041833892464637756, "rewards/rejected": -0.13492682576179504, "step": 680 }, { "epoch": 0.12, "grad_norm": 0.671875, "learning_rate": 4.994626795277772e-06, "logits/chosen": -3.4752120971679688, "logits/rejected": -3.4689478874206543, "logps/chosen": -81.83485412597656, "logps/rejected": -76.503662109375, "loss": 0.6672, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11247005313634872, "rewards/margins": 0.057780712842941284, "rewards/rejected": -0.1702507734298706, "step": 690 }, { "epoch": 0.12, "grad_norm": 0.61328125, "learning_rate": 4.993596101391443e-06, "logits/chosen": -3.471196413040161, "logits/rejected": -3.4650673866271973, "logps/chosen": -82.47694396972656, "logps/rejected": -81.34616088867188, "loss": 0.6749, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16253015398979187, "rewards/margins": 0.04237184301018715, "rewards/rejected": -0.2049020230770111, "step": 700 }, { "epoch": 0.12, "eval_logits/chosen": -3.448903799057007, "eval_logits/rejected": -3.44696044921875, "eval_logps/chosen": -78.99446868896484, "eval_logps/rejected": -86.01776885986328, "eval_loss": 0.6787543296813965, "eval_rewards/accuracies": 0.5980483293533325, "eval_rewards/chosen": -0.07584448158740997, "eval_rewards/margins": 0.03326273709535599, "eval_rewards/rejected": -0.10910722613334656, "eval_runtime": 484.0821, "eval_samples_per_second": 8.891, "eval_steps_per_second": 1.111, "step": 700 }, { "epoch": 0.12, "grad_norm": 0.73828125, "learning_rate": 4.992475156756952e-06, "logits/chosen": -3.4566562175750732, "logits/rejected": -3.45149302482605, "logps/chosen": -78.71736907958984, "logps/rejected": -82.33731842041016, "loss": 0.6672, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1411767303943634, "rewards/margins": 0.05822443962097168, "rewards/rejected": -0.19940117001533508, "step": 710 }, { "epoch": 0.12, "grad_norm": 0.6953125, "learning_rate": 4.991264001944659e-06, "logits/chosen": -3.448073148727417, "logits/rejected": -3.4477546215057373, "logps/chosen": -79.56318664550781, "logps/rejected": -84.22515869140625, "loss": 0.6664, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15367797017097473, "rewards/margins": 0.06053520366549492, "rewards/rejected": -0.21421320736408234, "step": 720 }, { "epoch": 0.13, "grad_norm": 0.9921875, "learning_rate": 4.989962680789901e-06, "logits/chosen": -3.4756839275360107, "logits/rejected": -3.471583843231201, "logps/chosen": -90.78223419189453, "logps/rejected": -92.80326843261719, "loss": 0.6587, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.23998799920082092, "rewards/margins": 0.0788407102227211, "rewards/rejected": -0.3188287019729614, "step": 730 }, { "epoch": 0.13, "grad_norm": 0.90234375, "learning_rate": 4.9885712403914095e-06, "logits/chosen": -3.446706771850586, "logits/rejected": -3.4412574768066406, "logps/chosen": -96.09211730957031, "logps/rejected": -100.0416030883789, "loss": 0.6612, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30381378531455994, "rewards/margins": 0.073874332010746, "rewards/rejected": -0.3776881694793701, "step": 740 }, { "epoch": 0.13, "grad_norm": 0.796875, "learning_rate": 4.9870897311096e-06, "logits/chosen": -3.4710693359375, "logits/rejected": -3.4651427268981934, "logps/chosen": -96.75785827636719, "logps/rejected": -99.54371643066406, "loss": 0.6585, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.29698044061660767, "rewards/margins": 0.08013220131397247, "rewards/rejected": -0.37711262702941895, "step": 750 }, { "epoch": 0.13, "grad_norm": 0.74609375, "learning_rate": 4.985518206564751e-06, "logits/chosen": -3.434743881225586, "logits/rejected": -3.4291248321533203, "logps/chosen": -82.78807067871094, "logps/rejected": -78.91937255859375, "loss": 0.6731, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17310987412929535, "rewards/margins": 0.04944610223174095, "rewards/rejected": -0.2225559651851654, "step": 760 }, { "epoch": 0.13, "grad_norm": 0.69140625, "learning_rate": 4.983856723635067e-06, "logits/chosen": -3.4525153636932373, "logits/rejected": -3.448737621307373, "logps/chosen": -74.29286193847656, "logps/rejected": -76.18905639648438, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": -0.10165517032146454, "rewards/margins": 0.05906829237937927, "rewards/rejected": -0.1607234627008438, "step": 770 }, { "epoch": 0.13, "grad_norm": 0.7109375, "learning_rate": 4.982105342454616e-06, "logits/chosen": -3.4456591606140137, "logits/rejected": -3.439436674118042, "logps/chosen": -78.19596862792969, "logps/rejected": -80.53340148925781, "loss": 0.6611, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13748806715011597, "rewards/margins": 0.07130144536495209, "rewards/rejected": -0.20878951251506805, "step": 780 }, { "epoch": 0.14, "grad_norm": 0.74609375, "learning_rate": 4.980264126411153e-06, "logits/chosen": -3.4281296730041504, "logits/rejected": -3.4237301349639893, "logps/chosen": -79.74320983886719, "logps/rejected": -83.33930969238281, "loss": 0.6664, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.149915412068367, "rewards/margins": 0.06214505434036255, "rewards/rejected": -0.21206045150756836, "step": 790 }, { "epoch": 0.14, "grad_norm": 0.92578125, "learning_rate": 4.97833314214383e-06, "logits/chosen": -3.4373092651367188, "logits/rejected": -3.433082103729248, "logps/chosen": -87.97721862792969, "logps/rejected": -93.013916015625, "loss": 0.6678, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2326813042163849, "rewards/margins": 0.06341059505939484, "rewards/rejected": -0.29609188437461853, "step": 800 }, { "epoch": 0.14, "eval_logits/chosen": -3.4187793731689453, "eval_logits/rejected": -3.4168734550476074, "eval_logps/chosen": -89.99909210205078, "eval_logps/rejected": -98.40325927734375, "eval_loss": 0.674051821231842, "eval_rewards/accuracies": 0.5906133651733398, "eval_rewards/chosen": -0.18589067459106445, "eval_rewards/margins": 0.047071486711502075, "eval_rewards/rejected": -0.23296219110488892, "eval_runtime": 483.9258, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 800 }, { "epoch": 0.14, "grad_norm": 0.90625, "learning_rate": 4.9763124595407785e-06, "logits/chosen": -3.43872332572937, "logits/rejected": -3.4330341815948486, "logps/chosen": -96.375, "logps/rejected": -98.60665130615234, "loss": 0.669, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.29701000452041626, "rewards/margins": 0.06087196618318558, "rewards/rejected": -0.35788196325302124, "step": 810 }, { "epoch": 0.14, "grad_norm": 0.7109375, "learning_rate": 4.974202151736584e-06, "logits/chosen": -3.4395699501037598, "logits/rejected": -3.4343771934509277, "logps/chosen": -87.37809753417969, "logps/rejected": -90.93356323242188, "loss": 0.6619, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24337145686149597, "rewards/margins": 0.07460545003414154, "rewards/rejected": -0.3179769217967987, "step": 820 }, { "epoch": 0.14, "grad_norm": 0.91796875, "learning_rate": 4.972002295109638e-06, "logits/chosen": -3.4160866737365723, "logits/rejected": -3.4143104553222656, "logps/chosen": -84.53849029541016, "logps/rejected": -88.76664733886719, "loss": 0.6578, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19176757335662842, "rewards/margins": 0.08151111006736755, "rewards/rejected": -0.2732786536216736, "step": 830 }, { "epoch": 0.14, "grad_norm": 0.91015625, "learning_rate": 4.969712969279372e-06, "logits/chosen": -3.4232819080352783, "logits/rejected": -3.4152915477752686, "logps/chosen": -84.25233459472656, "logps/rejected": -86.18933868408203, "loss": 0.654, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18940319120883942, "rewards/margins": 0.08970735222101212, "rewards/rejected": -0.27911052107810974, "step": 840 }, { "epoch": 0.15, "grad_norm": 0.85546875, "learning_rate": 4.967334257103379e-06, "logits/chosen": -3.4104301929473877, "logits/rejected": -3.410254716873169, "logps/chosen": -87.65679168701172, "logps/rejected": -96.04679107666016, "loss": 0.668, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2629455029964447, "rewards/margins": 0.06459694355726242, "rewards/rejected": -0.32754242420196533, "step": 850 }, { "epoch": 0.15, "grad_norm": 1.515625, "learning_rate": 4.9648662446744115e-06, "logits/chosen": -3.4208884239196777, "logits/rejected": -3.414494276046753, "logps/chosen": -86.74958801269531, "logps/rejected": -92.23248291015625, "loss": 0.6539, "rewards/accuracies": 0.65625, "rewards/chosen": -0.22999759018421173, "rewards/margins": 0.0975608378648758, "rewards/rejected": -0.3275584280490875, "step": 860 }, { "epoch": 0.15, "grad_norm": 1.0078125, "learning_rate": 4.962309021317268e-06, "logits/chosen": -3.4057350158691406, "logits/rejected": -3.4048256874084473, "logps/chosen": -82.92234802246094, "logps/rejected": -92.20657348632812, "loss": 0.6534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.21385452151298523, "rewards/margins": 0.09490419924259186, "rewards/rejected": -0.3087587058544159, "step": 870 }, { "epoch": 0.15, "grad_norm": 1.0078125, "learning_rate": 4.959662679585559e-06, "logits/chosen": -3.40657114982605, "logits/rejected": -3.402423143386841, "logps/chosen": -90.1636734008789, "logps/rejected": -95.72185516357422, "loss": 0.6505, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.27124643325805664, "rewards/margins": 0.10422557592391968, "rewards/rejected": -0.3754720091819763, "step": 880 }, { "epoch": 0.15, "grad_norm": 0.87890625, "learning_rate": 4.956927315258356e-06, "logits/chosen": -3.389841079711914, "logits/rejected": -3.3837730884552, "logps/chosen": -96.669189453125, "logps/rejected": -94.84233093261719, "loss": 0.6747, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2692159414291382, "rewards/margins": 0.05370805412530899, "rewards/rejected": -0.3229239881038666, "step": 890 }, { "epoch": 0.16, "grad_norm": 1.0703125, "learning_rate": 4.9541030273367276e-06, "logits/chosen": -3.388092041015625, "logits/rejected": -3.3876006603240967, "logps/chosen": -92.34650421142578, "logps/rejected": -94.53489685058594, "loss": 0.6655, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.2790505290031433, "rewards/margins": 0.069684699177742, "rewards/rejected": -0.3487352430820465, "step": 900 }, { "epoch": 0.16, "eval_logits/chosen": -3.3825888633728027, "eval_logits/rejected": -3.381103038787842, "eval_logps/chosen": -89.96687316894531, "eval_logps/rejected": -99.21875762939453, "eval_loss": 0.6709262728691101, "eval_rewards/accuracies": 0.5927044749259949, "eval_rewards/chosen": -0.18556852638721466, "eval_rewards/margins": 0.05554860830307007, "eval_rewards/rejected": -0.24111711978912354, "eval_runtime": 483.9133, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 900 }, { "epoch": 0.16, "grad_norm": 0.921875, "learning_rate": 4.951189918040154e-06, "logits/chosen": -3.394538164138794, "logits/rejected": -3.390890598297119, "logps/chosen": -91.48938751220703, "logps/rejected": -99.90650939941406, "loss": 0.6586, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.29850026965141296, "rewards/margins": 0.09216472506523132, "rewards/rejected": -0.3906649947166443, "step": 910 }, { "epoch": 0.16, "grad_norm": 0.9765625, "learning_rate": 4.948188092802828e-06, "logits/chosen": -3.384577512741089, "logits/rejected": -3.377805709838867, "logps/chosen": -93.5625991821289, "logps/rejected": -91.18093872070312, "loss": 0.6696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2843739688396454, "rewards/margins": 0.06233012676239014, "rewards/rejected": -0.3467040956020355, "step": 920 }, { "epoch": 0.16, "grad_norm": 1.0234375, "learning_rate": 4.94509766026984e-06, "logits/chosen": -3.388658046722412, "logits/rejected": -3.384310245513916, "logps/chosen": -85.4650650024414, "logps/rejected": -93.18614196777344, "loss": 0.6555, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.22382517158985138, "rewards/margins": 0.09300287067890167, "rewards/rejected": -0.31682807207107544, "step": 930 }, { "epoch": 0.16, "grad_norm": 0.96875, "learning_rate": 4.941918732293246e-06, "logits/chosen": -3.3858253955841064, "logits/rejected": -3.3792338371276855, "logps/chosen": -99.0291519165039, "logps/rejected": -99.58036804199219, "loss": 0.6596, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.27680402994155884, "rewards/margins": 0.08166074752807617, "rewards/rejected": -0.358464777469635, "step": 940 }, { "epoch": 0.16, "grad_norm": 0.95703125, "learning_rate": 4.9386514239280156e-06, "logits/chosen": -3.3473052978515625, "logits/rejected": -3.345482587814331, "logps/chosen": -103.10880279541016, "logps/rejected": -108.80924224853516, "loss": 0.6652, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.3806834816932678, "rewards/margins": 0.08083571493625641, "rewards/rejected": -0.46151915192604065, "step": 950 }, { "epoch": 0.17, "grad_norm": 0.87890625, "learning_rate": 4.935295853427875e-06, "logits/chosen": -3.3498833179473877, "logits/rejected": -3.351750612258911, "logps/chosen": -92.35511016845703, "logps/rejected": -101.44236755371094, "loss": 0.6671, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.3139442503452301, "rewards/margins": 0.06891517341136932, "rewards/rejected": -0.3828594386577606, "step": 960 }, { "epoch": 0.17, "grad_norm": 0.95703125, "learning_rate": 4.9318521422410186e-06, "logits/chosen": -3.361420154571533, "logits/rejected": -3.3568592071533203, "logps/chosen": -99.36900329589844, "logps/rejected": -93.79570770263672, "loss": 0.674, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2903686761856079, "rewards/margins": 0.054736148566007614, "rewards/rejected": -0.345104843378067, "step": 970 }, { "epoch": 0.17, "grad_norm": 0.89453125, "learning_rate": 4.928320415005718e-06, "logits/chosen": -3.3936476707458496, "logits/rejected": -3.3898472785949707, "logps/chosen": -89.65267944335938, "logps/rejected": -94.29356384277344, "loss": 0.6536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.22109732031822205, "rewards/margins": 0.09921900928020477, "rewards/rejected": -0.320316344499588, "step": 980 }, { "epoch": 0.17, "grad_norm": 1.015625, "learning_rate": 4.924700799545815e-06, "logits/chosen": -3.3750221729278564, "logits/rejected": -3.3710360527038574, "logps/chosen": -92.3640365600586, "logps/rejected": -98.47993469238281, "loss": 0.6459, "rewards/accuracies": 0.6875, "rewards/chosen": -0.280429869890213, "rewards/margins": 0.11348159611225128, "rewards/rejected": -0.3939114511013031, "step": 990 }, { "epoch": 0.17, "grad_norm": 1.21875, "learning_rate": 4.920993426866085e-06, "logits/chosen": -3.356358766555786, "logits/rejected": -3.352238893508911, "logps/chosen": -115.8489990234375, "logps/rejected": -114.68751525878906, "loss": 0.6695, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.4800940155982971, "rewards/margins": 0.0706138014793396, "rewards/rejected": -0.5507077574729919, "step": 1000 }, { "epoch": 0.17, "eval_logits/chosen": -3.3611390590667725, "eval_logits/rejected": -3.3595175743103027, "eval_logps/chosen": -110.34318542480469, "eval_logps/rejected": -120.9452896118164, "eval_loss": 0.6685853600502014, "eval_rewards/accuracies": 0.5945631861686707, "eval_rewards/chosen": -0.389331579208374, "eval_rewards/margins": 0.06905096769332886, "eval_rewards/rejected": -0.4583825469017029, "eval_runtime": 484.0568, "eval_samples_per_second": 8.892, "eval_steps_per_second": 1.111, "step": 1000 }, { "epoch": 0.17, "grad_norm": 1.46875, "learning_rate": 4.917198431147504e-06, "logits/chosen": -3.3470757007598877, "logits/rejected": -3.346139907836914, "logps/chosen": -114.94053649902344, "logps/rejected": -134.77088928222656, "loss": 0.6232, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.515256404876709, "rewards/margins": 0.18923744559288025, "rewards/rejected": -0.7044938206672668, "step": 1010 }, { "epoch": 0.18, "grad_norm": 1.265625, "learning_rate": 4.91331594974239e-06, "logits/chosen": -3.3773505687713623, "logits/rejected": -3.371786594390869, "logps/chosen": -123.88951110839844, "logps/rejected": -130.7519989013672, "loss": 0.6408, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.558532178401947, "rewards/margins": 0.13218751549720764, "rewards/rejected": -0.6907196640968323, "step": 1020 }, { "epoch": 0.18, "grad_norm": 1.2109375, "learning_rate": 4.90934612316943e-06, "logits/chosen": -3.36560320854187, "logits/rejected": -3.3610668182373047, "logps/chosen": -111.2602310180664, "logps/rejected": -115.28700256347656, "loss": 0.66, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4598492980003357, "rewards/margins": 0.09703020006418228, "rewards/rejected": -0.5568795204162598, "step": 1030 }, { "epoch": 0.18, "grad_norm": 1.3359375, "learning_rate": 4.905289095108597e-06, "logits/chosen": -3.3711113929748535, "logits/rejected": -3.368939161300659, "logps/chosen": -109.08182525634766, "logps/rejected": -115.71439361572266, "loss": 0.6811, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4638773500919342, "rewards/margins": 0.054792650043964386, "rewards/rejected": -0.5186699628829956, "step": 1040 }, { "epoch": 0.18, "grad_norm": 1.3515625, "learning_rate": 4.901145012395945e-06, "logits/chosen": -3.3484110832214355, "logits/rejected": -3.3432037830352783, "logps/chosen": -109.2693862915039, "logps/rejected": -112.49226379394531, "loss": 0.6478, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4130178987979889, "rewards/margins": 0.12246304750442505, "rewards/rejected": -0.5354809165000916, "step": 1050 }, { "epoch": 0.18, "grad_norm": 1.4765625, "learning_rate": 4.8969140250183036e-06, "logits/chosen": -3.355018138885498, "logits/rejected": -3.3515992164611816, "logps/chosen": -109.40594482421875, "logps/rejected": -111.96232604980469, "loss": 0.6666, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.41715121269226074, "rewards/margins": 0.07401702553033829, "rewards/rejected": -0.49116820096969604, "step": 1060 }, { "epoch": 0.18, "grad_norm": 1.125, "learning_rate": 4.892596286107838e-06, "logits/chosen": -3.389537811279297, "logits/rejected": -3.3849761486053467, "logps/chosen": -107.3383560180664, "logps/rejected": -107.23735046386719, "loss": 0.6731, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3981268107891083, "rewards/margins": 0.07017555832862854, "rewards/rejected": -0.4683023989200592, "step": 1070 }, { "epoch": 0.19, "grad_norm": 0.8203125, "learning_rate": 4.888191951936516e-06, "logits/chosen": -3.3715012073516846, "logits/rejected": -3.3688080310821533, "logps/chosen": -103.20147705078125, "logps/rejected": -103.82188415527344, "loss": 0.6623, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.36823979020118713, "rewards/margins": 0.08313159644603729, "rewards/rejected": -0.45137137174606323, "step": 1080 }, { "epoch": 0.19, "grad_norm": 1.296875, "learning_rate": 4.883701181910447e-06, "logits/chosen": -3.3489112854003906, "logits/rejected": -3.3472843170166016, "logps/chosen": -99.51116943359375, "logps/rejected": -107.47447204589844, "loss": 0.6554, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3695264458656311, "rewards/margins": 0.09836713969707489, "rewards/rejected": -0.4678936004638672, "step": 1090 }, { "epoch": 0.19, "grad_norm": 1.078125, "learning_rate": 4.879124138564116e-06, "logits/chosen": -3.355736494064331, "logits/rejected": -3.3550803661346436, "logps/chosen": -97.91027069091797, "logps/rejected": -106.47267150878906, "loss": 0.6648, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.35570019483566284, "rewards/margins": 0.08377765119075775, "rewards/rejected": -0.4394778311252594, "step": 1100 }, { "epoch": 0.19, "eval_logits/chosen": -3.3453195095062256, "eval_logits/rejected": -3.343949556350708, "eval_logps/chosen": -92.19027709960938, "eval_logps/rejected": -101.81735229492188, "eval_loss": 0.670196533203125, "eval_rewards/accuracies": 0.5975836515426636, "eval_rewards/chosen": -0.2078026980161667, "eval_rewards/margins": 0.05930037051439285, "eval_rewards/rejected": -0.26710304617881775, "eval_runtime": 483.818, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 1100 }, { "epoch": 0.19, "grad_norm": 1.234375, "learning_rate": 4.874460987554495e-06, "logits/chosen": -3.35750150680542, "logits/rejected": -3.3596253395080566, "logps/chosen": -93.9940185546875, "logps/rejected": -104.469482421875, "loss": 0.6495, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.30589941143989563, "rewards/margins": 0.10967297852039337, "rewards/rejected": -0.4155723452568054, "step": 1110 }, { "epoch": 0.19, "grad_norm": 1.0703125, "learning_rate": 4.869711897655058e-06, "logits/chosen": -3.3663763999938965, "logits/rejected": -3.3627076148986816, "logps/chosen": -95.74103546142578, "logps/rejected": -99.71363830566406, "loss": 0.6558, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2901385426521301, "rewards/margins": 0.09493489563465118, "rewards/rejected": -0.3850734233856201, "step": 1120 }, { "epoch": 0.19, "grad_norm": 1.078125, "learning_rate": 4.864877040749659e-06, "logits/chosen": -3.3478622436523438, "logits/rejected": -3.3438973426818848, "logps/chosen": -94.52452087402344, "logps/rejected": -106.47977447509766, "loss": 0.6393, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3232131004333496, "rewards/margins": 0.13161292672157288, "rewards/rejected": -0.4548260271549225, "step": 1130 }, { "epoch": 0.2, "grad_norm": 1.21875, "learning_rate": 4.859956591826323e-06, "logits/chosen": -3.353027820587158, "logits/rejected": -3.3485264778137207, "logps/chosen": -110.5881118774414, "logps/rejected": -114.19659423828125, "loss": 0.6553, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.43052440881729126, "rewards/margins": 0.10749445110559464, "rewards/rejected": -0.5380188226699829, "step": 1140 }, { "epoch": 0.2, "grad_norm": 1.3125, "learning_rate": 4.854950728970905e-06, "logits/chosen": -3.328634738922119, "logits/rejected": -3.324906826019287, "logps/chosen": -109.4224853515625, "logps/rejected": -121.63139343261719, "loss": 0.6376, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4534403681755066, "rewards/margins": 0.15592564642429352, "rewards/rejected": -0.6093659400939941, "step": 1150 }, { "epoch": 0.2, "grad_norm": 1.46875, "learning_rate": 4.849859633360649e-06, "logits/chosen": -3.3405518531799316, "logits/rejected": -3.3392815589904785, "logps/chosen": -109.72029876708984, "logps/rejected": -123.24666595458984, "loss": 0.626, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4358472228050232, "rewards/margins": 0.17100933194160461, "rewards/rejected": -0.6068565249443054, "step": 1160 }, { "epoch": 0.2, "grad_norm": 1.484375, "learning_rate": 4.84468348925763e-06, "logits/chosen": -3.3110427856445312, "logits/rejected": -3.307129383087158, "logps/chosen": -120.5337905883789, "logps/rejected": -133.09884643554688, "loss": 0.6506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5807204246520996, "rewards/margins": 0.13058218359947205, "rewards/rejected": -0.7113025188446045, "step": 1170 }, { "epoch": 0.2, "grad_norm": 1.5625, "learning_rate": 4.83942248400208e-06, "logits/chosen": -3.299647808074951, "logits/rejected": -3.294851303100586, "logps/chosen": -131.7820281982422, "logps/rejected": -137.20223999023438, "loss": 0.6587, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6844396591186523, "rewards/margins": 0.10787700116634369, "rewards/rejected": -0.792316734790802, "step": 1180 }, { "epoch": 0.21, "grad_norm": 1.4765625, "learning_rate": 4.834076808005615e-06, "logits/chosen": -3.3311972618103027, "logits/rejected": -3.3262181282043457, "logps/chosen": -132.84390258789062, "logps/rejected": -136.3151092529297, "loss": 0.6451, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6461424231529236, "rewards/margins": 0.1411634385585785, "rewards/rejected": -0.7873059511184692, "step": 1190 }, { "epoch": 0.21, "grad_norm": 1.2421875, "learning_rate": 4.828646654744338e-06, "logits/chosen": -3.315372943878174, "logits/rejected": -3.3137035369873047, "logps/chosen": -116.4529037475586, "logps/rejected": -123.21891784667969, "loss": 0.6543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5260134935379028, "rewards/margins": 0.11653532832860947, "rewards/rejected": -0.6425488591194153, "step": 1200 }, { "epoch": 0.21, "eval_logits/chosen": -3.310985565185547, "eval_logits/rejected": -3.309568405151367, "eval_logps/chosen": -106.52164459228516, "eval_logps/rejected": -118.23544311523438, "eval_loss": 0.6642152070999146, "eval_rewards/accuracies": 0.6010687947273254, "eval_rewards/chosen": -0.35111624002456665, "eval_rewards/margins": 0.08016779273748398, "eval_rewards/rejected": -0.43128401041030884, "eval_runtime": 484.1657, "eval_samples_per_second": 8.89, "eval_steps_per_second": 1.111, "step": 1200 }, { "epoch": 0.21, "grad_norm": 1.4609375, "learning_rate": 4.82313222075184e-06, "logits/chosen": -3.3166213035583496, "logits/rejected": -3.312199354171753, "logps/chosen": -117.9781265258789, "logps/rejected": -128.859130859375, "loss": 0.6414, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5318335890769958, "rewards/margins": 0.14254365861415863, "rewards/rejected": -0.6743772625923157, "step": 1210 }, { "epoch": 0.21, "grad_norm": 1.328125, "learning_rate": 4.8175337056120844e-06, "logits/chosen": -3.3199057579040527, "logits/rejected": -3.314791202545166, "logps/chosen": -114.121826171875, "logps/rejected": -130.0734100341797, "loss": 0.6249, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5108245015144348, "rewards/margins": 0.184920996427536, "rewards/rejected": -0.6957454681396484, "step": 1220 }, { "epoch": 0.21, "grad_norm": 1.40625, "learning_rate": 4.811851311952185e-06, "logits/chosen": -3.3396987915039062, "logits/rejected": -3.3339123725891113, "logps/chosen": -117.48822021484375, "logps/rejected": -122.26155853271484, "loss": 0.6497, "rewards/accuracies": 0.625, "rewards/chosen": -0.5022979974746704, "rewards/margins": 0.12123336642980576, "rewards/rejected": -0.6235313415527344, "step": 1230 }, { "epoch": 0.21, "grad_norm": 1.5234375, "learning_rate": 4.80608524543507e-06, "logits/chosen": -3.308558940887451, "logits/rejected": -3.3058955669403076, "logps/chosen": -113.2501449584961, "logps/rejected": -122.56099700927734, "loss": 0.662, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5077820420265198, "rewards/margins": 0.10850410163402557, "rewards/rejected": -0.6162861585617065, "step": 1240 }, { "epoch": 0.22, "grad_norm": 1.625, "learning_rate": 4.800235714752042e-06, "logits/chosen": -3.3179473876953125, "logits/rejected": -3.312861680984497, "logps/chosen": -107.0726547241211, "logps/rejected": -111.9657211303711, "loss": 0.6564, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4158380925655365, "rewards/margins": 0.10389236360788345, "rewards/rejected": -0.5197304487228394, "step": 1250 }, { "epoch": 0.22, "grad_norm": 1.4140625, "learning_rate": 4.7943029316152235e-06, "logits/chosen": -3.2991790771484375, "logits/rejected": -3.293081283569336, "logps/chosen": -108.0817642211914, "logps/rejected": -114.9096450805664, "loss": 0.6473, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4354972839355469, "rewards/margins": 0.13289375603199005, "rewards/rejected": -0.5683910250663757, "step": 1260 }, { "epoch": 0.22, "grad_norm": 1.6484375, "learning_rate": 4.788287110749892e-06, "logits/chosen": -3.3053946495056152, "logits/rejected": -3.3048996925354004, "logps/chosen": -114.3599624633789, "logps/rejected": -125.3141860961914, "loss": 0.6533, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5140891671180725, "rewards/margins": 0.1140485554933548, "rewards/rejected": -0.6281377077102661, "step": 1270 }, { "epoch": 0.22, "grad_norm": 1.4609375, "learning_rate": 4.782188469886711e-06, "logits/chosen": -3.335669994354248, "logits/rejected": -3.3347504138946533, "logps/chosen": -113.2931900024414, "logps/rejected": -135.6788787841797, "loss": 0.629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5072969794273376, "rewards/margins": 0.16740207374095917, "rewards/rejected": -0.6746990084648132, "step": 1280 }, { "epoch": 0.22, "grad_norm": 1.5625, "learning_rate": 4.776007229753847e-06, "logits/chosen": -3.308954954147339, "logits/rejected": -3.304429292678833, "logps/chosen": -129.07186889648438, "logps/rejected": -136.5091094970703, "loss": 0.651, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6353288888931274, "rewards/margins": 0.13825224339962006, "rewards/rejected": -0.7735811471939087, "step": 1290 }, { "epoch": 0.22, "grad_norm": 1.421875, "learning_rate": 4.7697436140689894e-06, "logits/chosen": -3.296621799468994, "logits/rejected": -3.2937755584716797, "logps/chosen": -126.10877990722656, "logps/rejected": -141.7520294189453, "loss": 0.6535, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6717920303344727, "rewards/margins": 0.13014563918113708, "rewards/rejected": -0.8019376993179321, "step": 1300 }, { "epoch": 0.22, "eval_logits/chosen": -3.283175468444824, "eval_logits/rejected": -3.2817232608795166, "eval_logps/chosen": -117.92476654052734, "eval_logps/rejected": -131.19667053222656, "eval_loss": 0.6605014801025391, "eval_rewards/accuracies": 0.5989776849746704, "eval_rewards/chosen": -0.4651474356651306, "eval_rewards/margins": 0.09574878215789795, "eval_rewards/rejected": -0.5608961582183838, "eval_runtime": 484.3981, "eval_samples_per_second": 8.885, "eval_steps_per_second": 1.111, "step": 1300 }, { "epoch": 0.23, "grad_norm": 1.78125, "learning_rate": 4.763397849531239e-06, "logits/chosen": -3.285306215286255, "logits/rejected": -3.279937744140625, "logps/chosen": -125.95235443115234, "logps/rejected": -137.43260192871094, "loss": 0.6306, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6224833726882935, "rewards/margins": 0.17584998905658722, "rewards/rejected": -0.7983332872390747, "step": 1310 }, { "epoch": 0.23, "grad_norm": 1.765625, "learning_rate": 4.756970165812914e-06, "logits/chosen": -3.304464817047119, "logits/rejected": -3.301752805709839, "logps/chosen": -129.58468627929688, "logps/rejected": -134.88809204101562, "loss": 0.6512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6340187788009644, "rewards/margins": 0.12720128893852234, "rewards/rejected": -0.7612199783325195, "step": 1320 }, { "epoch": 0.23, "grad_norm": 1.265625, "learning_rate": 4.750460795551235e-06, "logits/chosen": -3.3049120903015137, "logits/rejected": -3.3017563819885254, "logps/chosen": -123.66972351074219, "logps/rejected": -132.18521118164062, "loss": 0.6332, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5718771815299988, "rewards/margins": 0.16105546057224274, "rewards/rejected": -0.7329326868057251, "step": 1330 }, { "epoch": 0.23, "grad_norm": 1.3046875, "learning_rate": 4.743869974339904e-06, "logits/chosen": -3.297811985015869, "logits/rejected": -3.2948131561279297, "logps/chosen": -117.7374496459961, "logps/rejected": -127.28782653808594, "loss": 0.6355, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5143827199935913, "rewards/margins": 0.14824241399765015, "rewards/rejected": -0.6626251935958862, "step": 1340 }, { "epoch": 0.23, "grad_norm": 1.9375, "learning_rate": 4.737197940720577e-06, "logits/chosen": -3.3007519245147705, "logits/rejected": -3.2976622581481934, "logps/chosen": -135.472900390625, "logps/rejected": -139.4297332763672, "loss": 0.6914, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.7164632081985474, "rewards/margins": 0.059707384556531906, "rewards/rejected": -0.776170551776886, "step": 1350 }, { "epoch": 0.23, "grad_norm": 1.40625, "learning_rate": 4.730444936174233e-06, "logits/chosen": -3.282017469406128, "logits/rejected": -3.280801296234131, "logps/chosen": -125.0278549194336, "logps/rejected": -134.0998992919922, "loss": 0.6585, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5864216089248657, "rewards/margins": 0.11430720239877701, "rewards/rejected": -0.7007287740707397, "step": 1360 }, { "epoch": 0.24, "grad_norm": 1.6484375, "learning_rate": 4.723611205112431e-06, "logits/chosen": -3.2988338470458984, "logits/rejected": -3.2955524921417236, "logps/chosen": -120.72515869140625, "logps/rejected": -134.67813110351562, "loss": 0.6359, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5717614889144897, "rewards/margins": 0.15903595089912415, "rewards/rejected": -0.7307974696159363, "step": 1370 }, { "epoch": 0.24, "grad_norm": 1.46875, "learning_rate": 4.716696994868467e-06, "logits/chosen": -3.2845749855041504, "logits/rejected": -3.280909776687622, "logps/chosen": -125.68714904785156, "logps/rejected": -133.53616333007812, "loss": 0.6502, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5933858156204224, "rewards/margins": 0.1356620490550995, "rewards/rejected": -0.7290478944778442, "step": 1380 }, { "epoch": 0.24, "grad_norm": 1.3671875, "learning_rate": 4.70970255568842e-06, "logits/chosen": -3.304932117462158, "logits/rejected": -3.300816059112549, "logps/chosen": -130.515869140625, "logps/rejected": -136.6916046142578, "loss": 0.6547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6272884607315063, "rewards/margins": 0.13132533431053162, "rewards/rejected": -0.7586137652397156, "step": 1390 }, { "epoch": 0.24, "grad_norm": 1.25, "learning_rate": 4.702628140722096e-06, "logits/chosen": -3.2679882049560547, "logits/rejected": -3.2638022899627686, "logps/chosen": -120.34420013427734, "logps/rejected": -130.57388305664062, "loss": 0.6315, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.538371205329895, "rewards/margins": 0.16449260711669922, "rewards/rejected": -0.702863872051239, "step": 1400 }, { "epoch": 0.24, "eval_logits/chosen": -3.2787537574768066, "eval_logits/rejected": -3.2772786617279053, "eval_logps/chosen": -109.42464447021484, "eval_logps/rejected": -122.14970397949219, "eval_loss": 0.6605932116508484, "eval_rewards/accuracies": 0.6138476133346558, "eval_rewards/chosen": -0.38014617562294006, "eval_rewards/margins": 0.0902804583311081, "eval_rewards/rejected": -0.47042664885520935, "eval_runtime": 484.6215, "eval_samples_per_second": 8.881, "eval_steps_per_second": 1.11, "step": 1400 }, { "epoch": 0.24, "grad_norm": 1.625, "learning_rate": 4.695474006013865e-06, "logits/chosen": -3.282273769378662, "logits/rejected": -3.276782989501953, "logps/chosen": -116.5535888671875, "logps/rejected": -130.62254333496094, "loss": 0.6384, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5313894748687744, "rewards/margins": 0.16250842809677124, "rewards/rejected": -0.6938979029655457, "step": 1410 }, { "epoch": 0.24, "grad_norm": 1.4140625, "learning_rate": 4.688240410493394e-06, "logits/chosen": -3.258557081222534, "logits/rejected": -3.256169080734253, "logps/chosen": -118.12491607666016, "logps/rejected": -140.1725311279297, "loss": 0.6306, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.576335608959198, "rewards/margins": 0.18987338244915009, "rewards/rejected": -0.7662090063095093, "step": 1420 }, { "epoch": 0.25, "grad_norm": 2.3125, "learning_rate": 4.6809276159662785e-06, "logits/chosen": -3.264193296432495, "logits/rejected": -3.263188123703003, "logps/chosen": -136.53036499023438, "logps/rejected": -155.8864288330078, "loss": 0.6172, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7203831076622009, "rewards/margins": 0.2064674347639084, "rewards/rejected": -0.9268506169319153, "step": 1430 }, { "epoch": 0.25, "grad_norm": 2.15625, "learning_rate": 4.673535887104561e-06, "logits/chosen": -3.230701446533203, "logits/rejected": -3.2279746532440186, "logps/chosen": -140.97439575195312, "logps/rejected": -147.69012451171875, "loss": 0.662, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8013652563095093, "rewards/margins": 0.11966520547866821, "rewards/rejected": -0.9210304021835327, "step": 1440 }, { "epoch": 0.25, "grad_norm": 1.9921875, "learning_rate": 4.6660654914371575e-06, "logits/chosen": -3.2500369548797607, "logits/rejected": -3.2470641136169434, "logps/chosen": -144.15005493164062, "logps/rejected": -159.1547088623047, "loss": 0.6271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7871159911155701, "rewards/margins": 0.19642707705497742, "rewards/rejected": -0.9835430383682251, "step": 1450 }, { "epoch": 0.25, "grad_norm": 1.9921875, "learning_rate": 4.658516699340171e-06, "logits/chosen": -3.2364182472229004, "logits/rejected": -3.2350940704345703, "logps/chosen": -142.92050170898438, "logps/rejected": -155.60977172851562, "loss": 0.644, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7996357679367065, "rewards/margins": 0.1715250462293625, "rewards/rejected": -0.9711607098579407, "step": 1460 }, { "epoch": 0.25, "grad_norm": 1.46875, "learning_rate": 4.650889784027109e-06, "logits/chosen": -3.2780442237854004, "logits/rejected": -3.273808240890503, "logps/chosen": -133.93801879882812, "logps/rejected": -143.4498748779297, "loss": 0.6389, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.656316876411438, "rewards/margins": 0.15848079323768616, "rewards/rejected": -0.814797580242157, "step": 1470 }, { "epoch": 0.26, "grad_norm": 1.828125, "learning_rate": 4.64318502153899e-06, "logits/chosen": -3.274941921234131, "logits/rejected": -3.2701942920684814, "logps/chosen": -127.15447998046875, "logps/rejected": -139.1614227294922, "loss": 0.6487, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6501842141151428, "rewards/margins": 0.13524849712848663, "rewards/rejected": -0.7854325771331787, "step": 1480 }, { "epoch": 0.26, "grad_norm": 1.8671875, "learning_rate": 4.635402690734362e-06, "logits/chosen": -3.253192186355591, "logits/rejected": -3.2492566108703613, "logps/chosen": -138.96067810058594, "logps/rejected": -145.4950714111328, "loss": 0.653, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.710328221321106, "rewards/margins": 0.13628648221492767, "rewards/rejected": -0.84661465883255, "step": 1490 }, { "epoch": 0.26, "grad_norm": 1.8203125, "learning_rate": 4.627543073279197e-06, "logits/chosen": -3.2492332458496094, "logits/rejected": -3.2501883506774902, "logps/chosen": -138.17279052734375, "logps/rejected": -151.1929931640625, "loss": 0.6595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7718222141265869, "rewards/margins": 0.13232490420341492, "rewards/rejected": -0.9041470289230347, "step": 1500 }, { "epoch": 0.26, "eval_logits/chosen": -3.2445855140686035, "eval_logits/rejected": -3.2428994178771973, "eval_logps/chosen": -127.01957702636719, "eval_logps/rejected": -142.2230987548828, "eval_loss": 0.6543667912483215, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.556095540523529, "eval_rewards/margins": 0.11506481468677521, "eval_rewards/rejected": -0.6711603999137878, "eval_runtime": 484.307, "eval_samples_per_second": 8.887, "eval_steps_per_second": 1.111, "step": 1500 }, { "epoch": 0.26, "grad_norm": 1.3359375, "learning_rate": 4.619606453636708e-06, "logits/chosen": -3.2467637062072754, "logits/rejected": -3.2440028190612793, "logps/chosen": -141.313232421875, "logps/rejected": -154.34750366210938, "loss": 0.6182, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7261611819267273, "rewards/margins": 0.2013995349407196, "rewards/rejected": -0.9275606274604797, "step": 1510 }, { "epoch": 0.26, "grad_norm": 1.6953125, "learning_rate": 4.611593119057047e-06, "logits/chosen": -3.2586662769317627, "logits/rejected": -3.258805751800537, "logps/chosen": -140.69796752929688, "logps/rejected": -148.06845092773438, "loss": 0.667, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7792658805847168, "rewards/margins": 0.10725333541631699, "rewards/rejected": -0.8865191340446472, "step": 1520 }, { "epoch": 0.26, "grad_norm": 2.171875, "learning_rate": 4.603503359566912e-06, "logits/chosen": -3.236619472503662, "logits/rejected": -3.23425030708313, "logps/chosen": -139.57363891601562, "logps/rejected": -148.51255798339844, "loss": 0.6584, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7724454998970032, "rewards/margins": 0.13632960617542267, "rewards/rejected": -0.9087749719619751, "step": 1530 }, { "epoch": 0.27, "grad_norm": 2.65625, "learning_rate": 4.595337467959046e-06, "logits/chosen": -3.266460418701172, "logits/rejected": -3.261094570159912, "logps/chosen": -130.5712890625, "logps/rejected": -142.80465698242188, "loss": 0.6258, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6470538377761841, "rewards/margins": 0.1965980976819992, "rewards/rejected": -0.8436519503593445, "step": 1540 }, { "epoch": 0.27, "grad_norm": 2.125, "learning_rate": 4.587095739781645e-06, "logits/chosen": -3.246718645095825, "logits/rejected": -3.242940902709961, "logps/chosen": -126.24415588378906, "logps/rejected": -134.73782348632812, "loss": 0.6393, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6170962452888489, "rewards/margins": 0.1534012109041214, "rewards/rejected": -0.7704974412918091, "step": 1550 }, { "epoch": 0.27, "grad_norm": 2.046875, "learning_rate": 4.578778473327659e-06, "logits/chosen": -3.226879835128784, "logits/rejected": -3.223694324493408, "logps/chosen": -128.14688110351562, "logps/rejected": -140.37246704101562, "loss": 0.643, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6729716658592224, "rewards/margins": 0.14354786276817322, "rewards/rejected": -0.816519558429718, "step": 1560 }, { "epoch": 0.27, "grad_norm": 2.171875, "learning_rate": 4.570385969623993e-06, "logits/chosen": -3.250641345977783, "logits/rejected": -3.2509448528289795, "logps/chosen": -133.70693969726562, "logps/rejected": -152.85145568847656, "loss": 0.6358, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7064641118049622, "rewards/margins": 0.17193913459777832, "rewards/rejected": -0.8784033060073853, "step": 1570 }, { "epoch": 0.27, "grad_norm": 1.625, "learning_rate": 4.561918532420615e-06, "logits/chosen": -3.24027681350708, "logits/rejected": -3.238123655319214, "logps/chosen": -140.45701599121094, "logps/rejected": -155.67108154296875, "loss": 0.6473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7832036018371582, "rewards/margins": 0.15913492441177368, "rewards/rejected": -0.9423385858535767, "step": 1580 }, { "epoch": 0.27, "grad_norm": 1.96875, "learning_rate": 4.553376468179564e-06, "logits/chosen": -3.2495689392089844, "logits/rejected": -3.2477195262908936, "logps/chosen": -136.52931213378906, "logps/rejected": -150.0518035888672, "loss": 0.6475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7195461392402649, "rewards/margins": 0.14303259551525116, "rewards/rejected": -0.8625787496566772, "step": 1590 }, { "epoch": 0.28, "grad_norm": 1.7890625, "learning_rate": 4.544760086063856e-06, "logits/chosen": -3.2430386543273926, "logits/rejected": -3.2349789142608643, "logps/chosen": -135.8370361328125, "logps/rejected": -147.910400390625, "loss": 0.6383, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7234088182449341, "rewards/margins": 0.15631389617919922, "rewards/rejected": -0.8797225952148438, "step": 1600 }, { "epoch": 0.28, "eval_logits/chosen": -3.233834743499756, "eval_logits/rejected": -3.231811046600342, "eval_logps/chosen": -130.09262084960938, "eval_logps/rejected": -145.6309356689453, "eval_loss": 0.6538301706314087, "eval_rewards/accuracies": 0.6177973747253418, "eval_rewards/chosen": -0.5868260264396667, "eval_rewards/margins": 0.11841286718845367, "eval_rewards/rejected": -0.7052388787269592, "eval_runtime": 484.6899, "eval_samples_per_second": 8.88, "eval_steps_per_second": 1.11, "step": 1600 }, { "epoch": 0.28, "grad_norm": 1.578125, "learning_rate": 4.536069697926291e-06, "logits/chosen": -3.2569375038146973, "logits/rejected": -3.254868984222412, "logps/chosen": -145.26028442382812, "logps/rejected": -162.26365661621094, "loss": 0.638, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7968076467514038, "rewards/margins": 0.1780860722064972, "rewards/rejected": -0.9748937487602234, "step": 1610 }, { "epoch": 0.28, "grad_norm": 2.578125, "learning_rate": 4.527305618298173e-06, "logits/chosen": -3.2288143634796143, "logits/rejected": -3.2248358726501465, "logps/chosen": -153.36053466796875, "logps/rejected": -174.30075073242188, "loss": 0.6367, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9123834371566772, "rewards/margins": 0.2117292433977127, "rewards/rejected": -1.1241127252578735, "step": 1620 }, { "epoch": 0.28, "grad_norm": 1.765625, "learning_rate": 4.518468164377923e-06, "logits/chosen": -3.23083758354187, "logits/rejected": -3.224977493286133, "logps/chosen": -150.68533325195312, "logps/rejected": -162.896728515625, "loss": 0.6251, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8347437977790833, "rewards/margins": 0.20534896850585938, "rewards/rejected": -1.0400927066802979, "step": 1630 }, { "epoch": 0.28, "grad_norm": 1.75, "learning_rate": 4.5095576560195975e-06, "logits/chosen": -3.257983446121216, "logits/rejected": -3.2549941539764404, "logps/chosen": -143.98660278320312, "logps/rejected": -147.51876831054688, "loss": 0.6591, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7621957063674927, "rewards/margins": 0.1267503798007965, "rewards/rejected": -0.8889460563659668, "step": 1640 }, { "epoch": 0.28, "grad_norm": 1.828125, "learning_rate": 4.500574415721311e-06, "logits/chosen": -3.2671570777893066, "logits/rejected": -3.2629342079162598, "logps/chosen": -130.56808471679688, "logps/rejected": -147.02005004882812, "loss": 0.6212, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6243900656700134, "rewards/margins": 0.20408205687999725, "rewards/rejected": -0.8284721374511719, "step": 1650 }, { "epoch": 0.29, "grad_norm": 1.453125, "learning_rate": 4.491518768613569e-06, "logits/chosen": -3.263577699661255, "logits/rejected": -3.2591965198516846, "logps/chosen": -132.37673950195312, "logps/rejected": -136.60537719726562, "loss": 0.6528, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6591209769248962, "rewards/margins": 0.1342247724533081, "rewards/rejected": -0.7933458089828491, "step": 1660 }, { "epoch": 0.29, "grad_norm": 1.3359375, "learning_rate": 4.482391042447497e-06, "logits/chosen": -3.268989086151123, "logits/rejected": -3.265378475189209, "logps/chosen": -124.76434326171875, "logps/rejected": -138.3180389404297, "loss": 0.6385, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5851017236709595, "rewards/margins": 0.16150644421577454, "rewards/rejected": -0.7466081380844116, "step": 1670 }, { "epoch": 0.29, "grad_norm": 1.609375, "learning_rate": 4.473191567582975e-06, "logits/chosen": -3.249393939971924, "logits/rejected": -3.246347427368164, "logps/chosen": -127.35566711425781, "logps/rejected": -143.05728149414062, "loss": 0.6265, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6125344634056091, "rewards/margins": 0.19859835505485535, "rewards/rejected": -0.8111329078674316, "step": 1680 }, { "epoch": 0.29, "grad_norm": 1.921875, "learning_rate": 4.46392067697669e-06, "logits/chosen": -3.245945692062378, "logits/rejected": -3.244025468826294, "logps/chosen": -123.40696716308594, "logps/rejected": -132.4766082763672, "loss": 0.6511, "rewards/accuracies": 0.625, "rewards/chosen": -0.5801206827163696, "rewards/margins": 0.13897430896759033, "rewards/rejected": -0.7190949320793152, "step": 1690 }, { "epoch": 0.29, "grad_norm": 1.78125, "learning_rate": 4.454578706170075e-06, "logits/chosen": -3.2143101692199707, "logits/rejected": -3.2159550189971924, "logps/chosen": -124.13203430175781, "logps/rejected": -136.1236114501953, "loss": 0.6775, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.6394143104553223, "rewards/margins": 0.08252833038568497, "rewards/rejected": -0.7219426035881042, "step": 1700 }, { "epoch": 0.29, "eval_logits/chosen": -3.2211709022521973, "eval_logits/rejected": -3.219395637512207, "eval_logps/chosen": -118.28197479248047, "eval_logps/rejected": -132.27479553222656, "eval_loss": 0.6568416357040405, "eval_rewards/accuracies": 0.6173326969146729, "eval_rewards/chosen": -0.46871957182884216, "eval_rewards/margins": 0.10295785218477249, "eval_rewards/rejected": -0.5716773867607117, "eval_runtime": 484.3291, "eval_samples_per_second": 8.887, "eval_steps_per_second": 1.111, "step": 1700 }, { "epoch": 0.29, "grad_norm": 2.140625, "learning_rate": 4.445165993277171e-06, "logits/chosen": -3.222552537918091, "logits/rejected": -3.2166829109191895, "logps/chosen": -132.51321411132812, "logps/rejected": -142.26742553710938, "loss": 0.6422, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6295849084854126, "rewards/margins": 0.16701695322990417, "rewards/rejected": -0.7966018915176392, "step": 1710 }, { "epoch": 0.3, "grad_norm": 2.078125, "learning_rate": 4.435682878972389e-06, "logits/chosen": -3.2195911407470703, "logits/rejected": -3.214392900466919, "logps/chosen": -137.64071655273438, "logps/rejected": -141.55099487304688, "loss": 0.6611, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7094148397445679, "rewards/margins": 0.11735300719738007, "rewards/rejected": -0.8267678022384644, "step": 1720 }, { "epoch": 0.3, "grad_norm": 2.578125, "learning_rate": 4.426129706478178e-06, "logits/chosen": -3.2044854164123535, "logits/rejected": -3.202407121658325, "logps/chosen": -136.70947265625, "logps/rejected": -143.6942901611328, "loss": 0.6724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7217217683792114, "rewards/margins": 0.10501249134540558, "rewards/rejected": -0.8267343640327454, "step": 1730 }, { "epoch": 0.3, "grad_norm": 1.546875, "learning_rate": 4.416506821552603e-06, "logits/chosen": -3.2000412940979004, "logits/rejected": -3.197852849960327, "logps/chosen": -129.83609008789062, "logps/rejected": -149.46237182617188, "loss": 0.636, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6765210032463074, "rewards/margins": 0.1981184333562851, "rewards/rejected": -0.8746395111083984, "step": 1740 }, { "epoch": 0.3, "grad_norm": 1.890625, "learning_rate": 4.406814572476833e-06, "logits/chosen": -3.2070159912109375, "logits/rejected": -3.2036139965057373, "logps/chosen": -127.25260925292969, "logps/rejected": -143.64280700683594, "loss": 0.6352, "rewards/accuracies": 0.625, "rewards/chosen": -0.610071063041687, "rewards/margins": 0.17209911346435547, "rewards/rejected": -0.7821701765060425, "step": 1750 }, { "epoch": 0.3, "grad_norm": 2.171875, "learning_rate": 4.397053310042533e-06, "logits/chosen": -3.204622268676758, "logits/rejected": -3.200345993041992, "logps/chosen": -132.13955688476562, "logps/rejected": -146.16519165039062, "loss": 0.6286, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6560156345367432, "rewards/margins": 0.1965368688106537, "rewards/rejected": -0.852552592754364, "step": 1760 }, { "epoch": 0.3, "grad_norm": 1.7421875, "learning_rate": 4.3872233875391715e-06, "logits/chosen": -3.1852593421936035, "logits/rejected": -3.180729627609253, "logps/chosen": -139.60780334472656, "logps/rejected": -150.09140014648438, "loss": 0.6271, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7347689867019653, "rewards/margins": 0.2010943442583084, "rewards/rejected": -0.9358633160591125, "step": 1770 }, { "epoch": 0.31, "grad_norm": 2.15625, "learning_rate": 4.3773251607412294e-06, "logits/chosen": -3.1705119609832764, "logits/rejected": -3.169437885284424, "logps/chosen": -136.10638427734375, "logps/rejected": -164.87498474121094, "loss": 0.6009, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7181843519210815, "rewards/margins": 0.2686876654624939, "rewards/rejected": -0.9868720769882202, "step": 1780 }, { "epoch": 0.31, "grad_norm": 1.8828125, "learning_rate": 4.367358987895327e-06, "logits/chosen": -3.1425235271453857, "logits/rejected": -3.1398634910583496, "logps/chosen": -144.38807678222656, "logps/rejected": -163.69662475585938, "loss": 0.6184, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8109146952629089, "rewards/margins": 0.21914473176002502, "rewards/rejected": -1.0300594568252563, "step": 1790 }, { "epoch": 0.31, "grad_norm": 2.375, "learning_rate": 4.3573252297072544e-06, "logits/chosen": -3.1346559524536133, "logits/rejected": -3.1308445930480957, "logps/chosen": -151.2903594970703, "logps/rejected": -169.28575134277344, "loss": 0.6312, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8916425704956055, "rewards/margins": 0.19912366569042206, "rewards/rejected": -1.090766191482544, "step": 1800 }, { "epoch": 0.31, "eval_logits/chosen": -3.1236960887908936, "eval_logits/rejected": -3.1213295459747314, "eval_logps/chosen": -143.4406280517578, "eval_logps/rejected": -161.27671813964844, "eval_loss": 0.6496570706367493, "eval_rewards/accuracies": 0.6110594868659973, "eval_rewards/chosen": -0.7203060388565063, "eval_rewards/margins": 0.14139072597026825, "eval_rewards/rejected": -0.861696720123291, "eval_runtime": 483.6469, "eval_samples_per_second": 8.899, "eval_steps_per_second": 1.112, "step": 1800 }, { "epoch": 0.31, "grad_norm": 2.40625, "learning_rate": 4.347224249328922e-06, "logits/chosen": -3.1321170330047607, "logits/rejected": -3.1298587322235107, "logps/chosen": -162.84423828125, "logps/rejected": -170.39773559570312, "loss": 0.6818, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9545540809631348, "rewards/margins": 0.1280214488506317, "rewards/rejected": -1.0825755596160889, "step": 1810 }, { "epoch": 0.31, "grad_norm": 2.6875, "learning_rate": 4.337056412345209e-06, "logits/chosen": -3.134734630584717, "logits/rejected": -3.130065679550171, "logps/chosen": -152.81954956054688, "logps/rejected": -158.31002807617188, "loss": 0.6566, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.8701522946357727, "rewards/margins": 0.1269942671060562, "rewards/rejected": -0.997146487236023, "step": 1820 }, { "epoch": 0.32, "grad_norm": 1.921875, "learning_rate": 4.326822086760743e-06, "logits/chosen": -3.147284746170044, "logits/rejected": -3.1407833099365234, "logps/chosen": -138.86541748046875, "logps/rejected": -155.19522094726562, "loss": 0.6355, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7297259569168091, "rewards/margins": 0.18895025551319122, "rewards/rejected": -0.9186761975288391, "step": 1830 }, { "epoch": 0.32, "grad_norm": 2.046875, "learning_rate": 4.316521642986566e-06, "logits/chosen": -3.1870529651641846, "logits/rejected": -3.185727596282959, "logps/chosen": -143.051513671875, "logps/rejected": -155.83590698242188, "loss": 0.6529, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7773202657699585, "rewards/margins": 0.15543068945407867, "rewards/rejected": -0.9327509999275208, "step": 1840 }, { "epoch": 0.32, "grad_norm": 2.0625, "learning_rate": 4.3061554538267444e-06, "logits/chosen": -3.1828761100769043, "logits/rejected": -3.182145595550537, "logps/chosen": -132.54562377929688, "logps/rejected": -142.9228057861328, "loss": 0.6637, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6825622916221619, "rewards/margins": 0.12806358933448792, "rewards/rejected": -0.8106260299682617, "step": 1850 }, { "epoch": 0.32, "grad_norm": 1.796875, "learning_rate": 4.295723894464862e-06, "logits/chosen": -3.1849961280822754, "logits/rejected": -3.1808555126190186, "logps/chosen": -128.9047393798828, "logps/rejected": -136.94223022460938, "loss": 0.6357, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5910203456878662, "rewards/margins": 0.16387054324150085, "rewards/rejected": -0.7548909187316895, "step": 1860 }, { "epoch": 0.32, "grad_norm": 1.7109375, "learning_rate": 4.285227342450449e-06, "logits/chosen": -3.1886954307556152, "logits/rejected": -3.187377691268921, "logps/chosen": -124.11856842041016, "logps/rejected": -138.12887573242188, "loss": 0.6282, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.57961505651474, "rewards/margins": 0.19387942552566528, "rewards/rejected": -0.7734946012496948, "step": 1870 }, { "epoch": 0.32, "grad_norm": 1.78125, "learning_rate": 4.274666177685317e-06, "logits/chosen": -3.187439203262329, "logits/rejected": -3.1822094917297363, "logps/chosen": -127.79396057128906, "logps/rejected": -141.5155029296875, "loss": 0.6277, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6348319053649902, "rewards/margins": 0.1894364058971405, "rewards/rejected": -0.8242682218551636, "step": 1880 }, { "epoch": 0.33, "grad_norm": 2.0, "learning_rate": 4.264040782409804e-06, "logits/chosen": -3.1903483867645264, "logits/rejected": -3.1858248710632324, "logps/chosen": -125.4483413696289, "logps/rejected": -145.291748046875, "loss": 0.6229, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6227745413780212, "rewards/margins": 0.20939584076404572, "rewards/rejected": -0.8321703672409058, "step": 1890 }, { "epoch": 0.33, "grad_norm": 1.796875, "learning_rate": 4.253351541188947e-06, "logits/chosen": -3.1822569370269775, "logits/rejected": -3.180145740509033, "logps/chosen": -137.6725616455078, "logps/rejected": -140.46498107910156, "loss": 0.665, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6952374577522278, "rewards/margins": 0.11282126605510712, "rewards/rejected": -0.8080587387084961, "step": 1900 }, { "epoch": 0.33, "eval_logits/chosen": -3.1679985523223877, "eval_logits/rejected": -3.165999412536621, "eval_logps/chosen": -123.16142272949219, "eval_logps/rejected": -137.88668823242188, "eval_loss": 0.6550623178482056, "eval_rewards/accuracies": 0.613382875919342, "eval_rewards/chosen": -0.517514169216156, "eval_rewards/margins": 0.1102820485830307, "eval_rewards/rejected": -0.6277962923049927, "eval_runtime": 483.7577, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 1900 }, { "epoch": 0.33, "grad_norm": 1.96875, "learning_rate": 4.242598840898558e-06, "logits/chosen": -3.1519510746002197, "logits/rejected": -3.148864507675171, "logps/chosen": -137.3428192138672, "logps/rejected": -153.29315185546875, "loss": 0.6314, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7314358949661255, "rewards/margins": 0.18779778480529785, "rewards/rejected": -0.9192337989807129, "step": 1910 }, { "epoch": 0.33, "grad_norm": 2.109375, "learning_rate": 4.231783070711223e-06, "logits/chosen": -3.1809704303741455, "logits/rejected": -3.1791326999664307, "logps/chosen": -143.34425354003906, "logps/rejected": -155.4877471923828, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": -0.7753351330757141, "rewards/margins": 0.16499650478363037, "rewards/rejected": -0.9403316378593445, "step": 1920 }, { "epoch": 0.33, "grad_norm": 2.234375, "learning_rate": 4.22090462208222e-06, "logits/chosen": -3.1596155166625977, "logits/rejected": -3.1550707817077637, "logps/chosen": -147.46380615234375, "logps/rejected": -163.8795623779297, "loss": 0.6219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8165766596794128, "rewards/margins": 0.21217963099479675, "rewards/rejected": -1.0287562608718872, "step": 1930 }, { "epoch": 0.33, "grad_norm": 2.109375, "learning_rate": 4.209963888735346e-06, "logits/chosen": -3.178684949874878, "logits/rejected": -3.1801915168762207, "logps/chosen": -141.09107971191406, "logps/rejected": -158.66387939453125, "loss": 0.628, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8012874722480774, "rewards/margins": 0.20206686854362488, "rewards/rejected": -1.0033543109893799, "step": 1940 }, { "epoch": 0.34, "grad_norm": 2.34375, "learning_rate": 4.198961266648671e-06, "logits/chosen": -3.1610703468322754, "logits/rejected": -3.157761335372925, "logps/chosen": -152.76663208007812, "logps/rejected": -173.17831420898438, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": -0.9055836796760559, "rewards/margins": 0.19092567265033722, "rewards/rejected": -1.096509337425232, "step": 1950 }, { "epoch": 0.34, "grad_norm": 2.578125, "learning_rate": 4.187897154040205e-06, "logits/chosen": -3.15240740776062, "logits/rejected": -3.1524808406829834, "logps/chosen": -156.12435913085938, "logps/rejected": -169.71414184570312, "loss": 0.6442, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9187299013137817, "rewards/margins": 0.15986597537994385, "rewards/rejected": -1.0785958766937256, "step": 1960 }, { "epoch": 0.34, "grad_norm": 1.890625, "learning_rate": 4.176771951353481e-06, "logits/chosen": -3.1625571250915527, "logits/rejected": -3.158508777618408, "logps/chosen": -147.187744140625, "logps/rejected": -153.29031372070312, "loss": 0.6603, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7978456616401672, "rewards/margins": 0.12643900513648987, "rewards/rejected": -0.9242845773696899, "step": 1970 }, { "epoch": 0.34, "grad_norm": 1.9609375, "learning_rate": 4.165586061243074e-06, "logits/chosen": -3.160752058029175, "logits/rejected": -3.156141757965088, "logps/chosen": -148.16342163085938, "logps/rejected": -160.54156494140625, "loss": 0.6501, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8244088888168335, "rewards/margins": 0.15916991233825684, "rewards/rejected": -0.9835788607597351, "step": 1980 }, { "epoch": 0.34, "grad_norm": 1.6328125, "learning_rate": 4.154339888560008e-06, "logits/chosen": -3.1824774742126465, "logits/rejected": -3.1802866458892822, "logps/chosen": -145.84805297851562, "logps/rejected": -158.97315979003906, "loss": 0.6279, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8446575999259949, "rewards/margins": 0.1899007260799408, "rewards/rejected": -1.0345582962036133, "step": 1990 }, { "epoch": 0.34, "grad_norm": 1.9921875, "learning_rate": 4.1430338403371275e-06, "logits/chosen": -3.177934408187866, "logits/rejected": -3.179072856903076, "logps/chosen": -146.23016357421875, "logps/rejected": -168.99826049804688, "loss": 0.6385, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8656326532363892, "rewards/margins": 0.18379953503608704, "rewards/rejected": -1.0494321584701538, "step": 2000 }, { "epoch": 0.34, "eval_logits/chosen": -3.1845004558563232, "eval_logits/rejected": -3.1822633743286133, "eval_logps/chosen": -133.07003784179688, "eval_logps/rejected": -148.8959197998047, "eval_loss": 0.6522409319877625, "eval_rewards/accuracies": 0.6161710023880005, "eval_rewards/chosen": -0.6166000962257385, "eval_rewards/margins": 0.12128852307796478, "eval_rewards/rejected": -0.7378886938095093, "eval_runtime": 483.8898, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 2000 }, { "epoch": 0.35, "grad_norm": 1.8828125, "learning_rate": 4.131668325774343e-06, "logits/chosen": -3.1839537620544434, "logits/rejected": -3.1793434619903564, "logps/chosen": -146.1009979248047, "logps/rejected": -157.49703979492188, "loss": 0.6393, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7935379147529602, "rewards/margins": 0.1877121478319168, "rewards/rejected": -0.981249988079071, "step": 2010 }, { "epoch": 0.35, "grad_norm": 2.65625, "learning_rate": 4.120243756223835e-06, "logits/chosen": -3.1638169288635254, "logits/rejected": -3.1577069759368896, "logps/chosen": -148.914794921875, "logps/rejected": -175.17535400390625, "loss": 0.6138, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8834971189498901, "rewards/margins": 0.24149295687675476, "rewards/rejected": -1.1249901056289673, "step": 2020 }, { "epoch": 0.35, "grad_norm": 2.34375, "learning_rate": 4.108760545175163e-06, "logits/chosen": -3.173308849334717, "logits/rejected": -3.169128894805908, "logps/chosen": -156.82955932617188, "logps/rejected": -173.0848846435547, "loss": 0.6359, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8950290679931641, "rewards/margins": 0.19319191575050354, "rewards/rejected": -1.0882209539413452, "step": 2030 }, { "epoch": 0.35, "grad_norm": 2.03125, "learning_rate": 4.097219108240295e-06, "logits/chosen": -3.149092197418213, "logits/rejected": -3.1467485427856445, "logps/chosen": -149.60000610351562, "logps/rejected": -168.51541137695312, "loss": 0.6299, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8556310534477234, "rewards/margins": 0.21351167559623718, "rewards/rejected": -1.0691426992416382, "step": 2040 }, { "epoch": 0.35, "grad_norm": 2.65625, "learning_rate": 4.085619863138574e-06, "logits/chosen": -3.1532864570617676, "logits/rejected": -3.1526474952697754, "logps/chosen": -140.12538146972656, "logps/rejected": -162.52774047851562, "loss": 0.6204, "rewards/accuracies": 0.625, "rewards/chosen": -0.7849727272987366, "rewards/margins": 0.21698176860809326, "rewards/rejected": -1.0019545555114746, "step": 2050 }, { "epoch": 0.35, "grad_norm": 1.765625, "learning_rate": 4.0739632296815886e-06, "logits/chosen": -3.1688714027404785, "logits/rejected": -3.1625566482543945, "logps/chosen": -142.4561767578125, "logps/rejected": -155.34713745117188, "loss": 0.6376, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7721291780471802, "rewards/margins": 0.18038949370384216, "rewards/rejected": -0.9525187611579895, "step": 2060 }, { "epoch": 0.36, "grad_norm": 1.875, "learning_rate": 4.0622496297579905e-06, "logits/chosen": -3.1799235343933105, "logits/rejected": -3.1771349906921387, "logps/chosen": -139.90472412109375, "logps/rejected": -156.67532348632812, "loss": 0.6266, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7351183295249939, "rewards/margins": 0.19804798066616058, "rewards/rejected": -0.9331663250923157, "step": 2070 }, { "epoch": 0.36, "grad_norm": 2.15625, "learning_rate": 4.0504794873182144e-06, "logits/chosen": -3.162339687347412, "logits/rejected": -3.157538890838623, "logps/chosen": -140.0246124267578, "logps/rejected": -158.8914337158203, "loss": 0.613, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.778779149055481, "rewards/margins": 0.23042099177837372, "rewards/rejected": -1.0092002153396606, "step": 2080 }, { "epoch": 0.36, "grad_norm": 2.671875, "learning_rate": 4.038653228359143e-06, "logits/chosen": -3.157104015350342, "logits/rejected": -3.156982421875, "logps/chosen": -151.57472229003906, "logps/rejected": -167.0546875, "loss": 0.6456, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8603813052177429, "rewards/margins": 0.18172840774059296, "rewards/rejected": -1.042109727859497, "step": 2090 }, { "epoch": 0.36, "grad_norm": 2.0625, "learning_rate": 4.026771280908682e-06, "logits/chosen": -3.1234302520751953, "logits/rejected": -3.1203956604003906, "logps/chosen": -157.09375, "logps/rejected": -168.24038696289062, "loss": 0.6452, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9151522517204285, "rewards/margins": 0.17340800166130066, "rewards/rejected": -1.0885603427886963, "step": 2100 }, { "epoch": 0.36, "eval_logits/chosen": -3.136946201324463, "eval_logits/rejected": -3.1343982219696045, "eval_logps/chosen": -142.29122924804688, "eval_logps/rejected": -158.35354614257812, "eval_loss": 0.6538400053977966, "eval_rewards/accuracies": 0.604786217212677, "eval_rewards/chosen": -0.7088120579719543, "eval_rewards/margins": 0.12365300208330154, "eval_rewards/rejected": -0.8324649930000305, "eval_runtime": 483.8762, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 2100 }, { "epoch": 0.36, "grad_norm": 2.375, "learning_rate": 4.014834075010271e-06, "logits/chosen": -3.1391446590423584, "logits/rejected": -3.136540412902832, "logps/chosen": -156.41262817382812, "logps/rejected": -182.945068359375, "loss": 0.62, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9588032960891724, "rewards/margins": 0.23949292302131653, "rewards/rejected": -1.198296070098877, "step": 2110 }, { "epoch": 0.37, "grad_norm": 2.25, "learning_rate": 4.002842042707323e-06, "logits/chosen": -3.089831829071045, "logits/rejected": -3.083256959915161, "logps/chosen": -157.48348999023438, "logps/rejected": -182.3844451904297, "loss": 0.5942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9324124455451965, "rewards/margins": 0.2963864505290985, "rewards/rejected": -1.2287989854812622, "step": 2120 }, { "epoch": 0.37, "grad_norm": 2.359375, "learning_rate": 3.9907956180275785e-06, "logits/chosen": -3.0537140369415283, "logits/rejected": -3.0525598526000977, "logps/chosen": -178.11253356933594, "logps/rejected": -203.71084594726562, "loss": 0.5934, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1379125118255615, "rewards/margins": 0.29864054918289185, "rewards/rejected": -1.4365530014038086, "step": 2130 }, { "epoch": 0.37, "grad_norm": 2.546875, "learning_rate": 3.978695236967405e-06, "logits/chosen": -3.079768419265747, "logits/rejected": -3.0789151191711426, "logps/chosen": -174.4245147705078, "logps/rejected": -195.63836669921875, "loss": 0.6362, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1493077278137207, "rewards/margins": 0.19047953188419342, "rewards/rejected": -1.339787244796753, "step": 2140 }, { "epoch": 0.37, "grad_norm": 2.21875, "learning_rate": 3.966541337476012e-06, "logits/chosen": -3.0785582065582275, "logits/rejected": -3.0743796825408936, "logps/chosen": -169.74510192871094, "logps/rejected": -184.39230346679688, "loss": 0.612, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0371092557907104, "rewards/margins": 0.23942425847053528, "rewards/rejected": -1.2765334844589233, "step": 2150 }, { "epoch": 0.37, "grad_norm": 2.78125, "learning_rate": 3.9543343594396035e-06, "logits/chosen": -3.107583999633789, "logits/rejected": -3.1022396087646484, "logps/chosen": -174.77137756347656, "logps/rejected": -188.1559295654297, "loss": 0.6271, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.080005407333374, "rewards/margins": 0.2243720293045044, "rewards/rejected": -1.304377555847168, "step": 2160 }, { "epoch": 0.37, "grad_norm": 2.96875, "learning_rate": 3.942074744665456e-06, "logits/chosen": -3.1085472106933594, "logits/rejected": -3.0991461277008057, "logps/chosen": -178.31637573242188, "logps/rejected": -202.80215454101562, "loss": 0.5988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1354711055755615, "rewards/margins": 0.29800236225128174, "rewards/rejected": -1.4334733486175537, "step": 2170 }, { "epoch": 0.38, "grad_norm": 3.3125, "learning_rate": 3.929762936865926e-06, "logits/chosen": -3.1241629123687744, "logits/rejected": -3.1208481788635254, "logps/chosen": -197.42234802246094, "logps/rejected": -211.826416015625, "loss": 0.6394, "rewards/accuracies": 0.625, "rewards/chosen": -1.2813293933868408, "rewards/margins": 0.23204727470874786, "rewards/rejected": -1.5133765935897827, "step": 2180 }, { "epoch": 0.38, "grad_norm": 1.96875, "learning_rate": 3.917399381642395e-06, "logits/chosen": -3.1608245372772217, "logits/rejected": -3.1563944816589355, "logps/chosen": -174.8191375732422, "logps/rejected": -191.9217987060547, "loss": 0.6506, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1194748878479004, "rewards/margins": 0.18612749874591827, "rewards/rejected": -1.3056023120880127, "step": 2190 }, { "epoch": 0.38, "grad_norm": 2.34375, "learning_rate": 3.904984526469139e-06, "logits/chosen": -3.1563363075256348, "logits/rejected": -3.1518566608428955, "logps/chosen": -145.4347686767578, "logps/rejected": -171.84201049804688, "loss": 0.6024, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8299029469490051, "rewards/margins": 0.26778867840766907, "rewards/rejected": -1.0976916551589966, "step": 2200 }, { "epoch": 0.38, "eval_logits/chosen": -3.159609317779541, "eval_logits/rejected": -3.156726598739624, "eval_logps/chosen": -135.18582153320312, "eval_logps/rejected": -151.50193786621094, "eval_loss": 0.6526528000831604, "eval_rewards/accuracies": 0.6119888424873352, "eval_rewards/chosen": -0.6377579569816589, "eval_rewards/margins": 0.1261908859014511, "eval_rewards/rejected": -0.7639487981796265, "eval_runtime": 483.9194, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2200 }, { "epoch": 0.38, "grad_norm": 2.53125, "learning_rate": 3.892518820677131e-06, "logits/chosen": -3.1522021293640137, "logits/rejected": -3.148188591003418, "logps/chosen": -148.74594116210938, "logps/rejected": -164.57009887695312, "loss": 0.6301, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8294495344161987, "rewards/margins": 0.2009027898311615, "rewards/rejected": -1.0303523540496826, "step": 2210 }, { "epoch": 0.38, "grad_norm": 1.90625, "learning_rate": 3.880002715437786e-06, "logits/chosen": -3.1346442699432373, "logits/rejected": -3.1319546699523926, "logps/chosen": -146.25363159179688, "logps/rejected": -164.86203002929688, "loss": 0.6307, "rewards/accuracies": 0.625, "rewards/chosen": -0.8132961392402649, "rewards/margins": 0.2138318568468094, "rewards/rejected": -1.027127981185913, "step": 2220 }, { "epoch": 0.38, "grad_norm": 2.515625, "learning_rate": 3.867436663746622e-06, "logits/chosen": -3.1629092693328857, "logits/rejected": -3.1602606773376465, "logps/chosen": -153.08694458007812, "logps/rejected": -161.30661010742188, "loss": 0.6595, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8856202960014343, "rewards/margins": 0.13737449049949646, "rewards/rejected": -1.0229947566986084, "step": 2230 }, { "epoch": 0.39, "grad_norm": 1.5859375, "learning_rate": 3.854821120406871e-06, "logits/chosen": -3.1770501136779785, "logits/rejected": -3.1756601333618164, "logps/chosen": -144.17184448242188, "logps/rejected": -151.9558868408203, "loss": 0.6589, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7732978463172913, "rewards/margins": 0.12476921081542969, "rewards/rejected": -0.8980669975280762, "step": 2240 }, { "epoch": 0.39, "grad_norm": 2.828125, "learning_rate": 3.842156542013017e-06, "logits/chosen": -3.1689164638519287, "logits/rejected": -3.1652140617370605, "logps/chosen": -147.66421508789062, "logps/rejected": -162.92446899414062, "loss": 0.6223, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8171650767326355, "rewards/margins": 0.20014426112174988, "rewards/rejected": -1.017309308052063, "step": 2250 }, { "epoch": 0.39, "grad_norm": 2.296875, "learning_rate": 3.8294433869342695e-06, "logits/chosen": -3.1818199157714844, "logits/rejected": -3.180236339569092, "logps/chosen": -157.19143676757812, "logps/rejected": -165.59632873535156, "loss": 0.663, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8665980100631714, "rewards/margins": 0.13862961530685425, "rewards/rejected": -1.0052276849746704, "step": 2260 }, { "epoch": 0.39, "grad_norm": 1.9453125, "learning_rate": 3.816682115297976e-06, "logits/chosen": -3.1630165576934814, "logits/rejected": -3.15757417678833, "logps/chosen": -154.84857177734375, "logps/rejected": -168.8209686279297, "loss": 0.6382, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9062039256095886, "rewards/margins": 0.19149689376354218, "rewards/rejected": -1.097700834274292, "step": 2270 }, { "epoch": 0.39, "grad_norm": 1.8515625, "learning_rate": 3.803873188972966e-06, "logits/chosen": -3.147902011871338, "logits/rejected": -3.142792224884033, "logps/chosen": -151.74461364746094, "logps/rejected": -173.80873107910156, "loss": 0.6186, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8731371164321899, "rewards/margins": 0.21964311599731445, "rewards/rejected": -1.0927802324295044, "step": 2280 }, { "epoch": 0.39, "grad_norm": 2.46875, "learning_rate": 3.791017071552835e-06, "logits/chosen": -3.0958430767059326, "logits/rejected": -3.090329170227051, "logps/chosen": -159.39187622070312, "logps/rejected": -184.73562622070312, "loss": 0.5988, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9616400003433228, "rewards/margins": 0.2803899347782135, "rewards/rejected": -1.2420299053192139, "step": 2290 }, { "epoch": 0.4, "grad_norm": 3.0, "learning_rate": 3.778114228339168e-06, "logits/chosen": -3.109088182449341, "logits/rejected": -3.103534698486328, "logps/chosen": -168.2469940185547, "logps/rejected": -192.3583984375, "loss": 0.5912, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9937500953674316, "rewards/margins": 0.30891233682632446, "rewards/rejected": -1.3026624917984009, "step": 2300 }, { "epoch": 0.4, "eval_logits/chosen": -3.08532452583313, "eval_logits/rejected": -3.081225872039795, "eval_logps/chosen": -161.3302459716797, "eval_logps/rejected": -180.7163543701172, "eval_loss": 0.6485186219215393, "eval_rewards/accuracies": 0.6105948090553284, "eval_rewards/chosen": -0.8992023468017578, "eval_rewards/margins": 0.15689080953598022, "eval_rewards/rejected": -1.0560930967330933, "eval_runtime": 484.0391, "eval_samples_per_second": 8.892, "eval_steps_per_second": 1.111, "step": 2300 }, { "epoch": 0.4, "grad_norm": 3.3125, "learning_rate": 3.7651651263246947e-06, "logits/chosen": -3.0716493129730225, "logits/rejected": -3.066314935684204, "logps/chosen": -170.21990966796875, "logps/rejected": -194.90985107421875, "loss": 0.5975, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.082719326019287, "rewards/margins": 0.280337393283844, "rewards/rejected": -1.3630567789077759, "step": 2310 }, { "epoch": 0.4, "grad_norm": 3.109375, "learning_rate": 3.752170234176392e-06, "logits/chosen": -3.0347900390625, "logits/rejected": -3.029919385910034, "logps/chosen": -185.3545684814453, "logps/rejected": -206.3814239501953, "loss": 0.598, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.171419620513916, "rewards/margins": 0.30066943168640137, "rewards/rejected": -1.4720890522003174, "step": 2320 }, { "epoch": 0.4, "grad_norm": 2.734375, "learning_rate": 3.739130022218519e-06, "logits/chosen": -3.039088487625122, "logits/rejected": -3.0342190265655518, "logps/chosen": -182.79171752929688, "logps/rejected": -212.6605987548828, "loss": 0.5765, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1558698415756226, "rewards/margins": 0.3727215528488159, "rewards/rejected": -1.5285913944244385, "step": 2330 }, { "epoch": 0.4, "grad_norm": 2.6875, "learning_rate": 3.726044962415595e-06, "logits/chosen": -3.02746844291687, "logits/rejected": -3.0260281562805176, "logps/chosen": -189.47915649414062, "logps/rejected": -212.01712036132812, "loss": 0.6343, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2687938213348389, "rewards/margins": 0.2157808244228363, "rewards/rejected": -1.484574794769287, "step": 2340 }, { "epoch": 0.4, "grad_norm": 2.625, "learning_rate": 3.712915528355317e-06, "logits/chosen": -3.0346224308013916, "logits/rejected": -3.0251426696777344, "logps/chosen": -183.71942138671875, "logps/rejected": -206.18161010742188, "loss": 0.6177, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1512013673782349, "rewards/margins": 0.2862662374973297, "rewards/rejected": -1.4374675750732422, "step": 2350 }, { "epoch": 0.41, "grad_norm": 3.21875, "learning_rate": 3.6997421952314223e-06, "logits/chosen": -3.037364959716797, "logits/rejected": -3.032649517059326, "logps/chosen": -178.09678649902344, "logps/rejected": -203.7887420654297, "loss": 0.6128, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.166067361831665, "rewards/margins": 0.2873513996601105, "rewards/rejected": -1.4534189701080322, "step": 2360 }, { "epoch": 0.41, "grad_norm": 2.671875, "learning_rate": 3.686525439826484e-06, "logits/chosen": -3.043933391571045, "logits/rejected": -3.0355417728424072, "logps/chosen": -180.43582153320312, "logps/rejected": -206.72732543945312, "loss": 0.6098, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1631687879562378, "rewards/margins": 0.28793150186538696, "rewards/rejected": -1.451100468635559, "step": 2370 }, { "epoch": 0.41, "grad_norm": 3.5625, "learning_rate": 3.6732657404946624e-06, "logits/chosen": -3.028273582458496, "logits/rejected": -3.0188450813293457, "logps/chosen": -171.57801818847656, "logps/rejected": -193.34634399414062, "loss": 0.6294, "rewards/accuracies": 0.65625, "rewards/chosen": -1.068125605583191, "rewards/margins": 0.25308340787887573, "rewards/rejected": -1.3212089538574219, "step": 2380 }, { "epoch": 0.41, "grad_norm": 2.453125, "learning_rate": 3.6599635771443844e-06, "logits/chosen": -2.99981427192688, "logits/rejected": -2.993446111679077, "logps/chosen": -193.1945343017578, "logps/rejected": -221.90512084960938, "loss": 0.5843, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2635782957077026, "rewards/margins": 0.3521956205368042, "rewards/rejected": -1.615774154663086, "step": 2390 }, { "epoch": 0.41, "grad_norm": 2.75, "learning_rate": 3.646619431220978e-06, "logits/chosen": -3.006399631500244, "logits/rejected": -3.0024609565734863, "logps/chosen": -183.29205322265625, "logps/rejected": -209.1800994873047, "loss": 0.6188, "rewards/accuracies": 0.6875, "rewards/chosen": -1.206902265548706, "rewards/margins": 0.27105480432510376, "rewards/rejected": -1.4779571294784546, "step": 2400 }, { "epoch": 0.41, "eval_logits/chosen": -3.0275797843933105, "eval_logits/rejected": -3.021939277648926, "eval_logps/chosen": -171.0099639892578, "eval_logps/rejected": -191.72683715820312, "eval_loss": 0.6487921476364136, "eval_rewards/accuracies": 0.6203531622886658, "eval_rewards/chosen": -0.995999276638031, "eval_rewards/margins": 0.17019876837730408, "eval_rewards/rejected": -1.1661980152130127, "eval_runtime": 483.8998, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2400 }, { "epoch": 0.42, "grad_norm": 3.5, "learning_rate": 3.6332337856892475e-06, "logits/chosen": -3.0032505989074707, "logits/rejected": -2.999051809310913, "logps/chosen": -185.5613555908203, "logps/rejected": -204.23507690429688, "loss": 0.6102, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1981723308563232, "rewards/margins": 0.2738698422908783, "rewards/rejected": -1.4720420837402344, "step": 2410 }, { "epoch": 0.42, "grad_norm": 2.828125, "learning_rate": 3.6198071250159945e-06, "logits/chosen": -3.0181069374084473, "logits/rejected": -3.0125536918640137, "logps/chosen": -188.25917053222656, "logps/rejected": -215.7293701171875, "loss": 0.5964, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2226827144622803, "rewards/margins": 0.310552179813385, "rewards/rejected": -1.53323495388031, "step": 2420 }, { "epoch": 0.42, "grad_norm": 2.5, "learning_rate": 3.6063399351524793e-06, "logits/chosen": -3.036452531814575, "logits/rejected": -3.0326380729675293, "logps/chosen": -180.85311889648438, "logps/rejected": -194.0204315185547, "loss": 0.6539, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1466847658157349, "rewards/margins": 0.19593380391597748, "rewards/rejected": -1.3426185846328735, "step": 2430 }, { "epoch": 0.42, "grad_norm": 2.234375, "learning_rate": 3.592832703516836e-06, "logits/chosen": -3.077695846557617, "logits/rejected": -3.0692319869995117, "logps/chosen": -168.24078369140625, "logps/rejected": -192.09884643554688, "loss": 0.6049, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0139938592910767, "rewards/margins": 0.28206855058670044, "rewards/rejected": -1.2960623502731323, "step": 2440 }, { "epoch": 0.42, "grad_norm": 2.40625, "learning_rate": 3.5792859189764335e-06, "logits/chosen": -3.078490972518921, "logits/rejected": -3.0740816593170166, "logps/chosen": -166.83689880371094, "logps/rejected": -182.9640655517578, "loss": 0.6344, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0288734436035156, "rewards/margins": 0.1992349624633789, "rewards/rejected": -1.2281081676483154, "step": 2450 }, { "epoch": 0.42, "grad_norm": 1.984375, "learning_rate": 3.5657000718301765e-06, "logits/chosen": -3.0803191661834717, "logits/rejected": -3.0721633434295654, "logps/chosen": -160.91036987304688, "logps/rejected": -174.44577026367188, "loss": 0.6483, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9320403337478638, "rewards/margins": 0.20414026081562042, "rewards/rejected": -1.1361806392669678, "step": 2460 }, { "epoch": 0.43, "grad_norm": 2.5, "learning_rate": 3.5520756537907645e-06, "logits/chosen": -3.0954861640930176, "logits/rejected": -3.087409019470215, "logps/chosen": -156.04385375976562, "logps/rejected": -174.39959716796875, "loss": 0.624, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9000299572944641, "rewards/margins": 0.23049604892730713, "rewards/rejected": -1.1305259466171265, "step": 2470 }, { "epoch": 0.43, "grad_norm": 2.203125, "learning_rate": 3.538413157966893e-06, "logits/chosen": -3.06768536567688, "logits/rejected": -3.0605628490448, "logps/chosen": -163.9343719482422, "logps/rejected": -179.1577911376953, "loss": 0.6316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9970556497573853, "rewards/margins": 0.21489039063453674, "rewards/rejected": -1.2119461297988892, "step": 2480 }, { "epoch": 0.43, "grad_norm": 2.359375, "learning_rate": 3.5247130788454076e-06, "logits/chosen": -3.056560516357422, "logits/rejected": -3.052340030670166, "logps/chosen": -154.87538146972656, "logps/rejected": -178.25985717773438, "loss": 0.6238, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9618793725967407, "rewards/margins": 0.2296515256166458, "rewards/rejected": -1.1915308237075806, "step": 2490 }, { "epoch": 0.43, "grad_norm": 2.390625, "learning_rate": 3.510975912273406e-06, "logits/chosen": -3.0538134574890137, "logits/rejected": -3.044461250305176, "logps/chosen": -176.61367797851562, "logps/rejected": -198.43502807617188, "loss": 0.6286, "rewards/accuracies": 0.625, "rewards/chosen": -1.1062344312667847, "rewards/margins": 0.2353288233280182, "rewards/rejected": -1.3415632247924805, "step": 2500 }, { "epoch": 0.43, "eval_logits/chosen": -3.0475451946258545, "eval_logits/rejected": -3.0428457260131836, "eval_logps/chosen": -159.05422973632812, "eval_logps/rejected": -178.43539428710938, "eval_loss": 0.6483173966407776, "eval_rewards/accuracies": 0.6075743436813354, "eval_rewards/chosen": -0.876442015171051, "eval_rewards/margins": 0.15684130787849426, "eval_rewards/rejected": -1.0332833528518677, "eval_runtime": 483.7484, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 2500 }, { "epoch": 0.43, "grad_norm": 2.140625, "learning_rate": 3.4972021554402924e-06, "logits/chosen": -3.0428929328918457, "logits/rejected": -3.035857677459717, "logps/chosen": -177.7244873046875, "logps/rejected": -201.36776733398438, "loss": 0.6123, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1452029943466187, "rewards/margins": 0.272835910320282, "rewards/rejected": -1.4180389642715454, "step": 2510 }, { "epoch": 0.43, "grad_norm": 2.515625, "learning_rate": 3.483392306859784e-06, "logits/chosen": -3.020996570587158, "logits/rejected": -3.017580509185791, "logps/chosen": -179.0875244140625, "logps/rejected": -199.23764038085938, "loss": 0.6248, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1607515811920166, "rewards/margins": 0.25996798276901245, "rewards/rejected": -1.4207196235656738, "step": 2520 }, { "epoch": 0.44, "grad_norm": 2.859375, "learning_rate": 3.469546866351866e-06, "logits/chosen": -3.047424793243408, "logits/rejected": -3.042585611343384, "logps/chosen": -172.9007110595703, "logps/rejected": -190.90298461914062, "loss": 0.6494, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1134084463119507, "rewards/margins": 0.17701123654842377, "rewards/rejected": -1.2904198169708252, "step": 2530 }, { "epoch": 0.44, "grad_norm": 2.984375, "learning_rate": 3.455666335024701e-06, "logits/chosen": -3.025081157684326, "logits/rejected": -3.0198981761932373, "logps/chosen": -184.22813415527344, "logps/rejected": -208.3061981201172, "loss": 0.6328, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.207715630531311, "rewards/margins": 0.25757837295532227, "rewards/rejected": -1.4652938842773438, "step": 2540 }, { "epoch": 0.44, "grad_norm": 2.765625, "learning_rate": 3.4417512152564976e-06, "logits/chosen": -3.060671806335449, "logits/rejected": -3.0524652004241943, "logps/chosen": -173.49700927734375, "logps/rejected": -191.56179809570312, "loss": 0.6276, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0308191776275635, "rewards/margins": 0.2632397413253784, "rewards/rejected": -1.2940590381622314, "step": 2550 }, { "epoch": 0.44, "grad_norm": 2.078125, "learning_rate": 3.42780201067732e-06, "logits/chosen": -3.0848729610443115, "logits/rejected": -3.082109212875366, "logps/chosen": -158.57064819335938, "logps/rejected": -175.95895385742188, "loss": 0.6376, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9289971590042114, "rewards/margins": 0.20508842170238495, "rewards/rejected": -1.1340854167938232, "step": 2560 }, { "epoch": 0.44, "grad_norm": 1.9609375, "learning_rate": 3.413819226150868e-06, "logits/chosen": -3.0963122844696045, "logits/rejected": -3.0910143852233887, "logps/chosen": -162.51937866210938, "logps/rejected": -180.8915252685547, "loss": 0.6246, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9311281442642212, "rewards/margins": 0.22899818420410156, "rewards/rejected": -1.1601263284683228, "step": 2570 }, { "epoch": 0.44, "grad_norm": 2.734375, "learning_rate": 3.399803367756198e-06, "logits/chosen": -3.0756285190582275, "logits/rejected": -3.0704190731048584, "logps/chosen": -166.87071228027344, "logps/rejected": -179.3004150390625, "loss": 0.6574, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.0303601026535034, "rewards/margins": 0.15104451775550842, "rewards/rejected": -1.1814045906066895, "step": 2580 }, { "epoch": 0.45, "grad_norm": 2.546875, "learning_rate": 3.3857549427694114e-06, "logits/chosen": -3.0996594429016113, "logits/rejected": -3.092259168624878, "logps/chosen": -151.70034790039062, "logps/rejected": -163.020263671875, "loss": 0.626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8546916842460632, "rewards/margins": 0.19577807188034058, "rewards/rejected": -1.0504697561264038, "step": 2590 }, { "epoch": 0.45, "grad_norm": 2.984375, "learning_rate": 3.3716744596452918e-06, "logits/chosen": -3.0825467109680176, "logits/rejected": -3.0753118991851807, "logps/chosen": -159.81790161132812, "logps/rejected": -173.5633087158203, "loss": 0.61, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9154074788093567, "rewards/margins": 0.25420355796813965, "rewards/rejected": -1.1696109771728516, "step": 2600 }, { "epoch": 0.45, "eval_logits/chosen": -3.080397605895996, "eval_logits/rejected": -3.076663017272949, "eval_logps/chosen": -145.68942260742188, "eval_logps/rejected": -162.40744018554688, "eval_loss": 0.6531721949577332, "eval_rewards/accuracies": 0.6017658114433289, "eval_rewards/chosen": -0.7427940368652344, "eval_rewards/margins": 0.13020974397659302, "eval_rewards/rejected": -0.8730038404464722, "eval_runtime": 483.7174, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 2600 }, { "epoch": 0.45, "grad_norm": 2.28125, "learning_rate": 3.3575624279989017e-06, "logits/chosen": -3.0729708671569824, "logits/rejected": -3.0670769214630127, "logps/chosen": -161.18968200683594, "logps/rejected": -174.8655548095703, "loss": 0.6313, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9309514760971069, "rewards/margins": 0.21913857758045197, "rewards/rejected": -1.1500900983810425, "step": 2610 }, { "epoch": 0.45, "grad_norm": 2.359375, "learning_rate": 3.3434193585871405e-06, "logits/chosen": -3.0856680870056152, "logits/rejected": -3.0778071880340576, "logps/chosen": -156.5973663330078, "logps/rejected": -180.6645965576172, "loss": 0.6021, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.949898898601532, "rewards/margins": 0.2775116562843323, "rewards/rejected": -1.2274105548858643, "step": 2620 }, { "epoch": 0.45, "grad_norm": 2.25, "learning_rate": 3.3292457632902603e-06, "logits/chosen": -3.070791721343994, "logits/rejected": -3.0645086765289307, "logps/chosen": -163.17092895507812, "logps/rejected": -185.933837890625, "loss": 0.6115, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9745320081710815, "rewards/margins": 0.27206122875213623, "rewards/rejected": -1.2465932369232178, "step": 2630 }, { "epoch": 0.45, "grad_norm": 2.890625, "learning_rate": 3.315042155093334e-06, "logits/chosen": -3.0559096336364746, "logits/rejected": -3.048644781112671, "logps/chosen": -165.0961151123047, "logps/rejected": -187.40628051757812, "loss": 0.6092, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0381942987442017, "rewards/margins": 0.2644815742969513, "rewards/rejected": -1.3026758432388306, "step": 2640 }, { "epoch": 0.46, "grad_norm": 2.40625, "learning_rate": 3.300809048067692e-06, "logits/chosen": -3.0413713455200195, "logits/rejected": -3.0334982872009277, "logps/chosen": -174.01637268066406, "logps/rejected": -196.48086547851562, "loss": 0.6335, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0985610485076904, "rewards/margins": 0.24876561760902405, "rewards/rejected": -1.3473265171051025, "step": 2650 }, { "epoch": 0.46, "grad_norm": 3.1875, "learning_rate": 3.2865469573523163e-06, "logits/chosen": -3.069272756576538, "logits/rejected": -3.0634963512420654, "logps/chosen": -175.96026611328125, "logps/rejected": -190.06216430664062, "loss": 0.6375, "rewards/accuracies": 0.625, "rewards/chosen": -1.0801150798797607, "rewards/margins": 0.22055652737617493, "rewards/rejected": -1.3006716966629028, "step": 2660 }, { "epoch": 0.46, "grad_norm": 2.15625, "learning_rate": 3.2722563991351965e-06, "logits/chosen": -3.0801496505737305, "logits/rejected": -3.0725297927856445, "logps/chosen": -162.62802124023438, "logps/rejected": -181.03094482421875, "loss": 0.6374, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9848538637161255, "rewards/margins": 0.22435028851032257, "rewards/rejected": -1.209204077720642, "step": 2670 }, { "epoch": 0.46, "grad_norm": 1.984375, "learning_rate": 3.2579378906346464e-06, "logits/chosen": -3.118258237838745, "logits/rejected": -3.1177947521209717, "logps/chosen": -155.3919219970703, "logps/rejected": -165.44732666015625, "loss": 0.6409, "rewards/accuracies": 0.625, "rewards/chosen": -0.8578490018844604, "rewards/margins": 0.18157216906547546, "rewards/rejected": -1.0394213199615479, "step": 2680 }, { "epoch": 0.46, "grad_norm": 1.859375, "learning_rate": 3.243591950080584e-06, "logits/chosen": -3.1438679695129395, "logits/rejected": -3.1390693187713623, "logps/chosen": -140.6311798095703, "logps/rejected": -160.849853515625, "loss": 0.607, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8011075854301453, "rewards/margins": 0.2374686896800995, "rewards/rejected": -1.038576364517212, "step": 2690 }, { "epoch": 0.47, "grad_norm": 2.3125, "learning_rate": 3.2292190966957776e-06, "logits/chosen": -3.122014045715332, "logits/rejected": -3.1181957721710205, "logps/chosen": -149.11021423339844, "logps/rejected": -168.99923706054688, "loss": 0.6295, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8584330677986145, "rewards/margins": 0.22035415470600128, "rewards/rejected": -1.0787872076034546, "step": 2700 }, { "epoch": 0.47, "eval_logits/chosen": -3.1114442348480225, "eval_logits/rejected": -3.107997179031372, "eval_logps/chosen": -139.27479553222656, "eval_logps/rejected": -155.93223571777344, "eval_loss": 0.6526122689247131, "eval_rewards/accuracies": 0.6138476133346558, "eval_rewards/chosen": -0.6786475777626038, "eval_rewards/margins": 0.1296042650938034, "eval_rewards/rejected": -0.8082518577575684, "eval_runtime": 483.8293, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 2700 }, { "epoch": 0.47, "grad_norm": 2.46875, "learning_rate": 3.21481985067705e-06, "logits/chosen": -3.1047282218933105, "logits/rejected": -3.1033012866973877, "logps/chosen": -157.62673950195312, "logps/rejected": -169.35943603515625, "loss": 0.6389, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8627079129219055, "rewards/margins": 0.18570269644260406, "rewards/rejected": -1.0484106540679932, "step": 2710 }, { "epoch": 0.47, "grad_norm": 2.8125, "learning_rate": 3.200394733176454e-06, "logits/chosen": -3.1050503253936768, "logits/rejected": -3.1030993461608887, "logps/chosen": -149.4019775390625, "logps/rejected": -173.2121124267578, "loss": 0.6179, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8642409443855286, "rewards/margins": 0.24404600262641907, "rewards/rejected": -1.1082870960235596, "step": 2720 }, { "epoch": 0.47, "grad_norm": 3.171875, "learning_rate": 3.1859442662824085e-06, "logits/chosen": -3.099372386932373, "logits/rejected": -3.094510555267334, "logps/chosen": -160.30758666992188, "logps/rejected": -175.3297576904297, "loss": 0.6395, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9080232381820679, "rewards/margins": 0.2210191935300827, "rewards/rejected": -1.1290425062179565, "step": 2730 }, { "epoch": 0.47, "grad_norm": 2.328125, "learning_rate": 3.1714689730008043e-06, "logits/chosen": -3.11287522315979, "logits/rejected": -3.109023332595825, "logps/chosen": -150.24667358398438, "logps/rejected": -162.3335723876953, "loss": 0.6471, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8636587262153625, "rewards/margins": 0.18024428188800812, "rewards/rejected": -1.043903112411499, "step": 2740 }, { "epoch": 0.47, "grad_norm": 2.03125, "learning_rate": 3.156969377236072e-06, "logits/chosen": -3.104238748550415, "logits/rejected": -3.0955731868743896, "logps/chosen": -141.8778839111328, "logps/rejected": -168.66220092773438, "loss": 0.6094, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8233487010002136, "rewards/margins": 0.2567684054374695, "rewards/rejected": -1.080117106437683, "step": 2750 }, { "epoch": 0.48, "grad_norm": 3.125, "learning_rate": 3.1424460037722237e-06, "logits/chosen": -3.0943779945373535, "logits/rejected": -3.089038133621216, "logps/chosen": -144.514404296875, "logps/rejected": -164.00999450683594, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -0.7929133176803589, "rewards/margins": 0.21680936217308044, "rewards/rejected": -1.0097228288650513, "step": 2760 }, { "epoch": 0.48, "grad_norm": 2.5, "learning_rate": 3.127899378253858e-06, "logits/chosen": -3.110429525375366, "logits/rejected": -3.1051766872406006, "logps/chosen": -156.11631774902344, "logps/rejected": -171.1414031982422, "loss": 0.6267, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8985657691955566, "rewards/margins": 0.20263083279132843, "rewards/rejected": -1.1011966466903687, "step": 2770 }, { "epoch": 0.48, "grad_norm": 2.234375, "learning_rate": 3.1133300271671354e-06, "logits/chosen": -3.076634168624878, "logits/rejected": -3.069157600402832, "logps/chosen": -164.27456665039062, "logps/rejected": -182.2312469482422, "loss": 0.6262, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0159214735031128, "rewards/margins": 0.22290892899036407, "rewards/rejected": -1.238830327987671, "step": 2780 }, { "epoch": 0.48, "grad_norm": 2.328125, "learning_rate": 3.0987384778207218e-06, "logits/chosen": -3.0719902515411377, "logits/rejected": -3.0675671100616455, "logps/chosen": -160.24789428710938, "logps/rejected": -180.9109649658203, "loss": 0.6101, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9702569246292114, "rewards/margins": 0.2556365132331848, "rewards/rejected": -1.225893259048462, "step": 2790 }, { "epoch": 0.48, "grad_norm": 2.40625, "learning_rate": 3.0841252583267067e-06, "logits/chosen": -3.0815820693969727, "logits/rejected": -3.0771849155426025, "logps/chosen": -179.7408447265625, "logps/rejected": -193.85435485839844, "loss": 0.6504, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1009565591812134, "rewards/margins": 0.19542057812213898, "rewards/rejected": -1.2963770627975464, "step": 2800 }, { "epoch": 0.48, "eval_logits/chosen": -3.0914981365203857, "eval_logits/rejected": -3.087722063064575, "eval_logps/chosen": -149.51148986816406, "eval_logps/rejected": -167.5323486328125, "eval_loss": 0.651041567325592, "eval_rewards/accuracies": 0.6105948090553284, "eval_rewards/chosen": -0.7810146808624268, "eval_rewards/margins": 0.14323832094669342, "eval_rewards/rejected": -0.924252986907959, "eval_runtime": 483.9389, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2800 }, { "epoch": 0.48, "grad_norm": 2.46875, "learning_rate": 3.069490897581486e-06, "logits/chosen": -3.0918047428131104, "logits/rejected": -3.0865800380706787, "logps/chosen": -160.14015197753906, "logps/rejected": -187.4204864501953, "loss": 0.5971, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.933466911315918, "rewards/margins": 0.29498928785324097, "rewards/rejected": -1.2284562587738037, "step": 2810 }, { "epoch": 0.49, "grad_norm": 2.671875, "learning_rate": 3.054835925246622e-06, "logits/chosen": -3.0843358039855957, "logits/rejected": -3.080287218093872, "logps/chosen": -170.4581756591797, "logps/rejected": -182.52093505859375, "loss": 0.655, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0522292852401733, "rewards/margins": 0.16928645968437195, "rewards/rejected": -1.2215157747268677, "step": 2820 }, { "epoch": 0.49, "grad_norm": 2.28125, "learning_rate": 3.040160871729672e-06, "logits/chosen": -3.0598201751708984, "logits/rejected": -3.0533511638641357, "logps/chosen": -169.25344848632812, "logps/rejected": -193.43211364746094, "loss": 0.6093, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0220553874969482, "rewards/margins": 0.2788383364677429, "rewards/rejected": -1.3008936643600464, "step": 2830 }, { "epoch": 0.49, "grad_norm": 2.4375, "learning_rate": 3.025466268164992e-06, "logits/chosen": -3.083550453186035, "logits/rejected": -3.0787971019744873, "logps/chosen": -168.7285614013672, "logps/rejected": -182.9897918701172, "loss": 0.6431, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.045924186706543, "rewards/margins": 0.1945144236087799, "rewards/rejected": -1.24043869972229, "step": 2840 }, { "epoch": 0.49, "grad_norm": 3.671875, "learning_rate": 3.0107526463945124e-06, "logits/chosen": -3.0826773643493652, "logits/rejected": -3.077467679977417, "logps/chosen": -163.4428253173828, "logps/rejected": -191.730712890625, "loss": 0.6031, "rewards/accuracies": 0.65625, "rewards/chosen": -1.003996729850769, "rewards/margins": 0.2894473969936371, "rewards/rejected": -1.293444275856018, "step": 2850 }, { "epoch": 0.49, "grad_norm": 3.21875, "learning_rate": 2.9960205389484918e-06, "logits/chosen": -3.085514545440674, "logits/rejected": -3.0808959007263184, "logps/chosen": -165.62918090820312, "logps/rejected": -186.27366638183594, "loss": 0.6111, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0085035562515259, "rewards/margins": 0.25401344895362854, "rewards/rejected": -1.262516975402832, "step": 2860 }, { "epoch": 0.49, "grad_norm": 2.78125, "learning_rate": 2.981270479026239e-06, "logits/chosen": -3.0913071632385254, "logits/rejected": -3.0883071422576904, "logps/chosen": -173.51954650878906, "logps/rejected": -189.17471313476562, "loss": 0.6285, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0634653568267822, "rewards/margins": 0.22544582188129425, "rewards/rejected": -1.28891122341156, "step": 2870 }, { "epoch": 0.5, "grad_norm": 2.6875, "learning_rate": 2.9665030004768158e-06, "logits/chosen": -3.0986485481262207, "logits/rejected": -3.089444637298584, "logps/chosen": -169.74688720703125, "logps/rejected": -188.04348754882812, "loss": 0.6236, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0375983715057373, "rewards/margins": 0.25352948904037476, "rewards/rejected": -1.2911279201507568, "step": 2880 }, { "epoch": 0.5, "grad_norm": 3.09375, "learning_rate": 2.9517186377797203e-06, "logits/chosen": -3.0926268100738525, "logits/rejected": -3.0881187915802, "logps/chosen": -161.2325439453125, "logps/rejected": -184.8660125732422, "loss": 0.6159, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9785183072090149, "rewards/margins": 0.25898051261901855, "rewards/rejected": -1.2374987602233887, "step": 2890 }, { "epoch": 0.5, "grad_norm": 2.609375, "learning_rate": 2.936917926025536e-06, "logits/chosen": -3.0904502868652344, "logits/rejected": -3.0861709117889404, "logps/chosen": -155.18263244628906, "logps/rejected": -175.84353637695312, "loss": 0.6226, "rewards/accuracies": 0.65625, "rewards/chosen": -0.919967532157898, "rewards/margins": 0.2332448959350586, "rewards/rejected": -1.153212547302246, "step": 2900 }, { "epoch": 0.5, "eval_logits/chosen": -3.086979389190674, "eval_logits/rejected": -3.0831146240234375, "eval_logps/chosen": -147.7837371826172, "eval_logps/rejected": -165.611572265625, "eval_loss": 0.6512519121170044, "eval_rewards/accuracies": 0.6126858592033386, "eval_rewards/chosen": -0.763737142086029, "eval_rewards/margins": 0.14130805432796478, "eval_rewards/rejected": -0.905045211315155, "eval_runtime": 483.9421, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 2900 }, { "epoch": 0.5, "grad_norm": 2.484375, "learning_rate": 2.9221014008965686e-06, "logits/chosen": -3.097363233566284, "logits/rejected": -3.0895471572875977, "logps/chosen": -167.21290588378906, "logps/rejected": -197.3682861328125, "loss": 0.5858, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0012085437774658, "rewards/margins": 0.3309888243675232, "rewards/rejected": -1.3321974277496338, "step": 2910 }, { "epoch": 0.5, "grad_norm": 3.015625, "learning_rate": 2.907269598647457e-06, "logits/chosen": -3.0325138568878174, "logits/rejected": -3.0270192623138428, "logps/chosen": -183.08514404296875, "logps/rejected": -214.2899627685547, "loss": 0.5986, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1878725290298462, "rewards/margins": 0.3359866142272949, "rewards/rejected": -1.5238590240478516, "step": 2920 }, { "epoch": 0.5, "grad_norm": 2.78125, "learning_rate": 2.8924230560857657e-06, "logits/chosen": -3.0343759059906006, "logits/rejected": -3.0279102325439453, "logps/chosen": -178.95266723632812, "logps/rejected": -201.8841094970703, "loss": 0.6083, "rewards/accuracies": 0.65625, "rewards/chosen": -1.150267243385315, "rewards/margins": 0.2827618718147278, "rewards/rejected": -1.4330291748046875, "step": 2930 }, { "epoch": 0.51, "grad_norm": 2.5625, "learning_rate": 2.8775623105525557e-06, "logits/chosen": -3.0594565868377686, "logits/rejected": -3.0583560466766357, "logps/chosen": -169.06393432617188, "logps/rejected": -190.04153442382812, "loss": 0.6225, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0741300582885742, "rewards/margins": 0.2394881695508957, "rewards/rejected": -1.3136181831359863, "step": 2940 }, { "epoch": 0.51, "grad_norm": 3.15625, "learning_rate": 2.8626878999029354e-06, "logits/chosen": -3.0534815788269043, "logits/rejected": -3.048189878463745, "logps/chosen": -179.89453125, "logps/rejected": -201.58236694335938, "loss": 0.6313, "rewards/accuracies": 0.6875, "rewards/chosen": -1.145120620727539, "rewards/margins": 0.24250420928001404, "rewards/rejected": -1.387624979019165, "step": 2950 }, { "epoch": 0.51, "grad_norm": 2.453125, "learning_rate": 2.847800362486596e-06, "logits/chosen": -3.055997848510742, "logits/rejected": -3.0456926822662354, "logps/chosen": -172.91397094726562, "logps/rejected": -202.24942016601562, "loss": 0.5928, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0813416242599487, "rewards/margins": 0.3385675549507141, "rewards/rejected": -1.4199092388153076, "step": 2960 }, { "epoch": 0.51, "grad_norm": 3.125, "learning_rate": 2.832900237128325e-06, "logits/chosen": -3.0329508781433105, "logits/rejected": -3.029247760772705, "logps/chosen": -182.95108032226562, "logps/rejected": -201.630126953125, "loss": 0.6314, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1779873371124268, "rewards/margins": 0.2388262301683426, "rewards/rejected": -1.416813611984253, "step": 2970 }, { "epoch": 0.51, "grad_norm": 3.640625, "learning_rate": 2.8179880631085053e-06, "logits/chosen": -3.0308852195739746, "logits/rejected": -3.022143840789795, "logps/chosen": -176.36842346191406, "logps/rejected": -202.55227661132812, "loss": 0.603, "rewards/accuracies": 0.625, "rewards/chosen": -1.1120197772979736, "rewards/margins": 0.31395381689071655, "rewards/rejected": -1.4259734153747559, "step": 2980 }, { "epoch": 0.52, "grad_norm": 3.015625, "learning_rate": 2.803064380143598e-06, "logits/chosen": -3.0388948917388916, "logits/rejected": -3.037651538848877, "logps/chosen": -189.60171508789062, "logps/rejected": -203.8285675048828, "loss": 0.6495, "rewards/accuracies": 0.625, "rewards/chosen": -1.2227863073349, "rewards/margins": 0.19194839894771576, "rewards/rejected": -1.4147346019744873, "step": 2990 }, { "epoch": 0.52, "grad_norm": 2.5625, "learning_rate": 2.7881297283666063e-06, "logits/chosen": -3.08270001411438, "logits/rejected": -3.074023723602295, "logps/chosen": -163.1550750732422, "logps/rejected": -188.59869384765625, "loss": 0.6226, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0088860988616943, "rewards/margins": 0.24955037236213684, "rewards/rejected": -1.2584365606307983, "step": 3000 }, { "epoch": 0.52, "eval_logits/chosen": -3.09553599357605, "eval_logits/rejected": -3.0915791988372803, "eval_logps/chosen": -145.16188049316406, "eval_logps/rejected": -163.44444274902344, "eval_loss": 0.6493727564811707, "eval_rewards/accuracies": 0.6078066825866699, "eval_rewards/chosen": -0.7375187873840332, "eval_rewards/margins": 0.14585508406162262, "eval_rewards/rejected": -0.8833737969398499, "eval_runtime": 483.7912, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 3000 }, { "epoch": 0.52, "grad_norm": 2.359375, "learning_rate": 2.77318464830753e-06, "logits/chosen": -3.101335287094116, "logits/rejected": -3.0958926677703857, "logps/chosen": -160.12254333496094, "logps/rejected": -175.15676879882812, "loss": 0.6403, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.946562647819519, "rewards/margins": 0.2003302127122879, "rewards/rejected": -1.146892786026001, "step": 3010 }, { "epoch": 0.52, "grad_norm": 2.09375, "learning_rate": 2.7582296808737964e-06, "logits/chosen": -3.123887300491333, "logits/rejected": -3.1185977458953857, "logps/chosen": -159.97726440429688, "logps/rejected": -175.16766357421875, "loss": 0.6336, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9519734382629395, "rewards/margins": 0.23250770568847656, "rewards/rejected": -1.184481143951416, "step": 3020 }, { "epoch": 0.52, "grad_norm": 2.25, "learning_rate": 2.7432653673306896e-06, "logits/chosen": -3.1237471103668213, "logits/rejected": -3.119204521179199, "logps/chosen": -151.10260009765625, "logps/rejected": -166.8441162109375, "loss": 0.6358, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8556004762649536, "rewards/margins": 0.18172724545001984, "rewards/rejected": -1.0373276472091675, "step": 3030 }, { "epoch": 0.52, "grad_norm": 1.859375, "learning_rate": 2.7282922492817565e-06, "logits/chosen": -3.132171154022217, "logits/rejected": -3.1270885467529297, "logps/chosen": -147.31736755371094, "logps/rejected": -174.3421173095703, "loss": 0.6077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8648845553398132, "rewards/margins": 0.26659253239631653, "rewards/rejected": -1.1314771175384521, "step": 3040 }, { "epoch": 0.53, "grad_norm": 2.734375, "learning_rate": 2.7133108686492054e-06, "logits/chosen": -3.099966049194336, "logits/rejected": -3.1003365516662598, "logps/chosen": -150.30726623535156, "logps/rejected": -169.76614379882812, "loss": 0.6307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8589240908622742, "rewards/margins": 0.1988839954137802, "rewards/rejected": -1.0578080415725708, "step": 3050 }, { "epoch": 0.53, "grad_norm": 2.421875, "learning_rate": 2.6983217676542927e-06, "logits/chosen": -3.0891611576080322, "logits/rejected": -3.0840840339660645, "logps/chosen": -158.2369384765625, "logps/rejected": -178.24795532226562, "loss": 0.6191, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.918899655342102, "rewards/margins": 0.23263654112815857, "rewards/rejected": -1.151536226272583, "step": 3060 }, { "epoch": 0.53, "grad_norm": 2.609375, "learning_rate": 2.6833254887976974e-06, "logits/chosen": -3.0840859413146973, "logits/rejected": -3.076413631439209, "logps/chosen": -166.41143798828125, "logps/rejected": -182.2546844482422, "loss": 0.6218, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9795713424682617, "rewards/margins": 0.2302292138338089, "rewards/rejected": -1.2098004817962646, "step": 3070 }, { "epoch": 0.53, "grad_norm": 2.421875, "learning_rate": 2.6683225748398877e-06, "logits/chosen": -3.07651424407959, "logits/rejected": -3.067636013031006, "logps/chosen": -164.54656982421875, "logps/rejected": -189.8894805908203, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -1.047585129737854, "rewards/margins": 0.2406575232744217, "rewards/rejected": -1.2882425785064697, "step": 3080 }, { "epoch": 0.53, "grad_norm": 2.796875, "learning_rate": 2.6533135687814753e-06, "logits/chosen": -3.0946550369262695, "logits/rejected": -3.089721202850342, "logps/chosen": -162.77565002441406, "logps/rejected": -187.4752960205078, "loss": 0.5985, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9805480241775513, "rewards/margins": 0.2887071967124939, "rewards/rejected": -1.26925528049469, "step": 3090 }, { "epoch": 0.53, "grad_norm": 3.3125, "learning_rate": 2.638299013843564e-06, "logits/chosen": -3.072727918624878, "logits/rejected": -3.063985824584961, "logps/chosen": -169.7478790283203, "logps/rejected": -192.89866638183594, "loss": 0.6062, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0324430465698242, "rewards/margins": 0.2917777895927429, "rewards/rejected": -1.324220895767212, "step": 3100 }, { "epoch": 0.53, "eval_logits/chosen": -3.094945192337036, "eval_logits/rejected": -3.090569257736206, "eval_logps/chosen": -149.33984375, "eval_logps/rejected": -168.2215118408203, "eval_loss": 0.6485457420349121, "eval_rewards/accuracies": 0.6129181981086731, "eval_rewards/chosen": -0.7792982459068298, "eval_rewards/margins": 0.15184608101844788, "eval_rewards/rejected": -0.9311443567276001, "eval_runtime": 483.8725, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 3100 }, { "epoch": 0.54, "grad_norm": 2.703125, "learning_rate": 2.6232794534480866e-06, "logits/chosen": -3.0856406688690186, "logits/rejected": -3.0835537910461426, "logps/chosen": -166.58815002441406, "logps/rejected": -190.39205932617188, "loss": 0.6329, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0585448741912842, "rewards/margins": 0.20964722335338593, "rewards/rejected": -1.268192172050476, "step": 3110 }, { "epoch": 0.54, "grad_norm": 2.234375, "learning_rate": 2.6082554311981425e-06, "logits/chosen": -3.0849928855895996, "logits/rejected": -3.0772414207458496, "logps/chosen": -164.77818298339844, "logps/rejected": -188.58157348632812, "loss": 0.6049, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9779126048088074, "rewards/margins": 0.27373841404914856, "rewards/rejected": -1.2516510486602783, "step": 3120 }, { "epoch": 0.54, "grad_norm": 3.34375, "learning_rate": 2.5932274908583146e-06, "logits/chosen": -3.06742787361145, "logits/rejected": -3.0598721504211426, "logps/chosen": -164.9550018310547, "logps/rejected": -193.75918579101562, "loss": 0.6183, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0295908451080322, "rewards/margins": 0.2868326008319855, "rewards/rejected": -1.3164234161376953, "step": 3130 }, { "epoch": 0.54, "grad_norm": 2.703125, "learning_rate": 2.578196176334995e-06, "logits/chosen": -3.0695323944091797, "logits/rejected": -3.0637080669403076, "logps/chosen": -174.0780029296875, "logps/rejected": -204.63009643554688, "loss": 0.6004, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1191692352294922, "rewards/margins": 0.316087931394577, "rewards/rejected": -1.4352571964263916, "step": 3140 }, { "epoch": 0.54, "grad_norm": 2.78125, "learning_rate": 2.5631620316566986e-06, "logits/chosen": -3.055833578109741, "logits/rejected": -3.0524821281433105, "logps/chosen": -175.0342559814453, "logps/rejected": -193.7977752685547, "loss": 0.6291, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0929839611053467, "rewards/margins": 0.2319747656583786, "rewards/rejected": -1.3249588012695312, "step": 3150 }, { "epoch": 0.54, "grad_norm": 2.125, "learning_rate": 2.548125600954371e-06, "logits/chosen": -3.03719162940979, "logits/rejected": -3.03302264213562, "logps/chosen": -174.88192749023438, "logps/rejected": -193.9909210205078, "loss": 0.6338, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0910463333129883, "rewards/margins": 0.24616487324237823, "rewards/rejected": -1.3372113704681396, "step": 3160 }, { "epoch": 0.55, "grad_norm": 2.265625, "learning_rate": 2.5330874284416956e-06, "logits/chosen": -3.0866336822509766, "logits/rejected": -3.0766632556915283, "logps/chosen": -174.22366333007812, "logps/rejected": -191.46463012695312, "loss": 0.6076, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0721338987350464, "rewards/margins": 0.2831415832042694, "rewards/rejected": -1.3552755117416382, "step": 3170 }, { "epoch": 0.55, "grad_norm": 2.609375, "learning_rate": 2.5180480583953974e-06, "logits/chosen": -3.0933799743652344, "logits/rejected": -3.0862841606140137, "logps/chosen": -169.85255432128906, "logps/rejected": -192.44534301757812, "loss": 0.6166, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0625982284545898, "rewards/margins": 0.27071231603622437, "rewards/rejected": -1.333310604095459, "step": 3180 }, { "epoch": 0.55, "grad_norm": 2.34375, "learning_rate": 2.5030080351355452e-06, "logits/chosen": -3.0938119888305664, "logits/rejected": -3.0877671241760254, "logps/chosen": -157.56088256835938, "logps/rejected": -191.95864868164062, "loss": 0.5895, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9385620951652527, "rewards/margins": 0.3306645452976227, "rewards/rejected": -1.2692269086837769, "step": 3190 }, { "epoch": 0.55, "grad_norm": 2.109375, "learning_rate": 2.4879679030058478e-06, "logits/chosen": -3.085869789123535, "logits/rejected": -3.0796637535095215, "logps/chosen": -163.99746704101562, "logps/rejected": -186.37850952148438, "loss": 0.6071, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9612120389938354, "rewards/margins": 0.29425540566444397, "rewards/rejected": -1.2554675340652466, "step": 3200 }, { "epoch": 0.55, "eval_logits/chosen": -3.095604658126831, "eval_logits/rejected": -3.0910513401031494, "eval_logps/chosen": -151.8241729736328, "eval_logps/rejected": -170.87753295898438, "eval_loss": 0.6476815342903137, "eval_rewards/accuracies": 0.6117565035820007, "eval_rewards/chosen": -0.8041415214538574, "eval_rewards/margins": 0.1535632312297821, "eval_rewards/rejected": -0.9577047824859619, "eval_runtime": 483.7242, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 3200 }, { "epoch": 0.55, "grad_norm": 2.65625, "learning_rate": 2.472928206353955e-06, "logits/chosen": -3.0591721534729004, "logits/rejected": -3.050657033920288, "logps/chosen": -166.25405883789062, "logps/rejected": -189.50100708007812, "loss": 0.6049, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0175381898880005, "rewards/margins": 0.27071613073349, "rewards/rejected": -1.2882544994354248, "step": 3210 }, { "epoch": 0.55, "grad_norm": 2.8125, "learning_rate": 2.4578894895117554e-06, "logits/chosen": -3.061354637145996, "logits/rejected": -3.0563549995422363, "logps/chosen": -160.94094848632812, "logps/rejected": -190.11013793945312, "loss": 0.6035, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.00341796875, "rewards/margins": 0.2906709313392639, "rewards/rejected": -1.2940889596939087, "step": 3220 }, { "epoch": 0.56, "grad_norm": 3.515625, "learning_rate": 2.442852296775674e-06, "logits/chosen": -3.0528101921081543, "logits/rejected": -3.048557758331299, "logps/chosen": -169.01211547851562, "logps/rejected": -195.78076171875, "loss": 0.6241, "rewards/accuracies": 0.65625, "rewards/chosen": -1.051938533782959, "rewards/margins": 0.26093941926956177, "rewards/rejected": -1.3128780126571655, "step": 3230 }, { "epoch": 0.56, "grad_norm": 3.09375, "learning_rate": 2.427817172386977e-06, "logits/chosen": -3.0747134685516357, "logits/rejected": -3.0686464309692383, "logps/chosen": -178.54055786132812, "logps/rejected": -193.69073486328125, "loss": 0.6366, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1196863651275635, "rewards/margins": 0.2184637039899826, "rewards/rejected": -1.338149905204773, "step": 3240 }, { "epoch": 0.56, "grad_norm": 2.78125, "learning_rate": 2.412784660512068e-06, "logits/chosen": -3.0726211071014404, "logits/rejected": -3.0671470165252686, "logps/chosen": -165.47073364257812, "logps/rejected": -186.94058227539062, "loss": 0.6318, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0234571695327759, "rewards/margins": 0.23053304851055145, "rewards/rejected": -1.2539902925491333, "step": 3250 }, { "epoch": 0.56, "grad_norm": 2.390625, "learning_rate": 2.397755305222797e-06, "logits/chosen": -3.0731208324432373, "logits/rejected": -3.0661895275115967, "logps/chosen": -158.9102325439453, "logps/rejected": -184.57321166992188, "loss": 0.6192, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9673464894294739, "rewards/margins": 0.28494155406951904, "rewards/rejected": -1.2522878646850586, "step": 3260 }, { "epoch": 0.56, "grad_norm": 1.9765625, "learning_rate": 2.3827296504767667e-06, "logits/chosen": -3.0933609008789062, "logits/rejected": -3.083542823791504, "logps/chosen": -169.42149353027344, "logps/rejected": -194.79824829101562, "loss": 0.6218, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.043731927871704, "rewards/margins": 0.2579565644264221, "rewards/rejected": -1.3016884326934814, "step": 3270 }, { "epoch": 0.57, "grad_norm": 2.921875, "learning_rate": 2.3677082400976473e-06, "logits/chosen": -3.07395601272583, "logits/rejected": -3.0692670345306396, "logps/chosen": -164.40283203125, "logps/rejected": -188.9150848388672, "loss": 0.6246, "rewards/accuracies": 0.6875, "rewards/chosen": -1.044610619544983, "rewards/margins": 0.2366691380739212, "rewards/rejected": -1.2812796831130981, "step": 3280 }, { "epoch": 0.57, "grad_norm": 2.9375, "learning_rate": 2.352691617755492e-06, "logits/chosen": -3.0645198822021484, "logits/rejected": -3.052039384841919, "logps/chosen": -170.14920043945312, "logps/rejected": -200.637451171875, "loss": 0.5968, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0634241104125977, "rewards/margins": 0.31513866782188416, "rewards/rejected": -1.3785628080368042, "step": 3290 }, { "epoch": 0.57, "grad_norm": 3.671875, "learning_rate": 2.3376803269470604e-06, "logits/chosen": -3.016010284423828, "logits/rejected": -3.0037078857421875, "logps/chosen": -195.9353485107422, "logps/rejected": -224.1945343017578, "loss": 0.608, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.307019591331482, "rewards/margins": 0.3239001929759979, "rewards/rejected": -1.6309198141098022, "step": 3300 }, { "epoch": 0.57, "eval_logits/chosen": -3.006425142288208, "eval_logits/rejected": -3.0002307891845703, "eval_logps/chosen": -182.5597381591797, "eval_logps/rejected": -204.84671020507812, "eval_loss": 0.6460632085800171, "eval_rewards/accuracies": 0.6150093078613281, "eval_rewards/chosen": -1.111497402191162, "eval_rewards/margins": 0.18589934706687927, "eval_rewards/rejected": -1.2973966598510742, "eval_runtime": 483.9633, "eval_samples_per_second": 8.893, "eval_steps_per_second": 1.112, "step": 3300 }, { "epoch": 0.57, "grad_norm": 2.65625, "learning_rate": 2.3226749109761475e-06, "logits/chosen": -2.974262237548828, "logits/rejected": -2.9650983810424805, "logps/chosen": -196.76895141601562, "logps/rejected": -226.045654296875, "loss": 0.5954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3223161697387695, "rewards/margins": 0.3491113781929016, "rewards/rejected": -1.6714274883270264, "step": 3310 }, { "epoch": 0.57, "grad_norm": 2.3125, "learning_rate": 2.3076759129339222e-06, "logits/chosen": -3.0030081272125244, "logits/rejected": -2.991539478302002, "logps/chosen": -195.56874084472656, "logps/rejected": -222.0810089111328, "loss": 0.5985, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3113313913345337, "rewards/margins": 0.3409323990345001, "rewards/rejected": -1.6522636413574219, "step": 3320 }, { "epoch": 0.57, "grad_norm": 2.8125, "learning_rate": 2.2926838756792668e-06, "logits/chosen": -3.024853229522705, "logits/rejected": -3.0167689323425293, "logps/chosen": -182.8240966796875, "logps/rejected": -218.2233428955078, "loss": 0.577, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1806464195251465, "rewards/margins": 0.3838498890399933, "rewards/rejected": -1.5644962787628174, "step": 3330 }, { "epoch": 0.58, "grad_norm": 4.71875, "learning_rate": 2.2776993418191332e-06, "logits/chosen": -3.0294406414031982, "logits/rejected": -3.019160509109497, "logps/chosen": -192.11871337890625, "logps/rejected": -219.72607421875, "loss": 0.6074, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2465808391571045, "rewards/margins": 0.33514589071273804, "rewards/rejected": -1.5817267894744873, "step": 3340 }, { "epoch": 0.58, "grad_norm": 2.265625, "learning_rate": 2.262722853688902e-06, "logits/chosen": -3.0237231254577637, "logits/rejected": -3.012474298477173, "logps/chosen": -185.7919158935547, "logps/rejected": -216.2095184326172, "loss": 0.6007, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1903321743011475, "rewards/margins": 0.3349839448928833, "rewards/rejected": -1.5253162384033203, "step": 3350 }, { "epoch": 0.58, "grad_norm": 3.375, "learning_rate": 2.247754953332754e-06, "logits/chosen": -3.022948741912842, "logits/rejected": -3.0198607444763184, "logps/chosen": -184.82015991210938, "logps/rejected": -203.89492797851562, "loss": 0.6322, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.195936918258667, "rewards/margins": 0.24964404106140137, "rewards/rejected": -1.4455807209014893, "step": 3360 }, { "epoch": 0.58, "grad_norm": 2.859375, "learning_rate": 2.2327961824840564e-06, "logits/chosen": -3.028592586517334, "logits/rejected": -3.0219027996063232, "logps/chosen": -175.58358764648438, "logps/rejected": -209.2071075439453, "loss": 0.5844, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1270334720611572, "rewards/margins": 0.3475898206233978, "rewards/rejected": -1.4746233224868774, "step": 3370 }, { "epoch": 0.58, "grad_norm": 3.578125, "learning_rate": 2.2178470825457464e-06, "logits/chosen": -3.043968677520752, "logits/rejected": -3.039705753326416, "logps/chosen": -179.0226593017578, "logps/rejected": -198.73789978027344, "loss": 0.6178, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1249364614486694, "rewards/margins": 0.27562254667282104, "rewards/rejected": -1.4005590677261353, "step": 3380 }, { "epoch": 0.58, "grad_norm": 2.75, "learning_rate": 2.2029081945707473e-06, "logits/chosen": -3.054203748703003, "logits/rejected": -3.0460638999938965, "logps/chosen": -171.74972534179688, "logps/rejected": -194.30918884277344, "loss": 0.6249, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0857192277908325, "rewards/margins": 0.2693277895450592, "rewards/rejected": -1.3550468683242798, "step": 3390 }, { "epoch": 0.59, "grad_norm": 2.484375, "learning_rate": 2.1879800592423758e-06, "logits/chosen": -3.06596040725708, "logits/rejected": -3.0564138889312744, "logps/chosen": -168.08819580078125, "logps/rejected": -197.4599609375, "loss": 0.5996, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0484591722488403, "rewards/margins": 0.33644840121269226, "rewards/rejected": -1.3849074840545654, "step": 3400 }, { "epoch": 0.59, "eval_logits/chosen": -3.0741565227508545, "eval_logits/rejected": -3.0691306591033936, "eval_logps/chosen": -151.0113067626953, "eval_logps/rejected": -169.9129180908203, "eval_loss": 0.6485846638679504, "eval_rewards/accuracies": 0.609897792339325, "eval_rewards/chosen": -0.7960128784179688, "eval_rewards/margins": 0.15204598009586334, "eval_rewards/rejected": -0.9480588436126709, "eval_runtime": 483.9218, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 3400 }, { "epoch": 0.59, "grad_norm": 2.640625, "learning_rate": 2.1730632168547807e-06, "logits/chosen": -3.0579450130462646, "logits/rejected": -3.050424098968506, "logps/chosen": -154.18807983398438, "logps/rejected": -171.04515075683594, "loss": 0.6244, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9415484666824341, "rewards/margins": 0.21807841956615448, "rewards/rejected": -1.1596269607543945, "step": 3410 }, { "epoch": 0.59, "grad_norm": 4.21875, "learning_rate": 2.1581582072933873e-06, "logits/chosen": -3.071281909942627, "logits/rejected": -3.0659172534942627, "logps/chosen": -158.43789672851562, "logps/rejected": -185.93618774414062, "loss": 0.594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9461215138435364, "rewards/margins": 0.29764896631240845, "rewards/rejected": -1.2437703609466553, "step": 3420 }, { "epoch": 0.59, "grad_norm": 2.71875, "learning_rate": 2.1432655700153496e-06, "logits/chosen": -3.0646018981933594, "logits/rejected": -3.0569794178009033, "logps/chosen": -168.60687255859375, "logps/rejected": -194.6256103515625, "loss": 0.6238, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0673654079437256, "rewards/margins": 0.2535645067691803, "rewards/rejected": -1.320929765701294, "step": 3430 }, { "epoch": 0.59, "grad_norm": 2.953125, "learning_rate": 2.1283858440300376e-06, "logits/chosen": -3.046886920928955, "logits/rejected": -3.034882068634033, "logps/chosen": -177.31863403320312, "logps/rejected": -208.75552368164062, "loss": 0.5924, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1520001888275146, "rewards/margins": 0.33728262782096863, "rewards/rejected": -1.4892828464508057, "step": 3440 }, { "epoch": 0.59, "grad_norm": 2.625, "learning_rate": 2.113519567879517e-06, "logits/chosen": -3.0611815452575684, "logits/rejected": -3.0584826469421387, "logps/chosen": -188.48440551757812, "logps/rejected": -204.76821899414062, "loss": 0.6298, "rewards/accuracies": 0.6875, "rewards/chosen": -1.179123878479004, "rewards/margins": 0.24214068055152893, "rewards/rejected": -1.4212645292282104, "step": 3450 }, { "epoch": 0.6, "grad_norm": 2.4375, "learning_rate": 2.098667279619069e-06, "logits/chosen": -3.0351967811584473, "logits/rejected": -3.0250942707061768, "logps/chosen": -169.76300048828125, "logps/rejected": -197.97808837890625, "loss": 0.6074, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0658444166183472, "rewards/margins": 0.2957460880279541, "rewards/rejected": -1.3615906238555908, "step": 3460 }, { "epoch": 0.6, "grad_norm": 2.421875, "learning_rate": 2.0838295167977066e-06, "logits/chosen": -3.0645134449005127, "logits/rejected": -3.057884454727173, "logps/chosen": -178.63186645507812, "logps/rejected": -204.76443481445312, "loss": 0.6068, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1155550479888916, "rewards/margins": 0.3140595555305481, "rewards/rejected": -1.4296146631240845, "step": 3470 }, { "epoch": 0.6, "grad_norm": 2.859375, "learning_rate": 2.069006816438725e-06, "logits/chosen": -3.0422568321228027, "logits/rejected": -3.0331058502197266, "logps/chosen": -178.1131134033203, "logps/rejected": -202.82235717773438, "loss": 0.6116, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.122577428817749, "rewards/margins": 0.30298852920532227, "rewards/rejected": -1.4255659580230713, "step": 3480 }, { "epoch": 0.6, "grad_norm": 3.34375, "learning_rate": 2.054199715020266e-06, "logits/chosen": -3.0578956604003906, "logits/rejected": -3.0546748638153076, "logps/chosen": -181.18801879882812, "logps/rejected": -203.0323944091797, "loss": 0.6258, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1670176982879639, "rewards/margins": 0.23733997344970703, "rewards/rejected": -1.404357671737671, "step": 3490 }, { "epoch": 0.6, "grad_norm": 2.6875, "learning_rate": 2.039408748455894e-06, "logits/chosen": -3.053438186645508, "logits/rejected": -3.047560453414917, "logps/chosen": -170.10067749023438, "logps/rejected": -194.56643676757812, "loss": 0.6081, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0664782524108887, "rewards/margins": 0.2558566927909851, "rewards/rejected": -1.3223350048065186, "step": 3500 }, { "epoch": 0.6, "eval_logits/chosen": -3.068061590194702, "eval_logits/rejected": -3.06298828125, "eval_logps/chosen": -154.95416259765625, "eval_logps/rejected": -174.41162109375, "eval_loss": 0.6478354334831238, "eval_rewards/accuracies": 0.6157063245773315, "eval_rewards/chosen": -0.8354412913322449, "eval_rewards/margins": 0.15760457515716553, "eval_rewards/rejected": -0.9930458664894104, "eval_runtime": 483.9379, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 3500 }, { "epoch": 0.6, "grad_norm": 2.28125, "learning_rate": 2.024634452075209e-06, "logits/chosen": -3.0592658519744873, "logits/rejected": -3.0531296730041504, "logps/chosen": -170.28494262695312, "logps/rejected": -189.07864379882812, "loss": 0.627, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.068049430847168, "rewards/margins": 0.23227830231189728, "rewards/rejected": -1.3003276586532593, "step": 3510 }, { "epoch": 0.61, "grad_norm": 2.625, "learning_rate": 2.0098773606044627e-06, "logits/chosen": -3.0587456226348877, "logits/rejected": -3.0494563579559326, "logps/chosen": -165.07681274414062, "logps/rejected": -185.6932373046875, "loss": 0.6146, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9973398447036743, "rewards/margins": 0.2538382112979889, "rewards/rejected": -1.2511780261993408, "step": 3520 }, { "epoch": 0.61, "grad_norm": 4.3125, "learning_rate": 1.9951380081472135e-06, "logits/chosen": -3.0682685375213623, "logits/rejected": -3.059941530227661, "logps/chosen": -169.71578979492188, "logps/rejected": -192.5686492919922, "loss": 0.6077, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0332850217819214, "rewards/margins": 0.29921552538871765, "rewards/rejected": -1.332500696182251, "step": 3530 }, { "epoch": 0.61, "grad_norm": 2.84375, "learning_rate": 1.9804169281649873e-06, "logits/chosen": -3.0553269386291504, "logits/rejected": -3.0493812561035156, "logps/chosen": -169.69444274902344, "logps/rejected": -186.82571411132812, "loss": 0.6423, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0428342819213867, "rewards/margins": 0.22077639400959015, "rewards/rejected": -1.2636107206344604, "step": 3540 }, { "epoch": 0.61, "grad_norm": 2.640625, "learning_rate": 1.965714653457979e-06, "logits/chosen": -3.0672309398651123, "logits/rejected": -3.0627758502960205, "logps/chosen": -171.85525512695312, "logps/rejected": -186.51995849609375, "loss": 0.6456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0719311237335205, "rewards/margins": 0.1888027787208557, "rewards/rejected": -1.260733962059021, "step": 3550 }, { "epoch": 0.61, "grad_norm": 3.21875, "learning_rate": 1.9510317161457586e-06, "logits/chosen": -3.0688438415527344, "logits/rejected": -3.061432361602783, "logps/chosen": -163.94273376464844, "logps/rejected": -184.4805908203125, "loss": 0.6189, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9859815835952759, "rewards/margins": 0.2546016573905945, "rewards/rejected": -1.2405831813812256, "step": 3560 }, { "epoch": 0.62, "grad_norm": 2.890625, "learning_rate": 1.936368647648022e-06, "logits/chosen": -3.0663957595825195, "logits/rejected": -3.058706760406494, "logps/chosen": -178.1262969970703, "logps/rejected": -191.65078735351562, "loss": 0.6647, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.108483910560608, "rewards/margins": 0.18502870202064514, "rewards/rejected": -1.2935125827789307, "step": 3570 }, { "epoch": 0.62, "grad_norm": 2.0625, "learning_rate": 1.9217259786653513e-06, "logits/chosen": -3.0707554817199707, "logits/rejected": -3.0660297870635986, "logps/chosen": -173.64324951171875, "logps/rejected": -193.48947143554688, "loss": 0.6291, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0519256591796875, "rewards/margins": 0.2389855682849884, "rewards/rejected": -1.290911316871643, "step": 3580 }, { "epoch": 0.62, "grad_norm": 2.671875, "learning_rate": 1.9071042391600074e-06, "logits/chosen": -3.084831714630127, "logits/rejected": -3.0786590576171875, "logps/chosen": -172.4977569580078, "logps/rejected": -193.0021514892578, "loss": 0.6374, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0711432695388794, "rewards/margins": 0.24499531090259552, "rewards/rejected": -1.316138505935669, "step": 3590 }, { "epoch": 0.62, "grad_norm": 3.25, "learning_rate": 1.8925039583367535e-06, "logits/chosen": -3.0663247108459473, "logits/rejected": -3.060382127761841, "logps/chosen": -166.39263916015625, "logps/rejected": -185.80763244628906, "loss": 0.6256, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.007748007774353, "rewards/margins": 0.24702855944633484, "rewards/rejected": -1.2547765970230103, "step": 3600 }, { "epoch": 0.62, "eval_logits/chosen": -3.076892375946045, "eval_logits/rejected": -3.072154998779297, "eval_logps/chosen": -148.85464477539062, "eval_logps/rejected": -167.44215393066406, "eval_loss": 0.6490924954414368, "eval_rewards/accuracies": 0.6145446300506592, "eval_rewards/chosen": -0.7744462490081787, "eval_rewards/margins": 0.14890483021736145, "eval_rewards/rejected": -0.923350989818573, "eval_runtime": 483.8552, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 3600 }, { "epoch": 0.62, "grad_norm": 2.6875, "learning_rate": 1.8779256646236945e-06, "logits/chosen": -3.0635766983032227, "logits/rejected": -3.0530872344970703, "logps/chosen": -175.24679565429688, "logps/rejected": -192.53961181640625, "loss": 0.6352, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1354683637619019, "rewards/margins": 0.21656735241413116, "rewards/rejected": -1.3520357608795166, "step": 3610 }, { "epoch": 0.62, "grad_norm": 2.484375, "learning_rate": 1.8633698856531602e-06, "logits/chosen": -3.0502521991729736, "logits/rejected": -3.0385677814483643, "logps/chosen": -164.17025756835938, "logps/rejected": -196.63186645507812, "loss": 0.5629, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9688783884048462, "rewards/margins": 0.37294721603393555, "rewards/rejected": -1.3418257236480713, "step": 3620 }, { "epoch": 0.63, "grad_norm": 2.984375, "learning_rate": 1.8488371482425988e-06, "logits/chosen": -3.033907413482666, "logits/rejected": -3.0198476314544678, "logps/chosen": -180.27218627929688, "logps/rejected": -220.55386352539062, "loss": 0.5883, "rewards/accuracies": 0.6875, "rewards/chosen": -1.187538981437683, "rewards/margins": 0.38339418172836304, "rewards/rejected": -1.5709333419799805, "step": 3630 }, { "epoch": 0.63, "grad_norm": 2.796875, "learning_rate": 1.8343279783755208e-06, "logits/chosen": -3.0072665214538574, "logits/rejected": -3.0015065670013428, "logps/chosen": -185.86297607421875, "logps/rejected": -213.43020629882812, "loss": 0.6018, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1913856267929077, "rewards/margins": 0.3051080107688904, "rewards/rejected": -1.4964938163757324, "step": 3640 }, { "epoch": 0.63, "grad_norm": 3.28125, "learning_rate": 1.8198429011824515e-06, "logits/chosen": -3.001743793487549, "logits/rejected": -2.9939351081848145, "logps/chosen": -183.04818725585938, "logps/rejected": -214.5342254638672, "loss": 0.6168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2120859622955322, "rewards/margins": 0.2977331876754761, "rewards/rejected": -1.5098191499710083, "step": 3650 }, { "epoch": 0.63, "grad_norm": 2.78125, "learning_rate": 1.8053824409219322e-06, "logits/chosen": -2.991367816925049, "logits/rejected": -2.9764809608459473, "logps/chosen": -198.4747772216797, "logps/rejected": -234.4219970703125, "loss": 0.5701, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3279832601547241, "rewards/margins": 0.4112142026424408, "rewards/rejected": -1.7391973733901978, "step": 3660 }, { "epoch": 0.63, "grad_norm": 2.59375, "learning_rate": 1.7909471209615447e-06, "logits/chosen": -2.9868359565734863, "logits/rejected": -2.980062484741211, "logps/chosen": -199.0198516845703, "logps/rejected": -218.91775512695312, "loss": 0.6493, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3474757671356201, "rewards/margins": 0.23639878630638123, "rewards/rejected": -1.5838743448257446, "step": 3670 }, { "epoch": 0.63, "grad_norm": 2.953125, "learning_rate": 1.7765374637589632e-06, "logits/chosen": -3.0147016048431396, "logits/rejected": -3.0103507041931152, "logps/chosen": -197.39666748046875, "logps/rejected": -217.50534057617188, "loss": 0.621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.312948226928711, "rewards/margins": 0.28840309381484985, "rewards/rejected": -1.6013513803482056, "step": 3680 }, { "epoch": 0.64, "grad_norm": 2.921875, "learning_rate": 1.7621539908430555e-06, "logits/chosen": -3.0218026638031006, "logits/rejected": -3.0105462074279785, "logps/chosen": -180.84689331054688, "logps/rejected": -219.6415557861328, "loss": 0.5988, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1829901933670044, "rewards/margins": 0.34295639395713806, "rewards/rejected": -1.5259464979171753, "step": 3690 }, { "epoch": 0.64, "grad_norm": 2.578125, "learning_rate": 1.7477972227949947e-06, "logits/chosen": -2.998730182647705, "logits/rejected": -2.9870681762695312, "logps/chosen": -187.25515747070312, "logps/rejected": -220.0942840576172, "loss": 0.5969, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1871895790100098, "rewards/margins": 0.35144585371017456, "rewards/rejected": -1.538635492324829, "step": 3700 }, { "epoch": 0.64, "eval_logits/chosen": -3.023085594177246, "eval_logits/rejected": -3.0171217918395996, "eval_logps/chosen": -168.72821044921875, "eval_logps/rejected": -189.29779052734375, "eval_loss": 0.6469103693962097, "eval_rewards/accuracies": 0.6150093078613281, "eval_rewards/chosen": -0.9731818437576294, "eval_rewards/margins": 0.16872557997703552, "eval_rewards/rejected": -1.1419075727462769, "eval_runtime": 483.9353, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 3700 }, { "epoch": 0.64, "grad_norm": 2.78125, "learning_rate": 1.7334676792294303e-06, "logits/chosen": -3.0105113983154297, "logits/rejected": -3.0030391216278076, "logps/chosen": -183.92034912109375, "logps/rejected": -206.87088012695312, "loss": 0.6257, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1964876651763916, "rewards/margins": 0.28170451521873474, "rewards/rejected": -1.4781922101974487, "step": 3710 }, { "epoch": 0.64, "grad_norm": 2.890625, "learning_rate": 1.7191658787756705e-06, "logits/chosen": -3.0147390365600586, "logits/rejected": -3.0061700344085693, "logps/chosen": -177.37066650390625, "logps/rejected": -215.53512573242188, "loss": 0.5694, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1422507762908936, "rewards/margins": 0.3923090100288391, "rewards/rejected": -1.5345598459243774, "step": 3720 }, { "epoch": 0.64, "grad_norm": 2.71875, "learning_rate": 1.7048923390589211e-06, "logits/chosen": -3.007803440093994, "logits/rejected": -2.991123676300049, "logps/chosen": -190.38050842285156, "logps/rejected": -221.82601928710938, "loss": 0.5958, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2477678060531616, "rewards/margins": 0.3796270489692688, "rewards/rejected": -1.6273949146270752, "step": 3730 }, { "epoch": 0.64, "grad_norm": 2.8125, "learning_rate": 1.6906475766815455e-06, "logits/chosen": -3.019498348236084, "logits/rejected": -3.0102386474609375, "logps/chosen": -177.45591735839844, "logps/rejected": -209.06979370117188, "loss": 0.6059, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1353048086166382, "rewards/margins": 0.306613028049469, "rewards/rejected": -1.4419176578521729, "step": 3740 }, { "epoch": 0.65, "grad_norm": 2.359375, "learning_rate": 1.676432107204367e-06, "logits/chosen": -3.0263257026672363, "logits/rejected": -3.022390604019165, "logps/chosen": -183.7809295654297, "logps/rejected": -203.02212524414062, "loss": 0.6448, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1913459300994873, "rewards/margins": 0.21810908615589142, "rewards/rejected": -1.4094550609588623, "step": 3750 }, { "epoch": 0.65, "grad_norm": 2.84375, "learning_rate": 1.6622464451280131e-06, "logits/chosen": -3.0362114906311035, "logits/rejected": -3.029592514038086, "logps/chosen": -190.61134338378906, "logps/rejected": -205.67208862304688, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": -1.2369264364242554, "rewards/margins": 0.17871399223804474, "rewards/rejected": -1.4156402349472046, "step": 3760 }, { "epoch": 0.65, "grad_norm": 2.59375, "learning_rate": 1.6480911038742892e-06, "logits/chosen": -3.0372822284698486, "logits/rejected": -3.0271191596984863, "logps/chosen": -177.426513671875, "logps/rejected": -200.05203247070312, "loss": 0.6261, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1285603046417236, "rewards/margins": 0.24674637615680695, "rewards/rejected": -1.375306487083435, "step": 3770 }, { "epoch": 0.65, "grad_norm": 4.1875, "learning_rate": 1.6339665957676012e-06, "logits/chosen": -3.011491537094116, "logits/rejected": -3.0045924186706543, "logps/chosen": -183.3855743408203, "logps/rejected": -204.1756591796875, "loss": 0.6169, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1904141902923584, "rewards/margins": 0.2792222797870636, "rewards/rejected": -1.4696365594863892, "step": 3780 }, { "epoch": 0.65, "grad_norm": 3.125, "learning_rate": 1.6198734320164084e-06, "logits/chosen": -3.002366542816162, "logits/rejected": -2.9970383644104004, "logps/chosen": -184.1189422607422, "logps/rejected": -204.14517211914062, "loss": 0.6207, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.200168490409851, "rewards/margins": 0.26282474398612976, "rewards/rejected": -1.4629931449890137, "step": 3790 }, { "epoch": 0.65, "grad_norm": 4.78125, "learning_rate": 1.6058121226947265e-06, "logits/chosen": -2.999645948410034, "logits/rejected": -2.988257646560669, "logps/chosen": -186.4853057861328, "logps/rejected": -205.41824340820312, "loss": 0.6272, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.187149167060852, "rewards/margins": 0.25471192598342896, "rewards/rejected": -1.4418610334396362, "step": 3800 }, { "epoch": 0.65, "eval_logits/chosen": -3.014477014541626, "eval_logits/rejected": -3.008650779724121, "eval_logps/chosen": -166.1768035888672, "eval_logps/rejected": -186.34889221191406, "eval_loss": 0.6471571326255798, "eval_rewards/accuracies": 0.6175650358200073, "eval_rewards/chosen": -0.9476678967475891, "eval_rewards/margins": 0.16475053131580353, "eval_rewards/rejected": -1.1124184131622314, "eval_runtime": 483.7099, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 3800 }, { "epoch": 0.66, "grad_norm": 3.234375, "learning_rate": 1.5917831767236597e-06, "logits/chosen": -3.016317844390869, "logits/rejected": -3.0069072246551514, "logps/chosen": -192.1353759765625, "logps/rejected": -211.64712524414062, "loss": 0.6218, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2300946712493896, "rewards/margins": 0.28360113501548767, "rewards/rejected": -1.5136957168579102, "step": 3810 }, { "epoch": 0.66, "grad_norm": 2.671875, "learning_rate": 1.577787101852988e-06, "logits/chosen": -3.009542942047119, "logits/rejected": -3.003291606903076, "logps/chosen": -180.47171020507812, "logps/rejected": -205.54873657226562, "loss": 0.6031, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1500122547149658, "rewards/margins": 0.2948438823223114, "rewards/rejected": -1.4448561668395996, "step": 3820 }, { "epoch": 0.66, "grad_norm": 4.21875, "learning_rate": 1.5638244046427879e-06, "logits/chosen": -3.0196218490600586, "logits/rejected": -3.0076098442077637, "logps/chosen": -185.17117309570312, "logps/rejected": -199.9987335205078, "loss": 0.626, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1642451286315918, "rewards/margins": 0.24217692017555237, "rewards/rejected": -1.4064220190048218, "step": 3830 }, { "epoch": 0.66, "grad_norm": 1.9921875, "learning_rate": 1.549895590445094e-06, "logits/chosen": -3.017007827758789, "logits/rejected": -3.0075902938842773, "logps/chosen": -180.5908966064453, "logps/rejected": -221.23373413085938, "loss": 0.5758, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1535264253616333, "rewards/margins": 0.3890773355960846, "rewards/rejected": -1.5426037311553955, "step": 3840 }, { "epoch": 0.66, "grad_norm": 3.375, "learning_rate": 1.5360011633856175e-06, "logits/chosen": -3.0341978073120117, "logits/rejected": -3.026261329650879, "logps/chosen": -181.00332641601562, "logps/rejected": -202.96026611328125, "loss": 0.61, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1234402656555176, "rewards/margins": 0.28083527088165283, "rewards/rejected": -1.4042755365371704, "step": 3850 }, { "epoch": 0.67, "grad_norm": 2.546875, "learning_rate": 1.5221416263454914e-06, "logits/chosen": -3.0151686668395996, "logits/rejected": -3.0070672035217285, "logps/chosen": -185.99903869628906, "logps/rejected": -212.03543090820312, "loss": 0.6109, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.205248475074768, "rewards/margins": 0.29824763536453247, "rewards/rejected": -1.5034960508346558, "step": 3860 }, { "epoch": 0.67, "grad_norm": 3.015625, "learning_rate": 1.5083174809430773e-06, "logits/chosen": -3.012986660003662, "logits/rejected": -3.0011372566223145, "logps/chosen": -189.08023071289062, "logps/rejected": -219.11672973632812, "loss": 0.5882, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1898341178894043, "rewards/margins": 0.36755114793777466, "rewards/rejected": -1.5573852062225342, "step": 3870 }, { "epoch": 0.67, "grad_norm": 3.390625, "learning_rate": 1.4945292275158044e-06, "logits/chosen": -2.9859509468078613, "logits/rejected": -2.9861233234405518, "logps/chosen": -192.88870239257812, "logps/rejected": -211.9856414794922, "loss": 0.6548, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3117234706878662, "rewards/margins": 0.18962886929512024, "rewards/rejected": -1.5013524293899536, "step": 3880 }, { "epoch": 0.67, "grad_norm": 2.859375, "learning_rate": 1.4807773651020645e-06, "logits/chosen": -3.0090255737304688, "logits/rejected": -3.002739191055298, "logps/chosen": -185.8389892578125, "logps/rejected": -210.50222778320312, "loss": 0.6162, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2158050537109375, "rewards/margins": 0.289234459400177, "rewards/rejected": -1.5050393342971802, "step": 3890 }, { "epoch": 0.67, "grad_norm": 2.703125, "learning_rate": 1.467062391423149e-06, "logits/chosen": -3.013881206512451, "logits/rejected": -3.011373281478882, "logps/chosen": -191.22622680664062, "logps/rejected": -209.2890167236328, "loss": 0.6222, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2305704355239868, "rewards/margins": 0.27982720732688904, "rewards/rejected": -1.5103976726531982, "step": 3900 }, { "epoch": 0.67, "eval_logits/chosen": -3.0100162029266357, "eval_logits/rejected": -3.004049301147461, "eval_logps/chosen": -168.60433959960938, "eval_logps/rejected": -189.1106719970703, "eval_loss": 0.6467403173446655, "eval_rewards/accuracies": 0.6166356801986694, "eval_rewards/chosen": -0.9719431400299072, "eval_rewards/margins": 0.16809284687042236, "eval_rewards/rejected": -1.1400359869003296, "eval_runtime": 483.8368, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 3900 }, { "epoch": 0.67, "grad_norm": 2.5625, "learning_rate": 1.4533848028652347e-06, "logits/chosen": -3.0135159492492676, "logits/rejected": -3.005495071411133, "logps/chosen": -189.1254425048828, "logps/rejected": -219.94100952148438, "loss": 0.5947, "rewards/accuracies": 0.6875, "rewards/chosen": -1.228262186050415, "rewards/margins": 0.3604966104030609, "rewards/rejected": -1.5887585878372192, "step": 3910 }, { "epoch": 0.68, "grad_norm": 2.578125, "learning_rate": 1.4397450944614185e-06, "logits/chosen": -3.022209882736206, "logits/rejected": -3.014324188232422, "logps/chosen": -186.09042358398438, "logps/rejected": -205.3507843017578, "loss": 0.6055, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1575043201446533, "rewards/margins": 0.30854544043540955, "rewards/rejected": -1.4660497903823853, "step": 3920 }, { "epoch": 0.68, "grad_norm": 2.84375, "learning_rate": 1.426143759873801e-06, "logits/chosen": -3.0014710426330566, "logits/rejected": -2.9942893981933594, "logps/chosen": -182.577880859375, "logps/rejected": -212.81295776367188, "loss": 0.5951, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1955492496490479, "rewards/margins": 0.32639041543006897, "rewards/rejected": -1.5219395160675049, "step": 3930 }, { "epoch": 0.68, "grad_norm": 3.125, "learning_rate": 1.4125812913756174e-06, "logits/chosen": -2.984102725982666, "logits/rejected": -2.9806723594665527, "logps/chosen": -185.0265350341797, "logps/rejected": -214.3390350341797, "loss": 0.5953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2077550888061523, "rewards/margins": 0.32172971963882446, "rewards/rejected": -1.5294848680496216, "step": 3940 }, { "epoch": 0.68, "grad_norm": 4.0, "learning_rate": 1.3990581798334236e-06, "logits/chosen": -2.9794344902038574, "logits/rejected": -2.9656434059143066, "logps/chosen": -190.2133331298828, "logps/rejected": -221.944580078125, "loss": 0.5668, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2438141107559204, "rewards/margins": 0.3942989706993103, "rewards/rejected": -1.638113260269165, "step": 3950 }, { "epoch": 0.68, "grad_norm": 3.625, "learning_rate": 1.3855749146893285e-06, "logits/chosen": -2.9964189529418945, "logits/rejected": -2.9914658069610596, "logps/chosen": -194.50753784179688, "logps/rejected": -232.042724609375, "loss": 0.6, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3071004152297974, "rewards/margins": 0.3428882956504822, "rewards/rejected": -1.6499887704849243, "step": 3960 }, { "epoch": 0.68, "grad_norm": 2.84375, "learning_rate": 1.3721319839432794e-06, "logits/chosen": -2.97955584526062, "logits/rejected": -2.970545530319214, "logps/chosen": -201.5922088623047, "logps/rejected": -231.54232788085938, "loss": 0.5959, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3578835725784302, "rewards/margins": 0.33301469683647156, "rewards/rejected": -1.6908981800079346, "step": 3970 }, { "epoch": 0.69, "grad_norm": 2.796875, "learning_rate": 1.3587298741353999e-06, "logits/chosen": -2.9589290618896484, "logits/rejected": -2.944995403289795, "logps/chosen": -195.64895629882812, "logps/rejected": -232.37344360351562, "loss": 0.5752, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3240019083023071, "rewards/margins": 0.38941264152526855, "rewards/rejected": -1.7134145498275757, "step": 3980 }, { "epoch": 0.69, "grad_norm": 2.921875, "learning_rate": 1.3453690703283848e-06, "logits/chosen": -2.9528799057006836, "logits/rejected": -2.9546871185302734, "logps/chosen": -201.53150939941406, "logps/rejected": -223.7783660888672, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": -1.3705735206604004, "rewards/margins": 0.23949268460273743, "rewards/rejected": -1.6100661754608154, "step": 3990 }, { "epoch": 0.69, "grad_norm": 4.375, "learning_rate": 1.3320500560899329e-06, "logits/chosen": -2.9793593883514404, "logits/rejected": -2.9736618995666504, "logps/chosen": -204.895751953125, "logps/rejected": -229.46194458007812, "loss": 0.605, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3752055168151855, "rewards/margins": 0.3069971203804016, "rewards/rejected": -1.6822025775909424, "step": 4000 }, { "epoch": 0.69, "eval_logits/chosen": -2.9848952293395996, "eval_logits/rejected": -2.978325366973877, "eval_logps/chosen": -179.137939453125, "eval_logps/rejected": -200.68565368652344, "eval_loss": 0.6460844874382019, "eval_rewards/accuracies": 0.6203531622886658, "eval_rewards/chosen": -1.0772794485092163, "eval_rewards/margins": 0.178506538271904, "eval_rewards/rejected": -1.2557858228683472, "eval_runtime": 483.7273, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 4000 }, { "epoch": 0.69, "grad_norm": 3.359375, "learning_rate": 1.3187733134752622e-06, "logits/chosen": -2.9539637565612793, "logits/rejected": -2.9446120262145996, "logps/chosen": -191.04747009277344, "logps/rejected": -230.6028289794922, "loss": 0.58, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3172180652618408, "rewards/margins": 0.3778838515281677, "rewards/rejected": -1.6951020956039429, "step": 4010 }, { "epoch": 0.69, "grad_norm": 2.984375, "learning_rate": 1.3055393230096433e-06, "logits/chosen": -2.9659435749053955, "logits/rejected": -2.9620676040649414, "logps/chosen": -199.49159240722656, "logps/rejected": -227.734130859375, "loss": 0.6173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3726211786270142, "rewards/margins": 0.29269200563430786, "rewards/rejected": -1.6653131246566772, "step": 4020 }, { "epoch": 0.69, "grad_norm": 2.453125, "learning_rate": 1.2923485636710275e-06, "logits/chosen": -2.9760024547576904, "logits/rejected": -2.9680752754211426, "logps/chosen": -193.3185272216797, "logps/rejected": -217.10757446289062, "loss": 0.6181, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2622469663619995, "rewards/margins": 0.26707887649536133, "rewards/rejected": -1.52932608127594, "step": 4030 }, { "epoch": 0.7, "grad_norm": 2.953125, "learning_rate": 1.279201512872693e-06, "logits/chosen": -2.995952844619751, "logits/rejected": -2.982109308242798, "logps/chosen": -197.069580078125, "logps/rejected": -229.3331298828125, "loss": 0.5877, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3006064891815186, "rewards/margins": 0.36042696237564087, "rewards/rejected": -1.6610333919525146, "step": 4040 }, { "epoch": 0.7, "grad_norm": 2.765625, "learning_rate": 1.2660986464459817e-06, "logits/chosen": -2.974198579788208, "logits/rejected": -2.967679500579834, "logps/chosen": -188.9154052734375, "logps/rejected": -215.5602569580078, "loss": 0.625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2681456804275513, "rewards/margins": 0.2880643904209137, "rewards/rejected": -1.556209921836853, "step": 4050 }, { "epoch": 0.7, "grad_norm": 2.578125, "learning_rate": 1.2530404386230637e-06, "logits/chosen": -2.9840919971466064, "logits/rejected": -2.9815354347229004, "logps/chosen": -205.19137573242188, "logps/rejected": -222.2770538330078, "loss": 0.645, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3826020956039429, "rewards/margins": 0.24417224526405334, "rewards/rejected": -1.6267744302749634, "step": 4060 }, { "epoch": 0.7, "grad_norm": 2.703125, "learning_rate": 1.2400273620197856e-06, "logits/chosen": -2.982635498046875, "logits/rejected": -2.973203182220459, "logps/chosen": -194.95956420898438, "logps/rejected": -227.84765625, "loss": 0.5765, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.310858964920044, "rewards/margins": 0.36947301030158997, "rewards/rejected": -1.680331826210022, "step": 4070 }, { "epoch": 0.7, "grad_norm": 2.5625, "learning_rate": 1.2270598876185553e-06, "logits/chosen": -2.9874231815338135, "logits/rejected": -2.979541778564453, "logps/chosen": -189.1808319091797, "logps/rejected": -219.6368408203125, "loss": 0.6067, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2556878328323364, "rewards/margins": 0.31067654490470886, "rewards/rejected": -1.5663644075393677, "step": 4080 }, { "epoch": 0.7, "grad_norm": 2.78125, "learning_rate": 1.2141384847513006e-06, "logits/chosen": -3.0166218280792236, "logits/rejected": -3.002739429473877, "logps/chosen": -180.31333923339844, "logps/rejected": -209.5823211669922, "loss": 0.5903, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1415802240371704, "rewards/margins": 0.3395538628101349, "rewards/rejected": -1.4811339378356934, "step": 4090 }, { "epoch": 0.71, "grad_norm": 4.09375, "learning_rate": 1.2012636210824833e-06, "logits/chosen": -2.993722915649414, "logits/rejected": -2.988193988800049, "logps/chosen": -179.03883361816406, "logps/rejected": -206.8237762451172, "loss": 0.585, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1502227783203125, "rewards/margins": 0.347284734249115, "rewards/rejected": -1.4975074529647827, "step": 4100 }, { "epoch": 0.71, "eval_logits/chosen": -3.008620500564575, "eval_logits/rejected": -3.0023536682128906, "eval_logps/chosen": -169.76588439941406, "eval_logps/rejected": -190.66702270507812, "eval_loss": 0.6464406251907349, "eval_rewards/accuracies": 0.616403341293335, "eval_rewards/chosen": -0.9835586547851562, "eval_rewards/margins": 0.17204123735427856, "eval_rewards/rejected": -1.1556000709533691, "eval_runtime": 483.817, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 4100 }, { "epoch": 0.71, "grad_norm": 3.453125, "learning_rate": 1.1884357625921695e-06, "logits/chosen": -2.9908840656280518, "logits/rejected": -2.983269214630127, "logps/chosen": -194.69482421875, "logps/rejected": -209.41250610351562, "loss": 0.6519, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2834727764129639, "rewards/margins": 0.21269741654396057, "rewards/rejected": -1.4961702823638916, "step": 4110 }, { "epoch": 0.71, "grad_norm": 3.046875, "learning_rate": 1.175655373559168e-06, "logits/chosen": -2.999894618988037, "logits/rejected": -2.9891955852508545, "logps/chosen": -186.88348388671875, "logps/rejected": -212.41897583007812, "loss": 0.6279, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.257952094078064, "rewards/margins": 0.23879018425941467, "rewards/rejected": -1.4967423677444458, "step": 4120 }, { "epoch": 0.71, "grad_norm": 3.09375, "learning_rate": 1.162922916544224e-06, "logits/chosen": -2.9989144802093506, "logits/rejected": -2.9878134727478027, "logps/chosen": -184.68089294433594, "logps/rejected": -212.1684112548828, "loss": 0.5881, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1903077363967896, "rewards/margins": 0.3276127576828003, "rewards/rejected": -1.5179203748703003, "step": 4130 }, { "epoch": 0.71, "grad_norm": 3.703125, "learning_rate": 1.15023885237328e-06, "logits/chosen": -2.988089084625244, "logits/rejected": -2.980468273162842, "logps/chosen": -198.49673461914062, "logps/rejected": -212.5007781982422, "loss": 0.6467, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.315866231918335, "rewards/margins": 0.2183130979537964, "rewards/rejected": -1.534179449081421, "step": 4140 }, { "epoch": 0.72, "grad_norm": 2.734375, "learning_rate": 1.1376036401207939e-06, "logits/chosen": -3.0006988048553467, "logits/rejected": -2.99579119682312, "logps/chosen": -192.1051788330078, "logps/rejected": -208.1251983642578, "loss": 0.632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.23936927318573, "rewards/margins": 0.2513357996940613, "rewards/rejected": -1.4907052516937256, "step": 4150 }, { "epoch": 0.72, "grad_norm": 2.375, "learning_rate": 1.1250177370931265e-06, "logits/chosen": -2.993255376815796, "logits/rejected": -2.9826114177703857, "logps/chosen": -182.51060485839844, "logps/rejected": -212.94644165039062, "loss": 0.5801, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1925780773162842, "rewards/margins": 0.37092381715774536, "rewards/rejected": -1.5635017156600952, "step": 4160 }, { "epoch": 0.72, "grad_norm": 3.53125, "learning_rate": 1.112481598811992e-06, "logits/chosen": -3.0047919750213623, "logits/rejected": -2.9989700317382812, "logps/chosen": -179.00527954101562, "logps/rejected": -205.2192840576172, "loss": 0.6232, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1701711416244507, "rewards/margins": 0.25962555408477783, "rewards/rejected": -1.4297969341278076, "step": 4170 }, { "epoch": 0.72, "grad_norm": 2.890625, "learning_rate": 1.0999956789979626e-06, "logits/chosen": -3.00277042388916, "logits/rejected": -2.9926650524139404, "logps/chosen": -184.0215301513672, "logps/rejected": -211.02987670898438, "loss": 0.6108, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1701806783676147, "rewards/margins": 0.2845175862312317, "rewards/rejected": -1.4546983242034912, "step": 4180 }, { "epoch": 0.72, "grad_norm": 3.296875, "learning_rate": 1.0875604295540607e-06, "logits/chosen": -2.9975829124450684, "logits/rejected": -2.9934194087982178, "logps/chosen": -185.83523559570312, "logps/rejected": -216.00106811523438, "loss": 0.6053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2398172616958618, "rewards/margins": 0.314037024974823, "rewards/rejected": -1.55385422706604, "step": 4190 }, { "epoch": 0.72, "grad_norm": 2.8125, "learning_rate": 1.075176300549387e-06, "logits/chosen": -3.012768030166626, "logits/rejected": -3.0091800689697266, "logps/chosen": -185.5307159423828, "logps/rejected": -196.47280883789062, "loss": 0.6602, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1849339008331299, "rewards/margins": 0.17340119183063507, "rewards/rejected": -1.3583351373672485, "step": 4200 }, { "epoch": 0.72, "eval_logits/chosen": -3.015038013458252, "eval_logits/rejected": -3.0089128017425537, "eval_logps/chosen": -166.36691284179688, "eval_logps/rejected": -186.9268341064453, "eval_loss": 0.6464580297470093, "eval_rewards/accuracies": 0.6177973747253418, "eval_rewards/chosen": -0.9495689272880554, "eval_rewards/margins": 0.16862896084785461, "eval_rewards/rejected": -1.1181979179382324, "eval_runtime": 483.9621, "eval_samples_per_second": 8.893, "eval_steps_per_second": 1.112, "step": 4200 }, { "epoch": 0.73, "grad_norm": 3.28125, "learning_rate": 1.0628437402028475e-06, "logits/chosen": -3.0106093883514404, "logits/rejected": -3.0004124641418457, "logps/chosen": -186.6129913330078, "logps/rejected": -202.49771118164062, "loss": 0.6427, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2092821598052979, "rewards/margins": 0.21569499373435974, "rewards/rejected": -1.4249770641326904, "step": 4210 }, { "epoch": 0.73, "grad_norm": 2.34375, "learning_rate": 1.0505631948669184e-06, "logits/chosen": -2.998325824737549, "logits/rejected": -2.994420289993286, "logps/chosen": -180.66578674316406, "logps/rejected": -202.93563842773438, "loss": 0.6281, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1785303354263306, "rewards/margins": 0.24288196861743927, "rewards/rejected": -1.4214122295379639, "step": 4220 }, { "epoch": 0.73, "grad_norm": 3.296875, "learning_rate": 1.038335109011498e-06, "logits/chosen": -3.022573947906494, "logits/rejected": -3.018914222717285, "logps/chosen": -184.13241577148438, "logps/rejected": -202.57627868652344, "loss": 0.6306, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1667819023132324, "rewards/margins": 0.24214200675487518, "rewards/rejected": -1.408923864364624, "step": 4230 }, { "epoch": 0.73, "grad_norm": 2.296875, "learning_rate": 1.026159925207817e-06, "logits/chosen": -3.017408609390259, "logits/rejected": -3.0109002590179443, "logps/chosen": -178.22518920898438, "logps/rejected": -208.6528778076172, "loss": 0.5954, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1271382570266724, "rewards/margins": 0.3287648558616638, "rewards/rejected": -1.4559029340744019, "step": 4240 }, { "epoch": 0.73, "grad_norm": 2.96875, "learning_rate": 1.014038084112423e-06, "logits/chosen": -3.0114645957946777, "logits/rejected": -3.0079257488250732, "logps/chosen": -174.28128051757812, "logps/rejected": -194.43869018554688, "loss": 0.6103, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0754897594451904, "rewards/margins": 0.2575770914554596, "rewards/rejected": -1.3330668210983276, "step": 4250 }, { "epoch": 0.73, "grad_norm": 3.015625, "learning_rate": 1.001970024451229e-06, "logits/chosen": -3.0140953063964844, "logits/rejected": -3.005549907684326, "logps/chosen": -174.39950561523438, "logps/rejected": -202.94818115234375, "loss": 0.6097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0846554040908813, "rewards/margins": 0.31711500883102417, "rewards/rejected": -1.4017703533172607, "step": 4260 }, { "epoch": 0.74, "grad_norm": 3.125, "learning_rate": 9.899561830036372e-07, "logits/chosen": -3.0136845111846924, "logits/rejected": -3.0040946006774902, "logps/chosen": -169.96408081054688, "logps/rejected": -195.17189025878906, "loss": 0.6073, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.071539044380188, "rewards/margins": 0.28892484307289124, "rewards/rejected": -1.3604638576507568, "step": 4270 }, { "epoch": 0.74, "grad_norm": 2.5625, "learning_rate": 9.779969945867288e-07, "logits/chosen": -3.003387928009033, "logits/rejected": -2.9958813190460205, "logps/chosen": -173.567626953125, "logps/rejected": -199.57489013671875, "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1117585897445679, "rewards/margins": 0.297131210565567, "rewards/rejected": -1.4088897705078125, "step": 4280 }, { "epoch": 0.74, "grad_norm": 3.453125, "learning_rate": 9.660928920395274e-07, "logits/chosen": -2.9939751625061035, "logits/rejected": -2.9857096672058105, "logps/chosen": -186.2943115234375, "logps/rejected": -207.37026977539062, "loss": 0.6242, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2075446844100952, "rewards/margins": 0.27775177359580994, "rewards/rejected": -1.4852964878082275, "step": 4290 }, { "epoch": 0.74, "grad_norm": 2.40625, "learning_rate": 9.542443062073337e-07, "logits/chosen": -3.03216552734375, "logits/rejected": -3.0238797664642334, "logps/chosen": -174.03836059570312, "logps/rejected": -200.1016387939453, "loss": 0.6074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1056114435195923, "rewards/margins": 0.29145902395248413, "rewards/rejected": -1.3970704078674316, "step": 4300 }, { "epoch": 0.74, "eval_logits/chosen": -3.030604362487793, "eval_logits/rejected": -3.0247747898101807, "eval_logps/chosen": -160.95040893554688, "eval_logps/rejected": -181.0815887451172, "eval_loss": 0.6467998027801514, "eval_rewards/accuracies": 0.6182620525360107, "eval_rewards/chosen": -0.8954039216041565, "eval_rewards/margins": 0.16434147953987122, "eval_rewards/rejected": -1.0597453117370605, "eval_runtime": 483.7233, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 4300 }, { "epoch": 0.74, "grad_norm": 2.390625, "learning_rate": 9.424516659261304e-07, "logits/chosen": -3.006664276123047, "logits/rejected": -2.9954018592834473, "logps/chosen": -180.44845581054688, "logps/rejected": -201.24253845214844, "loss": 0.6192, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.152725338935852, "rewards/margins": 0.2573332190513611, "rewards/rejected": -1.4100584983825684, "step": 4310 }, { "epoch": 0.74, "grad_norm": 2.8125, "learning_rate": 9.307153980070624e-07, "logits/chosen": -3.0227952003479004, "logits/rejected": -3.010831832885742, "logps/chosen": -182.61170959472656, "logps/rejected": -214.8519744873047, "loss": 0.5658, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1262962818145752, "rewards/margins": 0.39121127128601074, "rewards/rejected": -1.517507553100586, "step": 4320 }, { "epoch": 0.75, "grad_norm": 2.765625, "learning_rate": 9.190359272209912e-07, "logits/chosen": -3.0103302001953125, "logits/rejected": -3.002786159515381, "logps/chosen": -179.65603637695312, "logps/rejected": -197.30091857910156, "loss": 0.6279, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1451447010040283, "rewards/margins": 0.2510768473148346, "rewards/rejected": -1.3962215185165405, "step": 4330 }, { "epoch": 0.75, "grad_norm": 3.140625, "learning_rate": 9.074136762831168e-07, "logits/chosen": -2.9997153282165527, "logits/rejected": -2.996535062789917, "logps/chosen": -174.4058380126953, "logps/rejected": -205.3519287109375, "loss": 0.5987, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1344711780548096, "rewards/margins": 0.33256837725639343, "rewards/rejected": -1.4670393466949463, "step": 4340 }, { "epoch": 0.75, "grad_norm": 3.109375, "learning_rate": 8.958490658376815e-07, "logits/chosen": -3.005241632461548, "logits/rejected": -2.999251365661621, "logps/chosen": -170.48207092285156, "logps/rejected": -198.5691375732422, "loss": 0.6109, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0801702737808228, "rewards/margins": 0.2803425192832947, "rewards/rejected": -1.3605127334594727, "step": 4350 }, { "epoch": 0.75, "grad_norm": 2.828125, "learning_rate": 8.843425144427442e-07, "logits/chosen": -3.0085055828094482, "logits/rejected": -2.9987130165100098, "logps/chosen": -188.10092163085938, "logps/rejected": -205.174560546875, "loss": 0.6524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2355743646621704, "rewards/margins": 0.21877996623516083, "rewards/rejected": -1.454354166984558, "step": 4360 }, { "epoch": 0.75, "grad_norm": 3.6875, "learning_rate": 8.728944385550328e-07, "logits/chosen": -3.0124564170837402, "logits/rejected": -3.0019426345825195, "logps/chosen": -180.61471557617188, "logps/rejected": -204.51443481445312, "loss": 0.6199, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1496288776397705, "rewards/margins": 0.26334765553474426, "rewards/rejected": -1.4129765033721924, "step": 4370 }, { "epoch": 0.75, "grad_norm": 2.671875, "learning_rate": 8.615052525148701e-07, "logits/chosen": -3.0287554264068604, "logits/rejected": -3.024423837661743, "logps/chosen": -179.65756225585938, "logps/rejected": -197.85726928710938, "loss": 0.6391, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1403712034225464, "rewards/margins": 0.2314852774143219, "rewards/rejected": -1.371856451034546, "step": 4380 }, { "epoch": 0.76, "grad_norm": 3.40625, "learning_rate": 8.501753685311784e-07, "logits/chosen": -3.0272891521453857, "logits/rejected": -3.0223135948181152, "logps/chosen": -176.3140411376953, "logps/rejected": -203.53335571289062, "loss": 0.6115, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1263439655303955, "rewards/margins": 0.28816819190979004, "rewards/rejected": -1.4145123958587646, "step": 4390 }, { "epoch": 0.76, "grad_norm": 3.3125, "learning_rate": 8.389051966665596e-07, "logits/chosen": -3.024461269378662, "logits/rejected": -3.0175955295562744, "logps/chosen": -181.77696228027344, "logps/rejected": -203.70669555664062, "loss": 0.6105, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1303656101226807, "rewards/margins": 0.2698122560977936, "rewards/rejected": -1.4001778364181519, "step": 4400 }, { "epoch": 0.76, "eval_logits/chosen": -3.0364904403686523, "eval_logits/rejected": -3.0306472778320312, "eval_logps/chosen": -160.46258544921875, "eval_logps/rejected": -180.574462890625, "eval_loss": 0.6469578146934509, "eval_rewards/accuracies": 0.6150093078613281, "eval_rewards/chosen": -0.8905255794525146, "eval_rewards/margins": 0.1641487181186676, "eval_rewards/rejected": -1.0546742677688599, "eval_runtime": 483.9156, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 4400 }, { "epoch": 0.76, "grad_norm": 2.71875, "learning_rate": 8.276951448224546e-07, "logits/chosen": -3.004551410675049, "logits/rejected": -2.995788097381592, "logps/chosen": -186.7025604248047, "logps/rejected": -206.0946502685547, "loss": 0.6432, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.20718514919281, "rewards/margins": 0.22127576172351837, "rewards/rejected": -1.4284608364105225, "step": 4410 }, { "epoch": 0.76, "grad_norm": 3.59375, "learning_rate": 8.165456187243797e-07, "logits/chosen": -3.0231869220733643, "logits/rejected": -3.0181660652160645, "logps/chosen": -182.51235961914062, "logps/rejected": -201.51275634765625, "loss": 0.6154, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.16275954246521, "rewards/margins": 0.2760746479034424, "rewards/rejected": -1.4388344287872314, "step": 4420 }, { "epoch": 0.76, "grad_norm": 3.984375, "learning_rate": 8.054570219072419e-07, "logits/chosen": -3.004575729370117, "logits/rejected": -2.997720241546631, "logps/chosen": -176.9400634765625, "logps/rejected": -197.44618225097656, "loss": 0.6364, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.131399393081665, "rewards/margins": 0.2509526312351227, "rewards/rejected": -1.3823518753051758, "step": 4430 }, { "epoch": 0.77, "grad_norm": 2.71875, "learning_rate": 7.944297557007366e-07, "logits/chosen": -3.0259780883789062, "logits/rejected": -3.019028663635254, "logps/chosen": -189.46664428710938, "logps/rejected": -213.1741485595703, "loss": 0.6029, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1963773965835571, "rewards/margins": 0.309071809053421, "rewards/rejected": -1.5054491758346558, "step": 4440 }, { "epoch": 0.77, "grad_norm": 2.234375, "learning_rate": 7.834642192148151e-07, "logits/chosen": -3.0177788734436035, "logits/rejected": -3.010500431060791, "logps/chosen": -172.21006774902344, "logps/rejected": -196.91000366210938, "loss": 0.6027, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0538257360458374, "rewards/margins": 0.2949651777744293, "rewards/rejected": -1.3487910032272339, "step": 4450 }, { "epoch": 0.77, "grad_norm": 2.640625, "learning_rate": 7.725608093252496e-07, "logits/chosen": -3.030818223953247, "logits/rejected": -3.021237850189209, "logps/chosen": -167.2049102783203, "logps/rejected": -199.4949188232422, "loss": 0.5868, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0465247631072998, "rewards/margins": 0.3491355776786804, "rewards/rejected": -1.395660161972046, "step": 4460 }, { "epoch": 0.77, "grad_norm": 2.71875, "learning_rate": 7.617199206592584e-07, "logits/chosen": -3.0402872562408447, "logits/rejected": -3.032195568084717, "logps/chosen": -176.73138427734375, "logps/rejected": -195.28189086914062, "loss": 0.6216, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1010245084762573, "rewards/margins": 0.2692503035068512, "rewards/rejected": -1.3702747821807861, "step": 4470 }, { "epoch": 0.77, "grad_norm": 4.0625, "learning_rate": 7.509419455812336e-07, "logits/chosen": -3.045377254486084, "logits/rejected": -3.0374984741210938, "logps/chosen": -174.28468322753906, "logps/rejected": -200.76377868652344, "loss": 0.6152, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1278858184814453, "rewards/margins": 0.27830731868743896, "rewards/rejected": -1.4061931371688843, "step": 4480 }, { "epoch": 0.77, "grad_norm": 2.921875, "learning_rate": 7.402272741785322e-07, "logits/chosen": -3.0193753242492676, "logits/rejected": -3.0095601081848145, "logps/chosen": -171.40904235839844, "logps/rejected": -198.89688110351562, "loss": 0.591, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0938918590545654, "rewards/margins": 0.3160261809825897, "rewards/rejected": -1.4099183082580566, "step": 4490 }, { "epoch": 0.78, "grad_norm": 2.546875, "learning_rate": 7.295762942473614e-07, "logits/chosen": -3.0159997940063477, "logits/rejected": -3.003957748413086, "logps/chosen": -178.99685668945312, "logps/rejected": -203.6125030517578, "loss": 0.6127, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1141316890716553, "rewards/margins": 0.31443601846694946, "rewards/rejected": -1.42856764793396, "step": 4500 }, { "epoch": 0.78, "eval_logits/chosen": -3.0338096618652344, "eval_logits/rejected": -3.027985095977783, "eval_logps/chosen": -160.40371704101562, "eval_logps/rejected": -180.48422241210938, "eval_loss": 0.6470324993133545, "eval_rewards/accuracies": 0.6182620525360107, "eval_rewards/chosen": -0.8899369239807129, "eval_rewards/margins": 0.16383494436740875, "eval_rewards/rejected": -1.0537718534469604, "eval_runtime": 483.655, "eval_samples_per_second": 8.899, "eval_steps_per_second": 1.112, "step": 4500 }, { "epoch": 0.78, "grad_norm": 2.625, "learning_rate": 7.189893912787424e-07, "logits/chosen": -3.0327839851379395, "logits/rejected": -3.0189476013183594, "logps/chosen": -175.37973022460938, "logps/rejected": -210.47427368164062, "loss": 0.5837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0987615585327148, "rewards/margins": 0.3621058762073517, "rewards/rejected": -1.4608676433563232, "step": 4510 }, { "epoch": 0.78, "grad_norm": 2.84375, "learning_rate": 7.084669484445581e-07, "logits/chosen": -3.0290794372558594, "logits/rejected": -3.021406412124634, "logps/chosen": -179.26211547851562, "logps/rejected": -196.85629272460938, "loss": 0.6376, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1606110334396362, "rewards/margins": 0.22728554904460907, "rewards/rejected": -1.3878967761993408, "step": 4520 }, { "epoch": 0.78, "grad_norm": 5.65625, "learning_rate": 6.980093465836852e-07, "logits/chosen": -3.0249483585357666, "logits/rejected": -3.0177531242370605, "logps/chosen": -177.0812530517578, "logps/rejected": -194.39430236816406, "loss": 0.6518, "rewards/accuracies": 0.59375, "rewards/chosen": -1.128304123878479, "rewards/margins": 0.20625057816505432, "rewards/rejected": -1.3345547914505005, "step": 4530 }, { "epoch": 0.78, "grad_norm": 4.09375, "learning_rate": 6.876169641882105e-07, "logits/chosen": -3.0121078491210938, "logits/rejected": -3.000190019607544, "logps/chosen": -173.17886352539062, "logps/rejected": -194.7333526611328, "loss": 0.6287, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0911271572113037, "rewards/margins": 0.26882293820381165, "rewards/rejected": -1.3599501848220825, "step": 4540 }, { "epoch": 0.78, "grad_norm": 2.734375, "learning_rate": 6.772901773897319e-07, "logits/chosen": -3.034879207611084, "logits/rejected": -3.024160385131836, "logps/chosen": -177.19334411621094, "logps/rejected": -202.35165405273438, "loss": 0.5996, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0899009704589844, "rewards/margins": 0.31121626496315, "rewards/rejected": -1.4011173248291016, "step": 4550 }, { "epoch": 0.79, "grad_norm": 2.484375, "learning_rate": 6.670293599457459e-07, "logits/chosen": -3.0138959884643555, "logits/rejected": -3.0035240650177, "logps/chosen": -173.7094268798828, "logps/rejected": -199.3247833251953, "loss": 0.5977, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0705434083938599, "rewards/margins": 0.31208616495132446, "rewards/rejected": -1.382629632949829, "step": 4560 }, { "epoch": 0.79, "grad_norm": 3.71875, "learning_rate": 6.568348832261174e-07, "logits/chosen": -3.0246381759643555, "logits/rejected": -3.0175058841705322, "logps/chosen": -179.7490997314453, "logps/rejected": -212.00320434570312, "loss": 0.6024, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1810725927352905, "rewards/margins": 0.3400949537754059, "rewards/rejected": -1.521167516708374, "step": 4570 }, { "epoch": 0.79, "grad_norm": 2.4375, "learning_rate": 6.467071161996447e-07, "logits/chosen": -3.007577657699585, "logits/rejected": -2.9982829093933105, "logps/chosen": -169.00662231445312, "logps/rejected": -191.3984832763672, "loss": 0.6086, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0316174030303955, "rewards/margins": 0.272244393825531, "rewards/rejected": -1.3038618564605713, "step": 4580 }, { "epoch": 0.79, "grad_norm": 3.53125, "learning_rate": 6.366464254206966e-07, "logits/chosen": -3.0268683433532715, "logits/rejected": -3.019697904586792, "logps/chosen": -181.07965087890625, "logps/rejected": -199.53695678710938, "loss": 0.6443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1539264917373657, "rewards/margins": 0.2402711808681488, "rewards/rejected": -1.394197702407837, "step": 4590 }, { "epoch": 0.79, "grad_norm": 3.0, "learning_rate": 6.266531750159557e-07, "logits/chosen": -3.022550106048584, "logits/rejected": -3.0063700675964355, "logps/chosen": -175.89822387695312, "logps/rejected": -211.6148223876953, "loss": 0.5798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1196292638778687, "rewards/margins": 0.37024620175361633, "rewards/rejected": -1.4898754358291626, "step": 4600 }, { "epoch": 0.79, "eval_logits/chosen": -3.0254907608032227, "eval_logits/rejected": -3.019495964050293, "eval_logps/chosen": -162.6863555908203, "eval_logps/rejected": -183.03439331054688, "eval_loss": 0.6467684507369995, "eval_rewards/accuracies": 0.6208178400993347, "eval_rewards/chosen": -0.9127631783485413, "eval_rewards/margins": 0.16651012003421783, "eval_rewards/rejected": -1.0792733430862427, "eval_runtime": 483.7395, "eval_samples_per_second": 8.897, "eval_steps_per_second": 1.112, "step": 4600 }, { "epoch": 0.79, "grad_norm": 3.296875, "learning_rate": 6.167277266712293e-07, "logits/chosen": -3.0018093585968018, "logits/rejected": -2.995985269546509, "logps/chosen": -184.42770385742188, "logps/rejected": -202.32168579101562, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -1.227630615234375, "rewards/margins": 0.2064712941646576, "rewards/rejected": -1.4341020584106445, "step": 4610 }, { "epoch": 0.8, "grad_norm": 3.109375, "learning_rate": 6.068704396183694e-07, "logits/chosen": -3.023332118988037, "logits/rejected": -3.015925168991089, "logps/chosen": -174.80226135253906, "logps/rejected": -200.43743896484375, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": -1.082917332649231, "rewards/margins": 0.28169089555740356, "rewards/rejected": -1.3646082878112793, "step": 4620 }, { "epoch": 0.8, "grad_norm": 2.84375, "learning_rate": 5.970816706222604e-07, "logits/chosen": -3.019421100616455, "logits/rejected": -3.0130579471588135, "logps/chosen": -186.56312561035156, "logps/rejected": -214.2733917236328, "loss": 0.5965, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.187298059463501, "rewards/margins": 0.31686994433403015, "rewards/rejected": -1.504167914390564, "step": 4630 }, { "epoch": 0.8, "grad_norm": 2.640625, "learning_rate": 5.873617739679172e-07, "logits/chosen": -3.006319522857666, "logits/rejected": -3.0017735958099365, "logps/chosen": -191.32012939453125, "logps/rejected": -214.06747436523438, "loss": 0.6296, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2619186639785767, "rewards/margins": 0.24370841681957245, "rewards/rejected": -1.5056270360946655, "step": 4640 }, { "epoch": 0.8, "grad_norm": 2.734375, "learning_rate": 5.77711101447652e-07, "logits/chosen": -3.0137367248535156, "logits/rejected": -3.00858998298645, "logps/chosen": -183.54359436035156, "logps/rejected": -204.957275390625, "loss": 0.6188, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1888493299484253, "rewards/margins": 0.2691509425640106, "rewards/rejected": -1.4580004215240479, "step": 4650 }, { "epoch": 0.8, "grad_norm": 2.6875, "learning_rate": 5.681300023483521e-07, "logits/chosen": -3.017210006713867, "logits/rejected": -3.0096325874328613, "logps/chosen": -180.97731018066406, "logps/rejected": -202.33935546875, "loss": 0.613, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1559107303619385, "rewards/margins": 0.2476036101579666, "rewards/rejected": -1.4035141468048096, "step": 4660 }, { "epoch": 0.8, "grad_norm": 2.90625, "learning_rate": 5.586188234388306e-07, "logits/chosen": -3.0169410705566406, "logits/rejected": -3.0076098442077637, "logps/chosen": -173.56065368652344, "logps/rejected": -200.5047607421875, "loss": 0.5863, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1080338954925537, "rewards/margins": 0.3275201916694641, "rewards/rejected": -1.4355541467666626, "step": 4670 }, { "epoch": 0.81, "grad_norm": 2.765625, "learning_rate": 5.491779089572793e-07, "logits/chosen": -3.0221850872039795, "logits/rejected": -3.019211769104004, "logps/chosen": -181.35293579101562, "logps/rejected": -202.7132568359375, "loss": 0.6375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1753777265548706, "rewards/margins": 0.2501986026763916, "rewards/rejected": -1.4255764484405518, "step": 4680 }, { "epoch": 0.81, "grad_norm": 2.359375, "learning_rate": 5.398076005988082e-07, "logits/chosen": -3.0308825969696045, "logits/rejected": -3.0206403732299805, "logps/chosen": -186.6603240966797, "logps/rejected": -218.62936401367188, "loss": 0.5978, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.204353928565979, "rewards/margins": 0.3522704541683197, "rewards/rejected": -1.556624412536621, "step": 4690 }, { "epoch": 0.81, "grad_norm": 3.71875, "learning_rate": 5.305082375030798e-07, "logits/chosen": -3.0150041580200195, "logits/rejected": -3.006856679916382, "logps/chosen": -181.75979614257812, "logps/rejected": -209.6063232421875, "loss": 0.6228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1912561655044556, "rewards/margins": 0.2754915654659271, "rewards/rejected": -1.466747760772705, "step": 4700 }, { "epoch": 0.81, "eval_logits/chosen": -3.0290753841400146, "eval_logits/rejected": -3.0230636596679688, "eval_logps/chosen": -163.55618286132812, "eval_logps/rejected": -184.06402587890625, "eval_loss": 0.6466771960258484, "eval_rewards/accuracies": 0.6191914677619934, "eval_rewards/chosen": -0.9214615821838379, "eval_rewards/margins": 0.1681082397699356, "eval_rewards/rejected": -1.0895699262619019, "eval_runtime": 483.8875, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 4700 }, { "epoch": 0.81, "grad_norm": 3.203125, "learning_rate": 5.212801562420342e-07, "logits/chosen": -3.027268409729004, "logits/rejected": -3.017737627029419, "logps/chosen": -182.80491638183594, "logps/rejected": -208.4292449951172, "loss": 0.5857, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1425063610076904, "rewards/margins": 0.3425898849964142, "rewards/rejected": -1.4850962162017822, "step": 4710 }, { "epoch": 0.81, "grad_norm": 3.15625, "learning_rate": 5.121236908077063e-07, "logits/chosen": -3.0060229301452637, "logits/rejected": -2.9983930587768555, "logps/chosen": -183.74058532714844, "logps/rejected": -213.54690551757812, "loss": 0.5862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1665380001068115, "rewards/margins": 0.361175000667572, "rewards/rejected": -1.5277130603790283, "step": 4720 }, { "epoch": 0.81, "grad_norm": 3.328125, "learning_rate": 5.030391726001394e-07, "logits/chosen": -3.003763198852539, "logits/rejected": -2.9963369369506836, "logps/chosen": -181.78604125976562, "logps/rejected": -205.4948272705078, "loss": 0.6263, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.171957015991211, "rewards/margins": 0.2908283770084381, "rewards/rejected": -1.462785243988037, "step": 4730 }, { "epoch": 0.82, "grad_norm": 3.21875, "learning_rate": 4.940269304153919e-07, "logits/chosen": -2.9984312057495117, "logits/rejected": -2.9876062870025635, "logps/chosen": -174.63938903808594, "logps/rejected": -210.2099609375, "loss": 0.5725, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0791208744049072, "rewards/margins": 0.3925134241580963, "rewards/rejected": -1.4716343879699707, "step": 4740 }, { "epoch": 0.82, "grad_norm": 3.078125, "learning_rate": 4.850872904336307e-07, "logits/chosen": -3.005293607711792, "logits/rejected": -3.0054850578308105, "logps/chosen": -181.31361389160156, "logps/rejected": -202.7025604248047, "loss": 0.6171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1545989513397217, "rewards/margins": 0.23978309333324432, "rewards/rejected": -1.394382119178772, "step": 4750 }, { "epoch": 0.82, "grad_norm": 2.84375, "learning_rate": 4.762205762073363e-07, "logits/chosen": -3.003577947616577, "logits/rejected": -2.994811534881592, "logps/chosen": -182.7504425048828, "logps/rejected": -210.9694366455078, "loss": 0.6039, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.204798698425293, "rewards/margins": 0.3121243119239807, "rewards/rejected": -1.516923189163208, "step": 4760 }, { "epoch": 0.82, "grad_norm": 2.640625, "learning_rate": 4.6742710864958103e-07, "logits/chosen": -3.023016929626465, "logits/rejected": -3.009099006652832, "logps/chosen": -189.5552215576172, "logps/rejected": -213.70492553710938, "loss": 0.6092, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1962745189666748, "rewards/margins": 0.3164735436439514, "rewards/rejected": -1.5127480030059814, "step": 4770 }, { "epoch": 0.82, "grad_norm": 3.25, "learning_rate": 4.5870720602242513e-07, "logits/chosen": -3.0070056915283203, "logits/rejected": -2.9952831268310547, "logps/chosen": -178.74525451660156, "logps/rejected": -210.75643920898438, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1661994457244873, "rewards/margins": 0.3252885341644287, "rewards/rejected": -1.4914880990982056, "step": 4780 }, { "epoch": 0.83, "grad_norm": 2.75, "learning_rate": 4.500611839253871e-07, "logits/chosen": -3.0066330432891846, "logits/rejected": -2.999293327331543, "logps/chosen": -189.28778076171875, "logps/rejected": -203.16702270507812, "loss": 0.6432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.214068055152893, "rewards/margins": 0.21259085834026337, "rewards/rejected": -1.4266588687896729, "step": 4790 }, { "epoch": 0.83, "grad_norm": 2.546875, "learning_rate": 4.4148935528403244e-07, "logits/chosen": -2.9953420162200928, "logits/rejected": -2.9865942001342773, "logps/chosen": -182.12466430664062, "logps/rejected": -209.14828491210938, "loss": 0.6131, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1962429285049438, "rewards/margins": 0.30054768919944763, "rewards/rejected": -1.4967906475067139, "step": 4800 }, { "epoch": 0.83, "eval_logits/chosen": -3.020195960998535, "eval_logits/rejected": -3.014082670211792, "eval_logps/chosen": -165.31649780273438, "eval_logps/rejected": -186.017578125, "eval_loss": 0.6466081738471985, "eval_rewards/accuracies": 0.6198884844779968, "eval_rewards/chosen": -0.9390648603439331, "eval_rewards/margins": 0.1700403094291687, "eval_rewards/rejected": -1.1091052293777466, "eval_runtime": 483.8577, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 4800 }, { "epoch": 0.83, "grad_norm": 2.765625, "learning_rate": 4.3299203033863643e-07, "logits/chosen": -3.0014617443084717, "logits/rejected": -2.9942636489868164, "logps/chosen": -184.14466857910156, "logps/rejected": -207.19735717773438, "loss": 0.601, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1630815267562866, "rewards/margins": 0.31077930331230164, "rewards/rejected": -1.4738609790802002, "step": 4810 }, { "epoch": 0.83, "grad_norm": 3.234375, "learning_rate": 4.245695166329661e-07, "logits/chosen": -3.0155398845672607, "logits/rejected": -3.00993013381958, "logps/chosen": -176.3240966796875, "logps/rejected": -202.77798461914062, "loss": 0.6134, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1332825422286987, "rewards/margins": 0.29605549573898315, "rewards/rejected": -1.429337978363037, "step": 4820 }, { "epoch": 0.83, "grad_norm": 3.015625, "learning_rate": 4.1622211900314235e-07, "logits/chosen": -3.0176029205322266, "logits/rejected": -3.0059218406677246, "logps/chosen": -178.7200927734375, "logps/rejected": -202.0586395263672, "loss": 0.6233, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1384029388427734, "rewards/margins": 0.27069368958473206, "rewards/rejected": -1.4090964794158936, "step": 4830 }, { "epoch": 0.83, "grad_norm": 2.328125, "learning_rate": 4.0795013956660884e-07, "logits/chosen": -3.0080854892730713, "logits/rejected": -2.9921693801879883, "logps/chosen": -189.64425659179688, "logps/rejected": -220.44540405273438, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.226061463356018, "rewards/margins": 0.37052440643310547, "rewards/rejected": -1.5965858697891235, "step": 4840 }, { "epoch": 0.84, "grad_norm": 2.546875, "learning_rate": 3.9975387771119925e-07, "logits/chosen": -3.0115370750427246, "logits/rejected": -3.0042665004730225, "logps/chosen": -176.58824157714844, "logps/rejected": -204.17892456054688, "loss": 0.6, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1130057573318481, "rewards/margins": 0.30895477533340454, "rewards/rejected": -1.421960473060608, "step": 4850 }, { "epoch": 0.84, "grad_norm": 3.640625, "learning_rate": 3.916336300842988e-07, "logits/chosen": -3.016387701034546, "logits/rejected": -3.008925199508667, "logps/chosen": -188.9263916015625, "logps/rejected": -202.9337158203125, "loss": 0.6344, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1967926025390625, "rewards/margins": 0.24283523857593536, "rewards/rejected": -1.4396278858184814, "step": 4860 }, { "epoch": 0.84, "grad_norm": 3.140625, "learning_rate": 3.8358969058210957e-07, "logits/chosen": -3.0090579986572266, "logits/rejected": -3.0010299682617188, "logps/chosen": -190.43182373046875, "logps/rejected": -213.4534912109375, "loss": 0.6068, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2139747142791748, "rewards/margins": 0.32153528928756714, "rewards/rejected": -1.5355100631713867, "step": 4870 }, { "epoch": 0.84, "grad_norm": 2.875, "learning_rate": 3.7562235033901273e-07, "logits/chosen": -3.0088894367218018, "logits/rejected": -3.0016369819641113, "logps/chosen": -179.21157836914062, "logps/rejected": -201.3288116455078, "loss": 0.6141, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1554762125015259, "rewards/margins": 0.2841919958591461, "rewards/rejected": -1.4396681785583496, "step": 4880 }, { "epoch": 0.84, "grad_norm": 3.328125, "learning_rate": 3.677318977170324e-07, "logits/chosen": -3.0278687477111816, "logits/rejected": -3.020611047744751, "logps/chosen": -182.8571319580078, "logps/rejected": -213.39736938476562, "loss": 0.5899, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1728826761245728, "rewards/margins": 0.3418412506580353, "rewards/rejected": -1.514723777770996, "step": 4890 }, { "epoch": 0.84, "grad_norm": 2.765625, "learning_rate": 3.599186182953973e-07, "logits/chosen": -3.0165491104125977, "logits/rejected": -3.0070672035217285, "logps/chosen": -181.3474578857422, "logps/rejected": -204.9975128173828, "loss": 0.6215, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1810262203216553, "rewards/margins": 0.29226669669151306, "rewards/rejected": -1.4732929468154907, "step": 4900 }, { "epoch": 0.84, "eval_logits/chosen": -3.024120807647705, "eval_logits/rejected": -3.0179731845855713, "eval_logps/chosen": -166.19190979003906, "eval_logps/rejected": -186.99465942382812, "eval_loss": 0.6464829444885254, "eval_rewards/accuracies": 0.6196561455726624, "eval_rewards/chosen": -0.94781893491745, "eval_rewards/margins": 0.1710570603609085, "eval_rewards/rejected": -1.1188760995864868, "eval_runtime": 483.8313, "eval_samples_per_second": 8.896, "eval_steps_per_second": 1.112, "step": 4900 }, { "epoch": 0.85, "grad_norm": 3.40625, "learning_rate": 3.5218279486020605e-07, "logits/chosen": -3.0371718406677246, "logits/rejected": -3.0303807258605957, "logps/chosen": -181.00103759765625, "logps/rejected": -205.81491088867188, "loss": 0.6047, "rewards/accuracies": 0.71875, "rewards/chosen": -1.141796350479126, "rewards/margins": 0.31500619649887085, "rewards/rejected": -1.4568026065826416, "step": 4910 }, { "epoch": 0.85, "grad_norm": 2.546875, "learning_rate": 3.445247073941932e-07, "logits/chosen": -2.993680477142334, "logits/rejected": -2.9794511795043945, "logps/chosen": -185.45579528808594, "logps/rejected": -223.18002319335938, "loss": 0.5763, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2101439237594604, "rewards/margins": 0.40528878569602966, "rewards/rejected": -1.6154325008392334, "step": 4920 }, { "epoch": 0.85, "grad_norm": 2.859375, "learning_rate": 3.369446330665918e-07, "logits/chosen": -3.025158405303955, "logits/rejected": -3.0162906646728516, "logps/chosen": -186.8469696044922, "logps/rejected": -216.64041137695312, "loss": 0.6315, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2324554920196533, "rewards/margins": 0.2733253836631775, "rewards/rejected": -1.505780816078186, "step": 4930 }, { "epoch": 0.85, "grad_norm": 3.578125, "learning_rate": 3.2944284622310834e-07, "logits/chosen": -3.0310635566711426, "logits/rejected": -3.019331216812134, "logps/chosen": -186.7605438232422, "logps/rejected": -215.85299682617188, "loss": 0.5915, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2067301273345947, "rewards/margins": 0.35939091444015503, "rewards/rejected": -1.5661208629608154, "step": 4940 }, { "epoch": 0.85, "grad_norm": 3.03125, "learning_rate": 3.220196183759855e-07, "logits/chosen": -3.004424810409546, "logits/rejected": -2.995706558227539, "logps/chosen": -180.77694702148438, "logps/rejected": -209.75, "loss": 0.6066, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.145007848739624, "rewards/margins": 0.31725460290908813, "rewards/rejected": -1.4622623920440674, "step": 4950 }, { "epoch": 0.85, "grad_norm": 3.21875, "learning_rate": 3.146752181941834e-07, "logits/chosen": -3.0184290409088135, "logits/rejected": -3.003715991973877, "logps/chosen": -179.36331176757812, "logps/rejected": -217.3169403076172, "loss": 0.5828, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1723054647445679, "rewards/margins": 0.3610745966434479, "rewards/rejected": -1.5333800315856934, "step": 4960 }, { "epoch": 0.86, "grad_norm": 3.3125, "learning_rate": 3.074099114936491e-07, "logits/chosen": -3.006310224533081, "logits/rejected": -2.994600534439087, "logps/chosen": -179.55746459960938, "logps/rejected": -214.8990478515625, "loss": 0.5762, "rewards/accuracies": 0.6875, "rewards/chosen": -1.173987865447998, "rewards/margins": 0.38004904985427856, "rewards/rejected": -1.5540368556976318, "step": 4970 }, { "epoch": 0.86, "grad_norm": 2.53125, "learning_rate": 3.002239612276991e-07, "logits/chosen": -3.0120511054992676, "logits/rejected": -2.998878002166748, "logps/chosen": -181.58895874023438, "logps/rejected": -212.9563446044922, "loss": 0.594, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1734919548034668, "rewards/margins": 0.32563871145248413, "rewards/rejected": -1.4991306066513062, "step": 4980 }, { "epoch": 0.86, "grad_norm": 2.90625, "learning_rate": 2.931176274775024e-07, "logits/chosen": -3.0127527713775635, "logits/rejected": -3.002488374710083, "logps/chosen": -182.4782257080078, "logps/rejected": -219.88577270507812, "loss": 0.5831, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2037298679351807, "rewards/margins": 0.362371027469635, "rewards/rejected": -1.5661009550094604, "step": 4990 }, { "epoch": 0.86, "grad_norm": 3.03125, "learning_rate": 2.8609116744266586e-07, "logits/chosen": -3.022179126739502, "logits/rejected": -3.0086216926574707, "logps/chosen": -181.14414978027344, "logps/rejected": -212.1620635986328, "loss": 0.585, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1410472393035889, "rewards/margins": 0.36742258071899414, "rewards/rejected": -1.5084699392318726, "step": 5000 }, { "epoch": 0.86, "eval_logits/chosen": -3.0225670337677, "eval_logits/rejected": -3.01635479927063, "eval_logps/chosen": -167.32522583007812, "eval_logps/rejected": -188.31539916992188, "eval_loss": 0.6460159420967102, "eval_rewards/accuracies": 0.6201208233833313, "eval_rewards/chosen": -0.9591519236564636, "eval_rewards/margins": 0.17293164134025574, "eval_rewards/rejected": -1.132083535194397, "eval_runtime": 484.0695, "eval_samples_per_second": 8.891, "eval_steps_per_second": 1.111, "step": 5000 }, { "epoch": 0.86, "grad_norm": 2.8125, "learning_rate": 2.791448354319265e-07, "logits/chosen": -3.002469301223755, "logits/rejected": -2.992302417755127, "logps/chosen": -188.56805419921875, "logps/rejected": -220.1179656982422, "loss": 0.5861, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.252831220626831, "rewards/margins": 0.3709490895271301, "rewards/rejected": -1.6237804889678955, "step": 5010 }, { "epoch": 0.86, "grad_norm": 2.84375, "learning_rate": 2.722788828539469e-07, "logits/chosen": -2.9895589351654053, "logits/rejected": -2.978393077850342, "logps/chosen": -181.16273498535156, "logps/rejected": -212.7776336669922, "loss": 0.5968, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1708439588546753, "rewards/margins": 0.3541907072067261, "rewards/rejected": -1.5250345468521118, "step": 5020 }, { "epoch": 0.87, "grad_norm": 4.21875, "learning_rate": 2.65493558208216e-07, "logits/chosen": -3.0141069889068604, "logits/rejected": -3.003230333328247, "logps/chosen": -187.78970336914062, "logps/rejected": -213.2385711669922, "loss": 0.6226, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2488605976104736, "rewards/margins": 0.2681311070919037, "rewards/rejected": -1.5169916152954102, "step": 5030 }, { "epoch": 0.87, "grad_norm": 3.0, "learning_rate": 2.5878910707605535e-07, "logits/chosen": -3.023101568222046, "logits/rejected": -3.015597105026245, "logps/chosen": -193.99081420898438, "logps/rejected": -211.8629608154297, "loss": 0.6181, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.257229208946228, "rewards/margins": 0.2696048617362976, "rewards/rejected": -1.52683424949646, "step": 5040 }, { "epoch": 0.87, "grad_norm": 2.359375, "learning_rate": 2.5216577211173045e-07, "logits/chosen": -3.0130486488342285, "logits/rejected": -3.0070443153381348, "logps/chosen": -187.1128692626953, "logps/rejected": -210.8717041015625, "loss": 0.6204, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1955726146697998, "rewards/margins": 0.27631038427352905, "rewards/rejected": -1.4718830585479736, "step": 5050 }, { "epoch": 0.87, "grad_norm": 3.140625, "learning_rate": 2.4562379303366855e-07, "logits/chosen": -2.9913368225097656, "logits/rejected": -2.9847044944763184, "logps/chosen": -184.39361572265625, "logps/rejected": -209.1636962890625, "loss": 0.6384, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2389198541641235, "rewards/margins": 0.24873094260692596, "rewards/rejected": -1.4876508712768555, "step": 5060 }, { "epoch": 0.87, "grad_norm": 3.28125, "learning_rate": 2.39163406615783e-07, "logits/chosen": -2.9937796592712402, "logits/rejected": -2.9822850227355957, "logps/chosen": -190.34483337402344, "logps/rejected": -209.4861602783203, "loss": 0.6359, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2612758874893188, "rewards/margins": 0.2758980095386505, "rewards/rejected": -1.5371739864349365, "step": 5070 }, { "epoch": 0.88, "grad_norm": 2.671875, "learning_rate": 2.327848466789029e-07, "logits/chosen": -3.0329487323760986, "logits/rejected": -3.0216927528381348, "logps/chosen": -183.50765991210938, "logps/rejected": -208.6122283935547, "loss": 0.5977, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1554895639419556, "rewards/margins": 0.3325926661491394, "rewards/rejected": -1.4880822896957397, "step": 5080 }, { "epoch": 0.88, "grad_norm": 3.140625, "learning_rate": 2.2648834408231012e-07, "logits/chosen": -3.027855157852173, "logits/rejected": -3.01641845703125, "logps/chosen": -180.16256713867188, "logps/rejected": -208.2643585205078, "loss": 0.5918, "rewards/accuracies": 0.71875, "rewards/chosen": -1.156362771987915, "rewards/margins": 0.3350638747215271, "rewards/rejected": -1.491426706314087, "step": 5090 }, { "epoch": 0.88, "grad_norm": 4.1875, "learning_rate": 2.2027412671538517e-07, "logits/chosen": -3.003014326095581, "logits/rejected": -2.9966347217559814, "logps/chosen": -187.82479858398438, "logps/rejected": -203.16989135742188, "loss": 0.6478, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2197223901748657, "rewards/margins": 0.20297710597515106, "rewards/rejected": -1.4226996898651123, "step": 5100 }, { "epoch": 0.88, "eval_logits/chosen": -3.0212771892547607, "eval_logits/rejected": -3.015050172805786, "eval_logps/chosen": -167.4737091064453, "eval_logps/rejected": -188.46954345703125, "eval_loss": 0.6460275650024414, "eval_rewards/accuracies": 0.6194238066673279, "eval_rewards/chosen": -0.9606368541717529, "eval_rewards/margins": 0.172988161444664, "eval_rewards/rejected": -1.1336250305175781, "eval_runtime": 483.893, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 5100 }, { "epoch": 0.88, "grad_norm": 3.0, "learning_rate": 2.1414241948935822e-07, "logits/chosen": -3.0174098014831543, "logits/rejected": -3.010067939758301, "logps/chosen": -200.5458526611328, "logps/rejected": -214.78872680664062, "loss": 0.6678, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3194735050201416, "rewards/margins": 0.20971199870109558, "rewards/rejected": -1.5291855335235596, "step": 5110 }, { "epoch": 0.88, "grad_norm": 3.390625, "learning_rate": 2.0809344432916905e-07, "logits/chosen": -3.0166664123535156, "logits/rejected": -3.0095877647399902, "logps/chosen": -186.00674438476562, "logps/rejected": -206.6129150390625, "loss": 0.622, "rewards/accuracies": 0.625, "rewards/chosen": -1.2112102508544922, "rewards/margins": 0.26906818151474, "rewards/rejected": -1.480278491973877, "step": 5120 }, { "epoch": 0.88, "grad_norm": 2.484375, "learning_rate": 2.0212742016543468e-07, "logits/chosen": -3.002864360809326, "logits/rejected": -2.9982025623321533, "logps/chosen": -187.5155487060547, "logps/rejected": -212.5284423828125, "loss": 0.5957, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.233302354812622, "rewards/margins": 0.3133721947669983, "rewards/rejected": -1.5466746091842651, "step": 5130 }, { "epoch": 0.89, "grad_norm": 2.90625, "learning_rate": 1.9624456292652667e-07, "logits/chosen": -3.001178026199341, "logits/rejected": -2.999664545059204, "logps/chosen": -194.84710693359375, "logps/rejected": -202.6844482421875, "loss": 0.6722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2551358938217163, "rewards/margins": 0.16700053215026855, "rewards/rejected": -1.4221365451812744, "step": 5140 }, { "epoch": 0.89, "grad_norm": 3.359375, "learning_rate": 1.9044508553075436e-07, "logits/chosen": -3.0168654918670654, "logits/rejected": -3.0097479820251465, "logps/chosen": -186.14413452148438, "logps/rejected": -213.52523803710938, "loss": 0.6104, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2238632440567017, "rewards/margins": 0.2837482988834381, "rewards/rejected": -1.5076117515563965, "step": 5150 }, { "epoch": 0.89, "grad_norm": 3.65625, "learning_rate": 1.8472919787865971e-07, "logits/chosen": -3.012164354324341, "logits/rejected": -3.0114898681640625, "logps/chosen": -184.64852905273438, "logps/rejected": -200.6281280517578, "loss": 0.654, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2310230731964111, "rewards/margins": 0.20353946089744568, "rewards/rejected": -1.4345625638961792, "step": 5160 }, { "epoch": 0.89, "grad_norm": 2.46875, "learning_rate": 1.7909710684542225e-07, "logits/chosen": -3.0000133514404297, "logits/rejected": -2.9884800910949707, "logps/chosen": -187.14547729492188, "logps/rejected": -217.1771697998047, "loss": 0.5846, "rewards/accuracies": 0.75, "rewards/chosen": -1.1928120851516724, "rewards/margins": 0.3771010637283325, "rewards/rejected": -1.5699129104614258, "step": 5170 }, { "epoch": 0.89, "grad_norm": 3.390625, "learning_rate": 1.735490162733658e-07, "logits/chosen": -3.0229477882385254, "logits/rejected": -3.0149283409118652, "logps/chosen": -190.2616729736328, "logps/rejected": -212.183349609375, "loss": 0.631, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.243887186050415, "rewards/margins": 0.2436877191066742, "rewards/rejected": -1.4875750541687012, "step": 5180 }, { "epoch": 0.89, "grad_norm": 3.015625, "learning_rate": 1.6808512696458862e-07, "logits/chosen": -3.0097222328186035, "logits/rejected": -3.0068881511688232, "logps/chosen": -187.0318145751953, "logps/rejected": -207.45431518554688, "loss": 0.6469, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2396968603134155, "rewards/margins": 0.21176902949810028, "rewards/rejected": -1.4514659643173218, "step": 5190 }, { "epoch": 0.9, "grad_norm": 3.234375, "learning_rate": 1.6270563667368872e-07, "logits/chosen": -3.0239500999450684, "logits/rejected": -3.0173819065093994, "logps/chosen": -183.49264526367188, "logps/rejected": -212.5428924560547, "loss": 0.6018, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.213315725326538, "rewards/margins": 0.3017173409461975, "rewards/rejected": -1.5150331258773804, "step": 5200 }, { "epoch": 0.9, "eval_logits/chosen": -3.016746997833252, "eval_logits/rejected": -3.0105228424072266, "eval_logps/chosen": -167.12588500976562, "eval_logps/rejected": -188.0692138671875, "eval_loss": 0.6461666226387024, "eval_rewards/accuracies": 0.6205855011940002, "eval_rewards/chosen": -0.9571587443351746, "eval_rewards/margins": 0.17246277630329132, "eval_rewards/rejected": -1.1296215057373047, "eval_runtime": 483.6852, "eval_samples_per_second": 8.898, "eval_steps_per_second": 1.112, "step": 5200 }, { "epoch": 0.9, "grad_norm": 2.421875, "learning_rate": 1.5741074010061252e-07, "logits/chosen": -3.0030903816223145, "logits/rejected": -2.9979660511016846, "logps/chosen": -184.4320831298828, "logps/rejected": -202.42068481445312, "loss": 0.6448, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.223050594329834, "rewards/margins": 0.21549400687217712, "rewards/rejected": -1.438544750213623, "step": 5210 }, { "epoch": 0.9, "grad_norm": 2.578125, "learning_rate": 1.5220062888360172e-07, "logits/chosen": -3.02173113822937, "logits/rejected": -3.0115773677825928, "logps/chosen": -178.15750122070312, "logps/rejected": -200.08419799804688, "loss": 0.6451, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.159388780593872, "rewards/margins": 0.20702338218688965, "rewards/rejected": -1.3664120435714722, "step": 5220 }, { "epoch": 0.9, "grad_norm": 2.890625, "learning_rate": 1.4707549159226425e-07, "logits/chosen": -3.0073437690734863, "logits/rejected": -3.002361297607422, "logps/chosen": -185.7303466796875, "logps/rejected": -219.1875762939453, "loss": 0.5854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1822620630264282, "rewards/margins": 0.3672925531864166, "rewards/rejected": -1.549554467201233, "step": 5230 }, { "epoch": 0.9, "grad_norm": 2.9375, "learning_rate": 1.4203551372074382e-07, "logits/chosen": -2.993562698364258, "logits/rejected": -2.979778289794922, "logps/chosen": -186.5539093017578, "logps/rejected": -229.2770233154297, "loss": 0.5627, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.231220006942749, "rewards/margins": 0.4505365788936615, "rewards/rejected": -1.681756615638733, "step": 5240 }, { "epoch": 0.9, "grad_norm": 3.5, "learning_rate": 1.3708087768100897e-07, "logits/chosen": -2.997523546218872, "logits/rejected": -2.9901375770568848, "logps/chosen": -180.35211181640625, "logps/rejected": -211.8171844482422, "loss": 0.5884, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1825653314590454, "rewards/margins": 0.3402937054634094, "rewards/rejected": -1.52285897731781, "step": 5250 }, { "epoch": 0.91, "grad_norm": 3.3125, "learning_rate": 1.3221176279625047e-07, "logits/chosen": -3.0125765800476074, "logits/rejected": -3.005173444747925, "logps/chosen": -180.85592651367188, "logps/rejected": -203.88833618164062, "loss": 0.6035, "rewards/accuracies": 0.65625, "rewards/chosen": -1.139540195465088, "rewards/margins": 0.3113035261631012, "rewards/rejected": -1.4508435726165771, "step": 5260 }, { "epoch": 0.91, "grad_norm": 2.671875, "learning_rate": 1.2742834529439112e-07, "logits/chosen": -3.024702548980713, "logits/rejected": -3.0192930698394775, "logps/chosen": -182.72805786132812, "logps/rejected": -210.27511596679688, "loss": 0.6119, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.197000503540039, "rewards/margins": 0.30947452783584595, "rewards/rejected": -1.5064748525619507, "step": 5270 }, { "epoch": 0.91, "grad_norm": 3.078125, "learning_rate": 1.2273079830170787e-07, "logits/chosen": -3.0090789794921875, "logits/rejected": -2.996187925338745, "logps/chosen": -193.37173461914062, "logps/rejected": -212.6551055908203, "loss": 0.6347, "rewards/accuracies": 0.625, "rewards/chosen": -1.2886271476745605, "rewards/margins": 0.25257769227027893, "rewards/rejected": -1.5412046909332275, "step": 5280 }, { "epoch": 0.91, "grad_norm": 3.109375, "learning_rate": 1.181192918365645e-07, "logits/chosen": -2.9993698596954346, "logits/rejected": -2.9934823513031006, "logps/chosen": -187.09957885742188, "logps/rejected": -205.87130737304688, "loss": 0.6187, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1988855600357056, "rewards/margins": 0.26499563455581665, "rewards/rejected": -1.463881254196167, "step": 5290 }, { "epoch": 0.91, "grad_norm": 3.34375, "learning_rate": 1.1359399280326034e-07, "logits/chosen": -3.008162021636963, "logits/rejected": -2.9974722862243652, "logps/chosen": -188.11373901367188, "logps/rejected": -216.1291046142578, "loss": 0.5963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2095937728881836, "rewards/margins": 0.32107558846473694, "rewards/rejected": -1.5306695699691772, "step": 5300 }, { "epoch": 0.91, "eval_logits/chosen": -3.022860288619995, "eval_logits/rejected": -3.0166664123535156, "eval_logps/chosen": -167.0541229248047, "eval_logps/rejected": -187.92852783203125, "eval_loss": 0.6464575529098511, "eval_rewards/accuracies": 0.6198884844779968, "eval_rewards/chosen": -0.9564412236213684, "eval_rewards/margins": 0.17177370190620422, "eval_rewards/rejected": -1.128214955329895, "eval_runtime": 483.9372, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 5300 }, { "epoch": 0.91, "grad_norm": 2.59375, "learning_rate": 1.0915506498598711e-07, "logits/chosen": -3.0100769996643066, "logits/rejected": -3.0087945461273193, "logps/chosen": -195.86776733398438, "logps/rejected": -212.3857421875, "loss": 0.6352, "rewards/accuracies": 0.625, "rewards/chosen": -1.274717092514038, "rewards/margins": 0.24112406373023987, "rewards/rejected": -1.5158412456512451, "step": 5310 }, { "epoch": 0.92, "grad_norm": 3.03125, "learning_rate": 1.0480266904290298e-07, "logits/chosen": -3.0121946334838867, "logits/rejected": -3.0049402713775635, "logps/chosen": -190.0812530517578, "logps/rejected": -210.83505249023438, "loss": 0.6051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2137463092803955, "rewards/margins": 0.3173975646495819, "rewards/rejected": -1.5311439037322998, "step": 5320 }, { "epoch": 0.92, "grad_norm": 2.96875, "learning_rate": 1.0053696250031803e-07, "logits/chosen": -3.0021157264709473, "logits/rejected": -2.9936251640319824, "logps/chosen": -179.1097869873047, "logps/rejected": -220.5721435546875, "loss": 0.5729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1843111515045166, "rewards/margins": 0.4195634722709656, "rewards/rejected": -1.6038745641708374, "step": 5330 }, { "epoch": 0.92, "grad_norm": 3.109375, "learning_rate": 9.635809974698929e-08, "logits/chosen": -3.019029140472412, "logits/rejected": -3.0142836570739746, "logps/chosen": -183.40927124023438, "logps/rejected": -204.5322723388672, "loss": 0.5946, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1520172357559204, "rewards/margins": 0.29836225509643555, "rewards/rejected": -1.4503793716430664, "step": 5340 }, { "epoch": 0.92, "grad_norm": 2.46875, "learning_rate": 9.22662320285389e-08, "logits/chosen": -3.005446672439575, "logits/rejected": -3.004295825958252, "logps/chosen": -184.45816040039062, "logps/rejected": -205.4375762939453, "loss": 0.6337, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.210837483406067, "rewards/margins": 0.25519412755966187, "rewards/rejected": -1.4660316705703735, "step": 5350 }, { "epoch": 0.92, "grad_norm": 2.84375, "learning_rate": 8.826150744197403e-08, "logits/chosen": -3.0155885219573975, "logits/rejected": -3.0053577423095703, "logps/chosen": -187.87252807617188, "logps/rejected": -221.6996307373047, "loss": 0.6039, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2338807582855225, "rewards/margins": 0.3368605971336365, "rewards/rejected": -1.5707414150238037, "step": 5360 }, { "epoch": 0.93, "grad_norm": 3.171875, "learning_rate": 8.434407093033225e-08, "logits/chosen": -3.018979549407959, "logits/rejected": -3.0176680088043213, "logps/chosen": -180.98257446289062, "logps/rejected": -201.25579833984375, "loss": 0.6407, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.176179051399231, "rewards/margins": 0.22094354033470154, "rewards/rejected": -1.3971226215362549, "step": 5370 }, { "epoch": 0.93, "grad_norm": 3.03125, "learning_rate": 8.051406427743047e-08, "logits/chosen": -3.0299999713897705, "logits/rejected": -3.0229012966156006, "logps/chosen": -188.4458770751953, "logps/rejected": -206.90756225585938, "loss": 0.6086, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.190413475036621, "rewards/margins": 0.28840866684913635, "rewards/rejected": -1.4788219928741455, "step": 5380 }, { "epoch": 0.93, "grad_norm": 2.453125, "learning_rate": 7.677162610273819e-08, "logits/chosen": -2.9983856678009033, "logits/rejected": -2.9884772300720215, "logps/chosen": -191.33926391601562, "logps/rejected": -213.33740234375, "loss": 0.6093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2295373678207397, "rewards/margins": 0.3071189522743225, "rewards/rejected": -1.536656379699707, "step": 5390 }, { "epoch": 0.93, "grad_norm": 4.375, "learning_rate": 7.311689185635573e-08, "logits/chosen": -3.0020601749420166, "logits/rejected": -2.992455005645752, "logps/chosen": -179.33995056152344, "logps/rejected": -216.6887664794922, "loss": 0.5921, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.176435112953186, "rewards/margins": 0.3754914402961731, "rewards/rejected": -1.5519264936447144, "step": 5400 }, { "epoch": 0.93, "eval_logits/chosen": -3.0195529460906982, "eval_logits/rejected": -3.0133368968963623, "eval_logps/chosen": -167.09963989257812, "eval_logps/rejected": -188.02737426757812, "eval_loss": 0.6461929678916931, "eval_rewards/accuracies": 0.6198884844779968, "eval_rewards/chosen": -0.9568961262702942, "eval_rewards/margins": 0.17230701446533203, "eval_rewards/rejected": -1.129203200340271, "eval_runtime": 483.9337, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 5400 }, { "epoch": 0.93, "grad_norm": 4.25, "learning_rate": 6.954999381411642e-08, "logits/chosen": -3.018542766571045, "logits/rejected": -3.0133025646209717, "logps/chosen": -197.008056640625, "logps/rejected": -207.00283813476562, "loss": 0.6865, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2879469394683838, "rewards/margins": 0.1416289359331131, "rewards/rejected": -1.429575800895691, "step": 5410 }, { "epoch": 0.93, "grad_norm": 3.3125, "learning_rate": 6.607106107279604e-08, "logits/chosen": -3.0142135620117188, "logits/rejected": -3.010924816131592, "logps/chosen": -190.42416381835938, "logps/rejected": -209.90029907226562, "loss": 0.6485, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2501780986785889, "rewards/margins": 0.2251962423324585, "rewards/rejected": -1.4753742218017578, "step": 5420 }, { "epoch": 0.94, "grad_norm": 3.203125, "learning_rate": 6.268021954544095e-08, "logits/chosen": -3.016510486602783, "logits/rejected": -3.0054919719696045, "logps/chosen": -187.24249267578125, "logps/rejected": -215.45034790039062, "loss": 0.6014, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2415316104888916, "rewards/margins": 0.3258061408996582, "rewards/rejected": -1.5673377513885498, "step": 5430 }, { "epoch": 0.94, "grad_norm": 2.890625, "learning_rate": 5.9377591956812364e-08, "logits/chosen": -3.003411054611206, "logits/rejected": -2.9964089393615723, "logps/chosen": -188.05709838867188, "logps/rejected": -215.2018585205078, "loss": 0.6064, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.21588134765625, "rewards/margins": 0.3087596893310547, "rewards/rejected": -1.5246409177780151, "step": 5440 }, { "epoch": 0.94, "grad_norm": 5.125, "learning_rate": 5.6163297838942866e-08, "logits/chosen": -3.0072951316833496, "logits/rejected": -3.0005078315734863, "logps/chosen": -187.33633422851562, "logps/rejected": -208.70126342773438, "loss": 0.6291, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2057554721832275, "rewards/margins": 0.26242905855178833, "rewards/rejected": -1.4681843519210815, "step": 5450 }, { "epoch": 0.94, "grad_norm": 2.859375, "learning_rate": 5.30374535268105e-08, "logits/chosen": -3.0079214572906494, "logits/rejected": -2.997767448425293, "logps/chosen": -185.90853881835938, "logps/rejected": -203.79953002929688, "loss": 0.6331, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2154641151428223, "rewards/margins": 0.21819797158241272, "rewards/rejected": -1.4336621761322021, "step": 5460 }, { "epoch": 0.94, "grad_norm": 3.03125, "learning_rate": 5.0000172154129887e-08, "logits/chosen": -3.0055441856384277, "logits/rejected": -3.005470037460327, "logps/chosen": -186.6962432861328, "logps/rejected": -203.15626525878906, "loss": 0.6712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2509243488311768, "rewards/margins": 0.1913340985774994, "rewards/rejected": -1.4422584772109985, "step": 5470 }, { "epoch": 0.94, "grad_norm": 2.671875, "learning_rate": 4.705156364925467e-08, "logits/chosen": -2.9974427223205566, "logits/rejected": -2.98425555229187, "logps/chosen": -182.87705993652344, "logps/rejected": -215.0094757080078, "loss": 0.5791, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.190344214439392, "rewards/margins": 0.36809319257736206, "rewards/rejected": -1.5584375858306885, "step": 5480 }, { "epoch": 0.95, "grad_norm": 2.78125, "learning_rate": 4.419173473120236e-08, "logits/chosen": -2.9959757328033447, "logits/rejected": -2.9869399070739746, "logps/chosen": -183.10800170898438, "logps/rejected": -200.63238525390625, "loss": 0.6302, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1613762378692627, "rewards/margins": 0.24263951182365417, "rewards/rejected": -1.4040155410766602, "step": 5490 }, { "epoch": 0.95, "grad_norm": 3.109375, "learning_rate": 4.142078890578827e-08, "logits/chosen": -3.0294241905212402, "logits/rejected": -3.022934913635254, "logps/chosen": -181.2027587890625, "logps/rejected": -211.42294311523438, "loss": 0.6015, "rewards/accuracies": 0.65625, "rewards/chosen": -1.169240117073059, "rewards/margins": 0.318584144115448, "rewards/rejected": -1.4878242015838623, "step": 5500 }, { "epoch": 0.95, "eval_logits/chosen": -3.0226480960845947, "eval_logits/rejected": -3.016446352005005, "eval_logps/chosen": -167.10562133789062, "eval_logps/rejected": -188.0281982421875, "eval_loss": 0.6462621092796326, "eval_rewards/accuracies": 0.6191914677619934, "eval_rewards/chosen": -0.9569559097290039, "eval_rewards/margins": 0.17225554585456848, "eval_rewards/rejected": -1.1292115449905396, "eval_runtime": 484.0094, "eval_samples_per_second": 8.892, "eval_steps_per_second": 1.112, "step": 5500 }, { "epoch": 0.95, "grad_norm": 2.78125, "learning_rate": 3.873882646188265e-08, "logits/chosen": -3.007052183151245, "logits/rejected": -3.0012123584747314, "logps/chosen": -197.01986694335938, "logps/rejected": -216.42385864257812, "loss": 0.6467, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3407747745513916, "rewards/margins": 0.21489660441875458, "rewards/rejected": -1.5556714534759521, "step": 5510 }, { "epoch": 0.95, "grad_norm": 2.546875, "learning_rate": 3.6145944467777525e-08, "logits/chosen": -3.0063059329986572, "logits/rejected": -2.997032642364502, "logps/chosen": -184.0959014892578, "logps/rejected": -211.39956665039062, "loss": 0.5849, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.157877802848816, "rewards/margins": 0.3614843785762787, "rewards/rejected": -1.519362211227417, "step": 5520 }, { "epoch": 0.95, "grad_norm": 3.5625, "learning_rate": 3.364223676767725e-08, "logits/chosen": -3.0024030208587646, "logits/rejected": -2.994372606277466, "logps/chosen": -192.8942108154297, "logps/rejected": -207.86978149414062, "loss": 0.633, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.220339298248291, "rewards/margins": 0.2580941915512085, "rewards/rejected": -1.4784334897994995, "step": 5530 }, { "epoch": 0.95, "grad_norm": 3.078125, "learning_rate": 3.122779397829845e-08, "logits/chosen": -3.0149142742156982, "logits/rejected": -3.007732391357422, "logps/chosen": -181.58314514160156, "logps/rejected": -215.8975830078125, "loss": 0.5995, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1833419799804688, "rewards/margins": 0.33634015917778015, "rewards/rejected": -1.5196820497512817, "step": 5540 }, { "epoch": 0.96, "grad_norm": 3.21875, "learning_rate": 2.8902703485593208e-08, "logits/chosen": -2.98718523979187, "logits/rejected": -2.982654094696045, "logps/chosen": -184.0675811767578, "logps/rejected": -203.8521270751953, "loss": 0.6588, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2190674543380737, "rewards/margins": 0.1979176253080368, "rewards/rejected": -1.4169851541519165, "step": 5550 }, { "epoch": 0.96, "grad_norm": 3.578125, "learning_rate": 2.666704944158438e-08, "logits/chosen": -3.010056734085083, "logits/rejected": -3.002088785171509, "logps/chosen": -179.8453369140625, "logps/rejected": -197.3083038330078, "loss": 0.634, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1680803298950195, "rewards/margins": 0.23168618977069855, "rewards/rejected": -1.399766445159912, "step": 5560 }, { "epoch": 0.96, "grad_norm": 2.9375, "learning_rate": 2.4520912761320515e-08, "logits/chosen": -3.004617214202881, "logits/rejected": -3.0041353702545166, "logps/chosen": -184.72076416015625, "logps/rejected": -206.00390625, "loss": 0.6545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2311315536499023, "rewards/margins": 0.1838848888874054, "rewards/rejected": -1.4150164127349854, "step": 5570 }, { "epoch": 0.96, "grad_norm": 2.578125, "learning_rate": 2.2464371119947926e-08, "logits/chosen": -3.0113699436187744, "logits/rejected": -3.001690626144409, "logps/chosen": -186.34310913085938, "logps/rejected": -216.634765625, "loss": 0.6002, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2307337522506714, "rewards/margins": 0.3142867386341095, "rewards/rejected": -1.545020580291748, "step": 5580 }, { "epoch": 0.96, "grad_norm": 3.515625, "learning_rate": 2.049749894989822e-08, "logits/chosen": -3.020048141479492, "logits/rejected": -3.014925479888916, "logps/chosen": -191.5043487548828, "logps/rejected": -217.0632781982422, "loss": 0.6179, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2349369525909424, "rewards/margins": 0.29942673444747925, "rewards/rejected": -1.534363865852356, "step": 5590 }, { "epoch": 0.96, "grad_norm": 3.3125, "learning_rate": 1.8620367438194898e-08, "logits/chosen": -3.0188074111938477, "logits/rejected": -3.011625051498413, "logps/chosen": -184.4583282470703, "logps/rejected": -216.49002075195312, "loss": 0.6148, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2120471000671387, "rewards/margins": 0.2991730570793152, "rewards/rejected": -1.511220097541809, "step": 5600 }, { "epoch": 0.96, "eval_logits/chosen": -3.0204532146453857, "eval_logits/rejected": -3.0142104625701904, "eval_logps/chosen": -166.8396453857422, "eval_logps/rejected": -187.79342651367188, "eval_loss": 0.6461296081542969, "eval_rewards/accuracies": 0.6194238066673279, "eval_rewards/chosen": -0.9542962312698364, "eval_rewards/margins": 0.17256729304790497, "eval_rewards/rejected": -1.1268635988235474, "eval_runtime": 483.9014, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 5600 }, { "epoch": 0.97, "grad_norm": 2.9375, "learning_rate": 1.683304452387763e-08, "logits/chosen": -3.016793727874756, "logits/rejected": -3.012073040008545, "logps/chosen": -180.9460906982422, "logps/rejected": -219.7074432373047, "loss": 0.5845, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1847233772277832, "rewards/margins": 0.37846142053604126, "rewards/rejected": -1.5631848573684692, "step": 5610 }, { "epoch": 0.97, "grad_norm": 3.859375, "learning_rate": 1.5135594895542005e-08, "logits/chosen": -2.995067834854126, "logits/rejected": -2.9892044067382812, "logps/chosen": -192.774169921875, "logps/rejected": -210.0087890625, "loss": 0.6356, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2736603021621704, "rewards/margins": 0.2315937578678131, "rewards/rejected": -1.5052540302276611, "step": 5620 }, { "epoch": 0.97, "grad_norm": 2.671875, "learning_rate": 1.352807998899891e-08, "logits/chosen": -3.013383388519287, "logits/rejected": -3.0054855346679688, "logps/chosen": -187.69821166992188, "logps/rejected": -210.5667266845703, "loss": 0.6201, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2161115407943726, "rewards/margins": 0.2969212830066681, "rewards/rejected": -1.5130326747894287, "step": 5630 }, { "epoch": 0.97, "grad_norm": 2.84375, "learning_rate": 1.2010557985051297e-08, "logits/chosen": -3.013478994369507, "logits/rejected": -3.0061988830566406, "logps/chosen": -179.22596740722656, "logps/rejected": -209.5537872314453, "loss": 0.6138, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1337904930114746, "rewards/margins": 0.31256765127182007, "rewards/rejected": -1.4463579654693604, "step": 5640 }, { "epoch": 0.97, "grad_norm": 2.4375, "learning_rate": 1.0583083807387818e-08, "logits/chosen": -3.0160446166992188, "logits/rejected": -3.0032248497009277, "logps/chosen": -178.92898559570312, "logps/rejected": -210.04541015625, "loss": 0.6143, "rewards/accuracies": 0.65625, "rewards/chosen": -1.179724097251892, "rewards/margins": 0.3233483135700226, "rewards/rejected": -1.5030725002288818, "step": 5650 }, { "epoch": 0.98, "grad_norm": 3.3125, "learning_rate": 9.245709120595526e-09, "logits/chosen": -3.0110814571380615, "logits/rejected": -2.997427225112915, "logps/chosen": -183.3815155029297, "logps/rejected": -215.2753143310547, "loss": 0.6007, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2159018516540527, "rewards/margins": 0.34620875120162964, "rewards/rejected": -1.5621105432510376, "step": 5660 }, { "epoch": 0.98, "grad_norm": 2.359375, "learning_rate": 7.998482328289702e-09, "logits/chosen": -3.0085549354553223, "logits/rejected": -2.997692823410034, "logps/chosen": -177.83279418945312, "logps/rejected": -199.79359436035156, "loss": 0.6171, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1252058744430542, "rewards/margins": 0.2588183283805847, "rewards/rejected": -1.3840242624282837, "step": 5670 }, { "epoch": 0.98, "grad_norm": 4.65625, "learning_rate": 6.841448571361376e-09, "logits/chosen": -3.002760648727417, "logits/rejected": -3.0002365112304688, "logps/chosen": -186.73818969726562, "logps/rejected": -208.52145385742188, "loss": 0.6087, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2175891399383545, "rewards/margins": 0.27791762351989746, "rewards/rejected": -1.495506763458252, "step": 5680 }, { "epoch": 0.98, "grad_norm": 2.890625, "learning_rate": 5.774649726345283e-09, "logits/chosen": -3.0148816108703613, "logits/rejected": -3.000577449798584, "logps/chosen": -188.50425720214844, "logps/rejected": -214.07492065429688, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1863806247711182, "rewards/margins": 0.35773932933807373, "rewards/rejected": -1.5441200733184814, "step": 5690 }, { "epoch": 0.98, "grad_norm": 5.34375, "learning_rate": 4.798124403902205e-09, "logits/chosen": -3.0013067722320557, "logits/rejected": -2.9927356243133545, "logps/chosen": -184.54940795898438, "logps/rejected": -203.5726776123047, "loss": 0.6299, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1662585735321045, "rewards/margins": 0.25036853551864624, "rewards/rejected": -1.4166268110275269, "step": 5700 }, { "epoch": 0.98, "eval_logits/chosen": -3.022794485092163, "eval_logits/rejected": -3.0166068077087402, "eval_logps/chosen": -166.8362579345703, "eval_logps/rejected": -187.7362518310547, "eval_loss": 0.6462457776069641, "eval_rewards/accuracies": 0.6194238066673279, "eval_rewards/chosen": -0.9542624354362488, "eval_rewards/margins": 0.1720295399427414, "eval_rewards/rejected": -1.1262919902801514, "eval_runtime": 483.9007, "eval_samples_per_second": 8.894, "eval_steps_per_second": 1.112, "step": 5700 }, { "epoch": 0.98, "grad_norm": 3.0, "learning_rate": 3.911907947422577e-09, "logits/chosen": -3.0133697986602783, "logits/rejected": -3.0069236755371094, "logps/chosen": -186.69503784179688, "logps/rejected": -214.79434204101562, "loss": 0.5947, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2144420146942139, "rewards/margins": 0.33198100328445435, "rewards/rejected": -1.5464229583740234, "step": 5710 }, { "epoch": 0.99, "grad_norm": 3.25, "learning_rate": 3.116032431747518e-09, "logits/chosen": -3.001695156097412, "logits/rejected": -2.993189811706543, "logps/chosen": -187.8595428466797, "logps/rejected": -216.593994140625, "loss": 0.601, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2452127933502197, "rewards/margins": 0.34862878918647766, "rewards/rejected": -1.5938416719436646, "step": 5720 }, { "epoch": 0.99, "grad_norm": 3.078125, "learning_rate": 2.410526662007251e-09, "logits/chosen": -3.010476589202881, "logits/rejected": -3.004617691040039, "logps/chosen": -182.94322204589844, "logps/rejected": -204.91983032226562, "loss": 0.6326, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2004114389419556, "rewards/margins": 0.2473103255033493, "rewards/rejected": -1.4477218389511108, "step": 5730 }, { "epoch": 0.99, "grad_norm": 2.890625, "learning_rate": 1.7954161725791674e-09, "logits/chosen": -3.0005202293395996, "logits/rejected": -2.9857637882232666, "logps/chosen": -198.15444946289062, "logps/rejected": -229.3535614013672, "loss": 0.5853, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2947299480438232, "rewards/margins": 0.38488340377807617, "rewards/rejected": -1.679613471031189, "step": 5740 }, { "epoch": 0.99, "grad_norm": 4.03125, "learning_rate": 1.270723226163284e-09, "logits/chosen": -3.02968168258667, "logits/rejected": -3.025573253631592, "logps/chosen": -192.98440551757812, "logps/rejected": -201.9522705078125, "loss": 0.6502, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2475230693817139, "rewards/margins": 0.1881568729877472, "rewards/rejected": -1.4356797933578491, "step": 5750 }, { "epoch": 0.99, "grad_norm": 2.71875, "learning_rate": 8.364668129762221e-10, "logits/chosen": -3.0103919506073, "logits/rejected": -3.0001320838928223, "logps/chosen": -188.3455352783203, "logps/rejected": -213.83786010742188, "loss": 0.6157, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2158849239349365, "rewards/margins": 0.2838238775730133, "rewards/rejected": -1.499708890914917, "step": 5760 }, { "epoch": 0.99, "grad_norm": 3.109375, "learning_rate": 4.926626500648124e-10, "logits/chosen": -2.9988603591918945, "logits/rejected": -2.9865429401397705, "logps/chosen": -179.5717010498047, "logps/rejected": -208.49853515625, "loss": 0.6028, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1670845746994019, "rewards/margins": 0.2915195822715759, "rewards/rejected": -1.4586042165756226, "step": 5770 }, { "epoch": 1.0, "grad_norm": 3.453125, "learning_rate": 2.393231807362728e-10, "logits/chosen": -3.005258798599243, "logits/rejected": -2.994863986968994, "logps/chosen": -185.7277374267578, "logps/rejected": -214.2324981689453, "loss": 0.6172, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2268074750900269, "rewards/margins": 0.2944895625114441, "rewards/rejected": -1.5212970972061157, "step": 5780 }, { "epoch": 1.0, "grad_norm": 2.640625, "learning_rate": 7.645757410912336e-11, "logits/chosen": -2.9988582134246826, "logits/rejected": -2.9891772270202637, "logps/chosen": -166.65049743652344, "logps/rejected": -200.60191345214844, "loss": 0.5918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0588618516921997, "rewards/margins": 0.32463932037353516, "rewards/rejected": -1.3835010528564453, "step": 5790 }, { "epoch": 1.0, "grad_norm": 2.78125, "learning_rate": 4.071724779286523e-12, "logits/chosen": -3.0157129764556885, "logits/rejected": -3.0100326538085938, "logps/chosen": -174.407958984375, "logps/rejected": -209.0814208984375, "loss": 0.5854, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1314672231674194, "rewards/margins": 0.36212995648384094, "rewards/rejected": -1.4935972690582275, "step": 5800 }, { "epoch": 1.0, "eval_logits/chosen": -3.0223519802093506, "eval_logits/rejected": -3.0161619186401367, "eval_logps/chosen": -167.01016235351562, "eval_logps/rejected": -187.9011993408203, "eval_loss": 0.6463221311569214, "eval_rewards/accuracies": 0.6203531622886658, "eval_rewards/chosen": -0.9560015797615051, "eval_rewards/margins": 0.17193979024887085, "eval_rewards/rejected": -1.127941370010376, "eval_runtime": 483.8696, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.112, "step": 5800 }, { "epoch": 1.0, "step": 5803, "total_flos": 0.0, "train_loss": 0.6317814285541924, "train_runtime": 53813.0859, "train_samples_per_second": 1.726, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 5803, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }