{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4e-08, "logits/chosen": -1.8503975868225098, "logits/rejected": -1.8503975868225098, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.4075, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -1.8588156700134277, "logits/rejected": -1.8588156700134277, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3636, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 10 }, { "epoch": 0.08, "learning_rate": 8.000000000000001e-07, "logits/chosen": -1.970517873764038, "logits/rejected": -1.970517873764038, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3902, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 20 }, { "epoch": 0.12, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -1.9209930896759033, "logits/rejected": -1.9209930896759033, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 30 }, { "epoch": 0.16, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -1.883547067642212, "logits/rejected": -1.883547067642212, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3507, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 40 }, { "epoch": 0.2, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.9128715991973877, "logits/rejected": -1.9128715991973877, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 50 }, { "epoch": 0.24, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -2.0107295513153076, "logits/rejected": -2.0107295513153076, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 60 }, { "epoch": 0.28, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -1.9920228719711304, "logits/rejected": -1.9920228719711304, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3112, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 70 }, { "epoch": 0.32, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -1.8801155090332031, "logits/rejected": -1.8801155090332031, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3778, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 80 }, { "epoch": 0.36, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -2.050198793411255, "logits/rejected": -2.050198793411255, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3655, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 90 }, { "epoch": 0.4, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.8852717876434326, "logits/rejected": -1.8852717876434326, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3803, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 100 }, { "epoch": 0.4, "eval_logits/chosen": -1.9617642164230347, "eval_logits/rejected": -1.8066532611846924, "eval_logps/chosen": -266.6976013183594, "eval_logps/rejected": -254.9398193359375, "eval_loss": 0.053734518587589264, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 700.7393, "eval_samples_per_second": 2.854, "eval_steps_per_second": 1.427, "step": 100 }, { "epoch": 0.44, "learning_rate": 4.4e-06, "logits/chosen": -1.731688141822815, "logits/rejected": -1.731688141822815, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2717, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 110 }, { "epoch": 0.48, "learning_rate": 4.800000000000001e-06, "logits/chosen": -1.8530235290527344, "logits/rejected": -1.8530235290527344, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3482, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 120 }, { "epoch": 0.52, "learning_rate": 4.999756310023261e-06, "logits/chosen": -2.0225424766540527, "logits/rejected": -2.0225424766540527, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3507, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 130 }, { "epoch": 0.56, "learning_rate": 4.997807075247147e-06, "logits/chosen": -1.8995482921600342, "logits/rejected": -1.8995482921600342, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3186, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 140 }, { "epoch": 0.6, "learning_rate": 4.993910125649561e-06, "logits/chosen": -1.8702564239501953, "logits/rejected": -1.8493874073028564, "logps/chosen": -4.896004676818848, "logps/rejected": -1.6084611415863037, "loss": 0.3112, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 150 }, { "epoch": 0.64, "learning_rate": 4.988068499954578e-06, "logits/chosen": -2.04287052154541, "logits/rejected": -2.04287052154541, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 160 }, { "epoch": 0.68, "learning_rate": 4.980286753286196e-06, "logits/chosen": -1.8564621210098267, "logits/rejected": -1.8564621210098267, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3531, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 170 }, { "epoch": 0.72, "learning_rate": 4.970570953616383e-06, "logits/chosen": -1.9510726928710938, "logits/rejected": -1.9173896312713623, "logps/chosen": -12.76134967803955, "logps/rejected": -5.861204624176025, "loss": 0.3393, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.029967620968818665, "rewards/margins": 0.002692684531211853, "rewards/rejected": -0.03266030550003052, "step": 180 }, { "epoch": 0.76, "learning_rate": 4.958928677033465e-06, "logits/chosen": -1.8616416454315186, "logits/rejected": -1.8616416454315186, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 190 }, { "epoch": 0.8, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -1.911077857017517, "logits/rejected": -1.9127223491668701, "logps/chosen": -5.607743740081787, "logps/rejected": -6.2597527503967285, "loss": 0.2732, "rewards/accuracies": 0.0, "rewards/chosen": -9.019851859193295e-05, "rewards/margins": -0.0019743461161851883, "rewards/rejected": 0.0018841475248336792, "step": 200 }, { "epoch": 0.8, "eval_logits/chosen": -1.9952213764190674, "eval_logits/rejected": -1.8367009162902832, "eval_logps/chosen": -270.7552795410156, "eval_logps/rejected": -259.2743835449219, "eval_loss": 0.05847138166427612, "eval_rewards/accuracies": 0.4404999911785126, "eval_rewards/chosen": -0.040576834231615067, "eval_rewards/margins": 0.0027689056005328894, "eval_rewards/rejected": -0.04334573447704315, "eval_runtime": 702.202, "eval_samples_per_second": 2.848, "eval_steps_per_second": 1.424, "step": 200 }, { "epoch": 0.84, "learning_rate": 4.9299025014463665e-06, "logits/chosen": -1.871522307395935, "logits/rejected": -1.871522307395935, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3062, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 210 }, { "epoch": 0.88, "learning_rate": 4.912541236180779e-06, "logits/chosen": -2.014587640762329, "logits/rejected": -2.014587640762329, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 220 }, { "epoch": 0.92, "learning_rate": 4.893298743830168e-06, "logits/chosen": -1.9391746520996094, "logits/rejected": -1.9385459423065186, "logps/chosen": -2.180995464324951, "logps/rejected": -2.3552231788635254, "loss": 0.3505, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.007617546711117029, "rewards/margins": 0.00016982034139800817, "rewards/rejected": -0.007787366863340139, "step": 230 }, { "epoch": 0.96, "learning_rate": 4.8721900291112415e-06, "logits/chosen": -2.1026082038879395, "logits/rejected": -2.1026082038879395, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3211, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 240 }, { "epoch": 1.0, "learning_rate": 4.849231551964771e-06, "logits/chosen": -1.923709511756897, "logits/rejected": -1.923709511756897, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3581, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 250 }, { "epoch": 1.04, "learning_rate": 4.824441214720629e-06, "logits/chosen": -1.7751576900482178, "logits/rejected": -1.7751576900482178, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3778, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 260 }, { "epoch": 1.08, "learning_rate": 4.7978383481380865e-06, "logits/chosen": -1.8949896097183228, "logits/rejected": -1.8624740839004517, "logps/chosen": -13.423696517944336, "logps/rejected": -20.979846954345703, "loss": 1.1297, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.036591093987226486, "rewards/margins": 0.14725562930107117, "rewards/rejected": -0.18384674191474915, "step": 270 }, { "epoch": 1.12, "learning_rate": 4.769443696332272e-06, "logits/chosen": -1.9459644556045532, "logits/rejected": -1.9459644556045532, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3754, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 280 }, { "epoch": 1.16, "learning_rate": 4.7392794005985324e-06, "logits/chosen": -1.8699764013290405, "logits/rejected": -1.8699764013290405, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3087, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 290 }, { "epoch": 1.2, "learning_rate": 4.707368982147318e-06, "logits/chosen": -1.9606857299804688, "logits/rejected": -1.9606857299804688, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3013, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 300 }, { "epoch": 1.2, "eval_logits/chosen": -1.975152850151062, "eval_logits/rejected": -1.813112497329712, "eval_logps/chosen": -299.82257080078125, "eval_logps/rejected": -291.25750732421875, "eval_loss": 0.08001529425382614, "eval_rewards/accuracies": 0.4645000100135803, "eval_rewards/chosen": -0.3312495946884155, "eval_rewards/margins": 0.031927283853292465, "eval_rewards/rejected": -0.3631769120693207, "eval_runtime": 703.8743, "eval_samples_per_second": 2.841, "eval_steps_per_second": 1.421, "step": 300 }, { "epoch": 1.24, "learning_rate": 4.673737323763048e-06, "logits/chosen": -1.8876497745513916, "logits/rejected": -1.8876497745513916, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 310 }, { "epoch": 1.28, "learning_rate": 4.638410650401267e-06, "logits/chosen": -1.9741500616073608, "logits/rejected": -1.9741500616073608, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3951, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 320 }, { "epoch": 1.32, "learning_rate": 4.601416508739211e-06, "logits/chosen": -1.9750694036483765, "logits/rejected": -1.974765419960022, "logps/chosen": -2.6847405433654785, "logps/rejected": -4.218203067779541, "loss": 0.3257, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.012654995545744896, "rewards/margins": 0.01376216672360897, "rewards/rejected": -0.026417162269353867, "step": 330 }, { "epoch": 1.36, "learning_rate": 4.562783745695738e-06, "logits/chosen": -1.7031259536743164, "logits/rejected": -1.7031259536743164, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 340 }, { "epoch": 1.4, "learning_rate": 4.522542485937369e-06, "logits/chosen": -1.9031295776367188, "logits/rejected": -1.9031295776367188, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3754, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 350 }, { "epoch": 1.44, "learning_rate": 4.4807241083879774e-06, "logits/chosen": -1.9317785501480103, "logits/rejected": -1.9317785501480103, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3778, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 360 }, { "epoch": 1.48, "learning_rate": 4.437361221760449e-06, "logits/chosen": -1.8960540294647217, "logits/rejected": -1.8960540294647217, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3852, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 370 }, { "epoch": 1.52, "learning_rate": 4.3924876391293915e-06, "logits/chosen": -1.8491512537002563, "logits/rejected": -1.8491512537002563, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3852, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 380 }, { "epoch": 1.56, "learning_rate": 4.346138351564711e-06, "logits/chosen": -1.8366947174072266, "logits/rejected": -1.8366947174072266, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 390 }, { "epoch": 1.6, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -1.9089797735214233, "logits/rejected": -1.9089797735214233, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3433, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 400 }, { "epoch": 1.6, "eval_logits/chosen": -1.9721235036849976, "eval_logits/rejected": -1.8102209568023682, "eval_logps/chosen": -300.3360595703125, "eval_logps/rejected": -291.88916015625, "eval_loss": 0.08119545131921768, "eval_rewards/accuracies": 0.4675000011920929, "eval_rewards/chosen": -0.33638474345207214, "eval_rewards/margins": 0.03310885280370712, "eval_rewards/rejected": -0.3694935739040375, "eval_runtime": 703.1815, "eval_samples_per_second": 2.844, "eval_steps_per_second": 1.422, "step": 400 }, { "epoch": 1.64, "learning_rate": 4.249158351283414e-06, "logits/chosen": -1.8887426853179932, "logits/rejected": -1.8887426853179932, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3186, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 410 }, { "epoch": 1.68, "learning_rate": 4.198603260653792e-06, "logits/chosen": -1.6066404581069946, "logits/rejected": -1.6066404581069946, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.284, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 420 }, { "epoch": 1.72, "learning_rate": 4.146723650296701e-06, "logits/chosen": -1.7861597537994385, "logits/rejected": -1.764651894569397, "logps/chosen": -6.981114864349365, "logps/rejected": -4.9876909255981445, "loss": 0.2566, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.020851103588938713, "rewards/margins": 0.012941191904246807, "rewards/rejected": -0.033792294561862946, "step": 430 }, { "epoch": 1.76, "learning_rate": 4.093559974371725e-06, "logits/chosen": -1.9894205331802368, "logits/rejected": -1.9900938272476196, "logps/chosen": -4.070672988891602, "logps/rejected": -14.303924560546875, "loss": 0.6156, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": 0.015280509367585182, "rewards/margins": 0.09383808076381683, "rewards/rejected": -0.0785575658082962, "step": 440 }, { "epoch": 1.8, "learning_rate": 4.039153688314146e-06, "logits/chosen": -1.858513593673706, "logits/rejected": -1.858513593673706, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 450 }, { "epoch": 1.84, "learning_rate": 3.983547216509254e-06, "logits/chosen": -2.0184249877929688, "logits/rejected": -2.0184249877929688, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3211, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 460 }, { "epoch": 1.88, "learning_rate": 3.92678391921108e-06, "logits/chosen": -1.7378623485565186, "logits/rejected": -1.7378623485565186, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3087, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 470 }, { "epoch": 1.92, "learning_rate": 3.868908058731376e-06, "logits/chosen": -1.7385492324829102, "logits/rejected": -1.7385492324829102, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 480 }, { "epoch": 1.96, "learning_rate": 3.8099647649251984e-06, "logits/chosen": -1.9141228199005127, "logits/rejected": -1.9141228199005127, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3062, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 490 }, { "epoch": 2.0, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.7758957147598267, "logits/rejected": -1.7758957147598267, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 500 }, { "epoch": 2.0, "eval_logits/chosen": -1.9969795942306519, "eval_logits/rejected": -1.8348422050476074, "eval_logps/chosen": -298.5122985839844, "eval_logps/rejected": -284.1371154785156, "eval_loss": 0.10996392369270325, "eval_rewards/accuracies": 0.3734999895095825, "eval_rewards/chosen": -0.3181473910808563, "eval_rewards/margins": -0.026174278929829597, "eval_rewards/rejected": -0.2919731140136719, "eval_runtime": 702.666, "eval_samples_per_second": 2.846, "eval_steps_per_second": 1.423, "step": 500 }, { "epoch": 2.04, "learning_rate": 3.689060522675689e-06, "logits/chosen": -1.7798951864242554, "logits/rejected": -1.7795917987823486, "logps/chosen": -2.3812079429626465, "logps/rejected": -3.1179988384246826, "loss": 0.2962, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.009619669988751411, "rewards/margins": 0.005795452743768692, "rewards/rejected": -0.015415122732520103, "step": 510 }, { "epoch": 2.08, "learning_rate": 3.627193851723577e-06, "logits/chosen": -1.9688339233398438, "logits/rejected": -1.9688339233398438, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 520 }, { "epoch": 2.12, "learning_rate": 3.564448228912682e-06, "logits/chosen": -2.014033794403076, "logits/rejected": -2.014033794403076, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 530 }, { "epoch": 2.16, "learning_rate": 3.5008725813922383e-06, "logits/chosen": -1.9099280834197998, "logits/rejected": -1.9099280834197998, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3926, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 540 }, { "epoch": 2.2, "learning_rate": 3.436516483539781e-06, "logits/chosen": -1.9664745330810547, "logits/rejected": -1.9664745330810547, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3235, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 550 }, { "epoch": 2.24, "learning_rate": 3.3714301183045382e-06, "logits/chosen": -1.767690658569336, "logits/rejected": -1.767690658569336, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3334, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 560 }, { "epoch": 2.28, "learning_rate": 3.3056642380762783e-06, "logits/chosen": -1.8395519256591797, "logits/rejected": -1.8395519256591797, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3581, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 570 }, { "epoch": 2.32, "learning_rate": 3.2392701251101172e-06, "logits/chosen": -1.9773021936416626, "logits/rejected": -1.9773021936416626, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3334, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 580 }, { "epoch": 2.36, "learning_rate": 3.1722995515381644e-06, "logits/chosen": -2.025939464569092, "logits/rejected": -2.025939464569092, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3754, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 590 }, { "epoch": 2.4, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -2.046207904815674, "logits/rejected": -2.046207904815674, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3038, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 600 }, { "epoch": 2.4, "eval_logits/chosen": -1.9935928583145142, "eval_logits/rejected": -1.831691026687622, "eval_logps/chosen": -299.02557373046875, "eval_logps/rejected": -284.72607421875, "eval_loss": 0.10917193442583084, "eval_rewards/accuracies": 0.37700000405311584, "eval_rewards/chosen": -0.3232795298099518, "eval_rewards/margins": -0.025417106226086617, "eval_rewards/rejected": -0.2978624105453491, "eval_runtime": 703.1518, "eval_samples_per_second": 2.844, "eval_steps_per_second": 1.422, "step": 600 }, { "epoch": 2.44, "learning_rate": 3.0368383179176584e-06, "logits/chosen": -2.0336036682128906, "logits/rejected": -2.0336036682128906, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3211, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 610 }, { "epoch": 2.48, "learning_rate": 2.9684532864643123e-06, "logits/chosen": -1.7936891317367554, "logits/rejected": -1.7936891317367554, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3062, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 620 }, { "epoch": 2.52, "learning_rate": 2.8997029692295875e-06, "logits/chosen": -1.8989810943603516, "logits/rejected": -1.8989810943603516, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3457, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 630 }, { "epoch": 2.56, "learning_rate": 2.8306409756428067e-06, "logits/chosen": -1.7690341472625732, "logits/rejected": -1.7690341472625732, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 640 }, { "epoch": 2.6, "learning_rate": 2.761321158169134e-06, "logits/chosen": -1.6524708271026611, "logits/rejected": -1.6524708271026611, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.4099, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 650 }, { "epoch": 2.64, "learning_rate": 2.6917975703170466e-06, "logits/chosen": -1.835680365562439, "logits/rejected": -1.835680365562439, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3902, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 660 }, { "epoch": 2.68, "learning_rate": 2.6221244244890336e-06, "logits/chosen": -1.7938703298568726, "logits/rejected": -1.7727596759796143, "logps/chosen": -5.777490615844727, "logps/rejected": -3.1091294288635254, "loss": 0.296, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.008814861066639423, "rewards/margins": 0.006191821303218603, "rewards/rejected": -0.01500668190419674, "step": 670 }, { "epoch": 2.72, "learning_rate": 2.5523560497083927e-06, "logits/chosen": -1.8663837909698486, "logits/rejected": -1.8663837909698486, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3112, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 680 }, { "epoch": 2.76, "learning_rate": 2.482546849255096e-06, "logits/chosen": -1.8712513446807861, "logits/rejected": -1.8712513446807861, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3852, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 690 }, { "epoch": 2.8, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -1.9196503162384033, "logits/rejected": -1.9196503162384033, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3161, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 700 }, { "epoch": 2.8, "eval_logits/chosen": -1.9965624809265137, "eval_logits/rejected": -1.8345471620559692, "eval_logps/chosen": -298.41583251953125, "eval_logps/rejected": -284.232177734375, "eval_loss": 0.10689055174589157, "eval_rewards/accuracies": 0.3799999952316284, "eval_rewards/chosen": -0.317182332277298, "eval_rewards/margins": -0.024258404970169067, "eval_rewards/rejected": -0.2929239571094513, "eval_runtime": 703.3493, "eval_samples_per_second": 2.844, "eval_steps_per_second": 1.422, "step": 700 }, { "epoch": 2.84, "learning_rate": 2.3430237011767166e-06, "logits/chosen": -1.9961488246917725, "logits/rejected": -1.9961488246917725, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3704, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 710 }, { "epoch": 2.88, "learning_rate": 2.2734185495055503e-06, "logits/chosen": -1.7007853984832764, "logits/rejected": -1.668602705001831, "logps/chosen": -22.714946746826172, "logps/rejected": -10.431631088256836, "loss": 0.5289, "rewards/accuracies": 0.0, "rewards/chosen": -0.12950357794761658, "rewards/margins": -0.05113900825381279, "rewards/rejected": -0.07836457341909409, "step": 720 }, { "epoch": 2.92, "learning_rate": 2.2039900792337477e-06, "logits/chosen": -1.8030157089233398, "logits/rejected": -1.8030157089233398, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3013, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 730 }, { "epoch": 2.96, "learning_rate": 2.134792428593971e-06, "logits/chosen": -1.9432337284088135, "logits/rejected": -1.9432337284088135, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3531, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 740 }, { "epoch": 3.0, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.9992624521255493, "logits/rejected": -2.000507116317749, "logps/chosen": -6.309609889984131, "logps/rejected": -9.059834480285645, "loss": 0.2932, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.007108859717845917, "rewards/margins": 0.019007809460163116, "rewards/rejected": -0.026116669178009033, "step": 750 }, { "epoch": 3.04, "learning_rate": 1.997305197135089e-06, "logits/chosen": -1.9350688457489014, "logits/rejected": -1.9350688457489014, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3087, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 760 }, { "epoch": 3.08, "learning_rate": 1.9291228247233607e-06, "logits/chosen": -1.943377137184143, "logits/rejected": -1.943377137184143, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2939, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 770 }, { "epoch": 3.12, "learning_rate": 1.8613856051605242e-06, "logits/chosen": -1.9116928577423096, "logits/rejected": -1.9116928577423096, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.4025, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 780 }, { "epoch": 3.16, "learning_rate": 1.7941463578928088e-06, "logits/chosen": -1.9572219848632812, "logits/rejected": -1.9572219848632812, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3038, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 790 }, { "epoch": 3.2, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -2.023911952972412, "logits/rejected": -2.023911952972412, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3852, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 800 }, { "epoch": 3.2, "eval_logits/chosen": -2.001863479614258, "eval_logits/rejected": -1.8408547639846802, "eval_logps/chosen": -289.7388000488281, "eval_logps/rejected": -275.51025390625, "eval_loss": 0.09182017296552658, "eval_rewards/accuracies": 0.3684999942779541, "eval_rewards/chosen": -0.2304122895002365, "eval_rewards/margins": -0.02470785565674305, "eval_rewards/rejected": -0.2057044357061386, "eval_runtime": 704.5723, "eval_samples_per_second": 2.839, "eval_steps_per_second": 1.419, "step": 800 }, { "epoch": 3.24, "learning_rate": 1.661371075624363e-06, "logits/chosen": -1.9029079675674438, "logits/rejected": -1.9029079675674438, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2964, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 810 }, { "epoch": 3.28, "learning_rate": 1.5959385747947697e-06, "logits/chosen": -1.9884204864501953, "logits/rejected": -1.9884204864501953, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3433, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 820 }, { "epoch": 3.32, "learning_rate": 1.5312110338697427e-06, "logits/chosen": -1.9079113006591797, "logits/rejected": -1.9079113006591797, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3136, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 830 }, { "epoch": 3.36, "learning_rate": 1.467238925438646e-06, "logits/chosen": -1.6772514581680298, "logits/rejected": -1.6457149982452393, "logps/chosen": -14.745180130004883, "logps/rejected": -14.53711223602295, "loss": 0.4833, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.04980592057108879, "rewards/margins": 0.06961346417665482, "rewards/rejected": -0.11941938102245331, "step": 840 }, { "epoch": 3.4, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -1.7772667407989502, "logits/rejected": -1.7781718969345093, "logps/chosen": -8.622848510742188, "logps/rejected": -12.08240032196045, "loss": 0.3293, "rewards/accuracies": 0.05000000074505806, "rewards/chosen": -0.01604883186519146, "rewards/margins": 0.024528637528419495, "rewards/rejected": -0.040577471256256104, "step": 850 }, { "epoch": 3.44, "learning_rate": 1.3417599122003464e-06, "logits/chosen": -1.9655358791351318, "logits/rejected": -1.9655358791351318, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 860 }, { "epoch": 3.48, "learning_rate": 1.280350852153168e-06, "logits/chosen": -2.0000901222229004, "logits/rejected": -2.0000901222229004, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3211, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 870 }, { "epoch": 3.52, "learning_rate": 1.2198928378235717e-06, "logits/chosen": -1.8563473224639893, "logits/rejected": -1.8563473224639893, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 880 }, { "epoch": 3.56, "learning_rate": 1.160433012552508e-06, "logits/chosen": -1.8875033855438232, "logits/rejected": -1.8875033855438232, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2791, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 890 }, { "epoch": 3.6, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -1.8840911388397217, "logits/rejected": -1.8840911388397217, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 900 }, { "epoch": 3.6, "eval_logits/chosen": -1.983768343925476, "eval_logits/rejected": -1.823969841003418, "eval_logps/chosen": -287.3323059082031, "eval_logps/rejected": -271.89581298828125, "eval_loss": 0.09828919917345047, "eval_rewards/accuracies": 0.34299999475479126, "eval_rewards/chosen": -0.20634719729423523, "eval_rewards/margins": -0.036787137389183044, "eval_rewards/rejected": -0.16956007480621338, "eval_runtime": 702.0734, "eval_samples_per_second": 2.849, "eval_steps_per_second": 1.424, "step": 900 }, { "epoch": 3.64, "learning_rate": 1.0446925746067768e-06, "logits/chosen": -1.8285129070281982, "logits/rejected": -1.8285129070281982, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 910 }, { "epoch": 3.68, "learning_rate": 9.88502212844063e-07, "logits/chosen": -1.877623200416565, "logits/rejected": -1.877623200416565, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3433, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 920 }, { "epoch": 3.72, "learning_rate": 9.334904715888496e-07, "logits/chosen": -1.7445094585418701, "logits/rejected": -1.7445094585418701, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 930 }, { "epoch": 3.76, "learning_rate": 8.797002473421729e-07, "logits/chosen": -1.8405430316925049, "logits/rejected": -1.8405430316925049, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2939, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 940 }, { "epoch": 3.8, "learning_rate": 8.271734841028553e-07, "logits/chosen": -1.934522271156311, "logits/rejected": -1.934522271156311, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3334, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 950 }, { "epoch": 3.84, "learning_rate": 7.759511406608255e-07, "logits/chosen": -2.002326011657715, "logits/rejected": -2.002326011657715, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3556, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 960 }, { "epoch": 3.88, "learning_rate": 7.260731586586983e-07, "logits/chosen": -1.997032880783081, "logits/rejected": -1.997032880783081, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 970 }, { "epoch": 3.92, "learning_rate": 6.775784314464717e-07, "logits/chosen": -1.9426374435424805, "logits/rejected": -1.9426374435424805, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.4149, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 980 }, { "epoch": 3.96, "learning_rate": 6.305047737536707e-07, "logits/chosen": -1.7942962646484375, "logits/rejected": -1.7942962646484375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3927, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 990 }, { "epoch": 4.0, "learning_rate": 5.848888922025553e-07, "logits/chosen": -1.9319026470184326, "logits/rejected": -1.9107824563980103, "logps/chosen": -4.969229698181152, "logps/rejected": -3.057560682296753, "loss": 0.3701, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.0007322501624003053, "rewards/margins": 0.013758744113147259, "rewards/rejected": -0.01449099462479353, "step": 1000 }, { "epoch": 4.0, "eval_logits/chosen": -1.9837957620620728, "eval_logits/rejected": -1.8240842819213867, "eval_logps/chosen": -287.31591796875, "eval_logps/rejected": -271.8734130859375, "eval_loss": 0.09820234775543213, "eval_rewards/accuracies": 0.34549999237060547, "eval_rewards/chosen": -0.20618313550949097, "eval_rewards/margins": -0.03684700280427933, "eval_rewards/rejected": -0.16933614015579224, "eval_runtime": 705.2974, "eval_samples_per_second": 2.836, "eval_steps_per_second": 1.418, "step": 1000 }, { "epoch": 4.04, "learning_rate": 5.407663566854008e-07, "logits/chosen": -1.8698651790618896, "logits/rejected": -1.8484447002410889, "logps/chosen": -16.939109802246094, "logps/rejected": -9.7978515625, "loss": 0.311, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.07174522429704666, "rewards/margins": 0.0002815544721670449, "rewards/rejected": -0.07202677428722382, "step": 1010 }, { "epoch": 4.08, "learning_rate": 4.981715726281666e-07, "logits/chosen": -1.9195115566253662, "logits/rejected": -1.9195115566253662, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2618, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1020 }, { "epoch": 4.12, "learning_rate": 4.5713775416217884e-07, "logits/chosen": -1.9020694494247437, "logits/rejected": -1.9020694494247437, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3655, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1030 }, { "epoch": 4.16, "learning_rate": 4.1769689822475147e-07, "logits/chosen": -2.1050262451171875, "logits/rejected": -2.1050262451171875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1040 }, { "epoch": 4.2, "learning_rate": 3.798797596089351e-07, "logits/chosen": -2.110747814178467, "logits/rejected": -2.110747814178467, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3507, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1050 }, { "epoch": 4.24, "learning_rate": 3.4371582698185636e-07, "logits/chosen": -1.8137989044189453, "logits/rejected": -1.8137989044189453, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3235, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1060 }, { "epoch": 4.28, "learning_rate": 3.092332998903416e-07, "logits/chosen": -2.048144817352295, "logits/rejected": -2.048144817352295, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1070 }, { "epoch": 4.32, "learning_rate": 2.764590667717562e-07, "logits/chosen": -2.115201473236084, "logits/rejected": -2.115201473236084, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3902, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1080 }, { "epoch": 4.36, "learning_rate": 2.454186839872158e-07, "logits/chosen": -1.7584993839263916, "logits/rejected": -1.7584993839263916, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3803, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1090 }, { "epoch": 4.4, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -2.012417793273926, "logits/rejected": -2.012417793273926, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.4025, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1100 }, { "epoch": 4.4, "eval_logits/chosen": -1.9857844114303589, "eval_logits/rejected": -1.8259761333465576, "eval_logps/chosen": -287.1649475097656, "eval_logps/rejected": -271.81268310546875, "eval_loss": 0.09747015684843063, "eval_rewards/accuracies": 0.34549999237060547, "eval_rewards/chosen": -0.20467324554920197, "eval_rewards/margins": -0.03594454750418663, "eval_rewards/rejected": -0.16872867941856384, "eval_runtime": 705.0834, "eval_samples_per_second": 2.837, "eval_steps_per_second": 1.418, "step": 1100 }, { "epoch": 4.44, "learning_rate": 1.8863491596921745e-07, "logits/chosen": -1.8358827829360962, "logits/rejected": -1.8358827829360962, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3408, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1110 }, { "epoch": 4.48, "learning_rate": 1.629358090099639e-07, "logits/chosen": -1.8439744710922241, "logits/rejected": -1.8439744710922241, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.2692, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1120 }, { "epoch": 4.52, "learning_rate": 1.3905907440629752e-07, "logits/chosen": -1.9981298446655273, "logits/rejected": -1.9981298446655273, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3112, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1130 }, { "epoch": 4.56, "learning_rate": 1.1702333051763271e-07, "logits/chosen": -2.0399842262268066, "logits/rejected": -2.0399842262268066, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3581, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1140 }, { "epoch": 4.6, "learning_rate": 9.684576015420277e-08, "logits/chosen": -1.9201631546020508, "logits/rejected": -1.9201631546020508, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3852, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1150 }, { "epoch": 4.64, "learning_rate": 7.854209717842231e-08, "logits/chosen": -1.6203396320343018, "logits/rejected": -1.6199716329574585, "logps/chosen": -2.1602871417999268, "logps/rejected": -3.176687240600586, "loss": 0.3391, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.007410462014377117, "rewards/margins": 0.0085915457457304, "rewards/rejected": -0.016002008691430092, "step": 1160 }, { "epoch": 4.68, "learning_rate": 6.212661423609184e-08, "logits/chosen": -1.7701547145843506, "logits/rejected": -1.7701547145843506, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3655, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1170 }, { "epoch": 4.72, "learning_rate": 4.761211162702117e-08, "logits/chosen": -1.9981458187103271, "logits/rejected": -1.976782202720642, "logps/chosen": -4.941376686096191, "logps/rejected": -3.0533976554870605, "loss": 0.3454, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.0004537239146884531, "rewards/margins": 0.013995639979839325, "rewards/rejected": -0.014449363574385643, "step": 1180 }, { "epoch": 4.76, "learning_rate": 3.5009907323737826e-08, "logits/chosen": -1.8368213176727295, "logits/rejected": -1.8368213176727295, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3334, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1190 }, { "epoch": 4.8, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -1.806884765625, "logits/rejected": -1.806884765625, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3754, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1200 }, { "epoch": 4.8, "eval_logits/chosen": -1.9853414297103882, "eval_logits/rejected": -1.825589895248413, "eval_logps/chosen": -287.1330871582031, "eval_logps/rejected": -271.7890319824219, "eval_loss": 0.09735800325870514, "eval_rewards/accuracies": 0.3440000116825104, "eval_rewards/chosen": -0.20435477793216705, "eval_rewards/margins": -0.03586255759000778, "eval_rewards/rejected": -0.16849222779273987, "eval_runtime": 702.0059, "eval_samples_per_second": 2.849, "eval_steps_per_second": 1.424, "step": 1200 }, { "epoch": 4.84, "learning_rate": 1.5580202098509078e-08, "logits/chosen": -1.985327124595642, "logits/rejected": -1.9865849018096924, "logps/chosen": -6.517402648925781, "logps/rejected": -8.804891586303711, "loss": 0.3148, "rewards/accuracies": 0.02500000037252903, "rewards/chosen": -0.009186786599457264, "rewards/margins": 0.014380457811057568, "rewards/rejected": -0.02356724441051483, "step": 1210 }, { "epoch": 4.88, "learning_rate": 8.767851876239075e-09, "logits/chosen": -1.8783695697784424, "logits/rejected": -1.8783695697784424, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3112, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1220 }, { "epoch": 4.92, "learning_rate": 3.8980895450474455e-09, "logits/chosen": -1.9094947576522827, "logits/rejected": -1.9094947576522827, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.284, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1230 }, { "epoch": 4.96, "learning_rate": 9.747123991141193e-10, "logits/chosen": -1.9814590215682983, "logits/rejected": -1.9814590215682983, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1240 }, { "epoch": 5.0, "learning_rate": 0.0, "logits/chosen": -1.8839404582977295, "logits/rejected": -1.8839404582977295, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.3383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1250 }, { "epoch": 5.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.3515237546205521, "train_runtime": 12848.6235, "train_samples_per_second": 0.389, "train_steps_per_second": 0.097 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }