{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 1.3400768041610718, "learning_rate": 4e-08, "logits/chosen": -2.951728105545044, "logits/rejected": -3.0115513801574707, "logps/chosen": -261.50799560546875, "logps/rejected": -337.26708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004, "grad_norm": 1.3155320882797241, "learning_rate": 2.0000000000000002e-07, "logits/chosen": -2.8931193351745605, "logits/rejected": -2.8665506839752197, "logps/chosen": -327.18511962890625, "logps/rejected": -271.54595947265625, "loss": 0.6934, "rewards/accuracies": 0.359375, "rewards/chosen": -0.0003679850487969816, "rewards/margins": -0.0005117338732816279, "rewards/rejected": 0.000143748868140392, "step": 5 }, { "epoch": 0.008, "grad_norm": 1.4168583154678345, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -2.8454272747039795, "logits/rejected": -2.8244102001190186, "logps/chosen": -278.81390380859375, "logps/rejected": -225.78091430664062, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0003080188180319965, "rewards/margins": -0.00016189362213481218, "rewards/rejected": -0.00014612523955293, "step": 10 }, { "epoch": 0.012, "grad_norm": 1.4461805820465088, "learning_rate": 6.000000000000001e-07, "logits/chosen": -2.941542387008667, "logits/rejected": -2.919604539871216, "logps/chosen": -338.14361572265625, "logps/rejected": -264.4473876953125, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -9.753620543051511e-05, "rewards/margins": 0.00017976768140215427, "rewards/rejected": -0.0002773039450403303, "step": 15 }, { "epoch": 0.016, "grad_norm": 1.218361735343933, "learning_rate": 8.000000000000001e-07, "logits/chosen": -2.844390392303467, "logits/rejected": -2.8012917041778564, "logps/chosen": -284.53179931640625, "logps/rejected": -265.3224792480469, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00011823275417555124, "rewards/margins": 0.00027608583332039416, "rewards/rejected": -0.00015785309369675815, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.0622657537460327, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -2.919724941253662, "logits/rejected": -2.8841071128845215, "logps/chosen": -282.7057800292969, "logps/rejected": -250.56005859375, "loss": 0.693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 3.2701333111617714e-05, "rewards/margins": 0.00029431647271849215, "rewards/rejected": -0.0002616152632981539, "step": 25 }, { "epoch": 0.024, "grad_norm": 1.2840291261672974, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -2.8690571784973145, "logits/rejected": -2.8205409049987793, "logps/chosen": -248.4199981689453, "logps/rejected": -239.7508544921875, "loss": 0.6933, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0002605341433081776, "rewards/margins": -0.0002968462067656219, "rewards/rejected": 0.0005573804955929518, "step": 30 }, { "epoch": 0.028, "grad_norm": 1.4659631252288818, "learning_rate": 1.4000000000000001e-06, "logits/chosen": -2.819516181945801, "logits/rejected": -2.8284599781036377, "logps/chosen": -260.5746765136719, "logps/rejected": -252.26657104492188, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00032235420076176524, "rewards/margins": -0.00027956519625149667, "rewards/rejected": -4.278900451026857e-05, "step": 35 }, { "epoch": 0.032, "grad_norm": 1.641062617301941, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -2.8422532081604004, "logits/rejected": -2.8213276863098145, "logps/chosen": -225.60000610351562, "logps/rejected": -254.83389282226562, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0003634319291450083, "rewards/margins": 0.001209324225783348, "rewards/rejected": -0.0008458923548460007, "step": 40 }, { "epoch": 0.036, "grad_norm": 1.2335015535354614, "learning_rate": 1.8000000000000001e-06, "logits/chosen": -2.8927934169769287, "logits/rejected": -2.895987033843994, "logps/chosen": -262.75616455078125, "logps/rejected": -258.01776123046875, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.163524034898728e-05, "rewards/margins": 0.0012018559500575066, "rewards/rejected": -0.0012834911467507482, "step": 45 }, { "epoch": 0.04, "grad_norm": 1.3828603029251099, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -2.8105452060699463, "logits/rejected": -2.766021966934204, "logps/chosen": -246.88064575195312, "logps/rejected": -221.18325805664062, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0005901859840378165, "rewards/margins": 0.0004752330423798412, "rewards/rejected": 0.00011495289800222963, "step": 50 }, { "epoch": 0.044, "grad_norm": 1.1620622873306274, "learning_rate": 2.2e-06, "logits/chosen": -2.8504276275634766, "logits/rejected": -2.830573558807373, "logps/chosen": -289.9104919433594, "logps/rejected": -304.9803771972656, "loss": 0.6934, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0006987753440625966, "rewards/margins": -0.0004663577419705689, "rewards/rejected": -0.00023241760209202766, "step": 55 }, { "epoch": 0.048, "grad_norm": 1.2738378047943115, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -2.893800735473633, "logits/rejected": -2.874782085418701, "logps/chosen": -265.0617370605469, "logps/rejected": -274.21246337890625, "loss": 0.6922, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0009635955793783069, "rewards/margins": 0.0018142672488465905, "rewards/rejected": -0.0008506716112606227, "step": 60 }, { "epoch": 0.052, "grad_norm": 1.0527548789978027, "learning_rate": 2.6e-06, "logits/chosen": -2.8645272254943848, "logits/rejected": -2.840221881866455, "logps/chosen": -242.2287139892578, "logps/rejected": -258.8787536621094, "loss": 0.6922, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0008619900909252465, "rewards/margins": 0.0019651155453175306, "rewards/rejected": -0.0011031257454305887, "step": 65 }, { "epoch": 0.056, "grad_norm": 1.2068345546722412, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -2.8592796325683594, "logits/rejected": -2.856304168701172, "logps/chosen": -256.22979736328125, "logps/rejected": -239.5323944091797, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0025476592127233744, "rewards/margins": 0.002555294893682003, "rewards/rejected": -7.636076588823926e-06, "step": 70 }, { "epoch": 0.06, "grad_norm": 1.4119035005569458, "learning_rate": 3e-06, "logits/chosen": -2.922961473464966, "logits/rejected": -2.861196517944336, "logps/chosen": -286.9571838378906, "logps/rejected": -258.0143737792969, "loss": 0.6918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0024884731974452734, "rewards/margins": 0.0027703498490154743, "rewards/rejected": -0.00028187656425870955, "step": 75 }, { "epoch": 0.064, "grad_norm": 1.4012054204940796, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -2.8784067630767822, "logits/rejected": -2.8731160163879395, "logps/chosen": -257.916259765625, "logps/rejected": -248.72305297851562, "loss": 0.6912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0049202474765479565, "rewards/margins": 0.0038819201290607452, "rewards/rejected": 0.0010383275803178549, "step": 80 }, { "epoch": 0.068, "grad_norm": 1.4234269857406616, "learning_rate": 3.4000000000000005e-06, "logits/chosen": -2.9202146530151367, "logits/rejected": -2.8719019889831543, "logps/chosen": -307.8442077636719, "logps/rejected": -268.5364990234375, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005258677992969751, "rewards/margins": 0.0030069469939917326, "rewards/rejected": 0.0022517309989780188, "step": 85 }, { "epoch": 0.072, "grad_norm": 1.1752641201019287, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -2.8795719146728516, "logits/rejected": -2.8445372581481934, "logps/chosen": -238.05691528320312, "logps/rejected": -238.66940307617188, "loss": 0.6924, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00510067492723465, "rewards/margins": 0.0015365609433501959, "rewards/rejected": 0.003564114449545741, "step": 90 }, { "epoch": 0.076, "grad_norm": 1.273596167564392, "learning_rate": 3.8000000000000005e-06, "logits/chosen": -2.8963799476623535, "logits/rejected": -2.899864673614502, "logps/chosen": -268.2062683105469, "logps/rejected": -242.0111083984375, "loss": 0.6902, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.007857006974518299, "rewards/margins": 0.005892972461879253, "rewards/rejected": 0.0019640345126390457, "step": 95 }, { "epoch": 0.08, "grad_norm": 1.2360827922821045, "learning_rate": 4.000000000000001e-06, "logits/chosen": -2.871992588043213, "logits/rejected": -2.8619043827056885, "logps/chosen": -292.5274353027344, "logps/rejected": -255.6526641845703, "loss": 0.6899, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.009717302396893501, "rewards/margins": 0.006645149551331997, "rewards/rejected": 0.0030721533112227917, "step": 100 }, { "epoch": 0.08, "eval_logits/chosen": -2.889031171798706, "eval_logits/rejected": -2.8468213081359863, "eval_logps/chosen": -282.2605285644531, "eval_logps/rejected": -247.75430297851562, "eval_loss": 0.6897016167640686, "eval_rewards/accuracies": 0.6666666865348816, "eval_rewards/chosen": 0.009775782003998756, "eval_rewards/margins": 0.007023118901997805, "eval_rewards/rejected": 0.0027526640333235264, "eval_runtime": 166.8346, "eval_samples_per_second": 2.997, "eval_steps_per_second": 0.378, "step": 100 }, { "epoch": 0.084, "grad_norm": 1.3187963962554932, "learning_rate": 4.2000000000000004e-06, "logits/chosen": -2.8663318157196045, "logits/rejected": -2.8205113410949707, "logps/chosen": -272.2581481933594, "logps/rejected": -261.42620849609375, "loss": 0.6907, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00842782761901617, "rewards/margins": 0.005024894140660763, "rewards/rejected": 0.0034029334783554077, "step": 105 }, { "epoch": 0.088, "grad_norm": 1.2544078826904297, "learning_rate": 4.4e-06, "logits/chosen": -2.9372715950012207, "logits/rejected": -2.9057135581970215, "logps/chosen": -251.8219757080078, "logps/rejected": -246.0946044921875, "loss": 0.6914, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.009438835084438324, "rewards/margins": 0.0036361501552164555, "rewards/rejected": 0.0058026849292218685, "step": 110 }, { "epoch": 0.092, "grad_norm": 3.0026137828826904, "learning_rate": 4.600000000000001e-06, "logits/chosen": -2.827087879180908, "logits/rejected": -2.816584348678589, "logps/chosen": -225.01516723632812, "logps/rejected": -294.75274658203125, "loss": 0.6868, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.012459425255656242, "rewards/margins": 0.012965649366378784, "rewards/rejected": -0.0005062236450612545, "step": 115 }, { "epoch": 0.096, "grad_norm": 3.147055149078369, "learning_rate": 4.800000000000001e-06, "logits/chosen": -2.7388131618499756, "logits/rejected": -2.748465061187744, "logps/chosen": -275.8075866699219, "logps/rejected": -249.1244659423828, "loss": 0.6885, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011224482208490372, "rewards/margins": 0.009682848118245602, "rewards/rejected": 0.0015416343230754137, "step": 120 }, { "epoch": 0.1, "grad_norm": 1.534111738204956, "learning_rate": 5e-06, "logits/chosen": -2.941195487976074, "logits/rejected": -2.924978494644165, "logps/chosen": -310.167236328125, "logps/rejected": -280.0481262207031, "loss": 0.6872, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011911705136299133, "rewards/margins": 0.012096909806132317, "rewards/rejected": -0.00018520592129789293, "step": 125 }, { "epoch": 0.104, "grad_norm": 1.4911248683929443, "learning_rate": 4.999756310023261e-06, "logits/chosen": -2.8891565799713135, "logits/rejected": -2.896601915359497, "logps/chosen": -286.2426452636719, "logps/rejected": -309.3197021484375, "loss": 0.6869, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.009015440009534359, "rewards/margins": 0.012755987234413624, "rewards/rejected": -0.003740546526387334, "step": 130 }, { "epoch": 0.108, "grad_norm": 1.4180755615234375, "learning_rate": 4.999025287600886e-06, "logits/chosen": -2.8916049003601074, "logits/rejected": -2.9071428775787354, "logps/chosen": -274.48236083984375, "logps/rejected": -265.49786376953125, "loss": 0.6811, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011777431704103947, "rewards/margins": 0.024640800431370735, "rewards/rejected": -0.012863369658589363, "step": 135 }, { "epoch": 0.112, "grad_norm": 1.3050034046173096, "learning_rate": 4.997807075247147e-06, "logits/chosen": -2.8806416988372803, "logits/rejected": -2.8594279289245605, "logps/chosen": -247.0726776123047, "logps/rejected": -236.9187774658203, "loss": 0.6891, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005051865242421627, "rewards/margins": 0.008825790137052536, "rewards/rejected": -0.013877655379474163, "step": 140 }, { "epoch": 0.116, "grad_norm": 1.3001933097839355, "learning_rate": 4.996101910454953e-06, "logits/chosen": -2.903634548187256, "logits/rejected": -2.859711170196533, "logps/chosen": -273.8101806640625, "logps/rejected": -244.11074829101562, "loss": 0.6801, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0007423794595524669, "rewards/margins": 0.026909640058875084, "rewards/rejected": -0.026167264208197594, "step": 145 }, { "epoch": 0.12, "grad_norm": 1.671297550201416, "learning_rate": 4.993910125649561e-06, "logits/chosen": -2.891292095184326, "logits/rejected": -2.856261968612671, "logps/chosen": -293.83563232421875, "logps/rejected": -247.8043975830078, "loss": 0.6803, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01020820252597332, "rewards/margins": 0.026513541117310524, "rewards/rejected": -0.016305336728692055, "step": 150 }, { "epoch": 0.124, "grad_norm": 1.5288795232772827, "learning_rate": 4.9912321481237616e-06, "logits/chosen": -2.778376340866089, "logits/rejected": -2.774121046066284, "logps/chosen": -231.49319458007812, "logps/rejected": -290.89337158203125, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.00030907365726307034, "rewards/margins": 0.019828204065561295, "rewards/rejected": -0.019519129768013954, "step": 155 }, { "epoch": 0.128, "grad_norm": 1.5330896377563477, "learning_rate": 4.988068499954578e-06, "logits/chosen": -2.889814853668213, "logits/rejected": -2.888610601425171, "logps/chosen": -316.81927490234375, "logps/rejected": -312.25006103515625, "loss": 0.6715, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.031255967915058136, "rewards/margins": 0.04561912640929222, "rewards/rejected": -0.01436315942555666, "step": 160 }, { "epoch": 0.132, "grad_norm": 1.637596607208252, "learning_rate": 4.984419797901491e-06, "logits/chosen": -2.922788143157959, "logits/rejected": -2.911243438720703, "logps/chosen": -311.63836669921875, "logps/rejected": -282.0634765625, "loss": 0.6705, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.031035322695970535, "rewards/margins": 0.04701067879796028, "rewards/rejected": -0.015975359827280045, "step": 165 }, { "epoch": 0.136, "grad_norm": 1.531761884689331, "learning_rate": 4.980286753286196e-06, "logits/chosen": -2.9153621196746826, "logits/rejected": -2.9075608253479004, "logps/chosen": -275.50396728515625, "logps/rejected": -273.3793029785156, "loss": 0.6759, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0289138313382864, "rewards/margins": 0.0373816192150116, "rewards/rejected": -0.008467786945402622, "step": 170 }, { "epoch": 0.14, "grad_norm": 1.562333106994629, "learning_rate": 4.975670171853926e-06, "logits/chosen": -2.881091833114624, "logits/rejected": -2.8206849098205566, "logps/chosen": -268.7303161621094, "logps/rejected": -241.11801147460938, "loss": 0.6727, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01539912074804306, "rewards/margins": 0.04456932842731476, "rewards/rejected": -0.02917020581662655, "step": 175 }, { "epoch": 0.144, "grad_norm": 1.5452988147735596, "learning_rate": 4.970570953616383e-06, "logits/chosen": -2.870706558227539, "logits/rejected": -2.846757173538208, "logps/chosen": -271.70098876953125, "logps/rejected": -250.15017700195312, "loss": 0.6579, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.027496661990880966, "rewards/margins": 0.07495806366205215, "rewards/rejected": -0.04746139422059059, "step": 180 }, { "epoch": 0.148, "grad_norm": 1.711881160736084, "learning_rate": 4.964990092676263e-06, "logits/chosen": -2.8256664276123047, "logits/rejected": -2.8229262828826904, "logps/chosen": -272.4619140625, "logps/rejected": -226.0482177734375, "loss": 0.6773, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.000382797239581123, "rewards/margins": 0.03519537299871445, "rewards/rejected": -0.0355781726539135, "step": 185 }, { "epoch": 0.152, "grad_norm": 1.8593279123306274, "learning_rate": 4.958928677033465e-06, "logits/chosen": -2.8317179679870605, "logits/rejected": -2.820038318634033, "logps/chosen": -276.53924560546875, "logps/rejected": -289.26007080078125, "loss": 0.6639, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008795881643891335, "rewards/margins": 0.06358983367681503, "rewards/rejected": -0.05479395389556885, "step": 190 }, { "epoch": 0.156, "grad_norm": 1.802320957183838, "learning_rate": 4.9523878883729794e-06, "logits/chosen": -2.876426935195923, "logits/rejected": -2.851534128189087, "logps/chosen": -288.3893737792969, "logps/rejected": -255.09683227539062, "loss": 0.6564, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.004248014185577631, "rewards/margins": 0.07926348596811295, "rewards/rejected": -0.07501547038555145, "step": 195 }, { "epoch": 0.16, "grad_norm": 1.8666610717773438, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -2.852238416671753, "logits/rejected": -2.8293018341064453, "logps/chosen": -255.58560180664062, "logps/rejected": -257.3184814453125, "loss": 0.6532, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0025687548331916332, "rewards/margins": 0.08859384059906006, "rewards/rejected": -0.09116257727146149, "step": 200 }, { "epoch": 0.16, "eval_logits/chosen": -2.878232717514038, "eval_logits/rejected": -2.8385584354400635, "eval_logps/chosen": -284.5143127441406, "eval_logps/rejected": -257.5306091308594, "eval_loss": 0.6568659543991089, "eval_rewards/accuracies": 0.6884920597076416, "eval_rewards/chosen": -0.012762677855789661, "eval_rewards/margins": 0.08224756270647049, "eval_rewards/rejected": -0.09501024335622787, "eval_runtime": 166.7797, "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 200 }, { "epoch": 0.164, "grad_norm": 1.9332078695297241, "learning_rate": 4.937873385763909e-06, "logits/chosen": -2.8655571937561035, "logits/rejected": -2.8335084915161133, "logps/chosen": -287.10076904296875, "logps/rejected": -284.3404846191406, "loss": 0.6582, "rewards/accuracies": 0.6875, "rewards/chosen": -0.033930666744709015, "rewards/margins": 0.07983305305242538, "rewards/rejected": -0.1137637123465538, "step": 205 }, { "epoch": 0.168, "grad_norm": 1.877467393875122, "learning_rate": 4.9299025014463665e-06, "logits/chosen": -2.879312038421631, "logits/rejected": -2.862196445465088, "logps/chosen": -248.899169921875, "logps/rejected": -245.27310180664062, "loss": 0.6704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04507671296596527, "rewards/margins": 0.054625023156404495, "rewards/rejected": -0.09970173239707947, "step": 210 }, { "epoch": 0.172, "grad_norm": 1.8854491710662842, "learning_rate": 4.921457902821578e-06, "logits/chosen": -2.8618056774139404, "logits/rejected": -2.8050172328948975, "logps/chosen": -316.2086181640625, "logps/rejected": -286.01239013671875, "loss": 0.6661, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.017241844907402992, "rewards/margins": 0.06764046102762222, "rewards/rejected": -0.08488230407238007, "step": 215 }, { "epoch": 0.176, "grad_norm": 2.2946383953094482, "learning_rate": 4.912541236180779e-06, "logits/chosen": -2.7987911701202393, "logits/rejected": -2.76237154006958, "logps/chosen": -325.50177001953125, "logps/rejected": -316.89739990234375, "loss": 0.642, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.016762247309088707, "rewards/margins": 0.11744923889636993, "rewards/rejected": -0.1342114955186844, "step": 220 }, { "epoch": 0.18, "grad_norm": 1.7292786836624146, "learning_rate": 4.903154239845798e-06, "logits/chosen": -2.8847053050994873, "logits/rejected": -2.825892210006714, "logps/chosen": -271.9214172363281, "logps/rejected": -247.08193969726562, "loss": 0.6482, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06987594068050385, "rewards/margins": 0.10394857078790665, "rewards/rejected": -0.1738245040178299, "step": 225 }, { "epoch": 0.184, "grad_norm": 2.149097204208374, "learning_rate": 4.893298743830168e-06, "logits/chosen": -2.792332172393799, "logits/rejected": -2.80527925491333, "logps/chosen": -302.56390380859375, "logps/rejected": -302.79840087890625, "loss": 0.6306, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04528792202472687, "rewards/margins": 0.1434146910905838, "rewards/rejected": -0.18870261311531067, "step": 230 }, { "epoch": 0.188, "grad_norm": 2.5504279136657715, "learning_rate": 4.882976669482368e-06, "logits/chosen": -2.8090176582336426, "logits/rejected": -2.7789652347564697, "logps/chosen": -274.94342041015625, "logps/rejected": -279.92120361328125, "loss": 0.6433, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0906161516904831, "rewards/margins": 0.124458909034729, "rewards/rejected": -0.2150750607252121, "step": 235 }, { "epoch": 0.192, "grad_norm": 2.925840377807617, "learning_rate": 4.8721900291112415e-06, "logits/chosen": -2.8581314086914062, "logits/rejected": -2.837096691131592, "logps/chosen": -290.9739685058594, "logps/rejected": -275.4525451660156, "loss": 0.6432, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.015353793278336525, "rewards/margins": 0.11724452674388885, "rewards/rejected": -0.13259831070899963, "step": 240 }, { "epoch": 0.196, "grad_norm": 2.329665184020996, "learning_rate": 4.860940925593703e-06, "logits/chosen": -2.878603458404541, "logits/rejected": -2.8466429710388184, "logps/chosen": -288.50494384765625, "logps/rejected": -274.239990234375, "loss": 0.6374, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.012451673857867718, "rewards/margins": 0.13850674033164978, "rewards/rejected": -0.1260550618171692, "step": 245 }, { "epoch": 0.2, "grad_norm": 2.1521079540252686, "learning_rate": 4.849231551964771e-06, "logits/chosen": -2.8556203842163086, "logits/rejected": -2.82784104347229, "logps/chosen": -254.6340789794922, "logps/rejected": -242.66726684570312, "loss": 0.6548, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.016969427466392517, "rewards/margins": 0.09558813273906708, "rewards/rejected": -0.112557552754879, "step": 250 }, { "epoch": 0.204, "grad_norm": 3.7580788135528564, "learning_rate": 4.837064190990036e-06, "logits/chosen": -2.7907662391662598, "logits/rejected": -2.8043570518493652, "logps/chosen": -287.1578674316406, "logps/rejected": -284.9073181152344, "loss": 0.6471, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06225704029202461, "rewards/margins": 0.11668694019317627, "rewards/rejected": -0.17894400656223297, "step": 255 }, { "epoch": 0.208, "grad_norm": 3.2057318687438965, "learning_rate": 4.824441214720629e-06, "logits/chosen": -2.8258254528045654, "logits/rejected": -2.838768720626831, "logps/chosen": -331.3340148925781, "logps/rejected": -295.8611145019531, "loss": 0.6624, "rewards/accuracies": 0.625, "rewards/chosen": -0.12479463964700699, "rewards/margins": 0.08991553634405136, "rewards/rejected": -0.21471016108989716, "step": 260 }, { "epoch": 0.212, "grad_norm": 5.250330448150635, "learning_rate": 4.811365084030784e-06, "logits/chosen": -2.788186550140381, "logits/rejected": -2.737650156021118, "logps/chosen": -240.7392578125, "logps/rejected": -258.4285583496094, "loss": 0.6295, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1400509476661682, "rewards/margins": 0.14637869596481323, "rewards/rejected": -0.28642964363098145, "step": 265 }, { "epoch": 0.216, "grad_norm": 4.085949420928955, "learning_rate": 4.7978383481380865e-06, "logits/chosen": -2.8263564109802246, "logits/rejected": -2.82792592048645, "logps/chosen": -284.7565002441406, "logps/rejected": -326.1434020996094, "loss": 0.6284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12767954170703888, "rewards/margins": 0.17385998368263245, "rewards/rejected": -0.30153951048851013, "step": 270 }, { "epoch": 0.22, "grad_norm": 3.1048779487609863, "learning_rate": 4.783863644106502e-06, "logits/chosen": -2.881112575531006, "logits/rejected": -2.87247896194458, "logps/chosen": -279.97052001953125, "logps/rejected": -273.80780029296875, "loss": 0.6366, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1319071650505066, "rewards/margins": 0.14382150769233704, "rewards/rejected": -0.2757287323474884, "step": 275 }, { "epoch": 0.224, "grad_norm": 3.2117135524749756, "learning_rate": 4.769443696332272e-06, "logits/chosen": -2.8749935626983643, "logits/rejected": -2.844726085662842, "logps/chosen": -292.9136657714844, "logps/rejected": -294.96185302734375, "loss": 0.629, "rewards/accuracies": 0.625, "rewards/chosen": -0.09047095477581024, "rewards/margins": 0.1656198650598526, "rewards/rejected": -0.25609081983566284, "step": 280 }, { "epoch": 0.228, "grad_norm": 3.0179073810577393, "learning_rate": 4.754581316012785e-06, "logits/chosen": -2.8741941452026367, "logits/rejected": -2.799834728240967, "logps/chosen": -323.2701721191406, "logps/rejected": -299.8788146972656, "loss": 0.5984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.11364835500717163, "rewards/margins": 0.2376987189054489, "rewards/rejected": -0.35134708881378174, "step": 285 }, { "epoch": 0.232, "grad_norm": 3.1179349422454834, "learning_rate": 4.7392794005985324e-06, "logits/chosen": -2.801036834716797, "logits/rejected": -2.793466329574585, "logps/chosen": -293.9784240722656, "logps/rejected": -272.1210021972656, "loss": 0.5962, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.23425976932048798, "rewards/margins": 0.23078274726867676, "rewards/rejected": -0.4650425314903259, "step": 290 }, { "epoch": 0.236, "grad_norm": 3.5481436252593994, "learning_rate": 4.723540933228245e-06, "logits/chosen": -2.8212785720825195, "logits/rejected": -2.7978832721710205, "logps/chosen": -327.6084899902344, "logps/rejected": -320.15106201171875, "loss": 0.6612, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.32292693853378296, "rewards/margins": 0.1046195775270462, "rewards/rejected": -0.42754650115966797, "step": 295 }, { "epoch": 0.24, "grad_norm": 4.389492034912109, "learning_rate": 4.707368982147318e-06, "logits/chosen": -2.8768062591552734, "logits/rejected": -2.8266239166259766, "logps/chosen": -329.6361083984375, "logps/rejected": -282.6575927734375, "loss": 0.6372, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2786393463611603, "rewards/margins": 0.155739888548851, "rewards/rejected": -0.4343792498111725, "step": 300 }, { "epoch": 0.24, "eval_logits/chosen": -2.84016752243042, "eval_logits/rejected": -2.803346872329712, "eval_logps/chosen": -307.0444030761719, "eval_logps/rejected": -292.0920715332031, "eval_loss": 0.6181342005729675, "eval_rewards/accuracies": 0.682539701461792, "eval_rewards/chosen": -0.2380632609128952, "eval_rewards/margins": 0.20256145298480988, "eval_rewards/rejected": -0.4406247138977051, "eval_runtime": 166.7743, "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 300 }, { "epoch": 0.244, "grad_norm": 6.237858295440674, "learning_rate": 4.690766700109659e-06, "logits/chosen": -2.813170909881592, "logits/rejected": -2.765450954437256, "logps/chosen": -251.8035430908203, "logps/rejected": -226.10787963867188, "loss": 0.6377, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.28489404916763306, "rewards/margins": 0.15168778598308563, "rewards/rejected": -0.4365817904472351, "step": 305 }, { "epoch": 0.248, "grad_norm": 2.941599130630493, "learning_rate": 4.673737323763048e-06, "logits/chosen": -2.8621535301208496, "logits/rejected": -2.883449077606201, "logps/chosen": -323.72625732421875, "logps/rejected": -309.9552307128906, "loss": 0.5975, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2496490776538849, "rewards/margins": 0.24661684036254883, "rewards/rejected": -0.4962659478187561, "step": 310 }, { "epoch": 0.252, "grad_norm": 2.81584095954895, "learning_rate": 4.656284173018144e-06, "logits/chosen": -2.7917304039001465, "logits/rejected": -2.771953821182251, "logps/chosen": -305.7306213378906, "logps/rejected": -337.7940979003906, "loss": 0.6245, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.24593646824359894, "rewards/margins": 0.17785824835300446, "rewards/rejected": -0.423794686794281, "step": 315 }, { "epoch": 0.256, "grad_norm": 3.657536029815674, "learning_rate": 4.638410650401267e-06, "logits/chosen": -2.863358974456787, "logits/rejected": -2.8709418773651123, "logps/chosen": -308.0937805175781, "logps/rejected": -322.3440246582031, "loss": 0.6189, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19167309999465942, "rewards/margins": 0.20849671959877014, "rewards/rejected": -0.40016984939575195, "step": 320 }, { "epoch": 0.26, "grad_norm": 3.2525851726531982, "learning_rate": 4.620120240391065e-06, "logits/chosen": -2.8361878395080566, "logits/rejected": -2.8604865074157715, "logps/chosen": -331.04949951171875, "logps/rejected": -306.60662841796875, "loss": 0.612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15142570436000824, "rewards/margins": 0.23161661624908447, "rewards/rejected": -0.3830423355102539, "step": 325 }, { "epoch": 0.264, "grad_norm": 3.2161409854888916, "learning_rate": 4.601416508739211e-06, "logits/chosen": -2.765329360961914, "logits/rejected": -2.731293201446533, "logps/chosen": -294.65509033203125, "logps/rejected": -288.2440185546875, "loss": 0.6113, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1889929473400116, "rewards/margins": 0.23026308417320251, "rewards/rejected": -0.4192560315132141, "step": 330 }, { "epoch": 0.268, "grad_norm": 4.34539270401001, "learning_rate": 4.582303101775249e-06, "logits/chosen": -2.773538112640381, "logits/rejected": -2.750394582748413, "logps/chosen": -301.92291259765625, "logps/rejected": -276.76275634765625, "loss": 0.6137, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.25806164741516113, "rewards/margins": 0.237727090716362, "rewards/rejected": -0.49578872323036194, "step": 335 }, { "epoch": 0.272, "grad_norm": 2.9809610843658447, "learning_rate": 4.562783745695738e-06, "logits/chosen": -2.7601351737976074, "logits/rejected": -2.805574893951416, "logps/chosen": -213.38693237304688, "logps/rejected": -248.6228790283203, "loss": 0.6131, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2839365601539612, "rewards/margins": 0.2204209268093109, "rewards/rejected": -0.5043575167655945, "step": 340 }, { "epoch": 0.276, "grad_norm": 3.7868945598602295, "learning_rate": 4.542862245837821e-06, "logits/chosen": -2.862086296081543, "logits/rejected": -2.80869722366333, "logps/chosen": -326.58392333984375, "logps/rejected": -329.5889587402344, "loss": 0.5811, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23613891005516052, "rewards/margins": 0.30714207887649536, "rewards/rejected": -0.5432809591293335, "step": 345 }, { "epoch": 0.28, "grad_norm": 4.723974227905273, "learning_rate": 4.522542485937369e-06, "logits/chosen": -2.723212242126465, "logits/rejected": -2.6936533451080322, "logps/chosen": -267.9949951171875, "logps/rejected": -286.03448486328125, "loss": 0.6194, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3070460557937622, "rewards/margins": 0.2262255847454071, "rewards/rejected": -0.5332716703414917, "step": 350 }, { "epoch": 0.284, "grad_norm": 3.612205982208252, "learning_rate": 4.501828427371834e-06, "logits/chosen": -2.8160369396209717, "logits/rejected": -2.7678263187408447, "logps/chosen": -276.889892578125, "logps/rejected": -262.567138671875, "loss": 0.6269, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2459622323513031, "rewards/margins": 0.21559634804725647, "rewards/rejected": -0.4615585207939148, "step": 355 }, { "epoch": 0.288, "grad_norm": 4.156825065612793, "learning_rate": 4.4807241083879774e-06, "logits/chosen": -2.8309903144836426, "logits/rejected": -2.848707914352417, "logps/chosen": -298.7277526855469, "logps/rejected": -328.09912109375, "loss": 0.6177, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27550002932548523, "rewards/margins": 0.23323087394237518, "rewards/rejected": -0.508730947971344, "step": 360 }, { "epoch": 0.292, "grad_norm": 4.314282417297363, "learning_rate": 4.4592336433146e-06, "logits/chosen": -2.811722755432129, "logits/rejected": -2.807515859603882, "logps/chosen": -309.4399719238281, "logps/rejected": -314.6178283691406, "loss": 0.6153, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3844314217567444, "rewards/margins": 0.22192268073558807, "rewards/rejected": -0.6063541173934937, "step": 365 }, { "epoch": 0.296, "grad_norm": 4.635516166687012, "learning_rate": 4.437361221760449e-06, "logits/chosen": -2.850919485092163, "logits/rejected": -2.8320136070251465, "logps/chosen": -316.4634704589844, "logps/rejected": -295.4265441894531, "loss": 0.5943, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.32734400033950806, "rewards/margins": 0.2893194556236267, "rewards/rejected": -0.61666339635849, "step": 370 }, { "epoch": 0.3, "grad_norm": 4.108780384063721, "learning_rate": 4.415111107797445e-06, "logits/chosen": -2.763192892074585, "logits/rejected": -2.6753077507019043, "logps/chosen": -304.7072448730469, "logps/rejected": -296.2711486816406, "loss": 0.6325, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.277075856924057, "rewards/margins": 0.20892643928527832, "rewards/rejected": -0.48600226640701294, "step": 375 }, { "epoch": 0.304, "grad_norm": 2.956279993057251, "learning_rate": 4.3924876391293915e-06, "logits/chosen": -2.7950615882873535, "logits/rejected": -2.7592384815216064, "logps/chosen": -273.1991271972656, "logps/rejected": -275.79229736328125, "loss": 0.6129, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13601061701774597, "rewards/margins": 0.22439420223236084, "rewards/rejected": -0.3604048192501068, "step": 380 }, { "epoch": 0.308, "grad_norm": 4.006164073944092, "learning_rate": 4.36949522624633e-06, "logits/chosen": -2.830416202545166, "logits/rejected": -2.8047218322753906, "logps/chosen": -323.8509826660156, "logps/rejected": -308.55230712890625, "loss": 0.5875, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10837619006633759, "rewards/margins": 0.28125640749931335, "rewards/rejected": -0.38963261246681213, "step": 385 }, { "epoch": 0.312, "grad_norm": 6.139017581939697, "learning_rate": 4.346138351564711e-06, "logits/chosen": -2.8317887783050537, "logits/rejected": -2.7582955360412598, "logps/chosen": -362.7658996582031, "logps/rejected": -310.333984375, "loss": 0.6309, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.20606637001037598, "rewards/margins": 0.2002502977848053, "rewards/rejected": -0.4063166677951813, "step": 390 }, { "epoch": 0.316, "grad_norm": 4.839846134185791, "learning_rate": 4.322421568553529e-06, "logits/chosen": -2.848759174346924, "logits/rejected": -2.7962448596954346, "logps/chosen": -382.81048583984375, "logps/rejected": -339.45672607421875, "loss": 0.6138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1714310199022293, "rewards/margins": 0.22842545807361603, "rewards/rejected": -0.39985641837120056, "step": 395 }, { "epoch": 0.32, "grad_norm": 3.8282814025878906, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -2.8639044761657715, "logits/rejected": -2.8238472938537598, "logps/chosen": -317.0664367675781, "logps/rejected": -313.5345458984375, "loss": 0.5699, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1875368058681488, "rewards/margins": 0.35046523809432983, "rewards/rejected": -0.538002073764801, "step": 400 }, { "epoch": 0.32, "eval_logits/chosen": -2.831890344619751, "eval_logits/rejected": -2.795173168182373, "eval_logps/chosen": -309.8138427734375, "eval_logps/rejected": -301.8563232421875, "eval_loss": 0.6034325957298279, "eval_rewards/accuracies": 0.6964285969734192, "eval_rewards/chosen": -0.2657574713230133, "eval_rewards/margins": 0.27250993251800537, "eval_rewards/rejected": -0.5382674336433411, "eval_runtime": 166.7653, "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 400 }, { "epoch": 0.324, "grad_norm": 4.6939778327941895, "learning_rate": 4.273926841341303e-06, "logits/chosen": -2.8153679370880127, "logits/rejected": -2.797407388687134, "logps/chosen": -267.6861877441406, "logps/rejected": -296.2272033691406, "loss": 0.6146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2948915362358093, "rewards/margins": 0.27376502752304077, "rewards/rejected": -0.5686565637588501, "step": 405 }, { "epoch": 0.328, "grad_norm": 5.867495059967041, "learning_rate": 4.249158351283414e-06, "logits/chosen": -2.8131103515625, "logits/rejected": -2.7752747535705566, "logps/chosen": -296.8202209472656, "logps/rejected": -309.84991455078125, "loss": 0.6198, "rewards/accuracies": 0.625, "rewards/chosen": -0.3828127086162567, "rewards/margins": 0.27066582441329956, "rewards/rejected": -0.6534786224365234, "step": 410 }, { "epoch": 0.332, "grad_norm": 3.4944844245910645, "learning_rate": 4.224048859339175e-06, "logits/chosen": -2.7919559478759766, "logits/rejected": -2.7731316089630127, "logps/chosen": -320.2292175292969, "logps/rejected": -313.79827880859375, "loss": 0.5827, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.28928855061531067, "rewards/margins": 0.3156905770301819, "rewards/rejected": -0.6049790978431702, "step": 415 }, { "epoch": 0.336, "grad_norm": 6.125190734863281, "learning_rate": 4.198603260653792e-06, "logits/chosen": -2.8130970001220703, "logits/rejected": -2.7901930809020996, "logps/chosen": -317.51165771484375, "logps/rejected": -293.3788146972656, "loss": 0.6275, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2976795434951782, "rewards/margins": 0.21939225494861603, "rewards/rejected": -0.5170717239379883, "step": 420 }, { "epoch": 0.34, "grad_norm": 4.455983638763428, "learning_rate": 4.172826515897146e-06, "logits/chosen": -2.8195388317108154, "logits/rejected": -2.780494451522827, "logps/chosen": -283.6789245605469, "logps/rejected": -300.58563232421875, "loss": 0.572, "rewards/accuracies": 0.75, "rewards/chosen": -0.23819419741630554, "rewards/margins": 0.36092180013656616, "rewards/rejected": -0.5991159677505493, "step": 425 }, { "epoch": 0.344, "grad_norm": 3.734440326690674, "learning_rate": 4.146723650296701e-06, "logits/chosen": -2.8214731216430664, "logits/rejected": -2.80680775642395, "logps/chosen": -305.1948547363281, "logps/rejected": -301.62579345703125, "loss": 0.603, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.25884318351745605, "rewards/margins": 0.2811453640460968, "rewards/rejected": -0.5399885773658752, "step": 430 }, { "epoch": 0.348, "grad_norm": 3.1842479705810547, "learning_rate": 4.120299752657828e-06, "logits/chosen": -2.799774169921875, "logits/rejected": -2.790123462677002, "logps/chosen": -309.83477783203125, "logps/rejected": -303.22332763671875, "loss": 0.5892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20615491271018982, "rewards/margins": 0.3112487494945526, "rewards/rejected": -0.5174037218093872, "step": 435 }, { "epoch": 0.352, "grad_norm": 6.865662574768066, "learning_rate": 4.093559974371725e-06, "logits/chosen": -2.806478977203369, "logits/rejected": -2.8173699378967285, "logps/chosen": -312.33978271484375, "logps/rejected": -340.7373046875, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22585222125053406, "rewards/margins": 0.3283933103084564, "rewards/rejected": -0.5542455911636353, "step": 440 }, { "epoch": 0.356, "grad_norm": 5.557225704193115, "learning_rate": 4.066509528411151e-06, "logits/chosen": -2.7204031944274902, "logits/rejected": -2.679771900177002, "logps/chosen": -277.78057861328125, "logps/rejected": -310.7375183105469, "loss": 0.5563, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30937278270721436, "rewards/margins": 0.39451706409454346, "rewards/rejected": -0.7038899660110474, "step": 445 }, { "epoch": 0.36, "grad_norm": 6.662594795227051, "learning_rate": 4.039153688314146e-06, "logits/chosen": -2.8505125045776367, "logits/rejected": -2.7928390502929688, "logps/chosen": -344.05902099609375, "logps/rejected": -324.29962158203125, "loss": 0.596, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.43934765458106995, "rewards/margins": 0.31155315041542053, "rewards/rejected": -0.7509008049964905, "step": 450 }, { "epoch": 0.364, "grad_norm": 5.561422348022461, "learning_rate": 4.011497787155938e-06, "logits/chosen": -2.759361743927002, "logits/rejected": -2.697282314300537, "logps/chosen": -333.2874755859375, "logps/rejected": -329.25225830078125, "loss": 0.5838, "rewards/accuracies": 0.625, "rewards/chosen": -0.529011607170105, "rewards/margins": 0.3510381579399109, "rewards/rejected": -0.8800498247146606, "step": 455 }, { "epoch": 0.368, "grad_norm": 4.668981075286865, "learning_rate": 3.983547216509254e-06, "logits/chosen": -2.8310632705688477, "logits/rejected": -2.7877674102783203, "logps/chosen": -384.951904296875, "logps/rejected": -339.9574890136719, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": -0.5116842985153198, "rewards/margins": 0.3813716769218445, "rewards/rejected": -0.8930560946464539, "step": 460 }, { "epoch": 0.372, "grad_norm": 3.6945154666900635, "learning_rate": 3.955307425393224e-06, "logits/chosen": -2.860947370529175, "logits/rejected": -2.817092180252075, "logps/chosen": -368.18890380859375, "logps/rejected": -373.40155029296875, "loss": 0.5221, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3717382550239563, "rewards/margins": 0.5002824664115906, "rewards/rejected": -0.8720208406448364, "step": 465 }, { "epoch": 0.376, "grad_norm": 5.232949256896973, "learning_rate": 3.92678391921108e-06, "logits/chosen": -2.7127513885498047, "logits/rejected": -2.689349889755249, "logps/chosen": -373.66119384765625, "logps/rejected": -391.1875305175781, "loss": 0.5644, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5153363347053528, "rewards/margins": 0.4165223240852356, "rewards/rejected": -0.9318585395812988, "step": 470 }, { "epoch": 0.38, "grad_norm": 5.3401899337768555, "learning_rate": 3.897982258676867e-06, "logits/chosen": -2.75883150100708, "logits/rejected": -2.7517054080963135, "logps/chosen": -315.7061462402344, "logps/rejected": -348.54168701171875, "loss": 0.5835, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4366493821144104, "rewards/margins": 0.3299176096916199, "rewards/rejected": -0.7665671110153198, "step": 475 }, { "epoch": 0.384, "grad_norm": 6.390676498413086, "learning_rate": 3.868908058731376e-06, "logits/chosen": -2.77325701713562, "logits/rejected": -2.7217469215393066, "logps/chosen": -355.1070251464844, "logps/rejected": -332.23199462890625, "loss": 0.6723, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.558080792427063, "rewards/margins": 0.1657411754131317, "rewards/rejected": -0.7238219380378723, "step": 480 }, { "epoch": 0.388, "grad_norm": 8.37760066986084, "learning_rate": 3.839566987447492e-06, "logits/chosen": -2.7543792724609375, "logits/rejected": -2.7266902923583984, "logps/chosen": -345.9546813964844, "logps/rejected": -343.42669677734375, "loss": 0.5789, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.584365963935852, "rewards/margins": 0.35950571298599243, "rewards/rejected": -0.9438716173171997, "step": 485 }, { "epoch": 0.392, "grad_norm": 4.165892124176025, "learning_rate": 3.8099647649251984e-06, "logits/chosen": -2.798603057861328, "logits/rejected": -2.746656894683838, "logps/chosen": -339.4115905761719, "logps/rejected": -342.5590515136719, "loss": 0.6252, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5836890935897827, "rewards/margins": 0.2635918855667114, "rewards/rejected": -0.8472809791564941, "step": 490 }, { "epoch": 0.396, "grad_norm": 5.796815872192383, "learning_rate": 3.780107162176429e-06, "logits/chosen": -2.771759510040283, "logits/rejected": -2.7516114711761475, "logps/chosen": -340.46343994140625, "logps/rejected": -309.8275451660156, "loss": 0.5526, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5660332441329956, "rewards/margins": 0.4165772497653961, "rewards/rejected": -0.9826105237007141, "step": 495 }, { "epoch": 0.4, "grad_norm": 10.051532745361328, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -2.7455780506134033, "logits/rejected": -2.723891496658325, "logps/chosen": -364.03936767578125, "logps/rejected": -377.3976135253906, "loss": 0.5622, "rewards/accuracies": 0.75, "rewards/chosen": -0.5170733332633972, "rewards/margins": 0.44108885526657104, "rewards/rejected": -0.9581623077392578, "step": 500 }, { "epoch": 0.4, "eval_logits/chosen": -2.732027053833008, "eval_logits/rejected": -2.691253662109375, "eval_logps/chosen": -338.8871765136719, "eval_logps/rejected": -345.97265625, "eval_loss": 0.5688419342041016, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": -0.5564908385276794, "eval_rewards/margins": 0.42293980717658997, "eval_rewards/rejected": -0.9794306755065918, "eval_runtime": 166.7904, "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 500 }, { "epoch": 0.404, "grad_norm": 7.153428554534912, "learning_rate": 3.7196491478468322e-06, "logits/chosen": -2.662764549255371, "logits/rejected": -2.6799817085266113, "logps/chosen": -346.8814392089844, "logps/rejected": -386.2395935058594, "loss": 0.5618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6629476547241211, "rewards/margins": 0.44316092133522034, "rewards/rejected": -1.106108546257019, "step": 505 }, { "epoch": 0.408, "grad_norm": 7.824460506439209, "learning_rate": 3.689060522675689e-06, "logits/chosen": -2.739622116088867, "logits/rejected": -2.7229952812194824, "logps/chosen": -341.31610107421875, "logps/rejected": -361.16583251953125, "loss": 0.6013, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5365445613861084, "rewards/margins": 0.3803355097770691, "rewards/rejected": -0.9168800115585327, "step": 510 }, { "epoch": 0.412, "grad_norm": 6.43038272857666, "learning_rate": 3.658240087799655e-06, "logits/chosen": -2.6712212562561035, "logits/rejected": -2.6851272583007812, "logps/chosen": -314.61614990234375, "logps/rejected": -371.5428466796875, "loss": 0.5377, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5410436391830444, "rewards/margins": 0.517443835735321, "rewards/rejected": -1.0584874153137207, "step": 515 }, { "epoch": 0.416, "grad_norm": 8.127975463867188, "learning_rate": 3.627193851723577e-06, "logits/chosen": -2.717458724975586, "logits/rejected": -2.690972328186035, "logps/chosen": -355.3538818359375, "logps/rejected": -379.71417236328125, "loss": 0.6306, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7785177230834961, "rewards/margins": 0.36465466022491455, "rewards/rejected": -1.143172264099121, "step": 520 }, { "epoch": 0.42, "grad_norm": 8.312115669250488, "learning_rate": 3.595927866972694e-06, "logits/chosen": -2.6868720054626465, "logits/rejected": -2.6861917972564697, "logps/chosen": -292.2730407714844, "logps/rejected": -340.75592041015625, "loss": 0.5573, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6696837544441223, "rewards/margins": 0.5235880613327026, "rewards/rejected": -1.1932718753814697, "step": 525 }, { "epoch": 0.424, "grad_norm": 9.0608491897583, "learning_rate": 3.564448228912682e-06, "logits/chosen": -2.6300408840179443, "logits/rejected": -2.623781204223633, "logps/chosen": -381.3360900878906, "logps/rejected": -383.34161376953125, "loss": 0.6091, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7431681752204895, "rewards/margins": 0.39449232816696167, "rewards/rejected": -1.1376605033874512, "step": 530 }, { "epoch": 0.428, "grad_norm": 15.011635780334473, "learning_rate": 3.532761074561355e-06, "logits/chosen": -2.6198954582214355, "logits/rejected": -2.571420192718506, "logps/chosen": -389.4923095703125, "logps/rejected": -430.41204833984375, "loss": 0.5782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7040005922317505, "rewards/margins": 0.5225512385368347, "rewards/rejected": -1.2265517711639404, "step": 535 }, { "epoch": 0.432, "grad_norm": 6.0668768882751465, "learning_rate": 3.5008725813922383e-06, "logits/chosen": -2.7458198070526123, "logits/rejected": -2.6754350662231445, "logps/chosen": -353.69500732421875, "logps/rejected": -393.26953125, "loss": 0.529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5991845726966858, "rewards/margins": 0.5754216313362122, "rewards/rejected": -1.1746060848236084, "step": 540 }, { "epoch": 0.436, "grad_norm": 9.939155578613281, "learning_rate": 3.4687889661302577e-06, "logits/chosen": -2.648597240447998, "logits/rejected": -2.6514642238616943, "logps/chosen": -319.5657653808594, "logps/rejected": -362.01885986328125, "loss": 0.5329, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6453290581703186, "rewards/margins": 0.6017133593559265, "rewards/rejected": -1.2470424175262451, "step": 545 }, { "epoch": 0.44, "grad_norm": 7.438101768493652, "learning_rate": 3.436516483539781e-06, "logits/chosen": -2.679978847503662, "logits/rejected": -2.667757034301758, "logps/chosen": -354.5327453613281, "logps/rejected": -376.2767333984375, "loss": 0.6251, "rewards/accuracies": 0.625, "rewards/chosen": -0.8374980688095093, "rewards/margins": 0.40907567739486694, "rewards/rejected": -1.246573805809021, "step": 550 }, { "epoch": 0.444, "grad_norm": 6.391237735748291, "learning_rate": 3.4040614252052305e-06, "logits/chosen": -2.672637939453125, "logits/rejected": -2.6665103435516357, "logps/chosen": -380.81195068359375, "logps/rejected": -405.7100524902344, "loss": 0.6038, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.939714252948761, "rewards/margins": 0.42443543672561646, "rewards/rejected": -1.364149808883667, "step": 555 }, { "epoch": 0.448, "grad_norm": 9.154504776000977, "learning_rate": 3.3714301183045382e-06, "logits/chosen": -2.6303160190582275, "logits/rejected": -2.572775363922119, "logps/chosen": -331.26007080078125, "logps/rejected": -356.6570129394531, "loss": 0.6228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9475865364074707, "rewards/margins": 0.3488699793815613, "rewards/rejected": -1.2964565753936768, "step": 560 }, { "epoch": 0.452, "grad_norm": 5.733785629272461, "learning_rate": 3.338628924375638e-06, "logits/chosen": -2.7566773891448975, "logits/rejected": -2.7174875736236572, "logps/chosen": -315.8184509277344, "logps/rejected": -379.3972473144531, "loss": 0.5181, "rewards/accuracies": 0.75, "rewards/chosen": -0.7764695286750793, "rewards/margins": 0.5681883096694946, "rewards/rejected": -1.3446576595306396, "step": 565 }, { "epoch": 0.456, "grad_norm": 5.354190349578857, "learning_rate": 3.3056642380762783e-06, "logits/chosen": -2.715209484100342, "logits/rejected": -2.703580379486084, "logps/chosen": -289.91387939453125, "logps/rejected": -322.75933837890625, "loss": 0.5887, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5646545886993408, "rewards/margins": 0.43763160705566406, "rewards/rejected": -1.0022861957550049, "step": 570 }, { "epoch": 0.46, "grad_norm": 6.91750955581665, "learning_rate": 3.272542485937369e-06, "logits/chosen": -2.6926093101501465, "logits/rejected": -2.6333212852478027, "logps/chosen": -300.4584045410156, "logps/rejected": -308.9971008300781, "loss": 0.5684, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.579850435256958, "rewards/margins": 0.424421489238739, "rewards/rejected": -1.0042719841003418, "step": 575 }, { "epoch": 0.464, "grad_norm": 5.240443706512451, "learning_rate": 3.2392701251101172e-06, "logits/chosen": -2.745445966720581, "logits/rejected": -2.6982531547546387, "logps/chosen": -345.88031005859375, "logps/rejected": -365.571533203125, "loss": 0.5121, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.45542454719543457, "rewards/margins": 0.6121553778648376, "rewards/rejected": -1.067579984664917, "step": 580 }, { "epoch": 0.468, "grad_norm": 5.993870258331299, "learning_rate": 3.205853642107192e-06, "logits/chosen": -2.679216146469116, "logits/rejected": -2.660553216934204, "logps/chosen": -300.2437744140625, "logps/rejected": -327.10833740234375, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5670121312141418, "rewards/margins": 0.38394877314567566, "rewards/rejected": -0.9509609341621399, "step": 585 }, { "epoch": 0.472, "grad_norm": 9.442901611328125, "learning_rate": 3.1722995515381644e-06, "logits/chosen": -2.657310962677002, "logits/rejected": -2.649341344833374, "logps/chosen": -328.50543212890625, "logps/rejected": -353.29852294921875, "loss": 0.5305, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.47631463408470154, "rewards/margins": 0.5425296425819397, "rewards/rejected": -1.0188442468643188, "step": 590 }, { "epoch": 0.476, "grad_norm": 6.075891971588135, "learning_rate": 3.1386143948394764e-06, "logits/chosen": -2.67082142829895, "logits/rejected": -2.663555145263672, "logps/chosen": -311.79595947265625, "logps/rejected": -385.62750244140625, "loss": 0.545, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5932539701461792, "rewards/margins": 0.5186047554016113, "rewards/rejected": -1.11185884475708, "step": 595 }, { "epoch": 0.48, "grad_norm": 8.881017684936523, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -2.6688761711120605, "logits/rejected": -2.598611354827881, "logps/chosen": -374.6273193359375, "logps/rejected": -324.2472839355469, "loss": 0.5826, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5414284467697144, "rewards/margins": 0.4649595320224762, "rewards/rejected": -1.0063880681991577, "step": 600 }, { "epoch": 0.48, "eval_logits/chosen": -2.690697431564331, "eval_logits/rejected": -2.6522767543792725, "eval_logps/chosen": -337.7991638183594, "eval_logps/rejected": -359.91156005859375, "eval_loss": 0.5457041263580322, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": -0.5456109642982483, "eval_rewards/margins": 0.5732083916664124, "eval_rewards/rejected": -1.1188193559646606, "eval_runtime": 166.7663, "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 600 }, { "epoch": 0.484, "grad_norm": 8.860865592956543, "learning_rate": 3.0708771752766397e-06, "logits/chosen": -2.714979648590088, "logits/rejected": -2.67262601852417, "logps/chosen": -379.689697265625, "logps/rejected": -389.07537841796875, "loss": 0.5436, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.500022292137146, "rewards/margins": 0.5223508477210999, "rewards/rejected": -1.0223733186721802, "step": 605 }, { "epoch": 0.488, "grad_norm": 7.4055280685424805, "learning_rate": 3.0368383179176584e-06, "logits/chosen": -2.648967981338501, "logits/rejected": -2.5849640369415283, "logps/chosen": -349.82244873046875, "logps/rejected": -420.50494384765625, "loss": 0.5099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5480636954307556, "rewards/margins": 0.6970816850662231, "rewards/rejected": -1.2451454401016235, "step": 610 }, { "epoch": 0.492, "grad_norm": 8.267292976379395, "learning_rate": 3.002694802864912e-06, "logits/chosen": -2.6305606365203857, "logits/rejected": -2.6106619834899902, "logps/chosen": -341.2841491699219, "logps/rejected": -381.73297119140625, "loss": 0.5663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.48357224464416504, "rewards/margins": 0.542944073677063, "rewards/rejected": -1.026516318321228, "step": 615 }, { "epoch": 0.496, "grad_norm": 6.144512176513672, "learning_rate": 2.9684532864643123e-06, "logits/chosen": -2.6233174800872803, "logits/rejected": -2.6076509952545166, "logps/chosen": -316.0887756347656, "logps/rejected": -354.6174621582031, "loss": 0.5047, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4337923526763916, "rewards/margins": 0.6806012392044067, "rewards/rejected": -1.1143935918807983, "step": 620 }, { "epoch": 0.5, "grad_norm": 12.604687690734863, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -2.668950319290161, "logits/rejected": -2.6760809421539307, "logps/chosen": -368.0353698730469, "logps/rejected": -343.40869140625, "loss": 0.658, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.606735110282898, "rewards/margins": 0.32046255469322205, "rewards/rejected": -0.9271975755691528, "step": 625 }, { "epoch": 0.504, "grad_norm": 9.32141399383545, "learning_rate": 2.8997029692295875e-06, "logits/chosen": -2.6581954956054688, "logits/rejected": -2.6276352405548096, "logps/chosen": -291.1399841308594, "logps/rejected": -327.74859619140625, "loss": 0.6077, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5456727743148804, "rewards/margins": 0.44013065099716187, "rewards/rejected": -0.9858034253120422, "step": 630 }, { "epoch": 0.508, "grad_norm": 8.439581871032715, "learning_rate": 2.8652075714060296e-06, "logits/chosen": -2.673593044281006, "logits/rejected": -2.6902694702148438, "logps/chosen": -301.8299560546875, "logps/rejected": -353.7574157714844, "loss": 0.5776, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5664817094802856, "rewards/margins": 0.4610595107078552, "rewards/rejected": -1.027541160583496, "step": 635 }, { "epoch": 0.512, "grad_norm": 6.46388053894043, "learning_rate": 2.8306409756428067e-06, "logits/chosen": -2.6250970363616943, "logits/rejected": -2.595864772796631, "logps/chosen": -283.65887451171875, "logps/rejected": -289.2749938964844, "loss": 0.5845, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4361953139305115, "rewards/margins": 0.3893643915653229, "rewards/rejected": -0.8255597949028015, "step": 640 }, { "epoch": 0.516, "grad_norm": 7.4935712814331055, "learning_rate": 2.7960099207662535e-06, "logits/chosen": -2.638918161392212, "logits/rejected": -2.606156826019287, "logps/chosen": -298.98980712890625, "logps/rejected": -329.75079345703125, "loss": 0.5492, "rewards/accuracies": 0.75, "rewards/chosen": -0.4644347131252289, "rewards/margins": 0.5126577615737915, "rewards/rejected": -0.9770925641059875, "step": 645 }, { "epoch": 0.52, "grad_norm": 5.561440467834473, "learning_rate": 2.761321158169134e-06, "logits/chosen": -2.6843574047088623, "logits/rejected": -2.687722682952881, "logps/chosen": -330.052001953125, "logps/rejected": -330.7695007324219, "loss": 0.5886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.48628172278404236, "rewards/margins": 0.3754786252975464, "rewards/rejected": -0.8617603182792664, "step": 650 }, { "epoch": 0.524, "grad_norm": 5.8051862716674805, "learning_rate": 2.726581450494451e-06, "logits/chosen": -2.6416282653808594, "logits/rejected": -2.6334142684936523, "logps/chosen": -323.7715148925781, "logps/rejected": -333.45831298828125, "loss": 0.5329, "rewards/accuracies": 0.75, "rewards/chosen": -0.2815348505973816, "rewards/margins": 0.5408404469490051, "rewards/rejected": -0.8223752975463867, "step": 655 }, { "epoch": 0.528, "grad_norm": 4.751335144042969, "learning_rate": 2.6917975703170466e-06, "logits/chosen": -2.691120147705078, "logits/rejected": -2.6784567832946777, "logps/chosen": -319.2125244140625, "logps/rejected": -377.9074401855469, "loss": 0.4832, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.34588542580604553, "rewards/margins": 0.7417780160903931, "rewards/rejected": -1.0876634120941162, "step": 660 }, { "epoch": 0.532, "grad_norm": 4.525731086730957, "learning_rate": 2.6569762988232838e-06, "logits/chosen": -2.6197690963745117, "logits/rejected": -2.6266191005706787, "logps/chosen": -293.38787841796875, "logps/rejected": -343.3548583984375, "loss": 0.5703, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4132465720176697, "rewards/margins": 0.46936559677124023, "rewards/rejected": -0.8826121091842651, "step": 665 }, { "epoch": 0.536, "grad_norm": 13.704550743103027, "learning_rate": 2.6221244244890336e-06, "logits/chosen": -2.677114963531494, "logits/rejected": -2.5927250385284424, "logps/chosen": -350.24310302734375, "logps/rejected": -378.822021484375, "loss": 0.5502, "rewards/accuracies": 0.6875, "rewards/chosen": -0.596604585647583, "rewards/margins": 0.5345416069030762, "rewards/rejected": -1.1311461925506592, "step": 670 }, { "epoch": 0.54, "grad_norm": 13.029091835021973, "learning_rate": 2.587248741756253e-06, "logits/chosen": -2.7029571533203125, "logits/rejected": -2.6872310638427734, "logps/chosen": -327.059326171875, "logps/rejected": -376.5028991699219, "loss": 0.5659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4712875485420227, "rewards/margins": 0.5444241762161255, "rewards/rejected": -1.0157115459442139, "step": 675 }, { "epoch": 0.544, "grad_norm": 5.572321891784668, "learning_rate": 2.5523560497083927e-06, "logits/chosen": -2.7125682830810547, "logits/rejected": -2.6776933670043945, "logps/chosen": -343.49658203125, "logps/rejected": -386.14215087890625, "loss": 0.5567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6705636978149414, "rewards/margins": 0.5794155597686768, "rewards/rejected": -1.2499791383743286, "step": 680 }, { "epoch": 0.548, "grad_norm": 11.680253028869629, "learning_rate": 2.517453150744904e-06, "logits/chosen": -2.708914041519165, "logits/rejected": -2.659080982208252, "logps/chosen": -388.8927307128906, "logps/rejected": -396.7651062011719, "loss": 0.6039, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7607260942459106, "rewards/margins": 0.5218645334243774, "rewards/rejected": -1.2825905084609985, "step": 685 }, { "epoch": 0.552, "grad_norm": 7.640016555786133, "learning_rate": 2.482546849255096e-06, "logits/chosen": -2.6931045055389404, "logits/rejected": -2.6509275436401367, "logps/chosen": -371.285400390625, "logps/rejected": -441.174560546875, "loss": 0.5044, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.866797924041748, "rewards/margins": 0.8486925363540649, "rewards/rejected": -1.7154903411865234, "step": 690 }, { "epoch": 0.556, "grad_norm": 6.240846157073975, "learning_rate": 2.447643950291608e-06, "logits/chosen": -2.527269124984741, "logits/rejected": -2.4740500450134277, "logps/chosen": -335.0668640136719, "logps/rejected": -334.4136657714844, "loss": 0.5401, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7494308352470398, "rewards/margins": 0.6046732068061829, "rewards/rejected": -1.3541040420532227, "step": 695 }, { "epoch": 0.56, "grad_norm": 10.70870590209961, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -2.6449031829833984, "logits/rejected": -2.6302623748779297, "logps/chosen": -367.7524719238281, "logps/rejected": -402.79736328125, "loss": 0.5313, "rewards/accuracies": 0.6875, "rewards/chosen": -0.799649178981781, "rewards/margins": 0.6352438926696777, "rewards/rejected": -1.434893012046814, "step": 700 }, { "epoch": 0.56, "eval_logits/chosen": -2.658555030822754, "eval_logits/rejected": -2.617255449295044, "eval_logps/chosen": -354.6570739746094, "eval_logps/rejected": -381.07342529296875, "eval_loss": 0.5387491583824158, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": -0.7141901850700378, "eval_rewards/margins": 0.616247832775116, "eval_rewards/rejected": -1.3304380178451538, "eval_runtime": 165.7154, "eval_samples_per_second": 3.017, "eval_steps_per_second": 0.38, "step": 700 }, { "epoch": 0.564, "grad_norm": 16.324960708618164, "learning_rate": 2.377875575510967e-06, "logits/chosen": -2.5862739086151123, "logits/rejected": -2.5246312618255615, "logps/chosen": -361.58331298828125, "logps/rejected": -368.5226745605469, "loss": 0.5767, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8630898594856262, "rewards/margins": 0.5553407669067383, "rewards/rejected": -1.4184306859970093, "step": 705 }, { "epoch": 0.568, "grad_norm": 9.928940773010254, "learning_rate": 2.3430237011767166e-06, "logits/chosen": -2.6898465156555176, "logits/rejected": -2.6531174182891846, "logps/chosen": -348.86614990234375, "logps/rejected": -376.40582275390625, "loss": 0.5582, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7559942007064819, "rewards/margins": 0.5151509642601013, "rewards/rejected": -1.2711451053619385, "step": 710 }, { "epoch": 0.572, "grad_norm": 8.1253023147583, "learning_rate": 2.3082024296829538e-06, "logits/chosen": -2.6176774501800537, "logits/rejected": -2.579152822494507, "logps/chosen": -302.24481201171875, "logps/rejected": -392.26226806640625, "loss": 0.4689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6992444396018982, "rewards/margins": 0.8311125040054321, "rewards/rejected": -1.530356764793396, "step": 715 }, { "epoch": 0.576, "grad_norm": 13.087418556213379, "learning_rate": 2.2734185495055503e-06, "logits/chosen": -2.67484974861145, "logits/rejected": -2.5980706214904785, "logps/chosen": -360.56085205078125, "logps/rejected": -359.30743408203125, "loss": 0.5599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7009941935539246, "rewards/margins": 0.48396244645118713, "rewards/rejected": -1.184956669807434, "step": 720 }, { "epoch": 0.58, "grad_norm": 8.680456161499023, "learning_rate": 2.238678841830867e-06, "logits/chosen": -2.6140735149383545, "logits/rejected": -2.5830371379852295, "logps/chosen": -364.34942626953125, "logps/rejected": -394.05047607421875, "loss": 0.5686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6779341697692871, "rewards/margins": 0.5314933657646179, "rewards/rejected": -1.2094275951385498, "step": 725 }, { "epoch": 0.584, "grad_norm": 10.017196655273438, "learning_rate": 2.2039900792337477e-06, "logits/chosen": -2.644421339035034, "logits/rejected": -2.6203815937042236, "logps/chosen": -375.94244384765625, "logps/rejected": -401.27056884765625, "loss": 0.5818, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8054073452949524, "rewards/margins": 0.5798496007919312, "rewards/rejected": -1.3852570056915283, "step": 730 }, { "epoch": 0.588, "grad_norm": 6.945478439331055, "learning_rate": 2.1693590243571937e-06, "logits/chosen": -2.6634681224823, "logits/rejected": -2.6050820350646973, "logps/chosen": -348.97540283203125, "logps/rejected": -389.2197265625, "loss": 0.5502, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.855374813079834, "rewards/margins": 0.6680639982223511, "rewards/rejected": -1.523438811302185, "step": 735 }, { "epoch": 0.592, "grad_norm": 10.911724090576172, "learning_rate": 2.134792428593971e-06, "logits/chosen": -2.5928680896759033, "logits/rejected": -2.563474416732788, "logps/chosen": -328.8409729003906, "logps/rejected": -379.95599365234375, "loss": 0.5491, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9474200010299683, "rewards/margins": 0.49488845467567444, "rewards/rejected": -1.4423085451126099, "step": 740 }, { "epoch": 0.596, "grad_norm": 9.961637496948242, "learning_rate": 2.1002970307704134e-06, "logits/chosen": -2.7088496685028076, "logits/rejected": -2.646193027496338, "logps/chosen": -421.77178955078125, "logps/rejected": -460.4210510253906, "loss": 0.5682, "rewards/accuracies": 0.6875, "rewards/chosen": -0.875130295753479, "rewards/margins": 0.6770623922348022, "rewards/rejected": -1.5521926879882812, "step": 745 }, { "epoch": 0.6, "grad_norm": 6.698659420013428, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -2.6276183128356934, "logits/rejected": -2.6407034397125244, "logps/chosen": -371.37225341796875, "logps/rejected": -429.2191467285156, "loss": 0.5026, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8518401384353638, "rewards/margins": 0.7702382206916809, "rewards/rejected": -1.622078537940979, "step": 750 }, { "epoch": 0.604, "grad_norm": 9.7139310836792, "learning_rate": 2.031546713535688e-06, "logits/chosen": -2.6299374103546143, "logits/rejected": -2.572783946990967, "logps/chosen": -366.63360595703125, "logps/rejected": -413.1153259277344, "loss": 0.5502, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7979942560195923, "rewards/margins": 0.6386594772338867, "rewards/rejected": -1.4366536140441895, "step": 755 }, { "epoch": 0.608, "grad_norm": 12.361163139343262, "learning_rate": 1.997305197135089e-06, "logits/chosen": -2.554405689239502, "logits/rejected": -2.564911365509033, "logps/chosen": -300.50689697265625, "logps/rejected": -360.9603271484375, "loss": 0.5553, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8539615869522095, "rewards/margins": 0.5313900709152222, "rewards/rejected": -1.3853518962860107, "step": 760 }, { "epoch": 0.612, "grad_norm": 8.68078899383545, "learning_rate": 1.963161682082342e-06, "logits/chosen": -2.5307528972625732, "logits/rejected": -2.5801663398742676, "logps/chosen": -357.2311706542969, "logps/rejected": -383.4229431152344, "loss": 0.567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7676454186439514, "rewards/margins": 0.5505531430244446, "rewards/rejected": -1.3181986808776855, "step": 765 }, { "epoch": 0.616, "grad_norm": 4.522902011871338, "learning_rate": 1.9291228247233607e-06, "logits/chosen": -2.5538437366485596, "logits/rejected": -2.5088653564453125, "logps/chosen": -338.6076965332031, "logps/rejected": -382.578369140625, "loss": 0.5448, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5785464644432068, "rewards/margins": 0.5552471280097961, "rewards/rejected": -1.133793592453003, "step": 770 }, { "epoch": 0.62, "grad_norm": 8.719709396362305, "learning_rate": 1.895195261000831e-06, "logits/chosen": -2.619828462600708, "logits/rejected": -2.574763298034668, "logps/chosen": -361.0684814453125, "logps/rejected": -423.12408447265625, "loss": 0.5232, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5979502201080322, "rewards/margins": 0.6523123383522034, "rewards/rejected": -1.2502626180648804, "step": 775 }, { "epoch": 0.624, "grad_norm": 7.265892505645752, "learning_rate": 1.8613856051605242e-06, "logits/chosen": -2.4674975872039795, "logits/rejected": -2.4988579750061035, "logps/chosen": -303.31695556640625, "logps/rejected": -350.3982849121094, "loss": 0.5336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.65043705701828, "rewards/margins": 0.534541666507721, "rewards/rejected": -1.184978723526001, "step": 780 }, { "epoch": 0.628, "grad_norm": 5.215295314788818, "learning_rate": 1.827700448461836e-06, "logits/chosen": -2.6682486534118652, "logits/rejected": -2.595508098602295, "logps/chosen": -380.72271728515625, "logps/rejected": -402.666748046875, "loss": 0.5612, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8000537157058716, "rewards/margins": 0.5293024182319641, "rewards/rejected": -1.3293559551239014, "step": 785 }, { "epoch": 0.632, "grad_norm": 9.841978073120117, "learning_rate": 1.7941463578928088e-06, "logits/chosen": -2.592263698577881, "logits/rejected": -2.559999942779541, "logps/chosen": -422.0464782714844, "logps/rejected": -429.20831298828125, "loss": 0.58, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7715168595314026, "rewards/margins": 0.5217434167861938, "rewards/rejected": -1.2932603359222412, "step": 790 }, { "epoch": 0.636, "grad_norm": 7.524374485015869, "learning_rate": 1.7607298748898844e-06, "logits/chosen": -2.6286463737487793, "logits/rejected": -2.6194465160369873, "logps/chosen": -354.8101501464844, "logps/rejected": -398.53948974609375, "loss": 0.588, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7351819276809692, "rewards/margins": 0.5240973830223083, "rewards/rejected": -1.2592793703079224, "step": 795 }, { "epoch": 0.64, "grad_norm": 7.135324954986572, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -2.50719952583313, "logits/rejected": -2.4550704956054688, "logps/chosen": -336.1716003417969, "logps/rejected": -404.1689147949219, "loss": 0.5332, "rewards/accuracies": 0.75, "rewards/chosen": -0.632702648639679, "rewards/margins": 0.5536705255508423, "rewards/rejected": -1.1863731145858765, "step": 800 }, { "epoch": 0.64, "eval_logits/chosen": -2.6166913509368896, "eval_logits/rejected": -2.5759570598602295, "eval_logps/chosen": -355.7965393066406, "eval_logps/rejected": -381.5441589355469, "eval_loss": 0.5385683178901672, "eval_rewards/accuracies": 0.7182539701461792, "eval_rewards/chosen": -0.7255847454071045, "eval_rewards/margins": 0.6095607876777649, "eval_rewards/rejected": -1.3351454734802246, "eval_runtime": 165.673, "eval_samples_per_second": 3.018, "eval_steps_per_second": 0.38, "step": 800 }, { "epoch": 0.644, "grad_norm": 6.823562145233154, "learning_rate": 1.6943357619237227e-06, "logits/chosen": -2.5743985176086426, "logits/rejected": -2.563049793243408, "logps/chosen": -344.70318603515625, "logps/rejected": -374.0846862792969, "loss": 0.4913, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7136049866676331, "rewards/margins": 0.67967289686203, "rewards/rejected": -1.393277883529663, "step": 805 }, { "epoch": 0.648, "grad_norm": 9.05685806274414, "learning_rate": 1.661371075624363e-06, "logits/chosen": -2.6020989418029785, "logits/rejected": -2.6505770683288574, "logps/chosen": -347.4692687988281, "logps/rejected": -473.5855407714844, "loss": 0.5726, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9543269872665405, "rewards/margins": 0.5893380641937256, "rewards/rejected": -1.5436651706695557, "step": 810 }, { "epoch": 0.652, "grad_norm": 10.01281452178955, "learning_rate": 1.6285698816954626e-06, "logits/chosen": -2.6235404014587402, "logits/rejected": -2.5892868041992188, "logps/chosen": -362.3233947753906, "logps/rejected": -383.004638671875, "loss": 0.5152, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6492418646812439, "rewards/margins": 0.675209641456604, "rewards/rejected": -1.3244515657424927, "step": 815 }, { "epoch": 0.656, "grad_norm": 10.048011779785156, "learning_rate": 1.5959385747947697e-06, "logits/chosen": -2.5589287281036377, "logits/rejected": -2.503087043762207, "logps/chosen": -325.8083801269531, "logps/rejected": -343.68316650390625, "loss": 0.5628, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9260866045951843, "rewards/margins": 0.5877935886383057, "rewards/rejected": -1.5138801336288452, "step": 820 }, { "epoch": 0.66, "grad_norm": 10.148550987243652, "learning_rate": 1.56348351646022e-06, "logits/chosen": -2.459164619445801, "logits/rejected": -2.4178318977355957, "logps/chosen": -334.05731201171875, "logps/rejected": -384.0740661621094, "loss": 0.5496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9508736729621887, "rewards/margins": 0.5749450922012329, "rewards/rejected": -1.5258188247680664, "step": 825 }, { "epoch": 0.664, "grad_norm": 11.33963680267334, "learning_rate": 1.5312110338697427e-06, "logits/chosen": -2.5506274700164795, "logits/rejected": -2.476365566253662, "logps/chosen": -354.90130615234375, "logps/rejected": -414.8627014160156, "loss": 0.5309, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1112221479415894, "rewards/margins": 0.6936079263687134, "rewards/rejected": -1.8048301935195923, "step": 830 }, { "epoch": 0.668, "grad_norm": 9.498885154724121, "learning_rate": 1.4991274186077632e-06, "logits/chosen": -2.5485012531280518, "logits/rejected": -2.539670467376709, "logps/chosen": -386.7198181152344, "logps/rejected": -442.57421875, "loss": 0.524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0074347257614136, "rewards/margins": 0.6671037077903748, "rewards/rejected": -1.6745383739471436, "step": 835 }, { "epoch": 0.672, "grad_norm": 10.999540328979492, "learning_rate": 1.467238925438646e-06, "logits/chosen": -2.5577821731567383, "logits/rejected": -2.5113046169281006, "logps/chosen": -419.2740173339844, "logps/rejected": -448.5807189941406, "loss": 0.6084, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9763299226760864, "rewards/margins": 0.62681645154953, "rewards/rejected": -1.6031463146209717, "step": 840 }, { "epoch": 0.676, "grad_norm": 8.105618476867676, "learning_rate": 1.4355517710873184e-06, "logits/chosen": -2.524392604827881, "logits/rejected": -2.4944310188293457, "logps/chosen": -380.297119140625, "logps/rejected": -396.42138671875, "loss": 0.5097, "rewards/accuracies": 0.75, "rewards/chosen": -0.9465241432189941, "rewards/margins": 0.6743323802947998, "rewards/rejected": -1.6208562850952148, "step": 845 }, { "epoch": 0.68, "grad_norm": 14.872075080871582, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -2.496351957321167, "logits/rejected": -2.5108532905578613, "logps/chosen": -367.57183837890625, "logps/rejected": -420.6947326660156, "loss": 0.6583, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.039666771888733, "rewards/margins": 0.4930298328399658, "rewards/rejected": -1.5326964855194092, "step": 850 }, { "epoch": 0.684, "grad_norm": 9.286355018615723, "learning_rate": 1.3728061482764238e-06, "logits/chosen": -2.626911163330078, "logits/rejected": -2.6237576007843018, "logps/chosen": -398.8912048339844, "logps/rejected": -464.3138732910156, "loss": 0.6356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.851772129535675, "rewards/margins": 0.5085344910621643, "rewards/rejected": -1.3603065013885498, "step": 855 }, { "epoch": 0.688, "grad_norm": 9.629425048828125, "learning_rate": 1.3417599122003464e-06, "logits/chosen": -2.6108672618865967, "logits/rejected": -2.600440502166748, "logps/chosen": -341.123046875, "logps/rejected": -377.686767578125, "loss": 0.6128, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8103886842727661, "rewards/margins": 0.4344421327114105, "rewards/rejected": -1.244830846786499, "step": 860 }, { "epoch": 0.692, "grad_norm": 10.558273315429688, "learning_rate": 1.3109394773243117e-06, "logits/chosen": -2.5375044345855713, "logits/rejected": -2.5400888919830322, "logps/chosen": -382.33154296875, "logps/rejected": -431.41455078125, "loss": 0.5468, "rewards/accuracies": 0.75, "rewards/chosen": -0.8447578549385071, "rewards/margins": 0.7139667272567749, "rewards/rejected": -1.5587245225906372, "step": 865 }, { "epoch": 0.696, "grad_norm": 11.613428115844727, "learning_rate": 1.280350852153168e-06, "logits/chosen": -2.610506296157837, "logits/rejected": -2.5400068759918213, "logps/chosen": -361.366943359375, "logps/rejected": -373.01904296875, "loss": 0.5561, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8189024925231934, "rewards/margins": 0.5556577444076538, "rewards/rejected": -1.3745602369308472, "step": 870 }, { "epoch": 0.7, "grad_norm": 11.380918502807617, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -2.5413687229156494, "logits/rejected": -2.5202109813690186, "logps/chosen": -360.80889892578125, "logps/rejected": -413.19012451171875, "loss": 0.5164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7272705435752869, "rewards/margins": 0.6798168420791626, "rewards/rejected": -1.4070874452590942, "step": 875 }, { "epoch": 0.704, "grad_norm": 11.80184555053711, "learning_rate": 1.2198928378235717e-06, "logits/chosen": -2.5873050689697266, "logits/rejected": -2.576911211013794, "logps/chosen": -299.10498046875, "logps/rejected": -388.73211669921875, "loss": 0.5155, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6009725332260132, "rewards/margins": 0.6888505220413208, "rewards/rejected": -1.2898229360580444, "step": 880 }, { "epoch": 0.708, "grad_norm": 5.85650634765625, "learning_rate": 1.1900352350748026e-06, "logits/chosen": -2.560586929321289, "logits/rejected": -2.5254247188568115, "logps/chosen": -374.28692626953125, "logps/rejected": -407.04827880859375, "loss": 0.5088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7127686142921448, "rewards/margins": 0.7965149879455566, "rewards/rejected": -1.509283423423767, "step": 885 }, { "epoch": 0.712, "grad_norm": 7.8027448654174805, "learning_rate": 1.160433012552508e-06, "logits/chosen": -2.5033535957336426, "logits/rejected": -2.5112838745117188, "logps/chosen": -330.83880615234375, "logps/rejected": -380.98297119140625, "loss": 0.5379, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7824558019638062, "rewards/margins": 0.5539994239807129, "rewards/rejected": -1.3364553451538086, "step": 890 }, { "epoch": 0.716, "grad_norm": 8.75863265991211, "learning_rate": 1.1310919412686248e-06, "logits/chosen": -2.5839171409606934, "logits/rejected": -2.5830111503601074, "logps/chosen": -370.30780029296875, "logps/rejected": -396.5429992675781, "loss": 0.5589, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7587816119194031, "rewards/margins": 0.5284246802330017, "rewards/rejected": -1.2872062921524048, "step": 895 }, { "epoch": 0.72, "grad_norm": 8.253792762756348, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -2.5888657569885254, "logits/rejected": -2.5641961097717285, "logps/chosen": -352.4532775878906, "logps/rejected": -376.51751708984375, "loss": 0.5334, "rewards/accuracies": 0.75, "rewards/chosen": -0.746880054473877, "rewards/margins": 0.5956496000289917, "rewards/rejected": -1.342529535293579, "step": 900 }, { "epoch": 0.72, "eval_logits/chosen": -2.5998997688293457, "eval_logits/rejected": -2.5573904514312744, "eval_logps/chosen": -353.8529357910156, "eval_logps/rejected": -380.3204345703125, "eval_loss": 0.536827027797699, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": -0.7061484456062317, "eval_rewards/margins": 0.616759717464447, "eval_rewards/rejected": -1.3229081630706787, "eval_runtime": 165.7628, "eval_samples_per_second": 3.016, "eval_steps_per_second": 0.38, "step": 900 }, { "epoch": 0.724, "grad_norm": 7.283039093017578, "learning_rate": 1.073216080788921e-06, "logits/chosen": -2.6033217906951904, "logits/rejected": -2.5845823287963867, "logps/chosen": -361.44329833984375, "logps/rejected": -374.8320617675781, "loss": 0.6075, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7317181825637817, "rewards/margins": 0.37635332345962524, "rewards/rejected": -1.1080714464187622, "step": 905 }, { "epoch": 0.728, "grad_norm": 9.450459480285645, "learning_rate": 1.0446925746067768e-06, "logits/chosen": -2.5516154766082764, "logits/rejected": -2.495788097381592, "logps/chosen": -316.8692321777344, "logps/rejected": -324.5566101074219, "loss": 0.5047, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7124193906784058, "rewards/margins": 0.6433127522468567, "rewards/rejected": -1.3557320833206177, "step": 910 }, { "epoch": 0.732, "grad_norm": 11.512716293334961, "learning_rate": 1.0164527834907468e-06, "logits/chosen": -2.4677295684814453, "logits/rejected": -2.4644956588745117, "logps/chosen": -342.6759948730469, "logps/rejected": -419.4685974121094, "loss": 0.4815, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7622194886207581, "rewards/margins": 0.7687476277351379, "rewards/rejected": -1.5309669971466064, "step": 915 }, { "epoch": 0.736, "grad_norm": 27.01951789855957, "learning_rate": 9.88502212844063e-07, "logits/chosen": -2.5636465549468994, "logits/rejected": -2.5651931762695312, "logps/chosen": -345.3995666503906, "logps/rejected": -412.34979248046875, "loss": 0.622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8238021731376648, "rewards/margins": 0.42956480383872986, "rewards/rejected": -1.2533669471740723, "step": 920 }, { "epoch": 0.74, "grad_norm": 13.12364387512207, "learning_rate": 9.608463116858544e-07, "logits/chosen": -2.5695652961730957, "logits/rejected": -2.5348198413848877, "logps/chosen": -351.7240905761719, "logps/rejected": -388.83319091796875, "loss": 0.5433, "rewards/accuracies": 0.6875, "rewards/chosen": -0.76836758852005, "rewards/margins": 0.6143354773521423, "rewards/rejected": -1.3827030658721924, "step": 925 }, { "epoch": 0.744, "grad_norm": 10.578348159790039, "learning_rate": 9.334904715888496e-07, "logits/chosen": -2.4992146492004395, "logits/rejected": -2.501399517059326, "logps/chosen": -339.5255432128906, "logps/rejected": -395.8268737792969, "loss": 0.5339, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.775671660900116, "rewards/margins": 0.6518876552581787, "rewards/rejected": -1.4275591373443604, "step": 930 }, { "epoch": 0.748, "grad_norm": 7.748569011688232, "learning_rate": 9.064400256282757e-07, "logits/chosen": -2.57441782951355, "logits/rejected": -2.546863079071045, "logps/chosen": -355.07818603515625, "logps/rejected": -380.3440246582031, "loss": 0.559, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6962515115737915, "rewards/margins": 0.5717657208442688, "rewards/rejected": -1.268017292022705, "step": 935 }, { "epoch": 0.752, "grad_norm": 7.653827667236328, "learning_rate": 8.797002473421729e-07, "logits/chosen": -2.544231653213501, "logits/rejected": -2.553048610687256, "logps/chosen": -380.5497131347656, "logps/rejected": -403.52191162109375, "loss": 0.5081, "rewards/accuracies": 0.75, "rewards/chosen": -0.5373459458351135, "rewards/margins": 0.6796460151672363, "rewards/rejected": -1.216991901397705, "step": 940 }, { "epoch": 0.756, "grad_norm": 14.531281471252441, "learning_rate": 8.532763497032987e-07, "logits/chosen": -2.4647645950317383, "logits/rejected": -2.452423572540283, "logps/chosen": -368.66497802734375, "logps/rejected": -440.90313720703125, "loss": 0.5264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7574427127838135, "rewards/margins": 0.6616984605789185, "rewards/rejected": -1.4191412925720215, "step": 945 }, { "epoch": 0.76, "grad_norm": 6.607179164886475, "learning_rate": 8.271734841028553e-07, "logits/chosen": -2.6168630123138428, "logits/rejected": -2.6241250038146973, "logps/chosen": -340.37542724609375, "logps/rejected": -366.9325866699219, "loss": 0.5419, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7232319712638855, "rewards/margins": 0.5706599950790405, "rewards/rejected": -1.2938919067382812, "step": 950 }, { "epoch": 0.764, "grad_norm": 7.8033528327941895, "learning_rate": 8.013967393462094e-07, "logits/chosen": -2.4783270359039307, "logits/rejected": -2.501206874847412, "logps/chosen": -348.3237609863281, "logps/rejected": -384.16656494140625, "loss": 0.5859, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7813480496406555, "rewards/margins": 0.5686533451080322, "rewards/rejected": -1.3500014543533325, "step": 955 }, { "epoch": 0.768, "grad_norm": 6.114492893218994, "learning_rate": 7.759511406608255e-07, "logits/chosen": -2.5830774307250977, "logits/rejected": -2.516847848892212, "logps/chosen": -397.07305908203125, "logps/rejected": -403.8395080566406, "loss": 0.4834, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8136352300643921, "rewards/margins": 0.8824082612991333, "rewards/rejected": -1.6960432529449463, "step": 960 }, { "epoch": 0.772, "grad_norm": 12.286111831665039, "learning_rate": 7.508416487165862e-07, "logits/chosen": -2.4968883991241455, "logits/rejected": -2.5091567039489746, "logps/chosen": -366.52630615234375, "logps/rejected": -400.1545715332031, "loss": 0.5807, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7833856344223022, "rewards/margins": 0.5320286154747009, "rewards/rejected": -1.3154141902923584, "step": 965 }, { "epoch": 0.776, "grad_norm": 12.27044677734375, "learning_rate": 7.260731586586983e-07, "logits/chosen": -2.4706804752349854, "logits/rejected": -2.4732460975646973, "logps/chosen": -339.1402587890625, "logps/rejected": -404.2414245605469, "loss": 0.6221, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9275779724121094, "rewards/margins": 0.46196335554122925, "rewards/rejected": -1.3895412683486938, "step": 970 }, { "epoch": 0.78, "grad_norm": 7.917988300323486, "learning_rate": 7.016504991533727e-07, "logits/chosen": -2.593116283416748, "logits/rejected": -2.565453290939331, "logps/chosen": -383.8894348144531, "logps/rejected": -424.5870666503906, "loss": 0.4774, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6088994145393372, "rewards/margins": 0.7229386568069458, "rewards/rejected": -1.3318378925323486, "step": 975 }, { "epoch": 0.784, "grad_norm": 5.051321983337402, "learning_rate": 6.775784314464717e-07, "logits/chosen": -2.4984991550445557, "logits/rejected": -2.5199942588806152, "logps/chosen": -342.84515380859375, "logps/rejected": -421.0189514160156, "loss": 0.4971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7955012917518616, "rewards/margins": 0.7193040251731873, "rewards/rejected": -1.5148054361343384, "step": 980 }, { "epoch": 0.788, "grad_norm": 8.092668533325195, "learning_rate": 6.538616484352902e-07, "logits/chosen": -2.5383505821228027, "logits/rejected": -2.526851177215576, "logps/chosen": -345.52655029296875, "logps/rejected": -379.8380432128906, "loss": 0.5156, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7932868003845215, "rewards/margins": 0.6958837509155273, "rewards/rejected": -1.4891705513000488, "step": 985 }, { "epoch": 0.792, "grad_norm": 9.803926467895508, "learning_rate": 6.305047737536707e-07, "logits/chosen": -2.509049654006958, "logits/rejected": -2.463141679763794, "logps/chosen": -351.3589172363281, "logps/rejected": -371.07281494140625, "loss": 0.5485, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8914083242416382, "rewards/margins": 0.6496592164039612, "rewards/rejected": -1.5410678386688232, "step": 990 }, { "epoch": 0.796, "grad_norm": 15.167935371398926, "learning_rate": 6.075123608706093e-07, "logits/chosen": -2.5473320484161377, "logits/rejected": -2.5690910816192627, "logps/chosen": -365.46673583984375, "logps/rejected": -389.1126403808594, "loss": 0.5431, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7901977300643921, "rewards/margins": 0.5910181999206543, "rewards/rejected": -1.381216049194336, "step": 995 }, { "epoch": 0.8, "grad_norm": 7.769952774047852, "learning_rate": 5.848888922025553e-07, "logits/chosen": -2.461652994155884, "logits/rejected": -2.4495410919189453, "logps/chosen": -327.51654052734375, "logps/rejected": -418.86737060546875, "loss": 0.5837, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9019506573677063, "rewards/margins": 0.5747453570365906, "rewards/rejected": -1.4766958951950073, "step": 1000 }, { "epoch": 0.8, "eval_logits/chosen": -2.5706355571746826, "eval_logits/rejected": -2.527315855026245, "eval_logps/chosen": -362.7657165527344, "eval_logps/rejected": -395.8990783691406, "eval_loss": 0.5301549434661865, "eval_rewards/accuracies": 0.716269850730896, "eval_rewards/chosen": -0.795275866985321, "eval_rewards/margins": 0.6834191083908081, "eval_rewards/rejected": -1.4786947965621948, "eval_runtime": 165.7401, "eval_samples_per_second": 3.017, "eval_steps_per_second": 0.38, "step": 1000 }, { "epoch": 0.804, "grad_norm": 9.499650001525879, "learning_rate": 5.626387782395512e-07, "logits/chosen": -2.570199489593506, "logits/rejected": -2.5388243198394775, "logps/chosen": -386.8207702636719, "logps/rejected": -439.36053466796875, "loss": 0.5546, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9186019897460938, "rewards/margins": 0.6662044525146484, "rewards/rejected": -1.5848064422607422, "step": 1005 }, { "epoch": 0.808, "grad_norm": 8.864973068237305, "learning_rate": 5.407663566854008e-07, "logits/chosen": -2.514481544494629, "logits/rejected": -2.469686269760132, "logps/chosen": -375.16436767578125, "logps/rejected": -431.52813720703125, "loss": 0.5046, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7475723028182983, "rewards/margins": 0.7609124779701233, "rewards/rejected": -1.5084848403930664, "step": 1010 }, { "epoch": 0.812, "grad_norm": 17.737668991088867, "learning_rate": 5.192758916120236e-07, "logits/chosen": -2.5291812419891357, "logits/rejected": -2.501344680786133, "logps/chosen": -376.4272766113281, "logps/rejected": -419.0375061035156, "loss": 0.5571, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8887074589729309, "rewards/margins": 0.6588888168334961, "rewards/rejected": -1.5475962162017822, "step": 1015 }, { "epoch": 0.816, "grad_norm": 9.168149948120117, "learning_rate": 4.981715726281666e-07, "logits/chosen": -2.5210018157958984, "logits/rejected": -2.518200635910034, "logps/chosen": -374.60687255859375, "logps/rejected": -385.31317138671875, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": -0.9296348690986633, "rewards/margins": 0.35056614875793457, "rewards/rejected": -1.2802008390426636, "step": 1020 }, { "epoch": 0.82, "grad_norm": 6.545177936553955, "learning_rate": 4.774575140626317e-07, "logits/chosen": -2.553743839263916, "logits/rejected": -2.564492702484131, "logps/chosen": -374.25372314453125, "logps/rejected": -418.3985290527344, "loss": 0.5131, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7494795918464661, "rewards/margins": 0.7920123338699341, "rewards/rejected": -1.5414918661117554, "step": 1025 }, { "epoch": 0.824, "grad_norm": 10.368010520935059, "learning_rate": 4.5713775416217884e-07, "logits/chosen": -2.5401394367218018, "logits/rejected": -2.5111076831817627, "logps/chosen": -364.0164794921875, "logps/rejected": -398.11749267578125, "loss": 0.493, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7943639159202576, "rewards/margins": 0.7978767156600952, "rewards/rejected": -1.5922406911849976, "step": 1030 }, { "epoch": 0.828, "grad_norm": 12.131779670715332, "learning_rate": 4.372162543042624e-07, "logits/chosen": -2.579563856124878, "logits/rejected": -2.539201259613037, "logps/chosen": -327.2681579589844, "logps/rejected": -347.89068603515625, "loss": 0.6285, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9573305249214172, "rewards/margins": 0.44094863533973694, "rewards/rejected": -1.3982793092727661, "step": 1035 }, { "epoch": 0.832, "grad_norm": 7.402243137359619, "learning_rate": 4.1769689822475147e-07, "logits/chosen": -2.533160924911499, "logits/rejected": -2.514822244644165, "logps/chosen": -332.4081726074219, "logps/rejected": -374.7890930175781, "loss": 0.5243, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7698723673820496, "rewards/margins": 0.6416595578193665, "rewards/rejected": -1.4115320444107056, "step": 1040 }, { "epoch": 0.836, "grad_norm": 11.70563793182373, "learning_rate": 3.9858349126078945e-07, "logits/chosen": -2.4150428771972656, "logits/rejected": -2.439276933670044, "logps/chosen": -360.05657958984375, "logps/rejected": -423.20184326171875, "loss": 0.5974, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8564049005508423, "rewards/margins": 0.5383543372154236, "rewards/rejected": -1.394759178161621, "step": 1045 }, { "epoch": 0.84, "grad_norm": 13.589889526367188, "learning_rate": 3.798797596089351e-07, "logits/chosen": -2.5914146900177, "logits/rejected": -2.56174898147583, "logps/chosen": -381.68048095703125, "logps/rejected": -398.01007080078125, "loss": 0.5775, "rewards/accuracies": 0.75, "rewards/chosen": -0.8752404451370239, "rewards/margins": 0.5522125959396362, "rewards/rejected": -1.4274529218673706, "step": 1050 }, { "epoch": 0.844, "grad_norm": 8.736641883850098, "learning_rate": 3.615893495987335e-07, "logits/chosen": -2.4973983764648438, "logits/rejected": -2.51640248298645, "logps/chosen": -355.0185852050781, "logps/rejected": -448.5818786621094, "loss": 0.5172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.703173816204071, "rewards/margins": 0.7288501858711243, "rewards/rejected": -1.4320241212844849, "step": 1055 }, { "epoch": 0.848, "grad_norm": 7.283242702484131, "learning_rate": 3.4371582698185636e-07, "logits/chosen": -2.510960578918457, "logits/rejected": -2.5245649814605713, "logps/chosen": -381.8621520996094, "logps/rejected": -428.3202209472656, "loss": 0.4451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8054243326187134, "rewards/margins": 0.8550852537155151, "rewards/rejected": -1.660509467124939, "step": 1060 }, { "epoch": 0.852, "grad_norm": 11.76474380493164, "learning_rate": 3.262626762369525e-07, "logits/chosen": -2.5506832599639893, "logits/rejected": -2.4736270904541016, "logps/chosen": -330.5300598144531, "logps/rejected": -350.5076599121094, "loss": 0.5339, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7755425572395325, "rewards/margins": 0.6821144819259644, "rewards/rejected": -1.4576570987701416, "step": 1065 }, { "epoch": 0.856, "grad_norm": 9.793760299682617, "learning_rate": 3.092332998903416e-07, "logits/chosen": -2.554365634918213, "logits/rejected": -2.5592918395996094, "logps/chosen": -383.59014892578125, "logps/rejected": -433.6626892089844, "loss": 0.5678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.708263635635376, "rewards/margins": 0.541972279548645, "rewards/rejected": -1.250235915184021, "step": 1070 }, { "epoch": 0.86, "grad_norm": 8.709606170654297, "learning_rate": 2.9263101785268253e-07, "logits/chosen": -2.5445055961608887, "logits/rejected": -2.5182182788848877, "logps/chosen": -370.8434753417969, "logps/rejected": -384.3281555175781, "loss": 0.6357, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.878252387046814, "rewards/margins": 0.4590927064418793, "rewards/rejected": -1.3373451232910156, "step": 1075 }, { "epoch": 0.864, "grad_norm": 7.788934230804443, "learning_rate": 2.764590667717562e-07, "logits/chosen": -2.5197861194610596, "logits/rejected": -2.498582363128662, "logps/chosen": -348.8467712402344, "logps/rejected": -429.28936767578125, "loss": 0.4726, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7739642858505249, "rewards/margins": 0.8543514013290405, "rewards/rejected": -1.6283156871795654, "step": 1080 }, { "epoch": 0.868, "grad_norm": 9.934327125549316, "learning_rate": 2.6072059940146775e-07, "logits/chosen": -2.4858384132385254, "logits/rejected": -2.4607391357421875, "logps/chosen": -357.95025634765625, "logps/rejected": -370.97479248046875, "loss": 0.639, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9774085283279419, "rewards/margins": 0.36331382393836975, "rewards/rejected": -1.3407223224639893, "step": 1085 }, { "epoch": 0.872, "grad_norm": 12.652565956115723, "learning_rate": 2.454186839872158e-07, "logits/chosen": -2.4667727947235107, "logits/rejected": -2.428893566131592, "logps/chosen": -368.6217346191406, "logps/rejected": -427.3558654785156, "loss": 0.5759, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8502403497695923, "rewards/margins": 0.5524962544441223, "rewards/rejected": -1.4027366638183594, "step": 1090 }, { "epoch": 0.876, "grad_norm": 7.642593860626221, "learning_rate": 2.3055630366772857e-07, "logits/chosen": -2.5572714805603027, "logits/rejected": -2.5431621074676514, "logps/chosen": -356.75775146484375, "logps/rejected": -395.92498779296875, "loss": 0.5148, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7548877000808716, "rewards/margins": 0.7364410161972046, "rewards/rejected": -1.4913287162780762, "step": 1095 }, { "epoch": 0.88, "grad_norm": 10.851374626159668, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -2.549379825592041, "logits/rejected": -2.547346830368042, "logps/chosen": -347.64202880859375, "logps/rejected": -392.60089111328125, "loss": 0.5144, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.831588625907898, "rewards/margins": 0.7135321497917175, "rewards/rejected": -1.5451208353042603, "step": 1100 }, { "epoch": 0.88, "eval_logits/chosen": -2.55863094329834, "eval_logits/rejected": -2.516242742538452, "eval_logps/chosen": -357.33807373046875, "eval_logps/rejected": -388.2352600097656, "eval_loss": 0.5326837301254272, "eval_rewards/accuracies": 0.7123016119003296, "eval_rewards/chosen": -0.7409996390342712, "eval_rewards/margins": 0.6610568761825562, "eval_rewards/rejected": -1.4020566940307617, "eval_runtime": 166.2233, "eval_samples_per_second": 3.008, "eval_steps_per_second": 0.379, "step": 1100 }, { "epoch": 0.884, "grad_norm": 11.941755294799805, "learning_rate": 2.0216165186191406e-07, "logits/chosen": -2.5250916481018066, "logits/rejected": -2.5078232288360596, "logps/chosen": -360.7425842285156, "logps/rejected": -419.687744140625, "loss": 0.5508, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7622045874595642, "rewards/margins": 0.6720725297927856, "rewards/rejected": -1.434277057647705, "step": 1105 }, { "epoch": 0.888, "grad_norm": 11.753776550292969, "learning_rate": 1.8863491596921745e-07, "logits/chosen": -2.5271763801574707, "logits/rejected": -2.495025396347046, "logps/chosen": -394.4120788574219, "logps/rejected": -420.91552734375, "loss": 0.6139, "rewards/accuracies": 0.6875, "rewards/chosen": -0.926856517791748, "rewards/margins": 0.48281335830688477, "rewards/rejected": -1.4096698760986328, "step": 1110 }, { "epoch": 0.892, "grad_norm": 12.333569526672363, "learning_rate": 1.7555878527937164e-07, "logits/chosen": -2.6087048053741455, "logits/rejected": -2.5676796436309814, "logps/chosen": -378.8360595703125, "logps/rejected": -399.5335693359375, "loss": 0.4934, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8667081594467163, "rewards/margins": 0.7776769995689392, "rewards/rejected": -1.6443853378295898, "step": 1115 }, { "epoch": 0.896, "grad_norm": 9.7278413772583, "learning_rate": 1.629358090099639e-07, "logits/chosen": -2.495575428009033, "logits/rejected": -2.489112615585327, "logps/chosen": -391.45159912109375, "logps/rejected": -426.06085205078125, "loss": 0.5054, "rewards/accuracies": 0.75, "rewards/chosen": -0.9123435020446777, "rewards/margins": 0.6857225298881531, "rewards/rejected": -1.5980660915374756, "step": 1120 }, { "epoch": 0.9, "grad_norm": 9.703481674194336, "learning_rate": 1.507684480352292e-07, "logits/chosen": -2.5202584266662598, "logits/rejected": -2.527817964553833, "logps/chosen": -364.148681640625, "logps/rejected": -412.2860412597656, "loss": 0.529, "rewards/accuracies": 0.75, "rewards/chosen": -0.9178013801574707, "rewards/margins": 0.6695634126663208, "rewards/rejected": -1.5873647928237915, "step": 1125 }, { "epoch": 0.904, "grad_norm": 6.87813138961792, "learning_rate": 1.3905907440629752e-07, "logits/chosen": -2.5462465286254883, "logits/rejected": -2.529540777206421, "logps/chosen": -367.54986572265625, "logps/rejected": -395.8585510253906, "loss": 0.5463, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9065104722976685, "rewards/margins": 0.6257832050323486, "rewards/rejected": -1.532293677330017, "step": 1130 }, { "epoch": 0.908, "grad_norm": 9.895462989807129, "learning_rate": 1.278099708887587e-07, "logits/chosen": -2.552335262298584, "logits/rejected": -2.5324137210845947, "logps/chosen": -345.7575988769531, "logps/rejected": -455.0641174316406, "loss": 0.5316, "rewards/accuracies": 0.75, "rewards/chosen": -0.7876826524734497, "rewards/margins": 0.7177135348320007, "rewards/rejected": -1.5053961277008057, "step": 1135 }, { "epoch": 0.912, "grad_norm": 8.755758285522461, "learning_rate": 1.1702333051763271e-07, "logits/chosen": -2.5616421699523926, "logits/rejected": -2.554831027984619, "logps/chosen": -397.1969909667969, "logps/rejected": -403.56439208984375, "loss": 0.5163, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8538335561752319, "rewards/margins": 0.7461098432540894, "rewards/rejected": -1.5999435186386108, "step": 1140 }, { "epoch": 0.916, "grad_norm": 12.037848472595215, "learning_rate": 1.067012561698319e-07, "logits/chosen": -2.5323455333709717, "logits/rejected": -2.519660472869873, "logps/chosen": -379.17340087890625, "logps/rejected": -407.46014404296875, "loss": 0.6399, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.895904541015625, "rewards/margins": 0.44356465339660645, "rewards/rejected": -1.3394691944122314, "step": 1145 }, { "epoch": 0.92, "grad_norm": 14.188241958618164, "learning_rate": 9.684576015420277e-08, "logits/chosen": -2.4839751720428467, "logits/rejected": -2.4552297592163086, "logps/chosen": -331.6858215332031, "logps/rejected": -358.20819091796875, "loss": 0.5293, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7821733355522156, "rewards/margins": 0.6462420225143433, "rewards/rejected": -1.4284155368804932, "step": 1150 }, { "epoch": 0.924, "grad_norm": 21.802221298217773, "learning_rate": 8.745876381922147e-08, "logits/chosen": -2.485172748565674, "logits/rejected": -2.5178401470184326, "logps/chosen": -343.31103515625, "logps/rejected": -370.01336669921875, "loss": 0.577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8277499079704285, "rewards/margins": 0.6033510565757751, "rewards/rejected": -1.4311010837554932, "step": 1155 }, { "epoch": 0.928, "grad_norm": 11.248420715332031, "learning_rate": 7.854209717842231e-08, "logits/chosen": -2.5530881881713867, "logits/rejected": -2.5298221111297607, "logps/chosen": -387.3213806152344, "logps/rejected": -385.5107421875, "loss": 0.6488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9766770601272583, "rewards/margins": 0.35460105538368225, "rewards/rejected": -1.3312779664993286, "step": 1160 }, { "epoch": 0.932, "grad_norm": 6.085402011871338, "learning_rate": 7.009749855363457e-08, "logits/chosen": -2.5276684761047363, "logits/rejected": -2.508495330810547, "logps/chosen": -339.74969482421875, "logps/rejected": -404.6656799316406, "loss": 0.519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6988147497177124, "rewards/margins": 0.6438443660736084, "rewards/rejected": -1.3426591157913208, "step": 1165 }, { "epoch": 0.936, "grad_norm": 15.023430824279785, "learning_rate": 6.212661423609184e-08, "logits/chosen": -2.5954625606536865, "logits/rejected": -2.5354666709899902, "logps/chosen": -389.9742736816406, "logps/rejected": -427.60284423828125, "loss": 0.5631, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9025875926017761, "rewards/margins": 0.6419355869293213, "rewards/rejected": -1.5445232391357422, "step": 1170 }, { "epoch": 0.94, "grad_norm": 12.646740913391113, "learning_rate": 5.463099816548578e-08, "logits/chosen": -2.5129947662353516, "logits/rejected": -2.5076282024383545, "logps/chosen": -355.4842224121094, "logps/rejected": -443.46014404296875, "loss": 0.4861, "rewards/accuracies": 0.75, "rewards/chosen": -0.8773505091667175, "rewards/margins": 0.769055187702179, "rewards/rejected": -1.646405816078186, "step": 1175 }, { "epoch": 0.944, "grad_norm": 8.745574951171875, "learning_rate": 4.761211162702117e-08, "logits/chosen": -2.5645899772644043, "logits/rejected": -2.502182722091675, "logps/chosen": -396.885498046875, "logps/rejected": -444.1766662597656, "loss": 0.5327, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.760084331035614, "rewards/margins": 0.5914161801338196, "rewards/rejected": -1.3515005111694336, "step": 1180 }, { "epoch": 0.948, "grad_norm": 10.453509330749512, "learning_rate": 4.1071322966535487e-08, "logits/chosen": -2.577366590499878, "logits/rejected": -2.5066463947296143, "logps/chosen": -418.02801513671875, "logps/rejected": -403.1604309082031, "loss": 0.4854, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7425155639648438, "rewards/margins": 0.8614899516105652, "rewards/rejected": -1.6040055751800537, "step": 1185 }, { "epoch": 0.952, "grad_norm": 6.866016864776611, "learning_rate": 3.5009907323737826e-08, "logits/chosen": -2.504338026046753, "logits/rejected": -2.57658052444458, "logps/chosen": -371.8552551269531, "logps/rejected": -480.74066162109375, "loss": 0.4368, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6980635523796082, "rewards/margins": 0.9740827679634094, "rewards/rejected": -1.6721464395523071, "step": 1190 }, { "epoch": 0.956, "grad_norm": 8.07772159576416, "learning_rate": 2.9429046383618042e-08, "logits/chosen": -2.459728717803955, "logits/rejected": -2.4553236961364746, "logps/chosen": -368.6015930175781, "logps/rejected": -395.6241149902344, "loss": 0.4823, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7073078751564026, "rewards/margins": 0.7206257581710815, "rewards/rejected": -1.4279335737228394, "step": 1195 }, { "epoch": 0.96, "grad_norm": 13.021551132202148, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -2.524336099624634, "logits/rejected": -2.4975974559783936, "logps/chosen": -377.58343505859375, "logps/rejected": -374.9549255371094, "loss": 0.5196, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8911786079406738, "rewards/margins": 0.713758647441864, "rewards/rejected": -1.6049373149871826, "step": 1200 }, { "epoch": 0.96, "eval_logits/chosen": -2.5477142333984375, "eval_logits/rejected": -2.504517078399658, "eval_logps/chosen": -361.9387512207031, "eval_logps/rejected": -394.47796630859375, "eval_loss": 0.5300799608230591, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": -0.7870069146156311, "eval_rewards/margins": 0.6774766445159912, "eval_rewards/rejected": -1.4644837379455566, "eval_runtime": 166.2408, "eval_samples_per_second": 3.008, "eval_steps_per_second": 0.379, "step": 1200 }, { "epoch": 0.964, "grad_norm": 8.705704689025879, "learning_rate": 1.9713246713805588e-08, "logits/chosen": -2.4079999923706055, "logits/rejected": -2.3863213062286377, "logps/chosen": -336.49639892578125, "logps/rejected": -405.0527648925781, "loss": 0.4696, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6577492952346802, "rewards/margins": 0.8586214780807495, "rewards/rejected": -1.5163707733154297, "step": 1205 }, { "epoch": 0.968, "grad_norm": 9.633703231811523, "learning_rate": 1.5580202098509078e-08, "logits/chosen": -2.488119602203369, "logits/rejected": -2.446547746658325, "logps/chosen": -409.77557373046875, "logps/rejected": -457.3531188964844, "loss": 0.5975, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9190298318862915, "rewards/margins": 0.5190376043319702, "rewards/rejected": -1.4380674362182617, "step": 1210 }, { "epoch": 0.972, "grad_norm": 9.208328247070312, "learning_rate": 1.193150004542204e-08, "logits/chosen": -2.523573160171509, "logits/rejected": -2.5186927318573, "logps/chosen": -355.54656982421875, "logps/rejected": -407.33172607421875, "loss": 0.5734, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.683289647102356, "rewards/margins": 0.6039477586746216, "rewards/rejected": -1.2872374057769775, "step": 1215 }, { "epoch": 0.976, "grad_norm": 7.021068096160889, "learning_rate": 8.767851876239075e-09, "logits/chosen": -2.505402088165283, "logits/rejected": -2.454876661300659, "logps/chosen": -327.73358154296875, "logps/rejected": -372.61370849609375, "loss": 0.5824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8027257919311523, "rewards/margins": 0.5884792804718018, "rewards/rejected": -1.391205072402954, "step": 1220 }, { "epoch": 0.98, "grad_norm": 8.4197416305542, "learning_rate": 6.089874350439507e-09, "logits/chosen": -2.5013089179992676, "logits/rejected": -2.485605239868164, "logps/chosen": -435.61669921875, "logps/rejected": -448.99688720703125, "loss": 0.5037, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8056790232658386, "rewards/margins": 0.687267005443573, "rewards/rejected": -1.492945909500122, "step": 1225 }, { "epoch": 0.984, "grad_norm": 9.84626293182373, "learning_rate": 3.8980895450474455e-09, "logits/chosen": -2.469447612762451, "logits/rejected": -2.4653396606445312, "logps/chosen": -375.6591796875, "logps/rejected": -485.65179443359375, "loss": 0.4352, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7044429779052734, "rewards/margins": 0.9541055560112, "rewards/rejected": -1.658548355102539, "step": 1230 }, { "epoch": 0.988, "grad_norm": 10.856142044067383, "learning_rate": 2.192924752854042e-09, "logits/chosen": -2.5709242820739746, "logits/rejected": -2.552412986755371, "logps/chosen": -359.99749755859375, "logps/rejected": -404.10693359375, "loss": 0.5811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8072735071182251, "rewards/margins": 0.5389326810836792, "rewards/rejected": -1.3462061882019043, "step": 1235 }, { "epoch": 0.992, "grad_norm": 8.36683464050293, "learning_rate": 9.747123991141193e-10, "logits/chosen": -2.4341177940368652, "logits/rejected": -2.4185235500335693, "logps/chosen": -372.7251892089844, "logps/rejected": -395.2005310058594, "loss": 0.5735, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8750492334365845, "rewards/margins": 0.6034801006317139, "rewards/rejected": -1.4785292148590088, "step": 1240 }, { "epoch": 0.996, "grad_norm": 9.960768699645996, "learning_rate": 2.43689976739403e-10, "logits/chosen": -2.397348642349243, "logits/rejected": -2.444608688354492, "logps/chosen": -407.68475341796875, "logps/rejected": -409.0362243652344, "loss": 0.5478, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8263516426086426, "rewards/margins": 0.5675632357597351, "rewards/rejected": -1.3939149379730225, "step": 1245 }, { "epoch": 1.0, "grad_norm": 14.954544067382812, "learning_rate": 0.0, "logits/chosen": -2.471954822540283, "logits/rejected": -2.448702335357666, "logps/chosen": -397.40447998046875, "logps/rejected": -444.6131896972656, "loss": 0.5219, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9162886738777161, "rewards/margins": 0.6220329999923706, "rewards/rejected": -1.5383217334747314, "step": 1250 }, { "epoch": 1.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.5873338260650635, "train_runtime": 15803.1996, "train_samples_per_second": 1.266, "train_steps_per_second": 0.079 } ], "logging_steps": 5, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }