{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99830220713073, "eval_steps": 100, "global_step": 441, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.111111111111111e-08, "logits/chosen": -2.669281482696533, "logits/rejected": -2.675659418106079, "logps/chosen": -301.2757873535156, "logps/rejected": -280.8008728027344, "loss": 0.2803, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.111111111111111e-07, "logits/chosen": -2.8107264041900635, "logits/rejected": -2.7811262607574463, "logps/chosen": -320.0479736328125, "logps/rejected": -195.18087768554688, "loss": 0.2792, "rewards/accuracies": 0.4826388955116272, "rewards/chosen": 0.0006166233215481043, "rewards/margins": 0.0009485264890827239, "rewards/rejected": -0.00033190299291163683, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.222222222222222e-07, "logits/chosen": -2.8013899326324463, "logits/rejected": -2.7626068592071533, "logps/chosen": -350.6124572753906, "logps/rejected": -191.4945831298828, "loss": 0.2774, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0069363838993012905, "rewards/margins": 0.013156639412045479, "rewards/rejected": -0.006220255978405476, "step": 20 }, { "epoch": 0.07, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.7069194316864014, "logits/rejected": -2.691702365875244, "logps/chosen": -316.24737548828125, "logps/rejected": -201.17063903808594, "loss": 0.2782, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.0192975215613842, "rewards/margins": 0.07039008289575577, "rewards/rejected": -0.05109255388379097, "step": 30 }, { "epoch": 0.09, "learning_rate": 4.444444444444444e-07, "logits/chosen": -2.606764554977417, "logits/rejected": -2.5910491943359375, "logps/chosen": -376.6419982910156, "logps/rejected": -222.21923828125, "loss": 0.2589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015041938051581383, "rewards/margins": 0.2519453465938568, "rewards/rejected": -0.23690339922904968, "step": 40 }, { "epoch": 0.11, "learning_rate": 4.998033461515242e-07, "logits/chosen": -2.5254263877868652, "logits/rejected": -2.522778034210205, "logps/chosen": -350.6036682128906, "logps/rejected": -230.76931762695312, "loss": 0.2151, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16026124358177185, "rewards/margins": 0.36047226190567017, "rewards/rejected": -0.5207335352897644, "step": 50 }, { "epoch": 0.14, "learning_rate": 4.982319711683221e-07, "logits/chosen": -2.509260654449463, "logits/rejected": -2.4819066524505615, "logps/chosen": -343.71575927734375, "logps/rejected": -281.47723388671875, "loss": 0.1652, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3321402072906494, "rewards/margins": 0.506214439868927, "rewards/rejected": -0.8383547067642212, "step": 60 }, { "epoch": 0.16, "learning_rate": 4.950991058546892e-07, "logits/chosen": -2.4952120780944824, "logits/rejected": -2.4609100818634033, "logps/chosen": -381.2798156738281, "logps/rejected": -299.72149658203125, "loss": 0.1193, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.518448531627655, "rewards/margins": 0.5417992472648621, "rewards/rejected": -1.0602478981018066, "step": 70 }, { "epoch": 0.18, "learning_rate": 4.904244573372733e-07, "logits/chosen": -2.4261202812194824, "logits/rejected": -2.4001305103302, "logps/chosen": -423.95233154296875, "logps/rejected": -343.5025939941406, "loss": 0.0926, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8466840982437134, "rewards/margins": 0.7109834551811218, "rewards/rejected": -1.55766761302948, "step": 80 }, { "epoch": 0.2, "learning_rate": 4.842374312499405e-07, "logits/chosen": -2.4205052852630615, "logits/rejected": -2.3753650188446045, "logps/chosen": -425.88934326171875, "logps/rejected": -375.40478515625, "loss": 0.0715, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0327210426330566, "rewards/margins": 0.8036619424819946, "rewards/rejected": -1.8363832235336304, "step": 90 }, { "epoch": 0.23, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -2.3601014614105225, "logits/rejected": -2.3345751762390137, "logps/chosen": -415.061279296875, "logps/rejected": -360.3291931152344, "loss": 0.0846, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.9150521159172058, "rewards/margins": 0.7828376889228821, "rewards/rejected": -1.6978899240493774, "step": 100 }, { "epoch": 0.23, "eval_logits/chosen": -2.3521652221679688, "eval_logits/rejected": -2.330230236053467, "eval_logps/chosen": -403.4620361328125, "eval_logps/rejected": -446.7535400390625, "eval_loss": 0.08460698276758194, "eval_rewards/accuracies": 0.6484375, "eval_rewards/chosen": -1.4642237424850464, "eval_rewards/margins": 0.42977917194366455, "eval_rewards/rejected": -1.894002914428711, "eval_runtime": 53.5062, "eval_samples_per_second": 37.379, "eval_steps_per_second": 0.598, "step": 100 }, { "epoch": 0.25, "learning_rate": 4.6749119174501973e-07, "logits/chosen": -2.338693857192993, "logits/rejected": -2.274574041366577, "logps/chosen": -452.51953125, "logps/rejected": -401.04058837890625, "loss": 0.0679, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2087438106536865, "rewards/margins": 0.9248006939888, "rewards/rejected": -2.133544445037842, "step": 110 }, { "epoch": 0.27, "learning_rate": 4.5703731967784265e-07, "logits/chosen": -2.2874975204467773, "logits/rejected": -2.2298505306243896, "logps/chosen": -436.90283203125, "logps/rejected": -372.56170654296875, "loss": 0.0738, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.8776865005493164, "rewards/margins": 0.9758397340774536, "rewards/rejected": -1.8535263538360596, "step": 120 }, { "epoch": 0.29, "learning_rate": 4.4528109009727333e-07, "logits/chosen": -2.297295093536377, "logits/rejected": -2.2529804706573486, "logps/chosen": -418.8360290527344, "logps/rejected": -369.44451904296875, "loss": 0.084, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.8223110437393188, "rewards/margins": 0.993333637714386, "rewards/rejected": -1.8156448602676392, "step": 130 }, { "epoch": 0.32, "learning_rate": 4.3229645495529427e-07, "logits/chosen": -2.30572509765625, "logits/rejected": -2.214725971221924, "logps/chosen": -461.4957580566406, "logps/rejected": -416.990234375, "loss": 0.066, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.087278962135315, "rewards/margins": 1.041991949081421, "rewards/rejected": -2.1292710304260254, "step": 140 }, { "epoch": 0.34, "learning_rate": 4.1816509342531317e-07, "logits/chosen": -2.2726972103118896, "logits/rejected": -2.196261405944824, "logps/chosen": -419.60479736328125, "logps/rejected": -361.95831298828125, "loss": 0.0853, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6499409675598145, "rewards/margins": 1.046847939491272, "rewards/rejected": -1.696789026260376, "step": 150 }, { "epoch": 0.36, "learning_rate": 4.0297589810356166e-07, "logits/chosen": -2.188967227935791, "logits/rejected": -2.119161605834961, "logps/chosen": -450.3418884277344, "logps/rejected": -422.260986328125, "loss": 0.0621, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2301867008209229, "rewards/margins": 1.049591064453125, "rewards/rejected": -2.279777765274048, "step": 160 }, { "epoch": 0.38, "learning_rate": 3.868244158348331e-07, "logits/chosen": -2.135490894317627, "logits/rejected": -2.0569894313812256, "logps/chosen": -508.18511962890625, "logps/rejected": -465.81689453125, "loss": 0.0461, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.621382474899292, "rewards/margins": 1.0623977184295654, "rewards/rejected": -2.6837801933288574, "step": 170 }, { "epoch": 0.41, "learning_rate": 3.698122466800142e-07, "logits/chosen": -2.1748709678649902, "logits/rejected": -2.0735726356506348, "logps/chosen": -480.3050842285156, "logps/rejected": -428.5189514160156, "loss": 0.0508, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.518842101097107, "rewards/margins": 0.8754765391349792, "rewards/rejected": -2.3943190574645996, "step": 180 }, { "epoch": 0.43, "learning_rate": 3.5204640480617574e-07, "logits/chosen": -2.104471206665039, "logits/rejected": -2.028428077697754, "logps/chosen": -493.2584533691406, "logps/rejected": -451.6133728027344, "loss": 0.0534, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -1.6701205968856812, "rewards/margins": 0.9617946743965149, "rewards/rejected": -2.63191556930542, "step": 190 }, { "epoch": 0.45, "learning_rate": 3.336386453195088e-07, "logits/chosen": -2.155747890472412, "logits/rejected": -2.0854601860046387, "logps/chosen": -486.59954833984375, "logps/rejected": -467.2601623535156, "loss": 0.0477, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5419719219207764, "rewards/margins": 1.163585901260376, "rewards/rejected": -2.7055578231811523, "step": 200 }, { "epoch": 0.45, "eval_logits/chosen": -2.1616616249084473, "eval_logits/rejected": -2.128385305404663, "eval_logps/chosen": -436.6204833984375, "eval_logps/rejected": -497.52166748046875, "eval_loss": 0.06721889227628708, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -1.7958087921142578, "eval_rewards/margins": 0.6058750152587891, "eval_rewards/rejected": -2.401683807373047, "eval_runtime": 53.3708, "eval_samples_per_second": 37.474, "eval_steps_per_second": 0.6, "step": 200 }, { "epoch": 0.48, "learning_rate": 3.147047612756302e-07, "logits/chosen": -2.1386637687683105, "logits/rejected": -2.041881561279297, "logps/chosen": -493.37530517578125, "logps/rejected": -443.93377685546875, "loss": 0.0514, "rewards/accuracies": 0.765625, "rewards/chosen": -1.43257737159729, "rewards/margins": 1.1513398885726929, "rewards/rejected": -2.5839171409606934, "step": 210 }, { "epoch": 0.5, "learning_rate": 2.9536385528937565e-07, "logits/chosen": -2.1365461349487305, "logits/rejected": -2.0607683658599854, "logps/chosen": -505.8017578125, "logps/rejected": -459.7229919433594, "loss": 0.0508, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.424095869064331, "rewards/margins": 1.3351457118988037, "rewards/rejected": -2.7592415809631348, "step": 220 }, { "epoch": 0.52, "learning_rate": 2.7573759032598365e-07, "logits/chosen": -2.12797474861145, "logits/rejected": -2.06542706489563, "logps/chosen": -517.1934814453125, "logps/rejected": -477.3455505371094, "loss": 0.049, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5975525379180908, "rewards/margins": 1.0971721410751343, "rewards/rejected": -2.6947245597839355, "step": 230 }, { "epoch": 0.54, "learning_rate": 2.5594942438652685e-07, "logits/chosen": -2.052783250808716, "logits/rejected": -1.9777238368988037, "logps/chosen": -483.03057861328125, "logps/rejected": -474.64654541015625, "loss": 0.0412, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -1.6363245248794556, "rewards/margins": 1.1237612962722778, "rewards/rejected": -2.7600855827331543, "step": 240 }, { "epoch": 0.57, "learning_rate": 2.36123833901765e-07, "logits/chosen": -2.0231640338897705, "logits/rejected": -1.9347641468048096, "logps/chosen": -503.3892517089844, "logps/rejected": -482.2828674316406, "loss": 0.0391, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7641624212265015, "rewards/margins": 1.1827863454818726, "rewards/rejected": -2.946949005126953, "step": 250 }, { "epoch": 0.59, "learning_rate": 2.1638553071961704e-07, "logits/chosen": -1.9997365474700928, "logits/rejected": -1.919660210609436, "logps/chosen": -549.3283081054688, "logps/rejected": -506.68707275390625, "loss": 0.0364, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.842703104019165, "rewards/margins": 1.4222638607025146, "rewards/rejected": -3.2649669647216797, "step": 260 }, { "epoch": 0.61, "learning_rate": 1.968586776117558e-07, "logits/chosen": -2.023411273956299, "logits/rejected": -1.9109961986541748, "logps/chosen": -555.9891357421875, "logps/rejected": -512.0736083984375, "loss": 0.0427, "rewards/accuracies": 0.765625, "rewards/chosen": -1.7782520055770874, "rewards/margins": 1.3554075956344604, "rewards/rejected": -3.133659839630127, "step": 270 }, { "epoch": 0.63, "learning_rate": 1.7766610723413684e-07, "logits/chosen": -2.0062713623046875, "logits/rejected": -1.9274126291275024, "logps/chosen": -505.69110107421875, "logps/rejected": -496.41864013671875, "loss": 0.0421, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -1.7307058572769165, "rewards/margins": 1.3143250942230225, "rewards/rejected": -3.0450305938720703, "step": 280 }, { "epoch": 0.66, "learning_rate": 1.589285494545514e-07, "logits/chosen": -2.0006046295166016, "logits/rejected": -1.9192641973495483, "logps/chosen": -496.51055908203125, "logps/rejected": -483.0226135253906, "loss": 0.0435, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7974494695663452, "rewards/margins": 1.1823097467422485, "rewards/rejected": -2.979759454727173, "step": 290 }, { "epoch": 0.68, "learning_rate": 1.4076387190766014e-07, "logits/chosen": -1.9719831943511963, "logits/rejected": -1.9076999425888062, "logps/chosen": -481.85968017578125, "logps/rejected": -481.77691650390625, "loss": 0.046, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -1.7264801263809204, "rewards/margins": 1.1051851511001587, "rewards/rejected": -2.831665277481079, "step": 300 }, { "epoch": 0.68, "eval_logits/chosen": -1.991421103477478, "eval_logits/rejected": -1.9484151601791382, "eval_logps/chosen": -471.8780517578125, "eval_logps/rejected": -544.56982421875, "eval_loss": 0.055175185203552246, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -2.1483840942382812, "eval_rewards/margins": 0.7237809896469116, "eval_rewards/rejected": -2.8721652030944824, "eval_runtime": 53.342, "eval_samples_per_second": 37.494, "eval_steps_per_second": 0.6, "step": 300 }, { "epoch": 0.7, "learning_rate": 1.232863385547543e-07, "logits/chosen": -1.9471126794815063, "logits/rejected": -1.8728282451629639, "logps/chosen": -499.8617248535156, "logps/rejected": -500.9742126464844, "loss": 0.0415, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7383416891098022, "rewards/margins": 1.4027913808822632, "rewards/rejected": -3.1411330699920654, "step": 310 }, { "epoch": 0.72, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -1.9511429071426392, "logits/rejected": -1.8690084218978882, "logps/chosen": -515.9614868164062, "logps/rejected": -489.18853759765625, "loss": 0.0424, "rewards/accuracies": 0.765625, "rewards/chosen": -1.7780258655548096, "rewards/margins": 1.2824206352233887, "rewards/rejected": -3.060446262359619, "step": 320 }, { "epoch": 0.75, "learning_rate": 9.082745647022797e-08, "logits/chosen": -1.9991905689239502, "logits/rejected": -1.9214942455291748, "logps/chosen": -518.4646606445312, "logps/rejected": -501.2618713378906, "loss": 0.0439, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7493177652359009, "rewards/margins": 1.3284794092178345, "rewards/rejected": -3.0777969360351562, "step": 330 }, { "epoch": 0.77, "learning_rate": 7.605028865161809e-08, "logits/chosen": -1.994573950767517, "logits/rejected": -1.9131485223770142, "logps/chosen": -523.8150024414062, "logps/rejected": -497.4427795410156, "loss": 0.0414, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -1.7702951431274414, "rewards/margins": 1.2257777452468872, "rewards/rejected": -2.996073007583618, "step": 340 }, { "epoch": 0.79, "learning_rate": 6.236734246357947e-08, "logits/chosen": -1.9538482427597046, "logits/rejected": -1.849259614944458, "logps/chosen": -493.80450439453125, "logps/rejected": -493.0531311035156, "loss": 0.0447, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6410820484161377, "rewards/margins": 1.3493789434432983, "rewards/rejected": -2.9904608726501465, "step": 350 }, { "epoch": 0.81, "learning_rate": 4.986468976890992e-08, "logits/chosen": -1.9808590412139893, "logits/rejected": -1.8965733051300049, "logps/chosen": -507.7171936035156, "logps/rejected": -484.1426696777344, "loss": 0.0423, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6426427364349365, "rewards/margins": 1.3561906814575195, "rewards/rejected": -2.998833417892456, "step": 360 }, { "epoch": 0.84, "learning_rate": 3.8620977855448936e-08, "logits/chosen": -1.9780423641204834, "logits/rejected": -1.884387731552124, "logps/chosen": -519.645751953125, "logps/rejected": -487.0723571777344, "loss": 0.0454, "rewards/accuracies": 0.71875, "rewards/chosen": -1.856579065322876, "rewards/margins": 1.1135355234146118, "rewards/rejected": -2.9701147079467773, "step": 370 }, { "epoch": 0.86, "learning_rate": 2.8706934709395893e-08, "logits/chosen": -1.99289870262146, "logits/rejected": -1.9017293453216553, "logps/chosen": -520.5479736328125, "logps/rejected": -500.9642028808594, "loss": 0.0424, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6913812160491943, "rewards/margins": 1.3811187744140625, "rewards/rejected": -3.072500228881836, "step": 380 }, { "epoch": 0.88, "learning_rate": 2.0184924104583612e-08, "logits/chosen": -1.988921880722046, "logits/rejected": -1.891033411026001, "logps/chosen": -521.970458984375, "logps/rejected": -489.20501708984375, "loss": 0.0425, "rewards/accuracies": 0.796875, "rewards/chosen": -1.6584064960479736, "rewards/margins": 1.3673207759857178, "rewards/rejected": -3.0257275104522705, "step": 390 }, { "epoch": 0.91, "learning_rate": 1.3108553306396263e-08, "logits/chosen": -2.0064077377319336, "logits/rejected": -1.9138851165771484, "logps/chosen": -516.9049072265625, "logps/rejected": -482.49859619140625, "loss": 0.0439, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8265987634658813, "rewards/margins": 1.1020760536193848, "rewards/rejected": -2.9286749362945557, "step": 400 }, { "epoch": 0.91, "eval_logits/chosen": -1.9848593473434448, "eval_logits/rejected": -1.94070303440094, "eval_logps/chosen": -476.7722473144531, "eval_logps/rejected": -549.7876586914062, "eval_loss": 0.054434459656476974, "eval_rewards/accuracies": 0.70703125, "eval_rewards/chosen": -2.1973259449005127, "eval_rewards/margins": 0.7270177602767944, "eval_rewards/rejected": -2.9243435859680176, "eval_runtime": 53.3853, "eval_samples_per_second": 37.463, "eval_steps_per_second": 0.599, "step": 400 }, { "epoch": 0.93, "learning_rate": 7.522335858048705e-09, "logits/chosen": -1.963323950767517, "logits/rejected": -1.8955034017562866, "logps/chosen": -518.6998901367188, "logps/rejected": -522.4810791015625, "loss": 0.0451, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -1.8480857610702515, "rewards/margins": 1.2641350030899048, "rewards/rejected": -3.112220287322998, "step": 410 }, { "epoch": 0.95, "learning_rate": 3.4614115704533766e-09, "logits/chosen": -1.9797197580337524, "logits/rejected": -1.9050052165985107, "logps/chosen": -495.5513610839844, "logps/rejected": -489.12384033203125, "loss": 0.0427, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.9035917520523071, "rewards/margins": 1.1141357421875, "rewards/rejected": -3.0177273750305176, "step": 420 }, { "epoch": 0.97, "learning_rate": 9.513254770636137e-10, "logits/chosen": -1.9577579498291016, "logits/rejected": -1.8832132816314697, "logps/chosen": -509.35406494140625, "logps/rejected": -482.31640625, "loss": 0.0437, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8531081676483154, "rewards/margins": 1.0920084714889526, "rewards/rejected": -2.9451169967651367, "step": 430 }, { "epoch": 1.0, "learning_rate": 7.867144166728844e-12, "logits/chosen": -1.9990612268447876, "logits/rejected": -1.9326860904693604, "logps/chosen": -519.8091430664062, "logps/rejected": -503.39385986328125, "loss": 0.0409, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7259622812271118, "rewards/margins": 1.2851760387420654, "rewards/rejected": -3.0111382007598877, "step": 440 }, { "epoch": 1.0, "step": 441, "total_flos": 0.0, "train_loss": 0.07998780592802972, "train_runtime": 7378.9712, "train_samples_per_second": 15.318, "train_steps_per_second": 0.06 } ], "logging_steps": 10, "max_steps": 441, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }