{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3067.8628948133914, "learning_rate": 4.9998992904271775e-08, "logits/chosen": -4.185730934143066, "logits/rejected": -4.509836196899414, "logps/chosen": -274.000732421875, "logps/rejected": -205.8054962158203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 3330.3974170986107, "learning_rate": 4.9899357349880975e-08, "logits/chosen": -4.211880207061768, "logits/rejected": -4.48573637008667, "logps/chosen": -318.31072998046875, "logps/rejected": -257.18267822265625, "loss": 0.7459, "rewards/accuracies": 0.5625, "rewards/chosen": 0.200405091047287, "rewards/margins": 0.10155472159385681, "rewards/rejected": 0.09885036945343018, "step": 10 }, { "epoch": 0.06, "grad_norm": 2932.727170813642, "learning_rate": 4.959823971496574e-08, "logits/chosen": -4.2464704513549805, "logits/rejected": -4.50115966796875, "logps/chosen": -304.53350830078125, "logps/rejected": -244.1282501220703, "loss": 0.6293, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.7030802965164185, "rewards/margins": 0.6052380800247192, "rewards/rejected": 0.09784229844808578, "step": 20 }, { "epoch": 0.09, "grad_norm": 2159.097276891197, "learning_rate": 4.9099071517396326e-08, "logits/chosen": -4.3018364906311035, "logits/rejected": -4.5636820793151855, "logps/chosen": -305.11822509765625, "logps/rejected": -258.89215087890625, "loss": 0.5093, "rewards/accuracies": 0.71875, "rewards/chosen": 1.3964869976043701, "rewards/margins": 0.9537334442138672, "rewards/rejected": 0.44275355339050293, "step": 30 }, { "epoch": 0.11, "grad_norm": 2233.10446662558, "learning_rate": 4.8405871765993426e-08, "logits/chosen": -4.304145812988281, "logits/rejected": -4.571420192718506, "logps/chosen": -293.4151916503906, "logps/rejected": -234.4054412841797, "loss": 0.4371, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 2.119215488433838, "rewards/margins": 1.3193193674087524, "rewards/rejected": 0.7998961806297302, "step": 40 }, { "epoch": 0.14, "grad_norm": 1863.9092640792912, "learning_rate": 4.7524221697560474e-08, "logits/chosen": -4.298985481262207, "logits/rejected": -4.545313835144043, "logps/chosen": -299.71026611328125, "logps/rejected": -252.57339477539062, "loss": 0.4054, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 3.015381336212158, "rewards/margins": 1.8283360004425049, "rewards/rejected": 1.1870452165603638, "step": 50 }, { "epoch": 0.17, "grad_norm": 1861.0742759245438, "learning_rate": 4.646121984004665e-08, "logits/chosen": -4.3018717765808105, "logits/rejected": -4.5299859046936035, "logps/chosen": -308.25457763671875, "logps/rejected": -261.1996154785156, "loss": 0.3815, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 3.097055673599243, "rewards/margins": 1.6846046447753906, "rewards/rejected": 1.412451148033142, "step": 60 }, { "epoch": 0.2, "grad_norm": 2083.1341477087894, "learning_rate": 4.522542485937369e-08, "logits/chosen": -4.417206764221191, "logits/rejected": -4.548245429992676, "logps/chosen": -285.4747009277344, "logps/rejected": -236.24136352539062, "loss": 0.3773, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 3.4294419288635254, "rewards/margins": 2.4485509395599365, "rewards/rejected": 0.9808910489082336, "step": 70 }, { "epoch": 0.23, "grad_norm": 1999.1118673285923, "learning_rate": 4.3826786650090273e-08, "logits/chosen": -4.271725177764893, "logits/rejected": -4.525103569030762, "logps/chosen": -292.2157897949219, "logps/rejected": -239.5623321533203, "loss": 0.3663, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.471898317337036, "rewards/margins": 2.5827386379241943, "rewards/rejected": 0.8891592025756836, "step": 80 }, { "epoch": 0.26, "grad_norm": 1543.0151245523064, "learning_rate": 4.2276566224671614e-08, "logits/chosen": -4.196888446807861, "logits/rejected": -4.430451393127441, "logps/chosen": -303.9364929199219, "logps/rejected": -258.19708251953125, "loss": 0.37, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 3.6267776489257812, "rewards/margins": 2.5005435943603516, "rewards/rejected": 1.1262344121932983, "step": 90 }, { "epoch": 0.29, "grad_norm": 2558.2358091969077, "learning_rate": 4.058724504646834e-08, "logits/chosen": -4.298203468322754, "logits/rejected": -4.51765251159668, "logps/chosen": -291.99151611328125, "logps/rejected": -240.97909545898438, "loss": 0.3573, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.8364264965057373, "rewards/margins": 2.6143250465393066, "rewards/rejected": 1.2221016883850098, "step": 100 }, { "epoch": 0.29, "eval_logits/chosen": -3.2259409427642822, "eval_logits/rejected": -3.2259409427642822, "eval_logps/chosen": -157.8415985107422, "eval_logps/rejected": -157.8415985107422, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -2.2645912170410156, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -2.2645912170410156, "eval_runtime": 1.5044, "eval_samples_per_second": 0.665, "eval_steps_per_second": 0.665, "step": 100 }, { "epoch": 0.31, "grad_norm": 2075.8470964199623, "learning_rate": 3.8772424536302564e-08, "logits/chosen": -4.3160247802734375, "logits/rejected": -4.557186126708984, "logps/chosen": -299.556640625, "logps/rejected": -250.2120361328125, "loss": 0.3653, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 3.8744053840637207, "rewards/margins": 2.781764268875122, "rewards/rejected": 1.0926413536071777, "step": 110 }, { "epoch": 0.34, "grad_norm": 2129.2578794603846, "learning_rate": 3.6846716561824964e-08, "logits/chosen": -4.358242988586426, "logits/rejected": -4.6036834716796875, "logps/chosen": -288.9602966308594, "logps/rejected": -237.98257446289062, "loss": 0.346, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 3.973881959915161, "rewards/margins": 2.8389506340026855, "rewards/rejected": 1.1349313259124756, "step": 120 }, { "epoch": 0.37, "grad_norm": 1374.3088736284383, "learning_rate": 3.482562579134809e-08, "logits/chosen": -4.360684871673584, "logits/rejected": -4.608490467071533, "logps/chosen": -278.861572265625, "logps/rejected": -218.7367706298828, "loss": 0.3426, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.8384926319122314, "rewards/margins": 2.634833812713623, "rewards/rejected": 1.2036586999893188, "step": 130 }, { "epoch": 0.4, "grad_norm": 1741.7465783603645, "learning_rate": 3.272542485937369e-08, "logits/chosen": -4.276978969573975, "logits/rejected": -4.593733787536621, "logps/chosen": -296.0984191894531, "logps/rejected": -240.01248168945312, "loss": 0.3729, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.785256862640381, "rewards/margins": 2.9941701889038086, "rewards/rejected": 0.79108726978302, "step": 140 }, { "epoch": 0.43, "grad_norm": 1837.7137132104272, "learning_rate": 3.056302334890786e-08, "logits/chosen": -4.245262622833252, "logits/rejected": -4.510401725769043, "logps/chosen": -295.3984680175781, "logps/rejected": -250.73580932617188, "loss": 0.3235, "rewards/accuracies": 0.84375, "rewards/chosen": 4.011710166931152, "rewards/margins": 3.0462794303894043, "rewards/rejected": 0.9654304385185242, "step": 150 }, { "epoch": 0.46, "grad_norm": 1744.335126050233, "learning_rate": 2.8355831645441387e-08, "logits/chosen": -4.277425765991211, "logits/rejected": -4.570274829864502, "logps/chosen": -296.66839599609375, "logps/rejected": -235.6475372314453, "loss": 0.36, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 4.259499549865723, "rewards/margins": 3.127065658569336, "rewards/rejected": 1.1324341297149658, "step": 160 }, { "epoch": 0.49, "grad_norm": 1875.319827037545, "learning_rate": 2.6121620758762875e-08, "logits/chosen": -4.229983329772949, "logits/rejected": -4.467092990875244, "logps/chosen": -296.31683349609375, "logps/rejected": -241.3401336669922, "loss": 0.3474, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.343829154968262, "rewards/margins": 3.233609437942505, "rewards/rejected": 1.1102204322814941, "step": 170 }, { "epoch": 0.51, "grad_norm": 2082.5003671787076, "learning_rate": 2.3878379241237133e-08, "logits/chosen": -4.364750862121582, "logits/rejected": -4.597868919372559, "logps/chosen": -285.72869873046875, "logps/rejected": -241.40652465820312, "loss": 0.3417, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 4.1484293937683105, "rewards/margins": 3.0738511085510254, "rewards/rejected": 1.074578046798706, "step": 180 }, { "epoch": 0.54, "grad_norm": 1597.9774938638957, "learning_rate": 2.164416835455862e-08, "logits/chosen": -4.3281121253967285, "logits/rejected": -4.498069763183594, "logps/chosen": -308.14776611328125, "logps/rejected": -257.7415466308594, "loss": 0.2852, "rewards/accuracies": 0.875, "rewards/chosen": 4.146700859069824, "rewards/margins": 3.202249526977539, "rewards/rejected": 0.9444509744644165, "step": 190 }, { "epoch": 0.57, "grad_norm": 1601.8580723204816, "learning_rate": 1.943697665109214e-08, "logits/chosen": -4.358348846435547, "logits/rejected": -4.601215839385986, "logps/chosen": -292.93658447265625, "logps/rejected": -249.59469604492188, "loss": 0.3184, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 4.194998741149902, "rewards/margins": 2.974621534347534, "rewards/rejected": 1.2203772068023682, "step": 200 }, { "epoch": 0.57, "eval_logits/chosen": -3.2195205688476562, "eval_logits/rejected": -3.2195205688476562, "eval_logps/chosen": -157.37933349609375, "eval_logps/rejected": -157.37933349609375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -1.8023262023925781, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -1.8023262023925781, "eval_runtime": 1.4741, "eval_samples_per_second": 0.678, "eval_steps_per_second": 0.678, "step": 200 }, { "epoch": 0.6, "grad_norm": 1818.1510653253358, "learning_rate": 1.7274575140626317e-08, "logits/chosen": -4.293700218200684, "logits/rejected": -4.587708473205566, "logps/chosen": -306.94647216796875, "logps/rejected": -254.83981323242188, "loss": 0.3169, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 4.274092674255371, "rewards/margins": 3.556690216064453, "rewards/rejected": 0.7174022793769836, "step": 210 }, { "epoch": 0.63, "grad_norm": 2084.9707047014217, "learning_rate": 1.517437420865191e-08, "logits/chosen": -4.2438554763793945, "logits/rejected": -4.590119361877441, "logps/chosen": -297.3277587890625, "logps/rejected": -225.09414672851562, "loss": 0.3117, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 4.186089515686035, "rewards/margins": 3.6873459815979004, "rewards/rejected": 0.4987434446811676, "step": 220 }, { "epoch": 0.66, "grad_norm": 1793.5243127965375, "learning_rate": 1.3153283438175034e-08, "logits/chosen": -4.3719801902771, "logits/rejected": -4.563234806060791, "logps/chosen": -281.373779296875, "logps/rejected": -226.25576782226562, "loss": 0.2879, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.584429979324341, "rewards/margins": 2.8807406425476074, "rewards/rejected": 0.7036892771720886, "step": 230 }, { "epoch": 0.69, "grad_norm": 1621.528952660571, "learning_rate": 1.1227575463697438e-08, "logits/chosen": -4.3936567306518555, "logits/rejected": -4.714280128479004, "logps/chosen": -258.6517639160156, "logps/rejected": -215.28759765625, "loss": 0.3042, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.862626552581787, "rewards/margins": 3.0625431537628174, "rewards/rejected": 0.8000835180282593, "step": 240 }, { "epoch": 0.71, "grad_norm": 2231.5682374793205, "learning_rate": 9.412754953531663e-09, "logits/chosen": -4.34213924407959, "logits/rejected": -4.6162428855896, "logps/chosen": -278.9085388183594, "logps/rejected": -232.6056365966797, "loss": 0.3109, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.6670002937316895, "rewards/margins": 2.8657121658325195, "rewards/rejected": 0.8012881278991699, "step": 250 }, { "epoch": 0.74, "grad_norm": 1668.5476234310504, "learning_rate": 7.723433775328384e-09, "logits/chosen": -4.386145114898682, "logits/rejected": -4.632050037384033, "logps/chosen": -271.8704833984375, "logps/rejected": -240.48257446289062, "loss": 0.3039, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.5878806114196777, "rewards/margins": 2.9320101737976074, "rewards/rejected": 0.6558703184127808, "step": 260 }, { "epoch": 0.77, "grad_norm": 1726.7631750123023, "learning_rate": 6.173213349909728e-09, "logits/chosen": -4.517698764801025, "logits/rejected": -4.687317848205566, "logps/chosen": -273.4754943847656, "logps/rejected": -228.2833709716797, "loss": 0.3356, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 3.6022744178771973, "rewards/margins": 2.6589503288269043, "rewards/rejected": 0.9433239698410034, "step": 270 }, { "epoch": 0.8, "grad_norm": 1197.1122441391342, "learning_rate": 4.7745751406263165e-09, "logits/chosen": -4.299304008483887, "logits/rejected": -4.589285850524902, "logps/chosen": -274.9901123046875, "logps/rejected": -229.76449584960938, "loss": 0.2791, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 4.094995021820068, "rewards/margins": 3.0975327491760254, "rewards/rejected": 0.997462272644043, "step": 280 }, { "epoch": 0.83, "grad_norm": 2356.4193384705377, "learning_rate": 3.5387801599533474e-09, "logits/chosen": -4.320891857147217, "logits/rejected": -4.508334636688232, "logps/chosen": -282.45013427734375, "logps/rejected": -236.50424194335938, "loss": 0.3316, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 4.115664958953857, "rewards/margins": 3.2731566429138184, "rewards/rejected": 0.8425084948539734, "step": 290 }, { "epoch": 0.86, "grad_norm": 1485.14332328563, "learning_rate": 2.475778302439524e-09, "logits/chosen": -4.295617580413818, "logits/rejected": -4.5400543212890625, "logps/chosen": -298.4153137207031, "logps/rejected": -240.1478271484375, "loss": 0.3594, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 4.416214942932129, "rewards/margins": 3.1984994411468506, "rewards/rejected": 1.2177152633666992, "step": 300 }, { "epoch": 0.86, "eval_logits/chosen": -3.220174551010132, "eval_logits/rejected": -3.220174551010132, "eval_logps/chosen": -157.367431640625, "eval_logps/rejected": -157.367431640625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -1.7904319763183594, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -1.7904319763183594, "eval_runtime": 1.47, "eval_samples_per_second": 0.68, "eval_steps_per_second": 0.68, "step": 300 }, { "epoch": 0.89, "grad_norm": 2625.0873445651387, "learning_rate": 1.5941282340065698e-09, "logits/chosen": -4.43851900100708, "logits/rejected": -4.580752372741699, "logps/chosen": -262.37445068359375, "logps/rejected": -226.46572875976562, "loss": 0.3007, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.741738796234131, "rewards/margins": 2.9144444465637207, "rewards/rejected": 0.8272944688796997, "step": 310 }, { "epoch": 0.91, "grad_norm": 1589.6112135444553, "learning_rate": 9.009284826036689e-10, "logits/chosen": -4.277141094207764, "logits/rejected": -4.5314412117004395, "logps/chosen": -292.65875244140625, "logps/rejected": -243.8509063720703, "loss": 0.3277, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.2250542640686035, "rewards/margins": 3.1320207118988037, "rewards/rejected": 1.093034029006958, "step": 320 }, { "epoch": 0.94, "grad_norm": 2192.855370501752, "learning_rate": 4.017602850342583e-10, "logits/chosen": -4.330888271331787, "logits/rejected": -4.536975383758545, "logps/chosen": -305.5764465332031, "logps/rejected": -252.0467529296875, "loss": 0.3203, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.009243488311768, "rewards/margins": 3.056270122528076, "rewards/rejected": 0.9529730677604675, "step": 330 }, { "epoch": 0.97, "grad_norm": 2158.7231383937637, "learning_rate": 1.0064265011902328e-10, "logits/chosen": -4.319821357727051, "logits/rejected": -4.614516735076904, "logps/chosen": -285.194091796875, "logps/rejected": -227.5124053955078, "loss": 0.3239, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.751185178756714, "rewards/margins": 2.758882522583008, "rewards/rejected": 0.9923027753829956, "step": 340 }, { "epoch": 1.0, "grad_norm": 1350.3403367664616, "learning_rate": 0.0, "logits/chosen": -4.290497779846191, "logits/rejected": -4.4949870109558105, "logps/chosen": -291.93768310546875, "logps/rejected": -244.3520965576172, "loss": 0.3142, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 4.083470344543457, "rewards/margins": 3.2036800384521484, "rewards/rejected": 0.8797903060913086, "step": 350 }, { "epoch": 1.0, "step": 350, "total_flos": 0.0, "train_loss": 0.36299856867109026, "train_runtime": 5294.123, "train_samples_per_second": 8.454, "train_steps_per_second": 0.066 } ], "logging_steps": 10, "max_steps": 350, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }