{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987515605493134, "eval_steps": 2000, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.760079834365155, "learning_rate": 8.333333333333334e-08, "logits/chosen": -1.1630980968475342, "logits/rejected": -1.0201224088668823, "logps/chosen": -350.4145812988281, "logps/rejected": -220.30422973632812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 3.6284772366587372, "learning_rate": 8.333333333333333e-07, "logits/chosen": -1.0121445655822754, "logits/rejected": -0.9547010064125061, "logps/chosen": -320.40802001953125, "logps/rejected": -186.48419189453125, "loss": 0.6924, "rewards/accuracies": 0.4982638955116272, "rewards/chosen": 0.0024558762088418007, "rewards/margins": 0.0015082670142874122, "rewards/rejected": 0.0009476091363467276, "step": 10 }, { "epoch": 0.03, "grad_norm": 4.24809146375121, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -0.9381664395332336, "logits/rejected": -0.8647511601448059, "logps/chosen": -328.6592102050781, "logps/rejected": -212.4827880859375, "loss": 0.6822, "rewards/accuracies": 0.7953125238418579, "rewards/chosen": 0.034552641212940216, "rewards/margins": 0.022216904908418655, "rewards/rejected": 0.012335737235844135, "step": 20 }, { "epoch": 0.05, "grad_norm": 3.9797438421104276, "learning_rate": 2.5e-06, "logits/chosen": -1.020849585533142, "logits/rejected": -0.9427526593208313, "logps/chosen": -329.3730163574219, "logps/rejected": -215.0742645263672, "loss": 0.6458, "rewards/accuracies": 0.854687511920929, "rewards/chosen": 0.1433950513601303, "rewards/margins": 0.10005545616149902, "rewards/rejected": 0.04333961382508278, "step": 30 }, { "epoch": 0.07, "grad_norm": 2.971987358375742, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -1.1010403633117676, "logits/rejected": -1.0402967929840088, "logps/chosen": -310.91778564453125, "logps/rejected": -205.1094512939453, "loss": 0.5801, "rewards/accuracies": 0.885937511920929, "rewards/chosen": 0.3175004720687866, "rewards/margins": 0.25547632575035095, "rewards/rejected": 0.06202414631843567, "step": 40 }, { "epoch": 0.08, "grad_norm": 2.284422715303357, "learning_rate": 4.166666666666667e-06, "logits/chosen": -1.176831841468811, "logits/rejected": -1.1168550252914429, "logps/chosen": -289.9400634765625, "logps/rejected": -193.466552734375, "loss": 0.5192, "rewards/accuracies": 0.8765624761581421, "rewards/chosen": 0.4821470379829407, "rewards/margins": 0.42469802498817444, "rewards/rejected": 0.057449012994766235, "step": 50 }, { "epoch": 0.1, "grad_norm": 2.693731441358458, "learning_rate": 5e-06, "logits/chosen": -1.1333694458007812, "logits/rejected": -1.0512011051177979, "logps/chosen": -268.7071838378906, "logps/rejected": -216.21939086914062, "loss": 0.4625, "rewards/accuracies": 0.890625, "rewards/chosen": 0.570508599281311, "rewards/margins": 0.6040245890617371, "rewards/rejected": -0.03351598605513573, "step": 60 }, { "epoch": 0.12, "grad_norm": 2.982413579425004, "learning_rate": 4.995770395678171e-06, "logits/chosen": -0.9235696792602539, "logits/rejected": -0.8171814680099487, "logps/chosen": -288.454833984375, "logps/rejected": -243.9117431640625, "loss": 0.3919, "rewards/accuracies": 0.9359375238418579, "rewards/chosen": 0.4965239465236664, "rewards/margins": 0.8616136312484741, "rewards/rejected": -0.36508968472480774, "step": 70 }, { "epoch": 0.13, "grad_norm": 3.0591291304636767, "learning_rate": 4.983095894354858e-06, "logits/chosen": -0.5841406583786011, "logits/rejected": -0.5758659243583679, "logps/chosen": -282.39080810546875, "logps/rejected": -267.4037170410156, "loss": 0.3708, "rewards/accuracies": 0.9234374761581421, "rewards/chosen": 0.3620058298110962, "rewards/margins": 1.001431941986084, "rewards/rejected": -0.639426052570343, "step": 80 }, { "epoch": 0.15, "grad_norm": 2.7852348655427495, "learning_rate": 4.962019382530521e-06, "logits/chosen": -0.3057808578014374, "logits/rejected": -0.08402713388204575, "logps/chosen": -304.71258544921875, "logps/rejected": -306.744873046875, "loss": 0.3332, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.34849730134010315, "rewards/margins": 1.234140157699585, "rewards/rejected": -0.8856428861618042, "step": 90 }, { "epoch": 0.17, "grad_norm": 3.904931302273915, "learning_rate": 4.93261217644956e-06, "logits/chosen": -0.3401206433773041, "logits/rejected": -0.006557087413966656, "logps/chosen": -306.9206848144531, "logps/rejected": -352.503173828125, "loss": 0.284, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13179777562618256, "rewards/margins": 1.5720919370651245, "rewards/rejected": -1.4402940273284912, "step": 100 }, { "epoch": 0.18, "grad_norm": 4.332999329515302, "learning_rate": 4.894973780788722e-06, "logits/chosen": -0.3165335953235626, "logits/rejected": 0.0790834054350853, "logps/chosen": -350.21405029296875, "logps/rejected": -433.90643310546875, "loss": 0.2218, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.08845386654138565, "rewards/margins": 2.1888351440429688, "rewards/rejected": -2.2772889137268066, "step": 110 }, { "epoch": 0.2, "grad_norm": 3.8287946526526464, "learning_rate": 4.849231551964771e-06, "logits/chosen": -0.34387272596359253, "logits/rejected": -0.02132757380604744, "logps/chosen": -342.6011047363281, "logps/rejected": -551.7183837890625, "loss": 0.1787, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.15126076340675354, "rewards/margins": 3.2273590564727783, "rewards/rejected": -3.378619432449341, "step": 120 }, { "epoch": 0.22, "grad_norm": 3.7220399456471203, "learning_rate": 4.7955402672006855e-06, "logits/chosen": -0.4739972949028015, "logits/rejected": -0.22087886929512024, "logps/chosen": -355.5228271484375, "logps/rejected": -637.9622192382812, "loss": 0.1651, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20811741054058075, "rewards/margins": 4.130118370056152, "rewards/rejected": -4.338235378265381, "step": 130 }, { "epoch": 0.23, "grad_norm": 2.216070201202239, "learning_rate": 4.734081600808531e-06, "logits/chosen": -0.5762065649032593, "logits/rejected": -0.37428033351898193, "logps/chosen": -342.99853515625, "logps/rejected": -746.65234375, "loss": 0.1198, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -0.11533623933792114, "rewards/margins": 5.1436309814453125, "rewards/rejected": -5.258967399597168, "step": 140 }, { "epoch": 0.25, "grad_norm": 4.452614644496966, "learning_rate": 4.665063509461098e-06, "logits/chosen": -0.43740949034690857, "logits/rejected": -0.276010662317276, "logps/chosen": -353.4092712402344, "logps/rejected": -838.3703002929688, "loss": 0.145, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -0.23360753059387207, "rewards/margins": 5.936570167541504, "rewards/rejected": -6.170177459716797, "step": 150 }, { "epoch": 0.27, "grad_norm": 6.909783574695525, "learning_rate": 4.588719528532342e-06, "logits/chosen": 0.02835695818066597, "logits/rejected": 0.3119501769542694, "logps/chosen": -353.52203369140625, "logps/rejected": -654.4629516601562, "loss": 0.1368, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -0.25593429803848267, "rewards/margins": 4.225825786590576, "rewards/rejected": -4.481760501861572, "step": 160 }, { "epoch": 0.28, "grad_norm": 1.7107339593657505, "learning_rate": 4.50530798188761e-06, "logits/chosen": -0.15743690729141235, "logits/rejected": 0.23819151520729065, "logps/chosen": -333.4959716796875, "logps/rejected": -659.2352294921875, "loss": 0.1267, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.00615291204303503, "rewards/margins": 4.596193790435791, "rewards/rejected": -4.602346420288086, "step": 170 }, { "epoch": 0.3, "grad_norm": 3.623547042771, "learning_rate": 4.415111107797445e-06, "logits/chosen": -0.31918713450431824, "logits/rejected": 0.03602874279022217, "logps/chosen": -330.2435302734375, "logps/rejected": -757.0585327148438, "loss": 0.1056, "rewards/accuracies": 0.979687511920929, "rewards/chosen": 0.007477378938347101, "rewards/margins": 5.585347652435303, "rewards/rejected": -5.577870845794678, "step": 180 }, { "epoch": 0.32, "grad_norm": 2.1551725469519507, "learning_rate": 4.318434103932622e-06, "logits/chosen": -0.3329532742500305, "logits/rejected": -0.041298139840364456, "logps/chosen": -382.2855529785156, "logps/rejected": -988.2845458984375, "loss": 0.1055, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -0.40997394919395447, "rewards/margins": 7.356125831604004, "rewards/rejected": -7.766099452972412, "step": 190 }, { "epoch": 0.33, "grad_norm": 2.5178815949434505, "learning_rate": 4.215604094671835e-06, "logits/chosen": -0.4541547894477844, "logits/rejected": -0.1613123118877411, "logps/chosen": -347.7196350097656, "logps/rejected": -880.2561645507812, "loss": 0.1013, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.13055315613746643, "rewards/margins": 6.6189093589782715, "rewards/rejected": -6.749462127685547, "step": 200 }, { "epoch": 0.35, "grad_norm": 2.0381257493358285, "learning_rate": 4.106969024216348e-06, "logits/chosen": -0.5205026865005493, "logits/rejected": -0.25759488344192505, "logps/chosen": -393.9586486816406, "logps/rejected": -1019.0203247070312, "loss": 0.0999, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.4994255006313324, "rewards/margins": 7.512589931488037, "rewards/rejected": -8.012015342712402, "step": 210 }, { "epoch": 0.37, "grad_norm": 6.3674783354384115, "learning_rate": 3.992896479256966e-06, "logits/chosen": -0.6750475168228149, "logits/rejected": -0.3842785954475403, "logps/chosen": -336.7120666503906, "logps/rejected": -858.0791015625, "loss": 0.1031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.017676908522844315, "rewards/margins": 6.612088680267334, "rewards/rejected": -6.594411373138428, "step": 220 }, { "epoch": 0.38, "grad_norm": 2.419780694811269, "learning_rate": 3.8737724451770155e-06, "logits/chosen": -0.6326015591621399, "logits/rejected": -0.40925782918930054, "logps/chosen": -375.2301025390625, "logps/rejected": -1089.742919921875, "loss": 0.0811, "rewards/accuracies": 0.984375, "rewards/chosen": -0.42722994089126587, "rewards/margins": 8.390230178833008, "rewards/rejected": -8.817461013793945, "step": 230 }, { "epoch": 0.4, "grad_norm": 17.310687719433773, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.5065186023712158, "logits/rejected": -0.2878126800060272, "logps/chosen": -386.92266845703125, "logps/rejected": -1173.471435546875, "loss": 0.0646, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.5754821300506592, "rewards/margins": 9.043745040893555, "rewards/rejected": -9.619227409362793, "step": 240 }, { "epoch": 0.42, "grad_norm": 1.566552488550366, "learning_rate": 3.621997950501156e-06, "logits/chosen": -0.23604285717010498, "logits/rejected": 0.03810877352952957, "logps/chosen": -380.695068359375, "logps/rejected": -1131.797607421875, "loss": 0.0778, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -0.5175285935401917, "rewards/margins": 8.581127166748047, "rewards/rejected": -9.098657608032227, "step": 250 }, { "epoch": 0.43, "grad_norm": 2.2662266467083074, "learning_rate": 3.4901994150978926e-06, "logits/chosen": -0.14488890767097473, "logits/rejected": 0.2611751854419708, "logps/chosen": -305.7989196777344, "logps/rejected": -813.2493896484375, "loss": 0.1017, "rewards/accuracies": 0.9765625, "rewards/chosen": 0.1130492091178894, "rewards/margins": 6.16524600982666, "rewards/rejected": -6.052196502685547, "step": 260 }, { "epoch": 0.45, "grad_norm": 2.023161637845827, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -0.14377684891223907, "logits/rejected": 0.22304537892341614, "logps/chosen": -372.13165283203125, "logps/rejected": -896.8603515625, "loss": 0.0815, "rewards/accuracies": 0.984375, "rewards/chosen": -0.10399510711431503, "rewards/margins": 6.930342197418213, "rewards/rejected": -7.034337520599365, "step": 270 }, { "epoch": 0.47, "grad_norm": 2.8073955958066197, "learning_rate": 3.217008081777726e-06, "logits/chosen": -0.10699782520532608, "logits/rejected": 0.14858277142047882, "logps/chosen": -438.87823486328125, "logps/rejected": -1203.0933837890625, "loss": 0.0873, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -1.0280810594558716, "rewards/margins": 8.752517700195312, "rewards/rejected": -9.780599594116211, "step": 280 }, { "epoch": 0.48, "grad_norm": 4.347308476957998, "learning_rate": 3.0765396768561005e-06, "logits/chosen": -0.4454914927482605, "logits/rejected": -0.15811693668365479, "logps/chosen": -349.51531982421875, "logps/rejected": -1020.2342529296875, "loss": 0.0657, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.13168838620185852, "rewards/margins": 7.97658634185791, "rewards/rejected": -8.108274459838867, "step": 290 }, { "epoch": 0.5, "grad_norm": 4.923260234351252, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -0.46575579047203064, "logits/rejected": -0.18623068928718567, "logps/chosen": -382.58807373046875, "logps/rejected": -1112.0509033203125, "loss": 0.0805, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -0.5270034670829773, "rewards/margins": 8.469846725463867, "rewards/rejected": -8.996851921081543, "step": 300 }, { "epoch": 0.52, "grad_norm": 4.035586599115476, "learning_rate": 2.7902322853130758e-06, "logits/chosen": -0.47277918457984924, "logits/rejected": -0.22883549332618713, "logps/chosen": -396.8818054199219, "logps/rejected": -1298.666748046875, "loss": 0.0646, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.6961787343025208, "rewards/margins": 10.189523696899414, "rewards/rejected": -10.885702133178711, "step": 310 }, { "epoch": 0.53, "grad_norm": 1.7894983493848236, "learning_rate": 2.6453620722761897e-06, "logits/chosen": -0.5364641547203064, "logits/rejected": -0.28673312067985535, "logps/chosen": -379.64337158203125, "logps/rejected": -1212.439697265625, "loss": 0.0705, "rewards/accuracies": 0.984375, "rewards/chosen": -0.35370904207229614, "rewards/margins": 9.660139083862305, "rewards/rejected": -10.013849258422852, "step": 320 }, { "epoch": 0.55, "grad_norm": 3.3096114791299955, "learning_rate": 2.5e-06, "logits/chosen": -0.5816466212272644, "logits/rejected": -0.35151442885398865, "logps/chosen": -379.9997863769531, "logps/rejected": -1187.98876953125, "loss": 0.0533, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.41831880807876587, "rewards/margins": 9.45728874206543, "rewards/rejected": -9.875606536865234, "step": 330 }, { "epoch": 0.57, "grad_norm": 3.165697815332725, "learning_rate": 2.3546379277238107e-06, "logits/chosen": -0.47588858008384705, "logits/rejected": -0.26532530784606934, "logps/chosen": -454.8421325683594, "logps/rejected": -1527.9405517578125, "loss": 0.0613, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2038371562957764, "rewards/margins": 11.936580657958984, "rewards/rejected": -13.140419006347656, "step": 340 }, { "epoch": 0.58, "grad_norm": 2.150182775021995, "learning_rate": 2.2097677146869242e-06, "logits/chosen": -0.5888150930404663, "logits/rejected": -0.2789239287376404, "logps/chosen": -349.5444030761719, "logps/rejected": -1127.449951171875, "loss": 0.0722, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.23732244968414307, "rewards/margins": 8.962444305419922, "rewards/rejected": -9.199767112731934, "step": 350 }, { "epoch": 0.6, "grad_norm": 2.370472414334315, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -0.4766604006290436, "logits/rejected": -0.22312171757221222, "logps/chosen": -409.70135498046875, "logps/rejected": -1264.747314453125, "loss": 0.0719, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -0.5473332405090332, "rewards/margins": 9.985517501831055, "rewards/rejected": -10.532853126525879, "step": 360 }, { "epoch": 0.62, "grad_norm": 4.485106240623341, "learning_rate": 1.9234603231439e-06, "logits/chosen": -0.40441417694091797, "logits/rejected": -0.13194730877876282, "logps/chosen": -391.0859069824219, "logps/rejected": -1242.066162109375, "loss": 0.0632, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -0.6057429909706116, "rewards/margins": 9.785425186157227, "rewards/rejected": -10.391169548034668, "step": 370 }, { "epoch": 0.63, "grad_norm": 1.3547912002584528, "learning_rate": 1.7829919182222752e-06, "logits/chosen": -0.3805684447288513, "logits/rejected": -0.11748667806386948, "logps/chosen": -420.901611328125, "logps/rejected": -1328.623291015625, "loss": 0.0609, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -0.752387523651123, "rewards/margins": 10.4163236618042, "rewards/rejected": -11.16871166229248, "step": 380 }, { "epoch": 0.65, "grad_norm": 7.41839867309329, "learning_rate": 1.6449496416858285e-06, "logits/chosen": -0.439796507358551, "logits/rejected": -0.18884414434432983, "logps/chosen": -361.68634033203125, "logps/rejected": -1187.6492919921875, "loss": 0.093, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -0.2755175232887268, "rewards/margins": 9.546114921569824, "rewards/rejected": -9.82163143157959, "step": 390 }, { "epoch": 0.67, "grad_norm": 1.9341085433225174, "learning_rate": 1.509800584902108e-06, "logits/chosen": -0.4470156133174896, "logits/rejected": -0.15826158225536346, "logps/chosen": -325.4810791015625, "logps/rejected": -963.2032470703125, "loss": 0.0697, "rewards/accuracies": 0.9921875, "rewards/chosen": 0.05155152827501297, "rewards/margins": 7.570789337158203, "rewards/rejected": -7.519238471984863, "step": 400 }, { "epoch": 0.68, "grad_norm": 2.2887818701903795, "learning_rate": 1.3780020494988447e-06, "logits/chosen": -0.38168513774871826, "logits/rejected": -0.13108135759830475, "logps/chosen": -338.5289001464844, "logps/rejected": -994.54345703125, "loss": 0.0767, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.13217389583587646, "rewards/margins": 7.651003360748291, "rewards/rejected": -7.783177375793457, "step": 410 }, { "epoch": 0.7, "grad_norm": 10.334424423093157, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -0.32895341515541077, "logits/rejected": -0.11540427058935165, "logps/chosen": -408.0703125, "logps/rejected": -1260.2664794921875, "loss": 0.0555, "rewards/accuracies": 0.984375, "rewards/chosen": -0.7070173025131226, "rewards/margins": 9.677043914794922, "rewards/rejected": -10.384060859680176, "step": 420 }, { "epoch": 0.72, "grad_norm": 3.446631906756751, "learning_rate": 1.1262275548229852e-06, "logits/chosen": -0.45178350806236267, "logits/rejected": -0.1636919528245926, "logps/chosen": -414.2293395996094, "logps/rejected": -1283.5029296875, "loss": 0.0678, "rewards/accuracies": 0.984375, "rewards/chosen": -0.8513700366020203, "rewards/margins": 9.903493881225586, "rewards/rejected": -10.754863739013672, "step": 430 }, { "epoch": 0.73, "grad_norm": 3.1936523282167912, "learning_rate": 1.0071035207430352e-06, "logits/chosen": -0.44514569640159607, "logits/rejected": -0.20628270506858826, "logps/chosen": -388.30224609375, "logps/rejected": -1252.75146484375, "loss": 0.048, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.6398779153823853, "rewards/margins": 9.881658554077148, "rewards/rejected": -10.521535873413086, "step": 440 }, { "epoch": 0.75, "grad_norm": 2.7973345573120514, "learning_rate": 8.930309757836517e-07, "logits/chosen": -0.5846482515335083, "logits/rejected": -0.3184075355529785, "logps/chosen": -380.7689514160156, "logps/rejected": -1142.24365234375, "loss": 0.0603, "rewards/accuracies": 0.995312511920929, "rewards/chosen": -0.4020051956176758, "rewards/margins": 8.967304229736328, "rewards/rejected": -9.36931037902832, "step": 450 }, { "epoch": 0.77, "grad_norm": 1.7683111661015913, "learning_rate": 7.843959053281663e-07, "logits/chosen": -0.4719081521034241, "logits/rejected": -0.248517706990242, "logps/chosen": -379.7674560546875, "logps/rejected": -1191.5079345703125, "loss": 0.0567, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -0.3425825238227844, "rewards/margins": 9.473920822143555, "rewards/rejected": -9.816503524780273, "step": 460 }, { "epoch": 0.78, "grad_norm": 2.619770880026216, "learning_rate": 6.815658960673782e-07, "logits/chosen": -0.5005173683166504, "logits/rejected": -0.22156552970409393, "logps/chosen": -389.5422668457031, "logps/rejected": -1232.5107421875, "loss": 0.0507, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6200243830680847, "rewards/margins": 9.514973640441895, "rewards/rejected": -10.134997367858887, "step": 470 }, { "epoch": 0.8, "grad_norm": 2.206190169965742, "learning_rate": 5.848888922025553e-07, "logits/chosen": -0.46861904859542847, "logits/rejected": -0.18907694518566132, "logps/chosen": -388.5583801269531, "logps/rejected": -1196.972900390625, "loss": 0.0648, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.40207749605178833, "rewards/margins": 9.50928783416748, "rewards/rejected": -9.911364555358887, "step": 480 }, { "epoch": 0.82, "grad_norm": 5.22635215684865, "learning_rate": 4.946920181123904e-07, "logits/chosen": -0.49871087074279785, "logits/rejected": -0.23706772923469543, "logps/chosen": -378.83209228515625, "logps/rejected": -1230.994140625, "loss": 0.0594, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.36057573556900024, "rewards/margins": 9.88486099243164, "rewards/rejected": -10.245436668395996, "step": 490 }, { "epoch": 0.83, "grad_norm": 4.399177165103731, "learning_rate": 4.1128047146765936e-07, "logits/chosen": -0.4957138001918793, "logits/rejected": -0.22013764083385468, "logps/chosen": -358.6220703125, "logps/rejected": -1129.539794921875, "loss": 0.0699, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.2314276248216629, "rewards/margins": 9.046243667602539, "rewards/rejected": -9.277670860290527, "step": 500 }, { "epoch": 0.85, "grad_norm": 2.4578322691260586, "learning_rate": 3.3493649053890325e-07, "logits/chosen": -0.3916376233100891, "logits/rejected": -0.13147909939289093, "logps/chosen": -376.29583740234375, "logps/rejected": -1182.824462890625, "loss": 0.0487, "rewards/accuracies": 0.995312511920929, "rewards/chosen": -0.43332648277282715, "rewards/margins": 9.195769309997559, "rewards/rejected": -9.629096031188965, "step": 510 }, { "epoch": 0.87, "grad_norm": 2.130490613601597, "learning_rate": 2.6591839919146963e-07, "logits/chosen": -0.36289340257644653, "logits/rejected": -0.13988874852657318, "logps/chosen": -386.994873046875, "logps/rejected": -1234.4404296875, "loss": 0.0544, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -0.5290244221687317, "rewards/margins": 9.592443466186523, "rewards/rejected": -10.121468544006348, "step": 520 }, { "epoch": 0.88, "grad_norm": 2.5265891239165703, "learning_rate": 2.044597327993153e-07, "logits/chosen": -0.44066888093948364, "logits/rejected": -0.1672184318304062, "logps/chosen": -404.02349853515625, "logps/rejected": -1213.29638671875, "loss": 0.0571, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.50049889087677, "rewards/margins": 9.578344345092773, "rewards/rejected": -10.07884407043457, "step": 530 }, { "epoch": 0.9, "grad_norm": 1.3929764183361966, "learning_rate": 1.507684480352292e-07, "logits/chosen": -0.4457983374595642, "logits/rejected": -0.20653533935546875, "logps/chosen": -387.2595520019531, "logps/rejected": -1257.850341796875, "loss": 0.0582, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.5553634762763977, "rewards/margins": 9.88083267211914, "rewards/rejected": -10.436195373535156, "step": 540 }, { "epoch": 0.92, "grad_norm": 1.8197547536284149, "learning_rate": 1.0502621921127776e-07, "logits/chosen": -0.4422330856323242, "logits/rejected": -0.1755351722240448, "logps/chosen": -414.64715576171875, "logps/rejected": -1265.412353515625, "loss": 0.052, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6048385500907898, "rewards/margins": 9.867313385009766, "rewards/rejected": -10.472152709960938, "step": 550 }, { "epoch": 0.93, "grad_norm": 3.101887331185102, "learning_rate": 6.738782355044048e-08, "logits/chosen": -0.4752727448940277, "logits/rejected": -0.20606884360313416, "logps/chosen": -390.14984130859375, "logps/rejected": -1334.9105224609375, "loss": 0.0647, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -0.6308324933052063, "rewards/margins": 10.629243850708008, "rewards/rejected": -11.260076522827148, "step": 560 }, { "epoch": 0.95, "grad_norm": 3.441898096294271, "learning_rate": 3.798061746947995e-08, "logits/chosen": -0.480734646320343, "logits/rejected": -0.21511860191822052, "logps/chosen": -387.4568786621094, "logps/rejected": -1270.74853515625, "loss": 0.0527, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7548898458480835, "rewards/margins": 9.84511947631836, "rewards/rejected": -10.600008964538574, "step": 570 }, { "epoch": 0.97, "grad_norm": 2.2741744504193826, "learning_rate": 1.6904105645142443e-08, "logits/chosen": -0.44097834825515747, "logits/rejected": -0.2139013707637787, "logps/chosen": -391.39453125, "logps/rejected": -1291.123779296875, "loss": 0.0469, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.5810464024543762, "rewards/margins": 10.310724258422852, "rewards/rejected": -10.891772270202637, "step": 580 }, { "epoch": 0.98, "grad_norm": 2.909048473319547, "learning_rate": 4.229604321829561e-09, "logits/chosen": -0.48271116614341736, "logits/rejected": -0.22559651732444763, "logps/chosen": -406.5685119628906, "logps/rejected": -1254.4326171875, "loss": 0.0513, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6645079851150513, "rewards/margins": 9.709487915039062, "rewards/rejected": -10.373995780944824, "step": 590 }, { "epoch": 1.0, "grad_norm": 2.7152944727872392, "learning_rate": 0.0, "logits/chosen": -0.49268728494644165, "logits/rejected": -0.23753933608531952, "logps/chosen": -411.12884521484375, "logps/rejected": -1246.8544921875, "loss": 0.0566, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.7164795994758606, "rewards/margins": 9.6405668258667, "rewards/rejected": -10.357046127319336, "step": 600 }, { "epoch": 1.0, "step": 600, "total_flos": 0.0, "train_loss": 0.150745850255092, "train_runtime": 48820.8334, "train_samples_per_second": 0.788, "train_steps_per_second": 0.012 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }