{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 329, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.0906090021590473, "learning_rate": 1.5151515151515152e-08, "logits/chosen": -2.6820077896118164, "logits/rejected": -2.6930205821990967, "logps/chosen": -281.2528381347656, "logps/rejected": -258.0622253417969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 2.1561337868153565, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -2.7683067321777344, "logits/rejected": -2.7538461685180664, "logps/chosen": -284.59912109375, "logps/rejected": -249.83580017089844, "loss": 0.6931, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 5.0317983550485224e-05, "rewards/margins": -0.00015015894314274192, "rewards/margins_max": 0.0020335663575679064, "rewards/margins_min": -0.0025187418796122074, "rewards/margins_std": 0.0020784963853657246, "rewards/rejected": 0.00020047693396918476, "step": 10 }, { "epoch": 0.06, "grad_norm": 1.9751920510122447, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -2.8347439765930176, "logits/rejected": -2.7819018363952637, "logps/chosen": -291.50921630859375, "logps/rejected": -270.4449768066406, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00022530628484673798, "rewards/margins": 0.0007139825029298663, "rewards/margins_max": 0.004345210734754801, "rewards/margins_min": -0.002943573985248804, "rewards/margins_std": 0.0032876902259886265, "rewards/rejected": -0.0004886762471869588, "step": 20 }, { "epoch": 0.09, "grad_norm": 1.6654288543237405, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.8627753257751465, "logits/rejected": -2.8151745796203613, "logps/chosen": -259.2825927734375, "logps/rejected": -227.37350463867188, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00037297833478078246, "rewards/margins": -0.00020202626183163375, "rewards/margins_max": 0.0030375297646969557, "rewards/margins_min": -0.0033374775666743517, "rewards/margins_std": 0.0028792533557862043, "rewards/rejected": -0.00017095205839723349, "step": 30 }, { "epoch": 0.12, "grad_norm": 1.6057052994928989, "learning_rate": 4.993103596812268e-07, "logits/chosen": -2.8291430473327637, "logits/rejected": -2.7638001441955566, "logps/chosen": -317.513916015625, "logps/rejected": -224.7698211669922, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": 0.000282805209280923, "rewards/margins": 0.0011336280731484294, "rewards/margins_max": 0.005270844791084528, "rewards/margins_min": -0.002321633044630289, "rewards/margins_std": 0.003387246746569872, "rewards/rejected": -0.000850822776556015, "step": 40 }, { "epoch": 0.15, "grad_norm": 1.75475734119915, "learning_rate": 4.959416858332709e-07, "logits/chosen": -2.79063081741333, "logits/rejected": -2.804368495941162, "logps/chosen": -242.9667510986328, "logps/rejected": -280.0011901855469, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00031891773687675595, "rewards/margins": 0.0008532041683793068, "rewards/margins_max": 0.004698004573583603, "rewards/margins_min": -0.002724443329498172, "rewards/margins_std": 0.00329922279343009, "rewards/rejected": -0.0011721218470484018, "step": 50 }, { "epoch": 0.18, "grad_norm": 1.91854731579599, "learning_rate": 4.898051734555674e-07, "logits/chosen": -2.8335373401641846, "logits/rejected": -2.8440303802490234, "logps/chosen": -321.90625, "logps/rejected": -283.37994384765625, "loss": 0.6921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00014833270688541234, "rewards/margins": 0.0021868678741157055, "rewards/margins_max": 0.008168894797563553, "rewards/margins_min": -0.0031033740378916264, "rewards/margins_std": 0.005018442869186401, "rewards/rejected": -0.0020385351963341236, "step": 60 }, { "epoch": 0.21, "grad_norm": 1.5964843630528198, "learning_rate": 4.809698831278217e-07, "logits/chosen": -2.748741865158081, "logits/rejected": -2.735199213027954, "logps/chosen": -266.52606201171875, "logps/rejected": -246.6175079345703, "loss": 0.6922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0004858696775045246, "rewards/margins": 0.0019440820906311274, "rewards/margins_max": 0.008044283837080002, "rewards/margins_min": -0.003816543845459819, "rewards/margins_std": 0.005177702754735947, "rewards/rejected": -0.002429951447993517, "step": 70 }, { "epoch": 0.24, "grad_norm": 2.1014040068240663, "learning_rate": 4.6953524759527053e-07, "logits/chosen": -2.8426356315612793, "logits/rejected": -2.8158562183380127, "logps/chosen": -282.353515625, "logps/rejected": -275.220458984375, "loss": 0.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.000800526118837297, "rewards/margins": 0.0022392510436475277, "rewards/margins_max": 0.00997895933687687, "rewards/margins_min": -0.005192113574594259, "rewards/margins_std": 0.0067059798166155815, "rewards/rejected": -0.0030397772789001465, "step": 80 }, { "epoch": 0.27, "grad_norm": 1.9914287244112958, "learning_rate": 4.5562995274820283e-07, "logits/chosen": -2.7992029190063477, "logits/rejected": -2.746138095855713, "logps/chosen": -295.78399658203125, "logps/rejected": -291.9333190917969, "loss": 0.6919, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002320217899978161, "rewards/margins": 0.001351921702735126, "rewards/margins_max": 0.010480575263500214, "rewards/margins_min": -0.009322223253548145, "rewards/margins_std": 0.008863108232617378, "rewards/rejected": -0.0036721397191286087, "step": 90 }, { "epoch": 0.3, "grad_norm": 1.6705180570693485, "learning_rate": 4.394104893853007e-07, "logits/chosen": -2.896794557571411, "logits/rejected": -2.85756254196167, "logps/chosen": -273.5906982421875, "logps/rejected": -257.73284912109375, "loss": 0.6914, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.0012845676392316818, "rewards/margins": 0.005008908919990063, "rewards/margins_max": 0.013928805477917194, "rewards/margins_min": -0.003106380347162485, "rewards/margins_std": 0.007625125348567963, "rewards/rejected": -0.006293477024883032, "step": 100 }, { "epoch": 0.3, "eval_logits/chosen": -2.806475877761841, "eval_logits/rejected": -2.767702102661133, "eval_logps/chosen": -284.6319274902344, "eval_logps/rejected": -258.9901123046875, "eval_loss": 0.691525399684906, "eval_rewards/accuracies": 0.6460000276565552, "eval_rewards/chosen": -0.00038527295691892505, "eval_rewards/margins": 0.003726556431502104, "eval_rewards/margins_max": 0.01873905211687088, "eval_rewards/margins_min": -0.009459242224693298, "eval_rewards/margins_std": 0.00931489747017622, "eval_rewards/rejected": -0.004111829213798046, "eval_runtime": 428.4684, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 100 }, { "epoch": 0.33, "grad_norm": 2.1453641236519685, "learning_rate": 4.2105939205932005e-07, "logits/chosen": -2.7631096839904785, "logits/rejected": -2.746663808822632, "logps/chosen": -311.8393249511719, "logps/rejected": -235.84280395507812, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008311712299473584, "rewards/margins": 0.0033794320188462734, "rewards/margins_max": 0.013278109021484852, "rewards/margins_min": -0.00541637372225523, "rewards/margins_std": 0.008299448527395725, "rewards/rejected": -0.0042106034234166145, "step": 110 }, { "epoch": 0.36, "grad_norm": 2.024896986425123, "learning_rate": 4.0078318482522114e-07, "logits/chosen": -2.7521708011627197, "logits/rejected": -2.750868082046509, "logps/chosen": -323.51666259765625, "logps/rejected": -274.75970458984375, "loss": 0.6909, "rewards/accuracies": 0.75, "rewards/chosen": 0.0004785500350408256, "rewards/margins": 0.004080395679920912, "rewards/margins_max": 0.015328818932175636, "rewards/margins_min": -0.0073760440573096275, "rewards/margins_std": 0.00990099273622036, "rewards/rejected": -0.0036018460523337126, "step": 120 }, { "epoch": 0.4, "grad_norm": 1.6346525930252072, "learning_rate": 3.7881005700938627e-07, "logits/chosen": -2.8206729888916016, "logits/rejected": -2.8308663368225098, "logps/chosen": -266.37469482421875, "logps/rejected": -234.52035522460938, "loss": 0.6906, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.00018907712365034968, "rewards/margins": 0.00421659741550684, "rewards/margins_max": 0.015676384791731834, "rewards/margins_min": -0.007556927390396595, "rewards/margins_std": 0.010246575810015202, "rewards/rejected": -0.004027520306408405, "step": 130 }, { "epoch": 0.43, "grad_norm": 1.9044203185149946, "learning_rate": 3.5538729515692354e-07, "logits/chosen": -2.780360460281372, "logits/rejected": -2.7639012336730957, "logps/chosen": -294.11309814453125, "logps/rejected": -270.84710693359375, "loss": 0.6896, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0028018890880048275, "rewards/margins": 0.007480897009372711, "rewards/margins_max": 0.021374408155679703, "rewards/margins_min": -0.0061719887889921665, "rewards/margins_std": 0.01222699973732233, "rewards/rejected": -0.004679008387029171, "step": 140 }, { "epoch": 0.46, "grad_norm": 1.4256559970133287, "learning_rate": 3.3077850005803125e-07, "logits/chosen": -2.8410263061523438, "logits/rejected": -2.8195314407348633, "logps/chosen": -270.49615478515625, "logps/rejected": -245.65200805664062, "loss": 0.6903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0014524383004754782, "rewards/margins": 0.006768654100596905, "rewards/margins_max": 0.025039460510015488, "rewards/margins_min": -0.01076546311378479, "rewards/margins_std": 0.015858832746744156, "rewards/rejected": -0.0053162164986133575, "step": 150 }, { "epoch": 0.49, "grad_norm": 2.1265057109843077, "learning_rate": 3.0526062017313247e-07, "logits/chosen": -2.79884672164917, "logits/rejected": -2.7815585136413574, "logps/chosen": -255.3964080810547, "logps/rejected": -241.00271606445312, "loss": 0.6909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0010771710658445954, "rewards/margins": 0.005134746432304382, "rewards/margins_max": 0.022996146231889725, "rewards/margins_min": -0.009730304591357708, "rewards/margins_std": 0.014861812815070152, "rewards/rejected": -0.004057575948536396, "step": 160 }, { "epoch": 0.52, "grad_norm": 1.59020242230242, "learning_rate": 2.791208348427426e-07, "logits/chosen": -2.814671039581299, "logits/rejected": -2.732504367828369, "logps/chosen": -303.4354553222656, "logps/rejected": -273.4683837890625, "loss": 0.6887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.002831272780895233, "rewards/margins": 0.008308259770274162, "rewards/margins_max": 0.02398056350648403, "rewards/margins_min": -0.007372391410171986, "rewards/margins_std": 0.014078010804951191, "rewards/rejected": -0.005476987920701504, "step": 170 }, { "epoch": 0.55, "grad_norm": 1.791424187804565, "learning_rate": 2.526533223585641e-07, "logits/chosen": -2.8398988246917725, "logits/rejected": -2.775310754776001, "logps/chosen": -256.0347595214844, "logps/rejected": -229.332763671875, "loss": 0.6897, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0009899451397359371, "rewards/margins": 0.005457731895148754, "rewards/margins_max": 0.021505217999219894, "rewards/margins_min": -0.008436702191829681, "rewards/margins_std": 0.013385000638663769, "rewards/rejected": -0.00446778628975153, "step": 180 }, { "epoch": 0.58, "grad_norm": 1.7305988753672774, "learning_rate": 2.261559492680755e-07, "logits/chosen": -2.781790256500244, "logits/rejected": -2.7643322944641113, "logps/chosen": -300.09393310546875, "logps/rejected": -271.13116455078125, "loss": 0.6891, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004870180506259203, "rewards/margins": 0.0101470947265625, "rewards/margins_max": 0.03561341017484665, "rewards/margins_min": -0.00919102318584919, "rewards/margins_std": 0.019990354776382446, "rewards/rejected": -0.005276912357658148, "step": 190 }, { "epoch": 0.61, "grad_norm": 2.169958133205736, "learning_rate": 1.9992691817133024e-07, "logits/chosen": -2.7859396934509277, "logits/rejected": -2.755178213119507, "logps/chosen": -281.18170166015625, "logps/rejected": -288.84930419921875, "loss": 0.6884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0041890377178788185, "rewards/margins": 0.009892629459500313, "rewards/margins_max": 0.03310906141996384, "rewards/margins_min": -0.012689237482845783, "rewards/margins_std": 0.02011021040380001, "rewards/rejected": -0.005703592207282782, "step": 200 }, { "epoch": 0.61, "eval_logits/chosen": -2.804927349090576, "eval_logits/rejected": -2.766470432281494, "eval_logps/chosen": -284.3610534667969, "eval_logps/rejected": -259.1879577636719, "eval_loss": 0.6895014643669128, "eval_rewards/accuracies": 0.6850000023841858, "eval_rewards/chosen": 0.0023234861437231302, "eval_rewards/margins": 0.008414038456976414, "eval_rewards/margins_max": 0.038945525884628296, "eval_rewards/margins_min": -0.018883490934967995, "eval_rewards/margins_std": 0.018971558660268784, "eval_rewards/rejected": -0.006090551149100065, "eval_runtime": 427.7798, "eval_samples_per_second": 4.675, "eval_steps_per_second": 0.292, "step": 200 }, { "epoch": 0.64, "grad_norm": 1.9906194761669704, "learning_rate": 1.742614117358029e-07, "logits/chosen": -2.80131196975708, "logits/rejected": -2.7576537132263184, "logps/chosen": -304.849853515625, "logps/rejected": -289.08197021484375, "loss": 0.6877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0046735843643546104, "rewards/margins": 0.012557747773826122, "rewards/margins_max": 0.03481978923082352, "rewards/margins_min": -0.00802917592227459, "rewards/margins_std": 0.019201457500457764, "rewards/rejected": -0.007884165272116661, "step": 210 }, { "epoch": 0.67, "grad_norm": 1.9658311065665528, "learning_rate": 1.4944827069769122e-07, "logits/chosen": -2.851292133331299, "logits/rejected": -2.8257217407226562, "logps/chosen": -312.4863586425781, "logps/rejected": -266.73626708984375, "loss": 0.6891, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.004867873154580593, "rewards/margins": 0.008174732327461243, "rewards/margins_max": 0.028449540957808495, "rewards/margins_min": -0.011079356074333191, "rewards/margins_std": 0.01735488697886467, "rewards/rejected": -0.003306858241558075, "step": 220 }, { "epoch": 0.7, "grad_norm": 1.8987994692738805, "learning_rate": 1.2576674323558928e-07, "logits/chosen": -2.821254014968872, "logits/rejected": -2.8421223163604736, "logps/chosen": -288.6875, "logps/rejected": -263.0277099609375, "loss": 0.6906, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00041991579928435385, "rewards/margins": 0.0022654212079942226, "rewards/margins_max": 0.024668732658028603, "rewards/margins_min": -0.022191215306520462, "rewards/margins_std": 0.020731808617711067, "rewards/rejected": -0.002685337094590068, "step": 230 }, { "epoch": 0.73, "grad_norm": 2.049682090113544, "learning_rate": 1.0348334229922676e-07, "logits/chosen": -2.877260684967041, "logits/rejected": -2.8300554752349854, "logps/chosen": -290.80633544921875, "logps/rejected": -275.846435546875, "loss": 0.6893, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0021143355406820774, "rewards/margins": 0.00877899769693613, "rewards/margins_max": 0.03138250857591629, "rewards/margins_min": -0.01147426012903452, "rewards/margins_std": 0.019380424171686172, "rewards/rejected": -0.006664662156254053, "step": 240 }, { "epoch": 0.76, "grad_norm": 2.008481756505904, "learning_rate": 8.284884626103164e-08, "logits/chosen": -2.817871570587158, "logits/rejected": -2.786424398422241, "logps/chosen": -300.6135559082031, "logps/rejected": -305.0606994628906, "loss": 0.6882, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0047633713111281395, "rewards/margins": 0.009853017516434193, "rewards/margins_max": 0.034555986523628235, "rewards/margins_min": -0.011890431866049767, "rewards/margins_std": 0.020635981112718582, "rewards/rejected": -0.0050896452739834785, "step": 250 }, { "epoch": 0.79, "grad_norm": 2.120277542804363, "learning_rate": 6.409547664531733e-08, "logits/chosen": -2.844655752182007, "logits/rejected": -2.811575412750244, "logps/chosen": -333.072265625, "logps/rejected": -312.94317626953125, "loss": 0.6874, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.009096643887460232, "rewards/margins": 0.013471168465912342, "rewards/margins_max": 0.0355917289853096, "rewards/margins_min": -0.005764602217823267, "rewards/margins_std": 0.018313560634851456, "rewards/rejected": -0.00437452457845211, "step": 260 }, { "epoch": 0.82, "grad_norm": 2.015529778175616, "learning_rate": 4.743428469705335e-08, "logits/chosen": -2.7949509620666504, "logits/rejected": -2.7894396781921387, "logps/chosen": -303.4598693847656, "logps/rejected": -308.66522216796875, "loss": 0.6889, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.003480118466541171, "rewards/margins": 0.010316994972527027, "rewards/margins_max": 0.033331625163555145, "rewards/margins_min": -0.010707234963774681, "rewards/margins_std": 0.01953895017504692, "rewards/rejected": -0.006836875341832638, "step": 270 }, { "epoch": 0.85, "grad_norm": 2.1162209644793024, "learning_rate": 3.305277620188826e-08, "logits/chosen": -2.844252347946167, "logits/rejected": -2.8254075050354004, "logps/chosen": -324.8486633300781, "logps/rejected": -270.613037109375, "loss": 0.6865, "rewards/accuracies": 0.75, "rewards/chosen": 0.0071704513393342495, "rewards/margins": 0.015561411157250404, "rewards/margins_max": 0.041363365948200226, "rewards/margins_min": -0.010495706461369991, "rewards/margins_std": 0.0231755543500185, "rewards/rejected": -0.008390960283577442, "step": 280 }, { "epoch": 0.88, "grad_norm": 1.7280929479679055, "learning_rate": 2.1112801287806375e-08, "logits/chosen": -2.783881187438965, "logits/rejected": -2.747999668121338, "logps/chosen": -273.90185546875, "logps/rejected": -246.3704833984375, "loss": 0.6881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0028805662877857685, "rewards/margins": 0.011384439654648304, "rewards/margins_max": 0.036729536950588226, "rewards/margins_min": -0.009171558544039726, "rewards/margins_std": 0.021134525537490845, "rewards/rejected": -0.008503873832523823, "step": 290 }, { "epoch": 0.91, "grad_norm": 1.8137366581326853, "learning_rate": 1.1748732956682023e-08, "logits/chosen": -2.878770351409912, "logits/rejected": -2.8104898929595947, "logps/chosen": -323.51312255859375, "logps/rejected": -286.44964599609375, "loss": 0.6873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0020009407307952642, "rewards/margins": 0.010605795308947563, "rewards/margins_max": 0.03404298424720764, "rewards/margins_min": -0.010880110785365105, "rewards/margins_std": 0.020114842802286148, "rewards/rejected": -0.008604854345321655, "step": 300 }, { "epoch": 0.91, "eval_logits/chosen": -2.802642822265625, "eval_logits/rejected": -2.7640159130096436, "eval_logps/chosen": -284.2815246582031, "eval_logps/rejected": -259.24383544921875, "eval_loss": 0.6889453530311584, "eval_rewards/accuracies": 0.6759999990463257, "eval_rewards/chosen": 0.0031190679874271154, "eval_rewards/margins": 0.009768038988113403, "eval_rewards/margins_max": 0.044922519475221634, "eval_rewards/margins_min": -0.021590130403637886, "eval_rewards/margins_std": 0.021896740421652794, "eval_rewards/rejected": -0.006648970767855644, "eval_runtime": 427.9336, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.292, "step": 300 }, { "epoch": 0.94, "grad_norm": 1.5476563917272619, "learning_rate": 5.065954844616721e-09, "logits/chosen": -2.8241655826568604, "logits/rejected": -2.7778286933898926, "logps/chosen": -276.5940856933594, "logps/rejected": -281.5748596191406, "loss": 0.6885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005276383366435766, "rewards/margins": 0.010391583666205406, "rewards/margins_max": 0.036186523735523224, "rewards/margins_min": -0.010774780064821243, "rewards/margins_std": 0.02108721435070038, "rewards/rejected": -0.005115201231092215, "step": 310 }, { "epoch": 0.97, "grad_norm": 1.9217088208809332, "learning_rate": 1.1396752298723499e-09, "logits/chosen": -2.8640575408935547, "logits/rejected": -2.8119149208068848, "logps/chosen": -249.0362548828125, "logps/rejected": -258.521484375, "loss": 0.6879, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0009104462224058807, "rewards/margins": 0.008900880813598633, "rewards/margins_max": 0.02946281060576439, "rewards/margins_min": -0.010788346640765667, "rewards/margins_std": 0.017393799498677254, "rewards/rejected": -0.009811325930058956, "step": 320 }, { "epoch": 1.0, "step": 329, "total_flos": 0.0, "train_loss": 0.6900796745323483, "train_runtime": 3893.3874, "train_samples_per_second": 1.352, "train_steps_per_second": 0.085 } ], "logging_steps": 10, "max_steps": 329, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }