{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 395, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005063291139240506, "grad_norm": 520145.53108452284, "learning_rate": 1.4084507042253521e-09, "logits/chosen": -16.270591735839844, "logits/rejected": -16.343984603881836, "logps/chosen": -186.17276000976562, "logps/rejected": -175.8095703125, "loss": 122464.3125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05063291139240506, "grad_norm": 501181.05233525805, "learning_rate": 1.408450704225352e-08, "logits/chosen": -17.194263458251953, "logits/rejected": -17.04476547241211, "logps/chosen": -220.64031982421875, "logps/rejected": -220.79531860351562, "loss": 124716.2917, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 1.5937095554363623e-07, "rewards/margins": 1.5358187738456763e-05, "rewards/rejected": -1.5198814253380988e-05, "step": 10 }, { "epoch": 0.10126582278481013, "grad_norm": 537058.8643033113, "learning_rate": 2.816901408450704e-08, "logits/chosen": -16.468345642089844, "logits/rejected": -16.397050857543945, "logps/chosen": -238.16464233398438, "logps/rejected": -234.13320922851562, "loss": 125132.075, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -7.482715773221571e-06, "rewards/margins": 1.4337347238324583e-05, "rewards/rejected": -2.1820069378009066e-05, "step": 20 }, { "epoch": 0.1518987341772152, "grad_norm": 457257.68659374124, "learning_rate": 4.2253521126760564e-08, "logits/chosen": -16.952747344970703, "logits/rejected": -16.70650863647461, "logps/chosen": -242.9259490966797, "logps/rejected": -242.9457244873047, "loss": 124660.25, "rewards/accuracies": 0.4375, "rewards/chosen": -3.21022052958142e-05, "rewards/margins": 3.5706521885003895e-05, "rewards/rejected": -6.780872354283929e-05, "step": 30 }, { "epoch": 0.20253164556962025, "grad_norm": 520967.9129238899, "learning_rate": 5.633802816901408e-08, "logits/chosen": -16.920284271240234, "logits/rejected": -16.8529052734375, "logps/chosen": -243.7992706298828, "logps/rejected": -244.38906860351562, "loss": 124148.0625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0001235240779351443, "rewards/margins": 8.850651647662744e-05, "rewards/rejected": -0.00021203060168772936, "step": 40 }, { "epoch": 0.25316455696202533, "grad_norm": 722258.4292859514, "learning_rate": 7.042253521126761e-08, "logits/chosen": -16.24307632446289, "logits/rejected": -16.294937133789062, "logps/chosen": -238.68148803710938, "logps/rejected": -240.46337890625, "loss": 125272.85, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00024854010553099215, "rewards/margins": -7.368279329966754e-05, "rewards/rejected": -0.00017485734133515507, "step": 50 }, { "epoch": 0.3037974683544304, "grad_norm": 521725.51159479923, "learning_rate": 8.450704225352113e-08, "logits/chosen": -16.547048568725586, "logits/rejected": -16.562244415283203, "logps/chosen": -234.24453735351562, "logps/rejected": -236.03823852539062, "loss": 123692.1, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00036723288940265775, "rewards/margins": 6.122588274592999e-06, "rewards/rejected": -0.00037335552042350173, "step": 60 }, { "epoch": 0.35443037974683544, "grad_norm": 446768.20251500694, "learning_rate": 9.859154929577463e-08, "logits/chosen": -16.514156341552734, "logits/rejected": -16.41303062438965, "logps/chosen": -240.8957061767578, "logps/rejected": -235.2915496826172, "loss": 125937.8, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00028800699510611594, "rewards/margins": 0.00021416530944406986, "rewards/rejected": -0.0005021723336540163, "step": 70 }, { "epoch": 0.4050632911392405, "grad_norm": 463557.5011981856, "learning_rate": 1.1267605633802817e-07, "logits/chosen": -16.711376190185547, "logits/rejected": -16.489612579345703, "logps/chosen": -243.5523681640625, "logps/rejected": -228.8307342529297, "loss": 125818.525, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0005035396316088736, "rewards/margins": 6.90509841660969e-05, "rewards/rejected": -0.0005725906230509281, "step": 80 }, { "epoch": 0.45569620253164556, "grad_norm": 465137.87035599066, "learning_rate": 1.2676056338028167e-07, "logits/chosen": -17.326900482177734, "logits/rejected": -17.396936416625977, "logps/chosen": -240.1623077392578, "logps/rejected": -234.27578735351562, "loss": 123894.4, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0008088796166703105, "rewards/margins": -0.00010466824460308999, "rewards/rejected": -0.0007042114739306271, "step": 90 }, { "epoch": 0.5063291139240507, "grad_norm": 505006.4054603859, "learning_rate": 1.4084507042253522e-07, "logits/chosen": -16.5346736907959, "logits/rejected": -16.46234893798828, "logps/chosen": -238.9674530029297, "logps/rejected": -235.36239624023438, "loss": 126640.2125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0006280581001192331, "rewards/margins": 2.4443055735900998e-05, "rewards/rejected": -0.0006525011267513037, "step": 100 }, { "epoch": 0.5569620253164557, "grad_norm": 475489.46555727004, "learning_rate": 1.549295774647887e-07, "logits/chosen": -16.67499351501465, "logits/rejected": -16.584075927734375, "logps/chosen": -240.5388946533203, "logps/rejected": -239.03366088867188, "loss": 122706.3, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0006213908782228827, "rewards/margins": 0.00010261077841278166, "rewards/rejected": -0.0007240016711875796, "step": 110 }, { "epoch": 0.6075949367088608, "grad_norm": 492764.07090207015, "learning_rate": 1.6901408450704225e-07, "logits/chosen": -16.746532440185547, "logits/rejected": -16.617717742919922, "logps/chosen": -227.05398559570312, "logps/rejected": -225.60214233398438, "loss": 126588.925, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0008889889577403665, "rewards/margins": 2.5076475139940158e-05, "rewards/rejected": -0.0009140653419308364, "step": 120 }, { "epoch": 0.6582278481012658, "grad_norm": 511084.4558498889, "learning_rate": 1.8309859154929577e-07, "logits/chosen": -16.747934341430664, "logits/rejected": -16.733430862426758, "logps/chosen": -240.7227325439453, "logps/rejected": -240.2967529296875, "loss": 125175.5125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0010187395382672548, "rewards/margins": 0.002705145161598921, "rewards/rejected": -0.003723885165527463, "step": 130 }, { "epoch": 0.7088607594936709, "grad_norm": 540454.6644647518, "learning_rate": 1.9718309859154927e-07, "logits/chosen": -16.1859073638916, "logits/rejected": -16.264835357666016, "logps/chosen": -231.37173461914062, "logps/rejected": -227.0606689453125, "loss": 126058.6375, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0008466474828310311, "rewards/margins": -4.7403918870259076e-05, "rewards/rejected": -0.0007992436294443905, "step": 140 }, { "epoch": 0.759493670886076, "grad_norm": 503077.16971538117, "learning_rate": 2.112676056338028e-07, "logits/chosen": -17.280269622802734, "logits/rejected": -17.093780517578125, "logps/chosen": -238.0977325439453, "logps/rejected": -238.93212890625, "loss": 126646.125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0008783842204138637, "rewards/margins": 0.00026031016022898257, "rewards/rejected": -0.0011386943515390158, "step": 150 }, { "epoch": 0.810126582278481, "grad_norm": 541715.9624559938, "learning_rate": 2.2535211267605633e-07, "logits/chosen": -16.782550811767578, "logits/rejected": -16.79593276977539, "logps/chosen": -250.48593139648438, "logps/rejected": -249.44924926757812, "loss": 124718.425, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0012445250758901238, "rewards/margins": 5.686017539119348e-05, "rewards/rejected": -0.0013013852294534445, "step": 160 }, { "epoch": 0.8607594936708861, "grad_norm": 548905.0358445289, "learning_rate": 2.394366197183098e-07, "logits/chosen": -17.04167938232422, "logits/rejected": -16.985572814941406, "logps/chosen": -255.06942749023438, "logps/rejected": -260.38128662109375, "loss": 125650.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021144188940525055, "rewards/margins": 0.001583110773935914, "rewards/rejected": -0.003697529900819063, "step": 170 }, { "epoch": 0.9113924050632911, "grad_norm": 561549.4959644328, "learning_rate": 2.5352112676056334e-07, "logits/chosen": -16.703407287597656, "logits/rejected": -16.487037658691406, "logps/chosen": -232.529052734375, "logps/rejected": -228.3297576904297, "loss": 127298.1375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.002397818025201559, "rewards/margins": 0.0011578220874071121, "rewards/rejected": -0.003555640112608671, "step": 180 }, { "epoch": 0.9620253164556962, "grad_norm": 565071.0053763993, "learning_rate": 2.6760563380281686e-07, "logits/chosen": -16.11090660095215, "logits/rejected": -16.053157806396484, "logps/chosen": -239.39205932617188, "logps/rejected": -235.435791015625, "loss": 127009.225, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0010291519574820995, "rewards/margins": 0.00019036220328416675, "rewards/rejected": -0.0012195140589028597, "step": 190 }, { "epoch": 1.0126582278481013, "grad_norm": 497332.98430491646, "learning_rate": 2.8169014084507043e-07, "logits/chosen": -16.127140045166016, "logits/rejected": -15.988116264343262, "logps/chosen": -225.9070587158203, "logps/rejected": -227.90145874023438, "loss": 126358.875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0012379485415294766, "rewards/margins": 0.0006675361073575914, "rewards/rejected": -0.001905484707094729, "step": 200 }, { "epoch": 1.0632911392405062, "grad_norm": 935063.760892245, "learning_rate": 2.957746478873239e-07, "logits/chosen": -16.701793670654297, "logits/rejected": -16.669902801513672, "logps/chosen": -230.3677520751953, "logps/rejected": -229.03921508789062, "loss": 124250.775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0006614397279918194, "rewards/margins": 0.003238010685890913, "rewards/rejected": -0.003899450646713376, "step": 210 }, { "epoch": 1.1139240506329113, "grad_norm": 517399.2020129059, "learning_rate": 3.098591549295774e-07, "logits/chosen": -16.413972854614258, "logits/rejected": -16.371458053588867, "logps/chosen": -247.8984832763672, "logps/rejected": -249.5322723388672, "loss": 124993.7375, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.0012706981506198645, "rewards/margins": 0.003060612827539444, "rewards/rejected": -0.004331310745328665, "step": 220 }, { "epoch": 1.1645569620253164, "grad_norm": 499036.7717944408, "learning_rate": 3.23943661971831e-07, "logits/chosen": -15.908624649047852, "logits/rejected": -15.847338676452637, "logps/chosen": -236.7013397216797, "logps/rejected": -239.3136749267578, "loss": 122842.5, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0006655483739450574, "rewards/margins": 0.0032406128011643887, "rewards/rejected": -0.0039061610586941242, "step": 230 }, { "epoch": 1.2151898734177216, "grad_norm": 540681.7856619481, "learning_rate": 3.380281690140845e-07, "logits/chosen": -16.052249908447266, "logits/rejected": -15.99653148651123, "logps/chosen": -229.74832153320312, "logps/rejected": -230.9803009033203, "loss": 124587.3625, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0007962372037582099, "rewards/margins": 0.0025483998470008373, "rewards/rejected": -0.003344637108966708, "step": 240 }, { "epoch": 1.2658227848101267, "grad_norm": 1023950.8355601664, "learning_rate": 3.52112676056338e-07, "logits/chosen": -15.299288749694824, "logits/rejected": -15.215815544128418, "logps/chosen": -231.2301788330078, "logps/rejected": -232.03359985351562, "loss": 121822.4, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.512583579227794e-06, "rewards/margins": 0.003883513854816556, "rewards/rejected": -0.0038910270668566227, "step": 250 }, { "epoch": 1.3164556962025316, "grad_norm": 620253.8184950812, "learning_rate": 3.6619718309859155e-07, "logits/chosen": -16.167770385742188, "logits/rejected": -15.915590286254883, "logps/chosen": -238.9904327392578, "logps/rejected": -239.73953247070312, "loss": 123388.8625, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.00017356239550281316, "rewards/margins": 0.0050824107602238655, "rewards/rejected": -0.005255972500890493, "step": 260 }, { "epoch": 1.3670886075949367, "grad_norm": 575104.3218096169, "learning_rate": 3.8028169014084507e-07, "logits/chosen": -15.480558395385742, "logits/rejected": -15.386639595031738, "logps/chosen": -241.60879516601562, "logps/rejected": -250.003173828125, "loss": 123555.7, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.001139859901741147, "rewards/margins": 0.005077657289803028, "rewards/rejected": -0.0062175169587135315, "step": 270 }, { "epoch": 1.4177215189873418, "grad_norm": 601224.4433091934, "learning_rate": 3.9436619718309853e-07, "logits/chosen": -15.266016960144043, "logits/rejected": -15.313554763793945, "logps/chosen": -230.73397827148438, "logps/rejected": -237.3317108154297, "loss": 125556.675, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0007209269679151475, "rewards/margins": 0.00534270191565156, "rewards/rejected": -0.004621774889528751, "step": 280 }, { "epoch": 1.4683544303797469, "grad_norm": 751936.3077706753, "learning_rate": 4.084507042253521e-07, "logits/chosen": -14.600263595581055, "logits/rejected": -14.538311958312988, "logps/chosen": -224.1177520751953, "logps/rejected": -226.97879028320312, "loss": 123584.675, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.0011863496620208025, "rewards/margins": 0.007649322040379047, "rewards/rejected": -0.006462973542511463, "step": 290 }, { "epoch": 1.518987341772152, "grad_norm": 575660.5828565176, "learning_rate": 4.225352112676056e-07, "logits/chosen": -14.935551643371582, "logits/rejected": -15.062429428100586, "logps/chosen": -235.7123565673828, "logps/rejected": -245.36181640625, "loss": 122562.1375, "rewards/accuracies": 0.75, "rewards/chosen": 0.0014863747637718916, "rewards/margins": 0.0057060932740569115, "rewards/rejected": -0.0042197187431156635, "step": 300 }, { "epoch": 1.5696202531645569, "grad_norm": 619514.1083852616, "learning_rate": 4.366197183098591e-07, "logits/chosen": -14.678690910339355, "logits/rejected": -14.617218017578125, "logps/chosen": -229.6386260986328, "logps/rejected": -234.1474151611328, "loss": 123630.225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0006864996394142509, "rewards/margins": 0.004933560267090797, "rewards/rejected": -0.004247060976922512, "step": 310 }, { "epoch": 1.620253164556962, "grad_norm": 738538.1512211321, "learning_rate": 4.5070422535211266e-07, "logits/chosen": -14.131611824035645, "logits/rejected": -14.156657218933105, "logps/chosen": -241.20156860351562, "logps/rejected": -248.2321319580078, "loss": 124158.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0009155808947980404, "rewards/margins": 0.006913213524967432, "rewards/rejected": -0.007828795351088047, "step": 320 }, { "epoch": 1.6708860759493671, "grad_norm": 688317.7143989427, "learning_rate": 4.647887323943662e-07, "logits/chosen": -13.791796684265137, "logits/rejected": -13.970884323120117, "logps/chosen": -228.53079223632812, "logps/rejected": -235.5008087158203, "loss": 123378.175, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0017698236042633653, "rewards/margins": 0.006004182621836662, "rewards/rejected": -0.004234359599649906, "step": 330 }, { "epoch": 1.721518987341772, "grad_norm": 693314.5034252935, "learning_rate": 4.788732394366196e-07, "logits/chosen": -13.555567741394043, "logits/rejected": -13.32630729675293, "logps/chosen": -227.0249481201172, "logps/rejected": -232.2772216796875, "loss": 122521.475, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.001143553527072072, "rewards/margins": 0.009070896543562412, "rewards/rejected": -0.00792734231799841, "step": 340 }, { "epoch": 1.7721518987341773, "grad_norm": 758709.6120906892, "learning_rate": 4.929577464788733e-07, "logits/chosen": -13.520563125610352, "logits/rejected": -13.633130073547363, "logps/chosen": -234.7182159423828, "logps/rejected": -248.12890625, "loss": 121557.575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00047356385039165616, "rewards/margins": 0.00813873577862978, "rewards/rejected": -0.008612299337983131, "step": 350 }, { "epoch": 1.8227848101265822, "grad_norm": 689974.393201542, "learning_rate": 4.992165465371357e-07, "logits/chosen": -12.841153144836426, "logits/rejected": -12.86094856262207, "logps/chosen": -232.314697265625, "logps/rejected": -232.64297485351562, "loss": 121436.65, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0036194869317114353, "rewards/margins": 0.009506477043032646, "rewards/rejected": -0.005886988714337349, "step": 360 }, { "epoch": 1.8734177215189873, "grad_norm": 883375.543329047, "learning_rate": 4.976496396114071e-07, "logits/chosen": -12.77904224395752, "logits/rejected": -12.76900577545166, "logps/chosen": -239.8730010986328, "logps/rejected": -251.4569549560547, "loss": 122456.925, "rewards/accuracies": 0.75, "rewards/chosen": -0.0006393647054210305, "rewards/margins": 0.008665768429636955, "rewards/rejected": -0.009305133484303951, "step": 370 }, { "epoch": 1.9240506329113924, "grad_norm": 797554.0864386982, "learning_rate": 4.960827326856785e-07, "logits/chosen": -13.028135299682617, "logits/rejected": -13.148831367492676, "logps/chosen": -237.040771484375, "logps/rejected": -244.45181274414062, "loss": 124907.725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0025544934906065464, "rewards/margins": 0.008132859133183956, "rewards/rejected": -0.005578366108238697, "step": 380 }, { "epoch": 1.9746835443037973, "grad_norm": 793120.1180084129, "learning_rate": 4.945158257599498e-07, "logits/chosen": -12.312803268432617, "logits/rejected": -12.135167121887207, "logps/chosen": -235.60360717773438, "logps/rejected": -242.9219207763672, "loss": 121583.8, "rewards/accuracies": 0.75, "rewards/chosen": 0.003660207614302635, "rewards/margins": 0.011001082137227058, "rewards/rejected": -0.007340874522924423, "step": 390 } ], "logging_steps": 10, "max_steps": 3546, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }