{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988623435722411, "eval_steps": 10000000, "global_step": 439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 33.30332403665468, "learning_rate": 2.2727272727272727e-09, "logits/chosen": -1.6768856048583984, "logits/rejected": -1.7259055376052856, "logps/chosen": -1.2793102264404297, "logps/rejected": -1.2162058353424072, "loss": 1.3133, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 37.64303926905523, "learning_rate": 2.2727272727272725e-08, "logits/chosen": -1.7033135890960693, "logits/rejected": -1.668673038482666, "logps/chosen": -1.2131016254425049, "logps/rejected": -1.22050142288208, "loss": 1.313, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 0.00040783319855108857, "rewards/margins": -8.263149356935173e-05, "rewards/rejected": 0.0004904646775685251, "step": 10 }, { "epoch": 0.05, "grad_norm": 38.69260337999141, "learning_rate": 4.545454545454545e-08, "logits/chosen": -1.7795250415802002, "logits/rejected": -1.7348783016204834, "logps/chosen": -1.1448484659194946, "logps/rejected": -1.1852957010269165, "loss": 1.3122, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.003251913469284773, "rewards/margins": 0.0004202231648378074, "rewards/rejected": -0.003672136692330241, "step": 20 }, { "epoch": 0.07, "grad_norm": 44.09359407998382, "learning_rate": 6.818181818181817e-08, "logits/chosen": -1.7442439794540405, "logits/rejected": -1.6752439737319946, "logps/chosen": -1.1954559087753296, "logps/rejected": -1.248280644416809, "loss": 1.3059, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025108838453888893, "rewards/margins": 0.01119022723287344, "rewards/rejected": -0.03629906848073006, "step": 30 }, { "epoch": 0.09, "grad_norm": 28.883029165176804, "learning_rate": 9.09090909090909e-08, "logits/chosen": -1.7305904626846313, "logits/rejected": -1.6642875671386719, "logps/chosen": -1.2533624172210693, "logps/rejected": -1.3383153676986694, "loss": 1.2922, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09282750636339188, "rewards/margins": 0.06525905430316925, "rewards/rejected": -0.15808656811714172, "step": 40 }, { "epoch": 0.11, "grad_norm": 36.39900209589975, "learning_rate": 9.994307990108962e-08, "logits/chosen": -1.690720796585083, "logits/rejected": -1.625451683998108, "logps/chosen": -1.3044583797454834, "logps/rejected": -1.3643444776535034, "loss": 1.2643, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19274269044399261, "rewards/margins": 0.0795869454741478, "rewards/rejected": -0.2723296284675598, "step": 50 }, { "epoch": 0.14, "grad_norm": 43.081578827458706, "learning_rate": 9.959570405988094e-08, "logits/chosen": -1.71735417842865, "logits/rejected": -1.6361076831817627, "logps/chosen": -1.3119524717330933, "logps/rejected": -1.4046932458877563, "loss": 1.2541, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4057086110115051, "rewards/margins": 0.0816243588924408, "rewards/rejected": -0.4873329699039459, "step": 60 }, { "epoch": 0.16, "grad_norm": 33.06897498171632, "learning_rate": 9.893476820924666e-08, "logits/chosen": -1.7922325134277344, "logits/rejected": -1.7017757892608643, "logps/chosen": -1.5047810077667236, "logps/rejected": -1.630091667175293, "loss": 1.2355, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5838777422904968, "rewards/margins": 0.1572917252779007, "rewards/rejected": -0.7411695718765259, "step": 70 }, { "epoch": 0.18, "grad_norm": 37.24284057004877, "learning_rate": 9.796445099843647e-08, "logits/chosen": -1.774518370628357, "logits/rejected": -1.6856935024261475, "logps/chosen": -1.5832115411758423, "logps/rejected": -1.7514270544052124, "loss": 1.232, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7524863481521606, "rewards/margins": 0.21537098288536072, "rewards/rejected": -0.9678572416305542, "step": 80 }, { "epoch": 0.2, "grad_norm": 45.064021238231845, "learning_rate": 9.669088708527066e-08, "logits/chosen": -1.7184202671051025, "logits/rejected": -1.6467373371124268, "logps/chosen": -1.7363929748535156, "logps/rejected": -1.8083902597427368, "loss": 1.2104, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0013912916183472, "rewards/margins": 0.12990526854991913, "rewards/rejected": -1.1312966346740723, "step": 90 }, { "epoch": 0.23, "grad_norm": 44.286763175528534, "learning_rate": 9.512212835085849e-08, "logits/chosen": -1.757889986038208, "logits/rejected": -1.6645339727401733, "logps/chosen": -1.779813528060913, "logps/rejected": -1.9353383779525757, "loss": 1.1819, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2227165699005127, "rewards/margins": 0.2256297618150711, "rewards/rejected": -1.448346495628357, "step": 100 }, { "epoch": 0.25, "grad_norm": 43.36692624974112, "learning_rate": 9.326809299301306e-08, "logits/chosen": -1.761940360069275, "logits/rejected": -1.6550146341323853, "logps/chosen": -1.8854389190673828, "logps/rejected": -2.1229450702667236, "loss": 1.1674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.369593620300293, "rewards/margins": 0.371805876493454, "rewards/rejected": -1.7413995265960693, "step": 110 }, { "epoch": 0.27, "grad_norm": 41.78554813342914, "learning_rate": 9.114050282021158e-08, "logits/chosen": -1.7491047382354736, "logits/rejected": -1.6867637634277344, "logps/chosen": -1.8475677967071533, "logps/rejected": -2.0627474784851074, "loss": 1.1591, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.400010108947754, "rewards/margins": 0.3355749249458313, "rewards/rejected": -1.7355849742889404, "step": 120 }, { "epoch": 0.3, "grad_norm": 40.42754129950971, "learning_rate": 8.875280914254802e-08, "logits/chosen": -1.737173080444336, "logits/rejected": -1.644561529159546, "logps/chosen": -2.0521700382232666, "logps/rejected": -2.296677827835083, "loss": 1.1348, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7186797857284546, "rewards/margins": 0.43216562271118164, "rewards/rejected": -2.1508452892303467, "step": 130 }, { "epoch": 0.32, "grad_norm": 39.13812568144021, "learning_rate": 8.612010772821971e-08, "logits/chosen": -1.7612278461456299, "logits/rejected": -1.715679407119751, "logps/chosen": -2.0781049728393555, "logps/rejected": -2.2759194374084473, "loss": 1.127, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7284520864486694, "rewards/margins": 0.4454485774040222, "rewards/rejected": -2.173900604248047, "step": 140 }, { "epoch": 0.34, "grad_norm": 37.596667789585375, "learning_rate": 8.325904336322055e-08, "logits/chosen": -1.735419511795044, "logits/rejected": -1.6814868450164795, "logps/chosen": -2.305412769317627, "logps/rejected": -2.55448317527771, "loss": 1.1399, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2760961055755615, "rewards/margins": 0.4037933945655823, "rewards/rejected": -2.679889440536499, "step": 150 }, { "epoch": 0.36, "grad_norm": 37.85769539137667, "learning_rate": 8.01877046176447e-08, "logits/chosen": -1.6751991510391235, "logits/rejected": -1.6064836978912354, "logps/chosen": -2.5598020553588867, "logps/rejected": -2.8157076835632324, "loss": 1.09, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.763003349304199, "rewards/margins": 0.40903931856155396, "rewards/rejected": -3.1720428466796875, "step": 160 }, { "epoch": 0.39, "grad_norm": 33.932219318133306, "learning_rate": 7.692550948392249e-08, "logits/chosen": -1.7231628894805908, "logits/rejected": -1.6755987405776978, "logps/chosen": -2.624762535095215, "logps/rejected": -2.9136133193969727, "loss": 1.1053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.8290135860443115, "rewards/margins": 0.5568121671676636, "rewards/rejected": -3.3858256340026855, "step": 170 }, { "epoch": 0.41, "grad_norm": 48.11500069751816, "learning_rate": 7.349308261002021e-08, "logits/chosen": -1.6858348846435547, "logits/rejected": -1.6378986835479736, "logps/chosen": -2.640817165374756, "logps/rejected": -2.949113368988037, "loss": 1.0837, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.887462615966797, "rewards/margins": 0.5172919034957886, "rewards/rejected": -3.404754638671875, "step": 180 }, { "epoch": 0.43, "grad_norm": 42.33388198011932, "learning_rate": 6.991212490377531e-08, "logits/chosen": -1.7423484325408936, "logits/rejected": -1.7037559747695923, "logps/chosen": -2.6472008228302, "logps/rejected": -3.0077877044677734, "loss": 1.0335, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.834195613861084, "rewards/margins": 0.6780903339385986, "rewards/rejected": -3.5122859477996826, "step": 190 }, { "epoch": 0.46, "grad_norm": 49.65676542149092, "learning_rate": 6.620527633276978e-08, "logits/chosen": -1.6741564273834229, "logits/rejected": -1.6151821613311768, "logps/chosen": -2.735678195953369, "logps/rejected": -3.225632905960083, "loss": 1.0663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0824506282806396, "rewards/margins": 0.80633145570755, "rewards/rejected": -3.888781785964966, "step": 200 }, { "epoch": 0.48, "grad_norm": 46.53275655997813, "learning_rate": 6.239597278716581e-08, "logits/chosen": -1.7146323919296265, "logits/rejected": -1.6657183170318604, "logps/chosen": -3.098931312561035, "logps/rejected": -3.467923641204834, "loss": 1.0287, "rewards/accuracies": 0.71875, "rewards/chosen": -3.707202911376953, "rewards/margins": 0.7793115377426147, "rewards/rejected": -4.486514091491699, "step": 210 }, { "epoch": 0.5, "grad_norm": 47.77625681519385, "learning_rate": 5.8508297910462456e-08, "logits/chosen": -1.6560382843017578, "logits/rejected": -1.5879056453704834, "logps/chosen": -3.1243553161621094, "logps/rejected": -3.6013519763946533, "loss": 1.032, "rewards/accuracies": 0.71875, "rewards/chosen": -3.921679735183716, "rewards/margins": 0.8384466171264648, "rewards/rejected": -4.76012659072876, "step": 220 }, { "epoch": 0.52, "grad_norm": 45.3152158322423, "learning_rate": 5.456683083494731e-08, "logits/chosen": -1.6423381567001343, "logits/rejected": -1.6075971126556396, "logps/chosen": -3.002626419067383, "logps/rejected": -3.339411497116089, "loss": 1.068, "rewards/accuracies": 0.65625, "rewards/chosen": -3.6176345348358154, "rewards/margins": 0.6260865330696106, "rewards/rejected": -4.2437214851379395, "step": 230 }, { "epoch": 0.55, "grad_norm": 50.18712381426658, "learning_rate": 5.059649078450834e-08, "logits/chosen": -1.6221996545791626, "logits/rejected": -1.587894082069397, "logps/chosen": -2.9972426891326904, "logps/rejected": -3.4068732261657715, "loss": 1.0045, "rewards/accuracies": 0.65625, "rewards/chosen": -3.657778263092041, "rewards/margins": 0.6951833963394165, "rewards/rejected": -4.352961540222168, "step": 240 }, { "epoch": 0.57, "grad_norm": 42.94625970616266, "learning_rate": 4.6622379527277186e-08, "logits/chosen": -1.6361802816390991, "logits/rejected": -1.5883018970489502, "logps/chosen": -3.0472984313964844, "logps/rejected": -3.4067275524139404, "loss": 1.0159, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.816819429397583, "rewards/margins": 0.65269935131073, "rewards/rejected": -4.469518661499023, "step": 250 }, { "epoch": 0.59, "grad_norm": 49.81189434860217, "learning_rate": 4.26696226741691e-08, "logits/chosen": -1.6441590785980225, "logits/rejected": -1.5848346948623657, "logps/chosen": -3.2412009239196777, "logps/rejected": -3.667572021484375, "loss": 1.0333, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.0167555809021, "rewards/margins": 0.8128072619438171, "rewards/rejected": -4.829562664031982, "step": 260 }, { "epoch": 0.61, "grad_norm": 46.698998113891435, "learning_rate": 3.876321082668098e-08, "logits/chosen": -1.6987736225128174, "logits/rejected": -1.6376842260360718, "logps/chosen": -3.1670312881469727, "logps/rejected": -3.625418186187744, "loss": 1.0046, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.857081174850464, "rewards/margins": 0.8981560468673706, "rewards/rejected": -4.755237579345703, "step": 270 }, { "epoch": 0.64, "grad_norm": 54.35348471111713, "learning_rate": 3.492784157826244e-08, "logits/chosen": -1.63980233669281, "logits/rejected": -1.552004337310791, "logps/chosen": -3.2830092906951904, "logps/rejected": -3.8152382373809814, "loss": 1.0119, "rewards/accuracies": 0.71875, "rewards/chosen": -4.120265483856201, "rewards/margins": 0.9936790466308594, "rewards/rejected": -5.113945007324219, "step": 280 }, { "epoch": 0.66, "grad_norm": 44.812750561614926, "learning_rate": 3.118776336817812e-08, "logits/chosen": -1.6625276803970337, "logits/rejected": -1.6122783422470093, "logps/chosen": -3.191256284713745, "logps/rejected": -3.756882429122925, "loss": 0.9859, "rewards/accuracies": 0.71875, "rewards/chosen": -3.97601580619812, "rewards/margins": 1.1168193817138672, "rewards/rejected": -5.092835426330566, "step": 290 }, { "epoch": 0.68, "grad_norm": 45.89134253017904, "learning_rate": 2.7566622175067443e-08, "logits/chosen": -1.6413261890411377, "logits/rejected": -1.5825086832046509, "logps/chosen": -3.339484691619873, "logps/rejected": -3.9588654041290283, "loss": 0.994, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.261802673339844, "rewards/margins": 1.0827885866165161, "rewards/rejected": -5.3445916175842285, "step": 300 }, { "epoch": 0.71, "grad_norm": 47.840562340740895, "learning_rate": 2.408731201945432e-08, "logits/chosen": -1.64263117313385, "logits/rejected": -1.6013950109481812, "logps/chosen": -3.251277446746826, "logps/rejected": -3.651395082473755, "loss": 1.0008, "rewards/accuracies": 0.71875, "rewards/chosen": -4.11476469039917, "rewards/margins": 0.7599252462387085, "rewards/rejected": -4.874690532684326, "step": 310 }, { "epoch": 0.73, "grad_norm": 51.79356167073485, "learning_rate": 2.0771830220378112e-08, "logits/chosen": -1.5991486310958862, "logits/rejected": -1.5396713018417358, "logps/chosen": -3.2509543895721436, "logps/rejected": -3.6364498138427734, "loss": 1.0066, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.056139945983887, "rewards/margins": 0.7866916060447693, "rewards/rejected": -4.842831611633301, "step": 320 }, { "epoch": 0.75, "grad_norm": 43.99284684689101, "learning_rate": 1.7641138321260257e-08, "logits/chosen": -1.6334537267684937, "logits/rejected": -1.5692901611328125, "logps/chosen": -3.158041477203369, "logps/rejected": -3.8241424560546875, "loss": 0.9807, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.9483726024627686, "rewards/margins": 1.2771327495574951, "rewards/rejected": -5.225505352020264, "step": 330 }, { "epoch": 0.77, "grad_norm": 50.47520523412627, "learning_rate": 1.4715029564277793e-08, "logits/chosen": -1.6923463344573975, "logits/rejected": -1.6500104665756226, "logps/chosen": -3.0949554443359375, "logps/rejected": -3.6954338550567627, "loss": 1.0051, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.7805895805358887, "rewards/margins": 1.1268298625946045, "rewards/rejected": -4.907419681549072, "step": 340 }, { "epoch": 0.8, "grad_norm": 48.63775480340643, "learning_rate": 1.2012003751113343e-08, "logits/chosen": -1.6796951293945312, "logits/rejected": -1.6264684200286865, "logps/chosen": -3.3736748695373535, "logps/rejected": -3.9650447368621826, "loss": 0.9726, "rewards/accuracies": 0.6875, "rewards/chosen": -4.423010349273682, "rewards/margins": 1.0823583602905273, "rewards/rejected": -5.505368709564209, "step": 350 }, { "epoch": 0.82, "grad_norm": 55.26630420954737, "learning_rate": 9.549150281252633e-09, "logits/chosen": -1.6259968280792236, "logits/rejected": -1.5858485698699951, "logps/chosen": -3.211542844772339, "logps/rejected": -3.735614776611328, "loss": 0.9729, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.078815460205078, "rewards/margins": 0.9446828961372375, "rewards/rejected": -5.02349853515625, "step": 360 }, { "epoch": 0.84, "grad_norm": 51.52261591377872, "learning_rate": 7.3420401072985306e-09, "logits/chosen": -1.6755279302597046, "logits/rejected": -1.6221554279327393, "logps/chosen": -3.299112319946289, "logps/rejected": -3.911120653152466, "loss": 0.9649, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.168996810913086, "rewards/margins": 1.1090896129608154, "rewards/rejected": -5.2780866622924805, "step": 370 }, { "epoch": 0.86, "grad_norm": 51.72886520205544, "learning_rate": 5.404627290395369e-09, "logits/chosen": -1.6374752521514893, "logits/rejected": -1.5786619186401367, "logps/chosen": -3.220484972000122, "logps/rejected": -3.803584337234497, "loss": 0.968, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.061758518218994, "rewards/margins": 1.0862071514129639, "rewards/rejected": -5.147965431213379, "step": 380 }, { "epoch": 0.89, "grad_norm": 55.56376010319163, "learning_rate": 3.74916077816162e-09, "logits/chosen": -1.6384235620498657, "logits/rejected": -1.5836341381072998, "logps/chosen": -3.2395005226135254, "logps/rejected": -3.746983051300049, "loss": 1.0011, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.179410934448242, "rewards/margins": 0.982707142829895, "rewards/rejected": -5.162117958068848, "step": 390 }, { "epoch": 0.91, "grad_norm": 54.81143409505458, "learning_rate": 2.386106962899165e-09, "logits/chosen": -1.5698174238204956, "logits/rejected": -1.5115009546279907, "logps/chosen": -3.4176878929138184, "logps/rejected": -3.958037853240967, "loss": 0.9695, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.382534027099609, "rewards/margins": 0.9680202603340149, "rewards/rejected": -5.350554466247559, "step": 400 }, { "epoch": 0.93, "grad_norm": 56.75402221437199, "learning_rate": 1.3240835096913706e-09, "logits/chosen": -1.594696283340454, "logits/rejected": -1.502890944480896, "logps/chosen": -3.213305711746216, "logps/rejected": -3.921264171600342, "loss": 1.0286, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.083509922027588, "rewards/margins": 1.2770874500274658, "rewards/rejected": -5.360597133636475, "step": 410 }, { "epoch": 0.96, "grad_norm": 47.085112169528884, "learning_rate": 5.698048727497462e-10, "logits/chosen": -1.6298091411590576, "logits/rejected": -1.5658090114593506, "logps/chosen": -3.3380351066589355, "logps/rejected": -3.9660251140594482, "loss": 0.983, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.293475151062012, "rewards/margins": 1.1220663785934448, "rewards/rejected": -5.415541648864746, "step": 420 }, { "epoch": 0.98, "grad_norm": 46.70771599324875, "learning_rate": 1.2803984447259387e-10, "logits/chosen": -1.6368719339370728, "logits/rejected": -1.5942411422729492, "logps/chosen": -3.3361122608184814, "logps/rejected": -3.98066782951355, "loss": 0.9434, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.3075456619262695, "rewards/margins": 1.1938055753707886, "rewards/rejected": -5.501351356506348, "step": 430 }, { "epoch": 1.0, "step": 439, "total_flos": 0.0, "train_loss": 1.0809600353240967, "train_runtime": 6838.8864, "train_samples_per_second": 8.223, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 439, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }