{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982003599280144, "eval_steps": 500, "global_step": 416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.1904761904761906e-07, "logits/chosen": 0.06842132657766342, "logits/rejected": 0.05148967728018761, "logps/chosen": -254.1962432861328, "logps/rejected": -268.0105285644531, "loss": 0.3778, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.1904761904761906e-06, "logits/chosen": 0.14202657341957092, "logits/rejected": 0.2216137945652008, "logps/chosen": -380.2115478515625, "logps/rejected": -331.30743408203125, "loss": 0.3702, "rewards/accuracies": 0.3819444477558136, "rewards/chosen": 5.156885163160041e-05, "rewards/margins": -8.286008232971653e-05, "rewards/rejected": 0.00013442893396131694, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.11058167368173599, "logits/rejected": 0.1396401971578598, "logps/chosen": -336.24676513671875, "logps/rejected": -305.8110656738281, "loss": 0.3688, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 2.6639914722181857e-05, "rewards/margins": 8.525094017386436e-07, "rewards/rejected": 2.578740895842202e-05, "step": 20 }, { "epoch": 0.07, "learning_rate": 3.5714285714285718e-06, "logits/chosen": 0.12251333147287369, "logits/rejected": 0.22839057445526123, "logps/chosen": -353.02825927734375, "logps/rejected": -329.4814147949219, "loss": 0.3833, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.00021826496231369674, "rewards/margins": 0.0004608921299222857, "rewards/rejected": -0.00024262710940092802, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.10691192001104355, "logits/rejected": 0.16547340154647827, "logps/chosen": -330.2546691894531, "logps/rejected": -321.59222412109375, "loss": 0.3676, "rewards/accuracies": 0.65625, "rewards/chosen": 0.000455420755315572, "rewards/margins": 0.0009521494503132999, "rewards/rejected": -0.0004967286949977279, "step": 40 }, { "epoch": 0.12, "learning_rate": 4.994357350311441e-06, "logits/chosen": 0.11281980574131012, "logits/rejected": 0.1559869647026062, "logps/chosen": -333.9978942871094, "logps/rejected": -305.3361511230469, "loss": 0.3648, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0016132129821926355, "rewards/margins": 0.002727704355493188, "rewards/rejected": -0.0011144911404699087, "step": 50 }, { "epoch": 0.14, "learning_rate": 4.97147773390341e-06, "logits/chosen": 0.1588975489139557, "logits/rejected": 0.22720813751220703, "logps/chosen": -328.2370300292969, "logps/rejected": -326.60650634765625, "loss": 0.3615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.002356642857193947, "rewards/margins": 0.006414002738893032, "rewards/rejected": -0.004057359881699085, "step": 60 }, { "epoch": 0.17, "learning_rate": 4.931169703639282e-06, "logits/chosen": 0.12001170963048935, "logits/rejected": 0.2352440357208252, "logps/chosen": -363.2486267089844, "logps/rejected": -353.33795166015625, "loss": 0.3571, "rewards/accuracies": 0.71875, "rewards/chosen": 0.001306799822486937, "rewards/margins": 0.013176659122109413, "rewards/rejected": -0.011869858019053936, "step": 70 }, { "epoch": 0.19, "learning_rate": 4.873717504456219e-06, "logits/chosen": 0.12399880588054657, "logits/rejected": 0.13322117924690247, "logps/chosen": -351.8349304199219, "logps/rejected": -346.0111999511719, "loss": 0.3553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01495267916470766, "rewards/margins": 0.018836025148630142, "rewards/rejected": -0.03378870338201523, "step": 80 }, { "epoch": 0.22, "learning_rate": 4.7995262788689865e-06, "logits/chosen": 0.15232697129249573, "logits/rejected": 0.16080796718597412, "logps/chosen": -415.25408935546875, "logps/rejected": -472.15838623046875, "loss": 0.3288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.060072846710681915, "rewards/margins": 0.05840995907783508, "rewards/rejected": -0.118482805788517, "step": 90 }, { "epoch": 0.24, "learning_rate": 4.709119209978242e-06, "logits/chosen": 0.13195478916168213, "logits/rejected": 0.1627262532711029, "logps/chosen": -528.2550659179688, "logps/rejected": -582.0850219726562, "loss": 0.3394, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16017559170722961, "rewards/margins": 0.08082611858844757, "rewards/rejected": -0.241001695394516, "step": 100 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-06, "logits/chosen": 0.07086379081010818, "logits/rejected": 0.13402250409126282, "logps/chosen": -478.28155517578125, "logps/rejected": -536.5116577148438, "loss": 0.3097, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1429453045129776, "rewards/margins": 0.08558131754398346, "rewards/rejected": -0.22852663695812225, "step": 110 }, { "epoch": 0.29, "learning_rate": 4.482317534878901e-06, "logits/chosen": 0.04285923391580582, "logits/rejected": 0.1404789388179779, "logps/chosen": -488.8091735839844, "logps/rejected": -566.8741455078125, "loss": 0.304, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.165468230843544, "rewards/margins": 0.09716635942459106, "rewards/rejected": -0.2626345753669739, "step": 120 }, { "epoch": 0.31, "learning_rate": 4.3475222930516484e-06, "logits/chosen": 0.0852217823266983, "logits/rejected": 0.12218357622623444, "logps/chosen": -495.8277282714844, "logps/rejected": -630.5374145507812, "loss": 0.305, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18193328380584717, "rewards/margins": 0.1324920505285263, "rewards/rejected": -0.3144252896308899, "step": 130 }, { "epoch": 0.34, "learning_rate": 4.199698658255298e-06, "logits/chosen": 0.10588987171649933, "logits/rejected": 0.13151074945926666, "logps/chosen": -534.0150146484375, "logps/rejected": -573.1961059570312, "loss": 0.3174, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1705319881439209, "rewards/margins": 0.08429961651563644, "rewards/rejected": -0.25483161211013794, "step": 140 }, { "epoch": 0.36, "learning_rate": 4.039889056019159e-06, "logits/chosen": 0.14604777097702026, "logits/rejected": 0.17551526427268982, "logps/chosen": -508.1181640625, "logps/rejected": -601.8496704101562, "loss": 0.2995, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15790864825248718, "rewards/margins": 0.10467412322759628, "rewards/rejected": -0.2625827491283417, "step": 150 }, { "epoch": 0.38, "learning_rate": 3.869220434746509e-06, "logits/chosen": 0.11204711347818375, "logits/rejected": 0.12582647800445557, "logps/chosen": -511.799072265625, "logps/rejected": -630.4769287109375, "loss": 0.3083, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17171703279018402, "rewards/margins": 0.12071826308965683, "rewards/rejected": -0.29243525862693787, "step": 160 }, { "epoch": 0.41, "learning_rate": 3.688896318678322e-06, "logits/chosen": 0.08149586617946625, "logits/rejected": 0.1699526309967041, "logps/chosen": -502.21331787109375, "logps/rejected": -597.2528076171875, "loss": 0.3003, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1475558578968048, "rewards/margins": 0.137506365776062, "rewards/rejected": -0.28506219387054443, "step": 170 }, { "epoch": 0.43, "learning_rate": 3.5001883208580668e-06, "logits/chosen": 0.06874342262744904, "logits/rejected": 0.17765206098556519, "logps/chosen": -517.6918334960938, "logps/rejected": -670.2382202148438, "loss": 0.2989, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18062789738178253, "rewards/margins": 0.14384225010871887, "rewards/rejected": -0.3244701325893402, "step": 180 }, { "epoch": 0.46, "learning_rate": 3.30442717594657e-06, "logits/chosen": 0.08455907553434372, "logits/rejected": 0.16155509650707245, "logps/chosen": -573.642333984375, "logps/rejected": -693.3612060546875, "loss": 0.2908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.21603624522686005, "rewards/margins": 0.11141183227300644, "rewards/rejected": -0.3274480700492859, "step": 190 }, { "epoch": 0.48, "learning_rate": 3.102993356121938e-06, "logits/chosen": 0.13803192973136902, "logits/rejected": 0.13260796666145325, "logps/chosen": -504.03070068359375, "logps/rejected": -662.9249267578125, "loss": 0.2837, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20684099197387695, "rewards/margins": 0.132938414812088, "rewards/rejected": -0.33977940678596497, "step": 200 }, { "epoch": 0.5, "learning_rate": 2.8973073362395e-06, "logits/chosen": 0.1248478889465332, "logits/rejected": 0.18150393664836884, "logps/chosen": -556.743896484375, "logps/rejected": -682.061279296875, "loss": 0.3043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1968514621257782, "rewards/margins": 0.12893599271774292, "rewards/rejected": -0.3257874846458435, "step": 210 }, { "epoch": 0.53, "learning_rate": 2.6888195769001147e-06, "logits/chosen": 0.12090057134628296, "logits/rejected": 0.16103163361549377, "logps/chosen": -576.2421875, "logps/rejected": -636.4088134765625, "loss": 0.2942, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.17859044671058655, "rewards/margins": 0.11207801103591919, "rewards/rejected": -0.29066842794418335, "step": 220 }, { "epoch": 0.55, "learning_rate": 2.479000296064417e-06, "logits/chosen": 0.11349457502365112, "logits/rejected": 0.17926748096942902, "logps/chosen": -535.904052734375, "logps/rejected": -646.5347900390625, "loss": 0.3018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18949127197265625, "rewards/margins": 0.14192432165145874, "rewards/rejected": -0.3314156234264374, "step": 230 }, { "epoch": 0.58, "learning_rate": 2.269329101341745e-06, "logits/chosen": 0.08516987413167953, "logits/rejected": 0.14970402419567108, "logps/chosen": -517.2757568359375, "logps/rejected": -607.4237060546875, "loss": 0.2892, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17799392342567444, "rewards/margins": 0.11533834785223007, "rewards/rejected": -0.2933322489261627, "step": 240 }, { "epoch": 0.6, "learning_rate": 2.06128455606496e-06, "logits/chosen": 0.08016245067119598, "logits/rejected": 0.16319520771503448, "logps/chosen": -497.9122619628906, "logps/rejected": -589.7242431640625, "loss": 0.3044, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19338397681713104, "rewards/margins": 0.09803850203752518, "rewards/rejected": -0.2914224863052368, "step": 250 }, { "epoch": 0.62, "learning_rate": 1.856333752729311e-06, "logits/chosen": 0.15101440250873566, "logits/rejected": 0.171871617436409, "logps/chosen": -571.2918090820312, "logps/rejected": -696.5802001953125, "loss": 0.3184, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20878608524799347, "rewards/margins": 0.13470368087291718, "rewards/rejected": -0.34348976612091064, "step": 260 }, { "epoch": 0.65, "learning_rate": 1.6559219673215784e-06, "logits/chosen": 0.13930802047252655, "logits/rejected": 0.20976956188678741, "logps/chosen": -531.32177734375, "logps/rejected": -613.5001220703125, "loss": 0.2899, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17231304943561554, "rewards/margins": 0.12125241756439209, "rewards/rejected": -0.29356545209884644, "step": 270 }, { "epoch": 0.67, "learning_rate": 1.4614624674952843e-06, "logits/chosen": 0.11396761983633041, "logits/rejected": 0.2314365655183792, "logps/chosen": -552.0916748046875, "logps/rejected": -652.0897827148438, "loss": 0.2926, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.18660280108451843, "rewards/margins": 0.11572613567113876, "rewards/rejected": -0.3023289442062378, "step": 280 }, { "epoch": 0.7, "learning_rate": 1.2743265464628787e-06, "logits/chosen": 0.06689377129077911, "logits/rejected": 0.1653607338666916, "logps/chosen": -559.4171142578125, "logps/rejected": -663.8862915039062, "loss": 0.2877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.19495370984077454, "rewards/margins": 0.13509666919708252, "rewards/rejected": -0.33005034923553467, "step": 290 }, { "epoch": 0.72, "learning_rate": 1.0958338528840893e-06, "logits/chosen": 0.0707981139421463, "logits/rejected": 0.19608157873153687, "logps/chosen": -585.0418090820312, "logps/rejected": -708.06689453125, "loss": 0.2956, "rewards/accuracies": 0.71875, "rewards/chosen": -0.21173438429832458, "rewards/margins": 0.14347293972969055, "rewards/rejected": -0.3552073538303375, "step": 300 }, { "epoch": 0.74, "learning_rate": 9.272430849423175e-07, "logits/chosen": 0.12051234394311905, "logits/rejected": 0.22985529899597168, "logps/chosen": -581.6112670898438, "logps/rejected": -700.2015380859375, "loss": 0.2826, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1966112107038498, "rewards/margins": 0.1535426378250122, "rewards/rejected": -0.3501538634300232, "step": 310 }, { "epoch": 0.77, "learning_rate": 7.697431142327633e-07, "logits/chosen": 0.14221158623695374, "logits/rejected": 0.16759946942329407, "logps/chosen": -543.4645385742188, "logps/rejected": -653.2728271484375, "loss": 0.2983, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.19766229391098022, "rewards/margins": 0.13313183188438416, "rewards/rejected": -0.33079415559768677, "step": 320 }, { "epoch": 0.79, "learning_rate": 6.244446020550182e-07, "logits/chosen": 0.1523372381925583, "logits/rejected": 0.17464013397693634, "logps/chosen": -559.0565185546875, "logps/rejected": -647.8815307617188, "loss": 0.2835, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.20685505867004395, "rewards/margins": 0.11294861137866974, "rewards/rejected": -0.3198036551475525, "step": 330 }, { "epoch": 0.82, "learning_rate": 4.923721672305148e-07, "logits/chosen": 0.07727678120136261, "logits/rejected": 0.14975441992282867, "logps/chosen": -530.6942138671875, "logps/rejected": -737.3821411132812, "loss": 0.2821, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1940075010061264, "rewards/margins": 0.18392714858055115, "rewards/rejected": -0.37793463468551636, "step": 340 }, { "epoch": 0.84, "learning_rate": 3.7445716067596506e-07, "logits/chosen": 0.06651227176189423, "logits/rejected": 0.1357874870300293, "logps/chosen": -568.581298828125, "logps/rejected": -677.1904907226562, "loss": 0.2696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22338159382343292, "rewards/margins": 0.14005926251411438, "rewards/rejected": -0.3634408414363861, "step": 350 }, { "epoch": 0.86, "learning_rate": 2.7153109768518926e-07, "logits/chosen": 0.12754273414611816, "logits/rejected": 0.18894067406654358, "logps/chosen": -551.6950073242188, "logps/rejected": -662.9625244140625, "loss": 0.2798, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20485155284404755, "rewards/margins": 0.13481572270393372, "rewards/rejected": -0.33966726064682007, "step": 360 }, { "epoch": 0.89, "learning_rate": 1.8431979423369607e-07, "logits/chosen": 0.1290164738893509, "logits/rejected": 0.2077597826719284, "logps/chosen": -539.3775634765625, "logps/rejected": -648.92138671875, "loss": 0.3018, "rewards/accuracies": 0.625, "rewards/chosen": -0.22309288382530212, "rewards/margins": 0.11153991520404816, "rewards/rejected": -0.3346328139305115, "step": 370 }, { "epoch": 0.91, "learning_rate": 1.1343824865573422e-07, "logits/chosen": 0.12593218684196472, "logits/rejected": 0.1743309050798416, "logps/chosen": -534.8792114257812, "logps/rejected": -619.5510864257812, "loss": 0.3046, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2254379242658615, "rewards/margins": 0.10715726763010025, "rewards/rejected": -0.33259516954421997, "step": 380 }, { "epoch": 0.94, "learning_rate": 5.9386304787299175e-08, "logits/chosen": 0.15326878428459167, "logits/rejected": 0.19667108356952667, "logps/chosen": -596.8463745117188, "logps/rejected": -687.6807861328125, "loss": 0.2838, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2249801903963089, "rewards/margins": 0.12209514528512955, "rewards/rejected": -0.34707534313201904, "step": 390 }, { "epoch": 0.96, "learning_rate": 2.2545127157831416e-08, "logits/chosen": 0.08709342032670975, "logits/rejected": 0.12705549597740173, "logps/chosen": -551.4010620117188, "logps/rejected": -672.7706298828125, "loss": 0.2969, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2128075808286667, "rewards/margins": 0.12216651439666748, "rewards/rejected": -0.33497413992881775, "step": 400 }, { "epoch": 0.98, "learning_rate": 3.1745130869123564e-09, "logits/chosen": 0.07737437635660172, "logits/rejected": 0.13851606845855713, "logps/chosen": -584.75, "logps/rejected": -722.1080322265625, "loss": 0.2947, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22851315140724182, "rewards/margins": 0.13729806244373322, "rewards/rejected": -0.36581122875213623, "step": 410 }, { "epoch": 1.0, "step": 416, "total_flos": 0.0, "train_loss": 0.3107354554276054, "train_runtime": 5919.6286, "train_samples_per_second": 3.379, "train_steps_per_second": 0.07 } ], "logging_steps": 10, "max_steps": 416, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }