{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0638297872340426e-07, "logits/chosen": 0.1359557956457138, "logits/rejected": 0.030706744641065598, "logps/chosen": -736.0869140625, "logps/rejected": -613.6344604492188, "loss": 2.0331, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.11667777597904205, "logits/rejected": 0.26604601740837097, "logps/chosen": -546.5281982421875, "logps/rejected": -597.5736083984375, "loss": 2.1592, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.0007250224007293582, "rewards/margins": 0.00040180076030083, "rewards/rejected": 0.0003232216986361891, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.16373148560523987, "logits/rejected": 0.2677033543586731, "logps/chosen": -604.6590576171875, "logps/rejected": -649.482177734375, "loss": 2.0972, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0005862273974344134, "rewards/margins": -0.0003054165281355381, "rewards/rejected": -0.0002808108984027058, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.14978544414043427, "logits/rejected": 0.1915779411792755, "logps/chosen": -594.8548583984375, "logps/rejected": -588.2429809570312, "loss": 2.122, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.004188057966530323, "rewards/margins": 0.0009490737575106323, "rewards/rejected": -0.0051371315494179726, "step": 30 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.16862796247005463, "logits/rejected": 0.23586151003837585, "logps/chosen": -574.7235107421875, "logps/rejected": -631.8544921875, "loss": 2.1863, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.01229151152074337, "rewards/margins": 0.005582691170275211, "rewards/rejected": -0.017874203622341156, "step": 40 }, { "epoch": 0.11, "learning_rate": 4.999373573764188e-06, "logits/chosen": 0.1411871314048767, "logits/rejected": 0.2258455753326416, "logps/chosen": -612.8582763671875, "logps/rejected": -636.5026245117188, "loss": 2.1508, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.04941480979323387, "rewards/margins": 0.019247086718678474, "rewards/rejected": -0.0686618983745575, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.988245838331339e-06, "logits/chosen": 0.17244111001491547, "logits/rejected": 0.17342150211334229, "logps/chosen": -634.6348266601562, "logps/rejected": -667.5384521484375, "loss": 2.0758, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13025899231433868, "rewards/margins": 0.05111612752079964, "rewards/rejected": -0.18137511610984802, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.963268819535228e-06, "logits/chosen": 0.12650486826896667, "logits/rejected": 0.14093999564647675, "logps/chosen": -608.5107421875, "logps/rejected": -702.1578369140625, "loss": 2.0556, "rewards/accuracies": 0.5625, "rewards/chosen": -0.21341009438037872, "rewards/margins": 0.09893321990966797, "rewards/rejected": -0.3123432993888855, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.9245815365216115e-06, "logits/chosen": 0.19184628129005432, "logits/rejected": 0.2408786565065384, "logps/chosen": -679.4183349609375, "logps/rejected": -609.7093505859375, "loss": 2.1137, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.21463651955127716, "rewards/margins": 0.05772104859352112, "rewards/rejected": -0.2723575234413147, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.872399318152594e-06, "logits/chosen": 0.1250939965248108, "logits/rejected": 0.18045032024383545, "logps/chosen": -622.2333374023438, "logps/rejected": -655.4575805664062, "loss": 2.0044, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1839352548122406, "rewards/margins": 0.10977420955896378, "rewards/rejected": -0.2937094569206238, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.807012604511542e-06, "logits/chosen": 0.18265239894390106, "logits/rejected": 0.2614283859729767, "logps/chosen": -649.8997802734375, "logps/rejected": -658.8975830078125, "loss": 1.9995, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.16064395010471344, "rewards/margins": 0.08805385231971741, "rewards/rejected": -0.24869783222675323, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.728785330347771e-06, "logits/chosen": 0.2479465901851654, "logits/rejected": 0.2932817339897156, "logps/chosen": -674.0836181640625, "logps/rejected": -645.6417236328125, "loss": 1.895, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12688389420509338, "rewards/margins": 0.08782283961772919, "rewards/rejected": -0.21470670402050018, "step": 110 }, { "epoch": 0.26, "learning_rate": 4.63815289945858e-06, "logits/chosen": 0.19643843173980713, "logits/rejected": 0.2974274456501007, "logps/chosen": -573.49658203125, "logps/rejected": -666.606689453125, "loss": 1.89, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.09824337065219879, "rewards/margins": 0.13982543349266052, "rewards/rejected": -0.2380688190460205, "step": 120 }, { "epoch": 0.28, "learning_rate": 4.535619761282989e-06, "logits/chosen": 0.23821644484996796, "logits/rejected": 0.288485586643219, "logps/chosen": -590.9158935546875, "logps/rejected": -623.23974609375, "loss": 1.9389, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.14589470624923706, "rewards/margins": 0.12624357640743256, "rewards/rejected": -0.2721382975578308, "step": 130 }, { "epoch": 0.3, "learning_rate": 4.42175660319555e-06, "logits/chosen": 0.2631734013557434, "logits/rejected": 0.2810806632041931, "logps/chosen": -645.8680419921875, "logps/rejected": -654.8004760742188, "loss": 1.8203, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20492109656333923, "rewards/margins": 0.20386295020580292, "rewards/rejected": -0.40878406167030334, "step": 140 }, { "epoch": 0.32, "learning_rate": 4.297197174127619e-06, "logits/chosen": 0.2586398422718048, "logits/rejected": 0.3086986839771271, "logps/chosen": -619.4220581054688, "logps/rejected": -697.2005615234375, "loss": 1.7553, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.24971242249011993, "rewards/margins": 0.2221045196056366, "rewards/rejected": -0.4718169569969177, "step": 150 }, { "epoch": 0.34, "learning_rate": 4.162634757195418e-06, "logits/chosen": 0.2681664526462555, "logits/rejected": 0.2807798683643341, "logps/chosen": -630.39306640625, "logps/rejected": -645.6117553710938, "loss": 1.8404, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.19384464621543884, "rewards/margins": 0.1983100175857544, "rewards/rejected": -0.39215466380119324, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.018818310967843e-06, "logits/chosen": 0.27496370673179626, "logits/rejected": 0.30781346559524536, "logps/chosen": -559.2887573242188, "logps/rejected": -601.3917846679688, "loss": 1.8382, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0747746005654335, "rewards/margins": 0.19791939854621887, "rewards/rejected": -0.2726939916610718, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.866548300851254e-06, "logits/chosen": 0.2482290267944336, "logits/rejected": 0.2852781414985657, "logps/chosen": -620.8068237304688, "logps/rejected": -665.9005737304688, "loss": 1.8229, "rewards/accuracies": 0.625, "rewards/chosen": -0.10497160255908966, "rewards/margins": 0.20543234050273895, "rewards/rejected": -0.3104039430618286, "step": 180 }, { "epoch": 0.41, "learning_rate": 3.706672243793271e-06, "logits/chosen": 0.2958913743495941, "logits/rejected": 0.3795389235019684, "logps/chosen": -611.8587646484375, "logps/rejected": -658.9635009765625, "loss": 1.7752, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0870656967163086, "rewards/margins": 0.23995642364025116, "rewards/rejected": -0.32702213525772095, "step": 190 }, { "epoch": 0.43, "learning_rate": 3.5400799911032357e-06, "logits/chosen": 0.2935205101966858, "logits/rejected": 0.3416239321231842, "logps/chosen": -660.2877197265625, "logps/rejected": -730.04541015625, "loss": 1.7351, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1828436255455017, "rewards/margins": 0.3010478913784027, "rewards/rejected": -0.4838915765285492, "step": 200 }, { "epoch": 0.45, "learning_rate": 3.3676987756445894e-06, "logits/chosen": 0.24807122349739075, "logits/rejected": 0.32862648367881775, "logps/chosen": -605.8773193359375, "logps/rejected": -641.6677856445312, "loss": 1.8245, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13868902623653412, "rewards/margins": 0.2735101878643036, "rewards/rejected": -0.4121991991996765, "step": 210 }, { "epoch": 0.47, "learning_rate": 3.1904880509659397e-06, "logits/chosen": 0.270724892616272, "logits/rejected": 0.3151053786277771, "logps/chosen": -650.7314453125, "logps/rejected": -708.2312622070312, "loss": 1.735, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16527561843395233, "rewards/margins": 0.2484448254108429, "rewards/rejected": -0.4137204587459564, "step": 220 }, { "epoch": 0.49, "learning_rate": 3.0094341510955697e-06, "logits/chosen": 0.19233042001724243, "logits/rejected": 0.29483872652053833, "logps/chosen": -663.5474243164062, "logps/rejected": -743.0173950195312, "loss": 1.7378, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.14797742664813995, "rewards/margins": 0.3706679344177246, "rewards/rejected": -0.5186454057693481, "step": 230 }, { "epoch": 0.51, "learning_rate": 2.825544800722376e-06, "logits/chosen": 0.2124979943037033, "logits/rejected": 0.3365432620048523, "logps/chosen": -619.9740600585938, "logps/rejected": -700.7166748046875, "loss": 1.8168, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1956629902124405, "rewards/margins": 0.2987174093723297, "rewards/rejected": -0.494380384683609, "step": 240 }, { "epoch": 0.53, "learning_rate": 2.639843506318899e-06, "logits/chosen": 0.2796134054660797, "logits/rejected": 0.2740449607372284, "logps/chosen": -582.3416748046875, "logps/rejected": -674.327880859375, "loss": 1.8901, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.19822832942008972, "rewards/margins": 0.19228845834732056, "rewards/rejected": -0.3905167877674103, "step": 250 }, { "epoch": 0.55, "learning_rate": 2.4533638594248094e-06, "logits/chosen": 0.25897207856178284, "logits/rejected": 0.31485193967819214, "logps/chosen": -604.8118896484375, "logps/rejected": -667.9144897460938, "loss": 1.8606, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14205999672412872, "rewards/margins": 0.28450149297714233, "rewards/rejected": -0.42656150460243225, "step": 260 }, { "epoch": 0.58, "learning_rate": 2.2671437837980943e-06, "logits/chosen": 0.22259187698364258, "logits/rejected": 0.22855930030345917, "logps/chosen": -593.6612548828125, "logps/rejected": -673.6566162109375, "loss": 1.7486, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.14823240041732788, "rewards/margins": 0.2802043557167053, "rewards/rejected": -0.4284366965293884, "step": 270 }, { "epoch": 0.6, "learning_rate": 2.082219758453629e-06, "logits/chosen": 0.2169434130191803, "logits/rejected": 0.2703471779823303, "logps/chosen": -611.6048583984375, "logps/rejected": -682.5806884765625, "loss": 1.6556, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11671599000692368, "rewards/margins": 0.26952022314071655, "rewards/rejected": -0.3862362205982208, "step": 280 }, { "epoch": 0.62, "learning_rate": 1.899621048743019e-06, "logits/chosen": 0.22146745026111603, "logits/rejected": 0.34733515977859497, "logps/chosen": -603.9933471679688, "logps/rejected": -673.3649291992188, "loss": 1.7238, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20439250767230988, "rewards/margins": 0.2682177424430847, "rewards/rejected": -0.4726102352142334, "step": 290 }, { "epoch": 0.64, "learning_rate": 1.7203639775848423e-06, "logits/chosen": 0.19099445641040802, "logits/rejected": 0.3011043667793274, "logps/chosen": -606.6263427734375, "logps/rejected": -639.6136474609375, "loss": 1.8381, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17796705663204193, "rewards/margins": 0.23042461276054382, "rewards/rejected": -0.40839165449142456, "step": 300 }, { "epoch": 0.66, "learning_rate": 1.5454462687309445e-06, "logits/chosen": 0.2036764919757843, "logits/rejected": 0.26239025592803955, "logps/chosen": -602.3845825195312, "logps/rejected": -666.4627075195312, "loss": 1.8042, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1518932580947876, "rewards/margins": 0.2536298632621765, "rewards/rejected": -0.4055231511592865, "step": 310 }, { "epoch": 0.68, "learning_rate": 1.3758414935535147e-06, "logits/chosen": 0.21739721298217773, "logits/rejected": 0.2840099334716797, "logps/chosen": -636.0455322265625, "logps/rejected": -709.1137084960938, "loss": 1.65, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16815349459648132, "rewards/margins": 0.29733169078826904, "rewards/rejected": -0.465485155582428, "step": 320 }, { "epoch": 0.7, "learning_rate": 1.2124936522614622e-06, "logits/chosen": 0.20938508212566376, "logits/rejected": 0.22490420937538147, "logps/chosen": -615.7994995117188, "logps/rejected": -669.2200927734375, "loss": 1.7098, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18394342064857483, "rewards/margins": 0.31033387780189514, "rewards/rejected": -0.49427732825279236, "step": 330 }, { "epoch": 0.73, "learning_rate": 1.0563119197063934e-06, "logits/chosen": 0.23827771842479706, "logits/rejected": 0.2663131356239319, "logps/chosen": -612.7750244140625, "logps/rejected": -685.60107421875, "loss": 1.7109, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19161880016326904, "rewards/margins": 0.26392242312431335, "rewards/rejected": -0.4555412232875824, "step": 340 }, { "epoch": 0.75, "learning_rate": 9.081655850224449e-07, "logits/chosen": 0.19827114045619965, "logits/rejected": 0.2343660295009613, "logps/chosen": -628.5892333984375, "logps/rejected": -699.3311767578125, "loss": 1.6981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.23514249920845032, "rewards/margins": 0.30311545729637146, "rewards/rejected": -0.5382579565048218, "step": 350 }, { "epoch": 0.77, "learning_rate": 7.688792132653111e-07, "logits/chosen": 0.19120459258556366, "logits/rejected": 0.2861759066581726, "logps/chosen": -659.7528076171875, "logps/rejected": -748.490234375, "loss": 1.6967, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19031907618045807, "rewards/margins": 0.34352895617485046, "rewards/rejected": -0.533847987651825, "step": 360 }, { "epoch": 0.79, "learning_rate": 6.392280559802341e-07, "logits/chosen": 0.2406836450099945, "logits/rejected": 0.23908407986164093, "logps/chosen": -658.35400390625, "logps/rejected": -720.8883666992188, "loss": 1.7368, "rewards/accuracies": 0.625, "rewards/chosen": -0.23702308535575867, "rewards/margins": 0.24957367777824402, "rewards/rejected": -0.48659682273864746, "step": 370 }, { "epoch": 0.81, "learning_rate": 5.199337362431792e-07, "logits/chosen": 0.26719361543655396, "logits/rejected": 0.1743316501379013, "logps/chosen": -621.3897094726562, "logps/rejected": -680.0, "loss": 1.7425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22942551970481873, "rewards/margins": 0.26667481660842896, "rewards/rejected": -0.49610036611557007, "step": 380 }, { "epoch": 0.83, "learning_rate": 4.1166023219176176e-07, "logits/chosen": 0.21561181545257568, "logits/rejected": 0.286629855632782, "logps/chosen": -654.0867919921875, "logps/rejected": -668.467529296875, "loss": 1.6798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21663355827331543, "rewards/margins": 0.2600599527359009, "rewards/rejected": -0.4766935408115387, "step": 390 }, { "epoch": 0.85, "learning_rate": 3.150101814011136e-07, "logits/chosen": 0.16323356330394745, "logits/rejected": 0.21500280499458313, "logps/chosen": -600.4713134765625, "logps/rejected": -730.5057983398438, "loss": 1.7084, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.19050315022468567, "rewards/margins": 0.28324562311172485, "rewards/rejected": -0.47374874353408813, "step": 400 }, { "epoch": 0.87, "learning_rate": 2.3052152667409289e-07, "logits/chosen": 0.1962326616048813, "logits/rejected": 0.22506949305534363, "logps/chosen": -614.2760009765625, "logps/rejected": -675.3383178710938, "loss": 1.7679, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1459427773952484, "rewards/margins": 0.3252793252468109, "rewards/rejected": -0.4712221026420593, "step": 410 }, { "epoch": 0.9, "learning_rate": 1.5866452191498488e-07, "logits/chosen": 0.20015636086463928, "logits/rejected": 0.25162121653556824, "logps/chosen": -651.9236450195312, "logps/rejected": -707.2882080078125, "loss": 1.7514, "rewards/accuracies": 0.59375, "rewards/chosen": -0.23218846321105957, "rewards/margins": 0.2290785312652588, "rewards/rejected": -0.46126699447631836, "step": 420 }, { "epoch": 0.92, "learning_rate": 9.983911475163727e-08, "logits/chosen": 0.16698592901229858, "logits/rejected": 0.2591376304626465, "logps/chosen": -590.045166015625, "logps/rejected": -642.6705322265625, "loss": 1.8093, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.17988340556621552, "rewards/margins": 0.23005299270153046, "rewards/rejected": -0.4099363684654236, "step": 430 }, { "epoch": 0.94, "learning_rate": 5.437272047405712e-08, "logits/chosen": 0.1858983337879181, "logits/rejected": 0.3158418536186218, "logps/chosen": -559.8682250976562, "logps/rejected": -648.7040405273438, "loss": 1.7686, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.19649046659469604, "rewards/margins": 0.26454511284828186, "rewards/rejected": -0.4610355794429779, "step": 440 }, { "epoch": 0.96, "learning_rate": 2.251839967945535e-08, "logits/chosen": 0.13786078989505768, "logits/rejected": 0.2333669662475586, "logps/chosen": -645.2703857421875, "logps/rejected": -707.0418090820312, "loss": 1.6172, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17300908267498016, "rewards/margins": 0.3292023241519928, "rewards/rejected": -0.5022113919258118, "step": 450 }, { "epoch": 0.98, "learning_rate": 4.453449766758933e-09, "logits/chosen": 0.1742466688156128, "logits/rejected": 0.2268284559249878, "logps/chosen": -576.7985229492188, "logps/rejected": -652.7803344726562, "loss": 1.7297, "rewards/accuracies": 0.625, "rewards/chosen": -0.24071533977985382, "rewards/margins": 0.18981412053108215, "rewards/rejected": -0.4305294454097748, "step": 460 }, { "epoch": 1.0, "step": 468, "total_flos": 0.0, "train_loss": 1.8394590188295414, "train_runtime": 15861.5475, "train_samples_per_second": 1.891, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }