{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0638297872340426e-07, "logits/chosen": 0.18604117631912231, "logits/rejected": 0.34631967544555664, "logps/chosen": -460.0769348144531, "logps/rejected": -351.57135009765625, "loss": 0.3612, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.05747946724295616, "logits/rejected": 0.16539901494979858, "logps/chosen": -330.9287414550781, "logps/rejected": -328.71575927734375, "loss": 0.3819, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 2.3913149561849423e-05, "rewards/margins": -1.7764228687155992e-05, "rewards/rejected": 4.167737643001601e-05, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.15373219549655914, "logits/rejected": 0.19493858516216278, "logps/chosen": -325.29803466796875, "logps/rejected": -315.1011047363281, "loss": 0.3718, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -1.974666338355746e-05, "rewards/margins": 1.2878153654583002e-07, "rewards/rejected": -1.9875456928275526e-05, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.11452794075012207, "logits/rejected": 0.1842522919178009, "logps/chosen": -371.60943603515625, "logps/rejected": -327.19366455078125, "loss": 0.3676, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -5.7096302043646574e-05, "rewards/margins": 3.7131747376406565e-05, "rewards/rejected": -9.422805305803195e-05, "step": 30 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.18893569707870483, "logits/rejected": 0.2084759920835495, "logps/chosen": -337.7607727050781, "logps/rejected": -353.72503662109375, "loss": 0.3678, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00010834180284291506, "rewards/margins": 0.00026063303812406957, "rewards/rejected": -0.0001522912352811545, "step": 40 }, { "epoch": 0.11, "learning_rate": 4.999373573764188e-06, "logits/chosen": 0.08207504451274872, "logits/rejected": 0.20786412060260773, "logps/chosen": -384.91656494140625, "logps/rejected": -347.6080627441406, "loss": 0.3741, "rewards/accuracies": 0.625, "rewards/chosen": 0.0010863704374060035, "rewards/margins": 0.0014893051702529192, "rewards/rejected": -0.0004029346746392548, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.988245838331339e-06, "logits/chosen": 0.13469652831554413, "logits/rejected": 0.17153413593769073, "logps/chosen": -372.34124755859375, "logps/rejected": -329.98626708984375, "loss": 0.3646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0023965700529515743, "rewards/margins": 0.0026933744084089994, "rewards/rejected": -0.0002968042972497642, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.963268819535228e-06, "logits/chosen": 0.15689334273338318, "logits/rejected": 0.20978419482707977, "logps/chosen": -364.29888916015625, "logps/rejected": -377.21087646484375, "loss": 0.3705, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0023818810004740953, "rewards/margins": 0.0072565278969705105, "rewards/rejected": -0.0048746466636657715, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.9245815365216115e-06, "logits/chosen": 0.11595060676336288, "logits/rejected": 0.18608702719211578, "logps/chosen": -328.6256103515625, "logps/rejected": -365.93896484375, "loss": 0.3663, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.002104334533214569, "rewards/margins": 0.011472588405013084, "rewards/rejected": -0.013576922006905079, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.872399318152594e-06, "logits/chosen": 0.08527339994907379, "logits/rejected": 0.11540959030389786, "logps/chosen": -352.0487060546875, "logps/rejected": -361.36578369140625, "loss": 0.3456, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.011672710999846458, "rewards/margins": 0.01901327446103096, "rewards/rejected": -0.03068598173558712, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.807012604511542e-06, "logits/chosen": 0.14592930674552917, "logits/rejected": 0.18039533495903015, "logps/chosen": -414.8818359375, "logps/rejected": -435.3639221191406, "loss": 0.34, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04601982235908508, "rewards/margins": 0.034013133496046066, "rewards/rejected": -0.08003295958042145, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.728785330347771e-06, "logits/chosen": 0.13395074009895325, "logits/rejected": 0.2120208740234375, "logps/chosen": -432.98486328125, "logps/rejected": -553.6598510742188, "loss": 0.305, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11783289909362793, "rewards/margins": 0.08183668553829193, "rewards/rejected": -0.19966959953308105, "step": 110 }, { "epoch": 0.26, "learning_rate": 4.63815289945858e-06, "logits/chosen": 0.14807412028312683, "logits/rejected": 0.21360798180103302, "logps/chosen": -565.0419311523438, "logps/rejected": -595.3816528320312, "loss": 0.3434, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.18253007531166077, "rewards/margins": 0.060289014130830765, "rewards/rejected": -0.24281907081604004, "step": 120 }, { "epoch": 0.28, "learning_rate": 4.535619761282989e-06, "logits/chosen": 0.10818658024072647, "logits/rejected": 0.18194182217121124, "logps/chosen": -462.3949279785156, "logps/rejected": -513.8490600585938, "loss": 0.3173, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1326686441898346, "rewards/margins": 0.0804656594991684, "rewards/rejected": -0.2131342887878418, "step": 130 }, { "epoch": 0.3, "learning_rate": 4.42175660319555e-06, "logits/chosen": 0.13493295013904572, "logits/rejected": 0.22737479209899902, "logps/chosen": -532.4095458984375, "logps/rejected": -585.3455200195312, "loss": 0.3196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16629299521446228, "rewards/margins": 0.08414360880851746, "rewards/rejected": -0.25043657422065735, "step": 140 }, { "epoch": 0.32, "learning_rate": 4.297197174127619e-06, "logits/chosen": 0.17478415369987488, "logits/rejected": 0.24990789592266083, "logps/chosen": -479.77862548828125, "logps/rejected": -553.7377319335938, "loss": 0.3207, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.14474426209926605, "rewards/margins": 0.09155549854040146, "rewards/rejected": -0.2362997829914093, "step": 150 }, { "epoch": 0.34, "learning_rate": 4.162634757195418e-06, "logits/chosen": 0.13402113318443298, "logits/rejected": 0.2551622688770294, "logps/chosen": -496.40081787109375, "logps/rejected": -558.84326171875, "loss": 0.3187, "rewards/accuracies": 0.65625, "rewards/chosen": -0.13809171319007874, "rewards/margins": 0.08086591213941574, "rewards/rejected": -0.21895763278007507, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.018818310967843e-06, "logits/chosen": 0.12252243608236313, "logits/rejected": 0.16481925547122955, "logps/chosen": -484.56353759765625, "logps/rejected": -554.2274780273438, "loss": 0.3255, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.14209917187690735, "rewards/margins": 0.09574152529239655, "rewards/rejected": -0.2378406971693039, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.866548300851254e-06, "logits/chosen": 0.08216498792171478, "logits/rejected": 0.17952165007591248, "logps/chosen": -482.576171875, "logps/rejected": -579.781982421875, "loss": 0.3047, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14626096189022064, "rewards/margins": 0.08570893108844757, "rewards/rejected": -0.23196987807750702, "step": 180 }, { "epoch": 0.41, "learning_rate": 3.706672243793271e-06, "logits/chosen": 0.07855963706970215, "logits/rejected": 0.07844971120357513, "logps/chosen": -468.69061279296875, "logps/rejected": -544.0549926757812, "loss": 0.2935, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1354297697544098, "rewards/margins": 0.09144213050603867, "rewards/rejected": -0.22687189280986786, "step": 190 }, { "epoch": 0.43, "learning_rate": 3.5400799911032357e-06, "logits/chosen": 0.10545216500759125, "logits/rejected": 0.1882828176021576, "logps/chosen": -501.1815490722656, "logps/rejected": -601.7250366210938, "loss": 0.3041, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.15607957541942596, "rewards/margins": 0.10436417162418365, "rewards/rejected": -0.2604437470436096, "step": 200 }, { "epoch": 0.45, "learning_rate": 3.3676987756445894e-06, "logits/chosen": 0.10487540811300278, "logits/rejected": 0.11818546056747437, "logps/chosen": -470.6344299316406, "logps/rejected": -565.8145751953125, "loss": 0.3148, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16682696342468262, "rewards/margins": 0.09452913701534271, "rewards/rejected": -0.26135605573654175, "step": 210 }, { "epoch": 0.47, "learning_rate": 3.1904880509659397e-06, "logits/chosen": 0.13482534885406494, "logits/rejected": 0.20024776458740234, "logps/chosen": -510.10528564453125, "logps/rejected": -605.6468505859375, "loss": 0.3172, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.16119703650474548, "rewards/margins": 0.1006912812590599, "rewards/rejected": -0.261888325214386, "step": 220 }, { "epoch": 0.49, "learning_rate": 3.0094341510955697e-06, "logits/chosen": 0.10005593299865723, "logits/rejected": 0.1791614145040512, "logps/chosen": -532.1925048828125, "logps/rejected": -624.0726318359375, "loss": 0.3106, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17168700695037842, "rewards/margins": 0.09216924011707306, "rewards/rejected": -0.26385626196861267, "step": 230 }, { "epoch": 0.51, "learning_rate": 2.825544800722376e-06, "logits/chosen": 0.10918021202087402, "logits/rejected": 0.18382051587104797, "logps/chosen": -508.88494873046875, "logps/rejected": -571.0011596679688, "loss": 0.309, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15795882046222687, "rewards/margins": 0.09128745645284653, "rewards/rejected": -0.2492462694644928, "step": 240 }, { "epoch": 0.53, "learning_rate": 2.639843506318899e-06, "logits/chosen": 0.12444597482681274, "logits/rejected": 0.16049379110336304, "logps/chosen": -509.65631103515625, "logps/rejected": -609.0398559570312, "loss": 0.2924, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1560695320367813, "rewards/margins": 0.11202052980661392, "rewards/rejected": -0.26809003949165344, "step": 250 }, { "epoch": 0.55, "learning_rate": 2.4533638594248094e-06, "logits/chosen": 0.11506851017475128, "logits/rejected": 0.1052849292755127, "logps/chosen": -535.1851806640625, "logps/rejected": -609.4058837890625, "loss": 0.3139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18579542636871338, "rewards/margins": 0.10177616029977798, "rewards/rejected": -0.28757157921791077, "step": 260 }, { "epoch": 0.58, "learning_rate": 2.2671437837980943e-06, "logits/chosen": 0.11827238649129868, "logits/rejected": 0.12189098447561264, "logps/chosen": -542.7433471679688, "logps/rejected": -633.5802612304688, "loss": 0.2867, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.19100908935070038, "rewards/margins": 0.09670811891555786, "rewards/rejected": -0.28771719336509705, "step": 270 }, { "epoch": 0.6, "learning_rate": 2.082219758453629e-06, "logits/chosen": 0.0904841274023056, "logits/rejected": 0.20232203602790833, "logps/chosen": -575.5374755859375, "logps/rejected": -672.4290161132812, "loss": 0.3105, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2171885222196579, "rewards/margins": 0.10769243538379669, "rewards/rejected": -0.3248809278011322, "step": 280 }, { "epoch": 0.62, "learning_rate": 1.899621048743019e-06, "logits/chosen": 0.07801838964223862, "logits/rejected": 0.16570156812667847, "logps/chosen": -556.6237182617188, "logps/rejected": -643.4578857421875, "loss": 0.3145, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.20063337683677673, "rewards/margins": 0.1107235923409462, "rewards/rejected": -0.31135696172714233, "step": 290 }, { "epoch": 0.64, "learning_rate": 1.7203639775848423e-06, "logits/chosen": 0.07458348572254181, "logits/rejected": 0.08251482248306274, "logps/chosen": -524.4200439453125, "logps/rejected": -666.1168823242188, "loss": 0.2934, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18233875930309296, "rewards/margins": 0.13505366444587708, "rewards/rejected": -0.3173924386501312, "step": 300 }, { "epoch": 0.66, "learning_rate": 1.5454462687309445e-06, "logits/chosen": 0.06379405409097672, "logits/rejected": 0.14568018913269043, "logps/chosen": -504.7530822753906, "logps/rejected": -650.941650390625, "loss": 0.2814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17345736920833588, "rewards/margins": 0.1400579959154129, "rewards/rejected": -0.3135153353214264, "step": 310 }, { "epoch": 0.68, "learning_rate": 1.3758414935535147e-06, "logits/chosen": 0.0773477703332901, "logits/rejected": 0.12101063877344131, "logps/chosen": -536.0679931640625, "logps/rejected": -659.2662353515625, "loss": 0.3014, "rewards/accuracies": 0.71875, "rewards/chosen": -0.18032298982143402, "rewards/margins": 0.11869337409734726, "rewards/rejected": -0.29901641607284546, "step": 320 }, { "epoch": 0.7, "learning_rate": 1.2124936522614622e-06, "logits/chosen": 0.09862785786390305, "logits/rejected": 0.14109982550144196, "logps/chosen": -487.79071044921875, "logps/rejected": -610.7267456054688, "loss": 0.2935, "rewards/accuracies": 0.65625, "rewards/chosen": -0.18224193155765533, "rewards/margins": 0.10070188343524933, "rewards/rejected": -0.2829437851905823, "step": 330 }, { "epoch": 0.73, "learning_rate": 1.0563119197063934e-06, "logits/chosen": 0.09601452201604843, "logits/rejected": 0.13355228304862976, "logps/chosen": -478.78143310546875, "logps/rejected": -622.6370239257812, "loss": 0.31, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.17518463730812073, "rewards/margins": 0.11871640384197235, "rewards/rejected": -0.2939010262489319, "step": 340 }, { "epoch": 0.75, "learning_rate": 9.081655850224449e-07, "logits/chosen": 0.14273716509342194, "logits/rejected": 0.12122112512588501, "logps/chosen": -535.0431518554688, "logps/rejected": -643.0104370117188, "loss": 0.3196, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19643327593803406, "rewards/margins": 0.10987784713506699, "rewards/rejected": -0.30631113052368164, "step": 350 }, { "epoch": 0.77, "learning_rate": 7.688792132653111e-07, "logits/chosen": 0.1597367525100708, "logits/rejected": 0.18071278929710388, "logps/chosen": -597.0608520507812, "logps/rejected": -692.2689208984375, "loss": 0.2965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20307877659797668, "rewards/margins": 0.12918424606323242, "rewards/rejected": -0.3322630524635315, "step": 360 }, { "epoch": 0.79, "learning_rate": 6.392280559802341e-07, "logits/chosen": 0.1371072232723236, "logits/rejected": 0.16206106543540955, "logps/chosen": -573.9052124023438, "logps/rejected": -700.1799926757812, "loss": 0.2689, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2099401205778122, "rewards/margins": 0.13856378197669983, "rewards/rejected": -0.34850388765335083, "step": 370 }, { "epoch": 0.81, "learning_rate": 5.199337362431792e-07, "logits/chosen": 0.10267746448516846, "logits/rejected": 0.1483933925628662, "logps/chosen": -554.7241821289062, "logps/rejected": -637.0789184570312, "loss": 0.2999, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19773730635643005, "rewards/margins": 0.11398313194513321, "rewards/rejected": -0.31172046065330505, "step": 380 }, { "epoch": 0.83, "learning_rate": 4.1166023219176176e-07, "logits/chosen": 0.11247100681066513, "logits/rejected": 0.1309679001569748, "logps/chosen": -557.1490478515625, "logps/rejected": -678.8016357421875, "loss": 0.2945, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1877584308385849, "rewards/margins": 0.12902548909187317, "rewards/rejected": -0.3167839050292969, "step": 390 }, { "epoch": 0.85, "learning_rate": 3.150101814011136e-07, "logits/chosen": 0.16857033967971802, "logits/rejected": 0.18106935918331146, "logps/chosen": -571.0828857421875, "logps/rejected": -638.4654541015625, "loss": 0.3045, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.19992712140083313, "rewards/margins": 0.10413169860839844, "rewards/rejected": -0.30405884981155396, "step": 400 }, { "epoch": 0.87, "learning_rate": 2.3052152667409289e-07, "logits/chosen": 0.09823437035083771, "logits/rejected": 0.23346427083015442, "logps/chosen": -537.1575317382812, "logps/rejected": -657.79931640625, "loss": 0.3101, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19983352720737457, "rewards/margins": 0.11520209163427353, "rewards/rejected": -0.3150356113910675, "step": 410 }, { "epoch": 0.9, "learning_rate": 1.5866452191498488e-07, "logits/chosen": 0.13151055574417114, "logits/rejected": 0.17384907603263855, "logps/chosen": -562.8436279296875, "logps/rejected": -688.2677612304688, "loss": 0.2892, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.21342253684997559, "rewards/margins": 0.1261134147644043, "rewards/rejected": -0.3395359516143799, "step": 420 }, { "epoch": 0.92, "learning_rate": 9.983911475163727e-08, "logits/chosen": 0.11608059704303741, "logits/rejected": 0.13708294928073883, "logps/chosen": -503.0716857910156, "logps/rejected": -606.0911865234375, "loss": 0.2983, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18081562221050262, "rewards/margins": 0.11758317053318024, "rewards/rejected": -0.29839879274368286, "step": 430 }, { "epoch": 0.94, "learning_rate": 5.437272047405712e-08, "logits/chosen": 0.11100079119205475, "logits/rejected": 0.13695240020751953, "logps/chosen": -530.9528198242188, "logps/rejected": -668.3643798828125, "loss": 0.3095, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2087739259004593, "rewards/margins": 0.13447535037994385, "rewards/rejected": -0.34324929118156433, "step": 440 }, { "epoch": 0.96, "learning_rate": 2.251839967945535e-08, "logits/chosen": 0.022059569135308266, "logits/rejected": 0.12616530060768127, "logps/chosen": -492.80181884765625, "logps/rejected": -643.1315307617188, "loss": 0.2834, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.18623578548431396, "rewards/margins": 0.14532844722270966, "rewards/rejected": -0.3315642178058624, "step": 450 }, { "epoch": 0.98, "learning_rate": 4.453449766758933e-09, "logits/chosen": 0.10801200568675995, "logits/rejected": 0.11458040773868561, "logps/chosen": -531.0709228515625, "logps/rejected": -645.5343627929688, "loss": 0.3043, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1950865238904953, "rewards/margins": 0.11565764993429184, "rewards/rejected": -0.3107442259788513, "step": 460 }, { "epoch": 1.0, "step": 468, "total_flos": 0.0, "train_loss": 0.2399272450015076, "train_runtime": 4417.3525, "train_samples_per_second": 3.396, "train_steps_per_second": 0.106 } ], "logging_steps": 10, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }