{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 10000, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 0.17655496299266815, "logits/rejected": 0.2531452775001526, "logps/chosen": -354.29669189453125, "logps/rejected": -305.259765625, "loss": 0.5, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.0010361697059124708, "rewards/margins": 0.0014542521676048636, "rewards/rejected": -0.00041808263631537557, "step": 10 }, { "epoch": 0.04, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 0.07140998542308807, "logits/rejected": 0.19915328919887543, "logps/chosen": -316.61407470703125, "logps/rejected": -276.1783142089844, "loss": 0.4997, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.001211934955790639, "rewards/margins": 0.00264042429625988, "rewards/rejected": -0.0014284893404692411, "step": 20 }, { "epoch": 0.06, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.1830858290195465, "logits/rejected": 0.25493288040161133, "logps/chosen": -294.3023376464844, "logps/rejected": -298.47430419921875, "loss": 0.4979, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00664560217410326, "rewards/margins": 0.008408578112721443, "rewards/rejected": -0.0017629768699407578, "step": 30 }, { "epoch": 0.08, "learning_rate": 8.333333333333333e-07, "logits/chosen": 0.1198926791548729, "logits/rejected": 0.2388772964477539, "logps/chosen": -343.3688659667969, "logps/rejected": -318.56866455078125, "loss": 0.4944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04203338176012039, "rewards/margins": 0.023049216717481613, "rewards/rejected": 0.01898416317999363, "step": 40 }, { "epoch": 0.1, "learning_rate": 9.999463737538052e-07, "logits/chosen": 0.19016575813293457, "logits/rejected": 0.2768324613571167, "logps/chosen": -305.9139709472656, "logps/rejected": -285.70263671875, "loss": 0.4888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0463864728808403, "rewards/margins": 0.06659023463726044, "rewards/rejected": -0.02020375430583954, "step": 50 }, { "epoch": 0.13, "learning_rate": 9.980706626858607e-07, "logits/chosen": 0.1583642065525055, "logits/rejected": 0.2964373230934143, "logps/chosen": -292.2091979980469, "logps/rejected": -283.33062744140625, "loss": 0.4823, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.006695735268294811, "rewards/margins": 0.08554854989051819, "rewards/rejected": -0.0788528248667717, "step": 60 }, { "epoch": 0.15, "learning_rate": 9.935251313189563e-07, "logits/chosen": 0.1668189913034439, "logits/rejected": 0.25383955240249634, "logps/chosen": -330.51483154296875, "logps/rejected": -332.74249267578125, "loss": 0.476, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007911129854619503, "rewards/margins": 0.13003569841384888, "rewards/rejected": -0.13794682919979095, "step": 70 }, { "epoch": 0.17, "learning_rate": 9.86334145175542e-07, "logits/chosen": 0.22892770171165466, "logits/rejected": 0.32262876629829407, "logps/chosen": -326.62847900390625, "logps/rejected": -321.47064208984375, "loss": 0.4678, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07964827120304108, "rewards/margins": 0.2643834054470062, "rewards/rejected": -0.3440317213535309, "step": 80 }, { "epoch": 0.19, "learning_rate": 9.765362502737097e-07, "logits/chosen": 0.12489993870258331, "logits/rejected": 0.2657889425754547, "logps/chosen": -358.5821838378906, "logps/rejected": -333.71466064453125, "loss": 0.4612, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.21640650928020477, "rewards/margins": 0.4499947130680084, "rewards/rejected": -0.6664012670516968, "step": 90 }, { "epoch": 0.21, "learning_rate": 9.641839665080363e-07, "logits/chosen": 0.2374851256608963, "logits/rejected": 0.4098134934902191, "logps/chosen": -378.7792053222656, "logps/rejected": -408.1399841308594, "loss": 0.4512, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.44217753410339355, "rewards/margins": 0.715401291847229, "rewards/rejected": -1.157578706741333, "step": 100 }, { "epoch": 0.23, "learning_rate": 9.493435061259129e-07, "logits/chosen": 0.29897215962409973, "logits/rejected": 0.34014248847961426, "logps/chosen": -395.0293884277344, "logps/rejected": -461.2764587402344, "loss": 0.4418, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8240998983383179, "rewards/margins": 0.7941638231277466, "rewards/rejected": -1.618263602256775, "step": 110 }, { "epoch": 0.25, "learning_rate": 9.320944188084241e-07, "logits/chosen": 0.18543429672718048, "logits/rejected": 0.282682329416275, "logps/chosen": -440.6853942871094, "logps/rejected": -526.3844604492188, "loss": 0.4495, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.201317548751831, "rewards/margins": 0.8505627512931824, "rewards/rejected": -2.051880359649658, "step": 120 }, { "epoch": 0.27, "learning_rate": 9.125291652582547e-07, "logits/chosen": 0.10988249629735947, "logits/rejected": 0.2532512843608856, "logps/chosen": -429.30322265625, "logps/rejected": -460.0655822753906, "loss": 0.4407, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9080715179443359, "rewards/margins": 0.8440803289413452, "rewards/rejected": -1.7521518468856812, "step": 130 }, { "epoch": 0.29, "learning_rate": 8.90752621580335e-07, "logits/chosen": 0.05259154364466667, "logits/rejected": 0.20351815223693848, "logps/chosen": -478.1226501464844, "logps/rejected": -552.33154296875, "loss": 0.4381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4607925415039062, "rewards/margins": 1.3634538650512695, "rewards/rejected": -2.8242461681365967, "step": 140 }, { "epoch": 0.31, "learning_rate": 8.668815171119019e-07, "logits/chosen": 0.1267194300889969, "logits/rejected": 0.16065822541713715, "logps/chosen": -432.47418212890625, "logps/rejected": -556.4413452148438, "loss": 0.4373, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9717355966567993, "rewards/margins": 1.443182110786438, "rewards/rejected": -2.4149177074432373, "step": 150 }, { "epoch": 0.33, "learning_rate": 8.410438087153911e-07, "logits/chosen": 0.05742305517196655, "logits/rejected": 0.03335579112172127, "logps/chosen": -386.4638366699219, "logps/rejected": -537.6171264648438, "loss": 0.4335, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8440232276916504, "rewards/margins": 1.7251598834991455, "rewards/rejected": -2.569182872772217, "step": 160 }, { "epoch": 0.36, "learning_rate": 8.133779948881513e-07, "logits/chosen": 0.04388447850942612, "logits/rejected": 0.06478340178728104, "logps/chosen": -450.94049072265625, "logps/rejected": -571.2717895507812, "loss": 0.4268, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1457209587097168, "rewards/margins": 1.4885038137435913, "rewards/rejected": -2.6342251300811768, "step": 170 }, { "epoch": 0.38, "learning_rate": 7.840323733655778e-07, "logits/chosen": 0.03801240772008896, "logits/rejected": 0.0668804943561554, "logps/chosen": -415.9105529785156, "logps/rejected": -594.4246826171875, "loss": 0.426, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8987852931022644, "rewards/margins": 2.0467095375061035, "rewards/rejected": -2.9454948902130127, "step": 180 }, { "epoch": 0.4, "learning_rate": 7.531642461971514e-07, "logits/chosen": 0.12394122779369354, "logits/rejected": 0.07622597366571426, "logps/chosen": -482.99774169921875, "logps/rejected": -617.9317626953125, "loss": 0.4148, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4833831787109375, "rewards/margins": 1.5686841011047363, "rewards/rejected": -3.052067279815674, "step": 190 }, { "epoch": 0.42, "learning_rate": 7.209390765564318e-07, "logits/chosen": 0.12547728419303894, "logits/rejected": 0.039741553366184235, "logps/chosen": -470.0662536621094, "logps/rejected": -810.3030395507812, "loss": 0.4152, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2957651615142822, "rewards/margins": 3.8659985065460205, "rewards/rejected": -5.1617631912231445, "step": 200 }, { "epoch": 0.44, "learning_rate": 6.875296018047809e-07, "logits/chosen": 0.20153549313545227, "logits/rejected": 0.1317548155784607, "logps/chosen": -447.82562255859375, "logps/rejected": -725.8985595703125, "loss": 0.4249, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3366836309432983, "rewards/margins": 3.223564863204956, "rewards/rejected": -4.560248374938965, "step": 210 }, { "epoch": 0.46, "learning_rate": 6.531149075630796e-07, "logits/chosen": -0.017775116488337517, "logits/rejected": 0.05367380380630493, "logps/chosen": -476.78790283203125, "logps/rejected": -663.9365844726562, "loss": 0.4167, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6010878086090088, "rewards/margins": 2.401573419570923, "rewards/rejected": -4.002661228179932, "step": 220 }, { "epoch": 0.48, "learning_rate": 6.178794677547137e-07, "logits/chosen": 0.07326556742191315, "logits/rejected": -0.006058653350919485, "logps/chosen": -590.01123046875, "logps/rejected": -870.9129028320312, "loss": 0.4193, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6394991874694824, "rewards/margins": 3.238422393798828, "rewards/rejected": -5.8779215812683105, "step": 230 }, { "epoch": 0.5, "learning_rate": 5.820121557655108e-07, "logits/chosen": 0.13632330298423767, "logits/rejected": 0.12085568904876709, "logps/chosen": -450.1314392089844, "logps/rejected": -587.374267578125, "loss": 0.425, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3817965984344482, "rewards/margins": 1.4728713035583496, "rewards/rejected": -2.854668140411377, "step": 240 }, { "epoch": 0.52, "learning_rate": 5.457052320211339e-07, "logits/chosen": 0.09744735062122345, "logits/rejected": -0.04311475530266762, "logps/chosen": -561.7251586914062, "logps/rejected": -1082.66064453125, "loss": 0.4126, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4967703819274902, "rewards/margins": 5.509397029876709, "rewards/rejected": -8.006166458129883, "step": 250 }, { "epoch": 0.54, "learning_rate": 5.091533134088387e-07, "logits/chosen": 0.007685136049985886, "logits/rejected": -0.026540469378232956, "logps/chosen": -681.2808837890625, "logps/rejected": -1102.198486328125, "loss": 0.4237, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.266371250152588, "rewards/margins": 4.754992485046387, "rewards/rejected": -8.021364212036133, "step": 260 }, { "epoch": 0.57, "learning_rate": 4.7255233006783624e-07, "logits/chosen": 0.24146917462348938, "logits/rejected": 0.05772332474589348, "logps/chosen": -437.0887756347656, "logps/rejected": -754.1742553710938, "loss": 0.409, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0795494318008423, "rewards/margins": 3.357706069946289, "rewards/rejected": -4.437255859375, "step": 270 }, { "epoch": 0.59, "learning_rate": 4.3609847514019763e-07, "logits/chosen": 0.15583154559135437, "logits/rejected": -0.01679980382323265, "logps/chosen": -622.4188232421875, "logps/rejected": -1143.203857421875, "loss": 0.4172, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.2593586444854736, "rewards/margins": 5.262009143829346, "rewards/rejected": -8.521368980407715, "step": 280 }, { "epoch": 0.61, "learning_rate": 3.9998715311197783e-07, "logits/chosen": 0.12384140491485596, "logits/rejected": -0.03689634054899216, "logps/chosen": -612.9854736328125, "logps/rejected": -1161.8275146484375, "loss": 0.4065, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.665544033050537, "rewards/margins": 5.724797248840332, "rewards/rejected": -8.390340805053711, "step": 290 }, { "epoch": 0.63, "learning_rate": 3.6441193238179146e-07, "logits/chosen": 0.23247964680194855, "logits/rejected": 0.08442293107509613, "logps/chosen": -644.8258056640625, "logps/rejected": -1333.277099609375, "loss": 0.4067, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.5307083129882812, "rewards/margins": 6.568638801574707, "rewards/rejected": -10.099346160888672, "step": 300 }, { "epoch": 0.65, "learning_rate": 3.295635076714144e-07, "logits/chosen": 0.21653930842876434, "logits/rejected": -0.010667298920452595, "logps/chosen": -576.2736206054688, "logps/rejected": -1167.0555419921875, "loss": 0.4003, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.805418014526367, "rewards/margins": 5.748055458068848, "rewards/rejected": -8.553472518920898, "step": 310 }, { "epoch": 0.67, "learning_rate": 2.956286778402226e-07, "logits/chosen": 0.14956721663475037, "logits/rejected": -0.00617391150444746, "logps/chosen": -499.51556396484375, "logps/rejected": -1073.225830078125, "loss": 0.4081, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8388452529907227, "rewards/margins": 5.99139928817749, "rewards/rejected": -7.830244541168213, "step": 320 }, { "epoch": 0.69, "learning_rate": 2.6278934458271996e-07, "logits/chosen": 0.20027479529380798, "logits/rejected": 0.06552217900753021, "logps/chosen": -461.4195861816406, "logps/rejected": -1150.258544921875, "loss": 0.4027, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6265113353729248, "rewards/margins": 6.768563270568848, "rewards/rejected": -8.395073890686035, "step": 330 }, { "epoch": 0.71, "learning_rate": 2.312215373764551e-07, "logits/chosen": 0.1772742122411728, "logits/rejected": 0.058857548981904984, "logps/chosen": -519.1689453125, "logps/rejected": -1075.103759765625, "loss": 0.4056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.324723720550537, "rewards/margins": 5.8179826736450195, "rewards/rejected": -8.142705917358398, "step": 340 }, { "epoch": 0.73, "learning_rate": 2.0109446990692963e-07, "logits/chosen": 0.09322932362556458, "logits/rejected": -0.021080341190099716, "logps/chosen": -524.8082275390625, "logps/rejected": -1263.429443359375, "loss": 0.404, "rewards/accuracies": 0.65625, "rewards/chosen": -2.233060121536255, "rewards/margins": 7.2954888343811035, "rewards/rejected": -9.528549194335938, "step": 350 }, { "epoch": 0.75, "learning_rate": 1.725696330273575e-07, "logits/chosen": 0.12329642474651337, "logits/rejected": -0.045363299548625946, "logps/chosen": -477.84747314453125, "logps/rejected": -1159.287353515625, "loss": 0.3987, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4969019889831543, "rewards/margins": 7.274144172668457, "rewards/rejected": -8.77104663848877, "step": 360 }, { "epoch": 0.77, "learning_rate": 1.4579992911531496e-07, "logits/chosen": 0.13813820481300354, "logits/rejected": 0.06726070493459702, "logps/chosen": -596.8673706054688, "logps/rejected": -1229.910888671875, "loss": 0.3989, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.484358787536621, "rewards/margins": 6.708567142486572, "rewards/rejected": -9.192926406860352, "step": 370 }, { "epoch": 0.8, "learning_rate": 1.209288524664029e-07, "logits/chosen": 0.2262219935655594, "logits/rejected": 0.04883592948317528, "logps/chosen": -571.9241333007812, "logps/rejected": -1147.636474609375, "loss": 0.3965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7885093688964844, "rewards/margins": 5.966012954711914, "rewards/rejected": -8.754522323608398, "step": 380 }, { "epoch": 0.82, "learning_rate": 9.808972011828054e-08, "logits/chosen": 0.13919615745544434, "logits/rejected": 0.08005174249410629, "logps/chosen": -603.2689208984375, "logps/rejected": -1278.978271484375, "loss": 0.3993, "rewards/accuracies": 0.75, "rewards/chosen": -2.6157753467559814, "rewards/margins": 7.164151668548584, "rewards/rejected": -9.779927253723145, "step": 390 }, { "epoch": 0.84, "learning_rate": 7.740495722810269e-08, "logits/chosen": 0.1855761706829071, "logits/rejected": 0.03339262679219246, "logps/chosen": -554.6050415039062, "logps/rejected": -1247.11474609375, "loss": 0.4064, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.375999927520752, "rewards/margins": 7.147269248962402, "rewards/rejected": -9.523270606994629, "step": 400 }, { "epoch": 0.86, "learning_rate": 5.898544083397e-08, "logits/chosen": 0.10612723976373672, "logits/rejected": -0.03204170614480972, "logps/chosen": -598.8375244140625, "logps/rejected": -1218.921142578125, "loss": 0.4009, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.0963997840881348, "rewards/margins": 6.186778545379639, "rewards/rejected": -9.283178329467773, "step": 410 }, { "epoch": 0.88, "learning_rate": 4.292990551804171e-08, "logits/chosen": 0.3134514391422272, "logits/rejected": 0.1133495420217514, "logps/chosen": -560.297607421875, "logps/rejected": -1385.083251953125, "loss": 0.3991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.669637680053711, "rewards/margins": 8.3246488571167, "rewards/rejected": -10.994285583496094, "step": 420 }, { "epoch": 0.9, "learning_rate": 2.9324414157151367e-08, "logits/chosen": 0.14708609879016876, "logits/rejected": 0.05113764852285385, "logps/chosen": -646.3408203125, "logps/rejected": -1521.79345703125, "loss": 0.3999, "rewards/accuracies": 0.75, "rewards/chosen": -3.122638702392578, "rewards/margins": 8.8574800491333, "rewards/rejected": -11.980117797851562, "step": 430 }, { "epoch": 0.92, "learning_rate": 1.824189659787284e-08, "logits/chosen": 0.19891302287578583, "logits/rejected": 0.057393454015254974, "logps/chosen": -530.86865234375, "logps/rejected": -1372.778076171875, "loss": 0.3979, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4631145000457764, "rewards/margins": 8.486894607543945, "rewards/rejected": -10.950007438659668, "step": 440 }, { "epoch": 0.94, "learning_rate": 9.741758728888217e-09, "logits/chosen": 0.20876403152942657, "logits/rejected": 0.052755843847990036, "logps/chosen": -683.3274536132812, "logps/rejected": -1404.552978515625, "loss": 0.3915, "rewards/accuracies": 0.71875, "rewards/chosen": -3.3337600231170654, "rewards/margins": 7.872265815734863, "rewards/rejected": -11.206026077270508, "step": 450 }, { "epoch": 0.96, "learning_rate": 3.869564046156459e-09, "logits/chosen": 0.2985457181930542, "logits/rejected": 0.15650448203086853, "logps/chosen": -468.8932189941406, "logps/rejected": -1197.56201171875, "loss": 0.3987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7095565795898438, "rewards/margins": 7.608504295349121, "rewards/rejected": -9.318059921264648, "step": 460 }, { "epoch": 0.98, "learning_rate": 6.567894177967325e-10, "logits/chosen": 0.17393910884857178, "logits/rejected": 0.02789122983813286, "logps/chosen": -607.3438720703125, "logps/rejected": -1505.235595703125, "loss": 0.3978, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6819469928741455, "rewards/margins": 9.39558219909668, "rewards/rejected": -12.07752799987793, "step": 470 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 0.42718374404267445, "train_runtime": 6325.1171, "train_samples_per_second": 9.665, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 477, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 0.0, "trial_name": null, "trial_params": null }