{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9968602825745683, "eval_steps": 100, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": 0.2709607779979706, "logits/rejected": 0.36084669828414917, "logps/chosen": -304.1212463378906, "logps/rejected": -281.92694091796875, "loss": 0.1836, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00025563794770278037, "rewards/margins": -4.445898957783356e-05, "rewards/rejected": -0.00021117893629707396, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 0.3387250602245331, "logits/rejected": 0.365884006023407, "logps/chosen": -287.37677001953125, "logps/rejected": -261.12213134765625, "loss": 0.1853, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0006877075065858662, "rewards/margins": -0.0008078098180703819, "rewards/rejected": 0.00012010247155558318, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 0.19636110961437225, "logits/rejected": 0.2971157133579254, "logps/chosen": -355.48052978515625, "logps/rejected": -307.60101318359375, "loss": 0.1858, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0008693916606716812, "rewards/margins": 0.0018057005945593119, "rewards/rejected": -0.002675092313438654, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 0.2009788304567337, "logits/rejected": 0.2732384204864502, "logps/chosen": -320.2412414550781, "logps/rejected": -295.5198059082031, "loss": 0.1786, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0026089001912623644, "rewards/margins": 0.005245196167379618, "rewards/rejected": -0.007854094728827477, "step": 40 }, { "epoch": 0.1, "learning_rate": 5.208333333333334e-07, "logits/chosen": 0.30105799436569214, "logits/rejected": 0.3381732106208801, "logps/chosen": -329.18377685546875, "logps/rejected": -330.98297119140625, "loss": 0.1792, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.004522019065916538, "rewards/margins": 0.024444926530122757, "rewards/rejected": -0.02896694466471672, "step": 50 }, { "epoch": 0.13, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.2789975106716156, "logits/rejected": 0.37255367636680603, "logps/chosen": -289.91961669921875, "logps/rejected": -293.2933044433594, "loss": 0.1882, "rewards/accuracies": 0.65625, "rewards/chosen": -0.022119298577308655, "rewards/margins": 0.04484058916568756, "rewards/rejected": -0.06695988774299622, "step": 60 }, { "epoch": 0.15, "learning_rate": 7.291666666666666e-07, "logits/chosen": 0.40268006920814514, "logits/rejected": 0.4395596981048584, "logps/chosen": -272.30242919921875, "logps/rejected": -299.4195861816406, "loss": 0.1725, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05775861814618111, "rewards/margins": 0.07561491429805756, "rewards/rejected": -0.13337352871894836, "step": 70 }, { "epoch": 0.17, "learning_rate": 8.333333333333333e-07, "logits/chosen": 0.38377270102500916, "logits/rejected": 0.427955687046051, "logps/chosen": -305.79461669921875, "logps/rejected": -306.46527099609375, "loss": 0.1546, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11612454801797867, "rewards/margins": 0.15051202476024628, "rewards/rejected": -0.26663655042648315, "step": 80 }, { "epoch": 0.19, "learning_rate": 9.374999999999999e-07, "logits/chosen": 0.40008336305618286, "logits/rejected": 0.5159471035003662, "logps/chosen": -304.7159423828125, "logps/rejected": -290.49200439453125, "loss": 0.1296, "rewards/accuracies": 0.75, "rewards/chosen": -0.20290033519268036, "rewards/margins": 0.20165178179740906, "rewards/rejected": -0.4045521318912506, "step": 90 }, { "epoch": 0.21, "learning_rate": 9.999463737538052e-07, "logits/chosen": 0.3695070147514343, "logits/rejected": 0.4569215774536133, "logps/chosen": -343.1157531738281, "logps/rejected": -321.3959655761719, "loss": 0.111, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3429110646247864, "rewards/margins": 0.19276174902915955, "rewards/rejected": -0.5356727838516235, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": 0.39762556552886963, "eval_logits/rejected": 0.4520445764064789, "eval_logps/chosen": -336.5851135253906, "eval_logps/rejected": -375.96063232421875, "eval_loss": 0.10800629109144211, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -0.3300043046474457, "eval_rewards/margins": 0.3133509159088135, "eval_rewards/rejected": -0.6433552503585815, "eval_runtime": 74.5651, "eval_samples_per_second": 26.822, "eval_steps_per_second": 0.429, "step": 100 }, { "epoch": 0.23, "learning_rate": 9.993432105822034e-07, "logits/chosen": 0.29598233103752136, "logits/rejected": 0.3596528172492981, "logps/chosen": -346.12603759765625, "logps/rejected": -347.3135681152344, "loss": 0.1045, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.37233370542526245, "rewards/margins": 0.2192118912935257, "rewards/rejected": -0.5915456414222717, "step": 110 }, { "epoch": 0.25, "learning_rate": 9.980706626858607e-07, "logits/chosen": 0.22473303973674774, "logits/rejected": 0.2900647521018982, "logps/chosen": -369.64788818359375, "logps/rejected": -380.33404541015625, "loss": 0.0935, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4193580746650696, "rewards/margins": 0.33016690611839294, "rewards/rejected": -0.7495249509811401, "step": 120 }, { "epoch": 0.27, "learning_rate": 9.961304359538434e-07, "logits/chosen": 0.28685927391052246, "logits/rejected": 0.34184715151786804, "logps/chosen": -355.6005554199219, "logps/rejected": -348.94989013671875, "loss": 0.0953, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.49181294441223145, "rewards/margins": 0.21913418173789978, "rewards/rejected": -0.7109471559524536, "step": 130 }, { "epoch": 0.29, "learning_rate": 9.935251313189563e-07, "logits/chosen": 0.24738028645515442, "logits/rejected": 0.3477911353111267, "logps/chosen": -349.7526550292969, "logps/rejected": -339.65081787109375, "loss": 0.0842, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5085957646369934, "rewards/margins": 0.2822812795639038, "rewards/rejected": -0.7908770442008972, "step": 140 }, { "epoch": 0.31, "learning_rate": 9.902582412711118e-07, "logits/chosen": 0.23527678847312927, "logits/rejected": 0.297168105840683, "logps/chosen": -385.29278564453125, "logps/rejected": -388.8484191894531, "loss": 0.0829, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5571666955947876, "rewards/margins": 0.3536582887172699, "rewards/rejected": -0.9108250737190247, "step": 150 }, { "epoch": 0.33, "learning_rate": 9.86334145175542e-07, "logits/chosen": 0.2127149999141693, "logits/rejected": 0.3407444953918457, "logps/chosen": -391.6556701660156, "logps/rejected": -385.64373779296875, "loss": 0.0831, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6610974073410034, "rewards/margins": 0.4304705560207367, "rewards/rejected": -1.0915679931640625, "step": 160 }, { "epoch": 0.36, "learning_rate": 9.817581034021272e-07, "logits/chosen": 0.10886111110448837, "logits/rejected": 0.23057182133197784, "logps/chosen": -425.691650390625, "logps/rejected": -426.1463928222656, "loss": 0.0777, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6238452792167664, "rewards/margins": 0.35001182556152344, "rewards/rejected": -0.9738571047782898, "step": 170 }, { "epoch": 0.38, "learning_rate": 9.765362502737097e-07, "logits/chosen": 0.28310832381248474, "logits/rejected": 0.34999170899391174, "logps/chosen": -386.44647216796875, "logps/rejected": -417.8492736816406, "loss": 0.0723, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.615284264087677, "rewards/margins": 0.5002199411392212, "rewards/rejected": -1.115504264831543, "step": 180 }, { "epoch": 0.4, "learning_rate": 9.706755858428485e-07, "logits/chosen": 0.2382032573223114, "logits/rejected": 0.3148980438709259, "logps/chosen": -376.91082763671875, "logps/rejected": -408.2652282714844, "loss": 0.0709, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6280232667922974, "rewards/margins": 0.39571380615234375, "rewards/rejected": -1.0237369537353516, "step": 190 }, { "epoch": 0.42, "learning_rate": 9.641839665080363e-07, "logits/chosen": 0.2947675287723541, "logits/rejected": 0.44535762071609497, "logps/chosen": -357.99951171875, "logps/rejected": -366.5459899902344, "loss": 0.0697, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5609619617462158, "rewards/margins": 0.4246044158935547, "rewards/rejected": -0.9855663180351257, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": 0.3266603350639343, "eval_logits/rejected": 0.4101351499557495, "eval_logps/chosen": -362.02423095703125, "eval_logps/rejected": -433.75665283203125, "eval_loss": 0.07280407100915909, "eval_rewards/accuracies": 0.7421875, "eval_rewards/chosen": -0.5843959450721741, "eval_rewards/margins": 0.6369195580482483, "eval_rewards/rejected": -1.221315622329712, "eval_runtime": 75.214, "eval_samples_per_second": 26.591, "eval_steps_per_second": 0.425, "step": 200 }, { "epoch": 0.44, "learning_rate": 9.570700944819582e-07, "logits/chosen": 0.29511094093322754, "logits/rejected": 0.4236629605293274, "logps/chosen": -369.3717346191406, "logps/rejected": -388.28240966796875, "loss": 0.075, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6854770183563232, "rewards/margins": 0.5033689141273499, "rewards/rejected": -1.1888458728790283, "step": 210 }, { "epoch": 0.46, "learning_rate": 9.493435061259129e-07, "logits/chosen": 0.24741777777671814, "logits/rejected": 0.3947208523750305, "logps/chosen": -389.66033935546875, "logps/rejected": -376.7333984375, "loss": 0.0737, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6762610077857971, "rewards/margins": 0.43233251571655273, "rewards/rejected": -1.1085935831069946, "step": 220 }, { "epoch": 0.48, "learning_rate": 9.4101455916603e-07, "logits/chosen": 0.20855531096458435, "logits/rejected": 0.32445111870765686, "logps/chosen": -379.79962158203125, "logps/rejected": -424.62109375, "loss": 0.0673, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7091845273971558, "rewards/margins": 0.5963308215141296, "rewards/rejected": -1.3055154085159302, "step": 230 }, { "epoch": 0.5, "learning_rate": 9.320944188084241e-07, "logits/chosen": 0.1453891396522522, "logits/rejected": 0.3285972774028778, "logps/chosen": -462.5914001464844, "logps/rejected": -419.4541015625, "loss": 0.0613, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7956157922744751, "rewards/margins": 0.49665746092796326, "rewards/rejected": -1.2922732830047607, "step": 240 }, { "epoch": 0.52, "learning_rate": 9.225950427718974e-07, "logits/chosen": 0.19092246890068054, "logits/rejected": 0.27189213037490845, "logps/chosen": -375.6587219238281, "logps/rejected": -393.5346984863281, "loss": 0.0586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6252821683883667, "rewards/margins": 0.4895978569984436, "rewards/rejected": -1.114880084991455, "step": 250 }, { "epoch": 0.54, "learning_rate": 9.125291652582547e-07, "logits/chosen": 0.2094411551952362, "logits/rejected": 0.25953131914138794, "logps/chosen": -379.9848937988281, "logps/rejected": -423.44891357421875, "loss": 0.0649, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7326905727386475, "rewards/margins": 0.668548047542572, "rewards/rejected": -1.4012387990951538, "step": 260 }, { "epoch": 0.57, "learning_rate": 9.019102798817195e-07, "logits/chosen": 0.20080764591693878, "logits/rejected": 0.2263043224811554, "logps/chosen": -374.03680419921875, "logps/rejected": -410.4359436035156, "loss": 0.0584, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8592589497566223, "rewards/margins": 0.481633722782135, "rewards/rejected": -1.3408926725387573, "step": 270 }, { "epoch": 0.59, "learning_rate": 8.90752621580335e-07, "logits/chosen": 0.16465748846530914, "logits/rejected": 0.2313542366027832, "logps/chosen": -432.99884033203125, "logps/rejected": -439.2601013183594, "loss": 0.0529, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.04286527633667, "rewards/margins": 0.4970209002494812, "rewards/rejected": -1.5398861169815063, "step": 280 }, { "epoch": 0.61, "learning_rate": 8.79071147533597e-07, "logits/chosen": 0.17371919751167297, "logits/rejected": 0.25447744131088257, "logps/chosen": -404.51141357421875, "logps/rejected": -469.1675720214844, "loss": 0.0592, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7758570313453674, "rewards/margins": 0.7041818499565125, "rewards/rejected": -1.4800388813018799, "step": 290 }, { "epoch": 0.63, "learning_rate": 8.668815171119019e-07, "logits/chosen": 0.20719440281391144, "logits/rejected": 0.22149357199668884, "logps/chosen": -375.27703857421875, "logps/rejected": -434.1705627441406, "loss": 0.055, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7744671106338501, "rewards/margins": 0.6087144613265991, "rewards/rejected": -1.3831814527511597, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": 0.245079904794693, "eval_logits/rejected": 0.2779832184314728, "eval_logps/chosen": -383.036865234375, "eval_logps/rejected": -465.8376159667969, "eval_loss": 0.06104155629873276, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": -0.7945222854614258, "eval_rewards/margins": 0.7476030588150024, "eval_rewards/rejected": -1.5421253442764282, "eval_runtime": 75.177, "eval_samples_per_second": 26.604, "eval_steps_per_second": 0.426, "step": 300 }, { "epoch": 0.65, "learning_rate": 8.54200070884685e-07, "logits/chosen": 0.2089851200580597, "logits/rejected": 0.27757978439331055, "logps/chosen": -364.8197021484375, "logps/rejected": -426.005615234375, "loss": 0.0519, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8241626620292664, "rewards/margins": 0.7324913740158081, "rewards/rejected": -1.5566540956497192, "step": 310 }, { "epoch": 0.67, "learning_rate": 8.410438087153911e-07, "logits/chosen": 0.19529737532138824, "logits/rejected": 0.22135886549949646, "logps/chosen": -390.7099304199219, "logps/rejected": -456.1578063964844, "loss": 0.0485, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9577051401138306, "rewards/margins": 0.7674695253372192, "rewards/rejected": -1.7251746654510498, "step": 320 }, { "epoch": 0.69, "learning_rate": 8.274303669726426e-07, "logits/chosen": 0.10426706075668335, "logits/rejected": 0.09881766140460968, "logps/chosen": -433.2498474121094, "logps/rejected": -481.43218994140625, "loss": 0.0523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0801985263824463, "rewards/margins": 0.6202031373977661, "rewards/rejected": -1.7004016637802124, "step": 330 }, { "epoch": 0.71, "learning_rate": 8.133779948881513e-07, "logits/chosen": 0.18205437064170837, "logits/rejected": 0.2083740234375, "logps/chosen": -362.75665283203125, "logps/rejected": -403.1224060058594, "loss": 0.058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.767187237739563, "rewards/margins": 0.5591500401496887, "rewards/rejected": -1.3263373374938965, "step": 340 }, { "epoch": 0.73, "learning_rate": 7.989055300930704e-07, "logits/chosen": 0.17904943227767944, "logits/rejected": 0.1928117871284485, "logps/chosen": -392.50665283203125, "logps/rejected": -460.57049560546875, "loss": 0.0581, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9408512115478516, "rewards/margins": 0.6105338335037231, "rewards/rejected": -1.5513849258422852, "step": 350 }, { "epoch": 0.75, "learning_rate": 7.840323733655778e-07, "logits/chosen": 0.11943835020065308, "logits/rejected": 0.14566612243652344, "logps/chosen": -408.4542541503906, "logps/rejected": -477.89556884765625, "loss": 0.0586, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9370508193969727, "rewards/margins": 0.6635792851448059, "rewards/rejected": -1.6006300449371338, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.687784626235447e-07, "logits/chosen": 0.08894483745098114, "logits/rejected": 0.21349970996379852, "logps/chosen": -427.32806396484375, "logps/rejected": -448.68115234375, "loss": 0.0583, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9731753468513489, "rewards/margins": 0.594713032245636, "rewards/rejected": -1.5678884983062744, "step": 370 }, { "epoch": 0.8, "learning_rate": 7.531642461971514e-07, "logits/chosen": 0.1658913791179657, "logits/rejected": 0.1944103091955185, "logps/chosen": -356.7992858886719, "logps/rejected": -428.2513732910156, "loss": 0.0597, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8277195692062378, "rewards/margins": 0.6664320826530457, "rewards/rejected": -1.4941515922546387, "step": 380 }, { "epoch": 0.82, "learning_rate": 7.372106554172801e-07, "logits/chosen": 0.17445510625839233, "logits/rejected": 0.24218544363975525, "logps/chosen": -364.82818603515625, "logps/rejected": -411.1913146972656, "loss": 0.0667, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7496371865272522, "rewards/margins": 0.4808691143989563, "rewards/rejected": -1.2305063009262085, "step": 390 }, { "epoch": 0.84, "learning_rate": 7.209390765564318e-07, "logits/chosen": 0.21217799186706543, "logits/rejected": 0.19523081183433533, "logps/chosen": -384.03082275390625, "logps/rejected": -439.64697265625, "loss": 0.0573, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9415151476860046, "rewards/margins": 0.5813928842544556, "rewards/rejected": -1.5229079723358154, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": 0.23484691977500916, "eval_logits/rejected": 0.25605612993240356, "eval_logps/chosen": -386.639404296875, "eval_logps/rejected": -471.1476745605469, "eval_loss": 0.05661754682660103, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.8305472135543823, "eval_rewards/margins": 0.7646786570549011, "eval_rewards/rejected": -1.5952258110046387, "eval_runtime": 73.9447, "eval_samples_per_second": 27.047, "eval_steps_per_second": 0.433, "step": 400 }, { "epoch": 0.86, "learning_rate": 7.043713221597773e-07, "logits/chosen": 0.13619688153266907, "logits/rejected": 0.24737751483917236, "logps/chosen": -439.60595703125, "logps/rejected": -467.98974609375, "loss": 0.0498, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.987277626991272, "rewards/margins": 0.6264899373054504, "rewards/rejected": -1.6137676239013672, "step": 410 }, { "epoch": 0.88, "learning_rate": 6.875296018047809e-07, "logits/chosen": 0.08895771205425262, "logits/rejected": 0.10534010827541351, "logps/chosen": -437.58575439453125, "logps/rejected": -504.79962158203125, "loss": 0.0508, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.197588562965393, "rewards/margins": 0.6411014795303345, "rewards/rejected": -1.8386898040771484, "step": 420 }, { "epoch": 0.9, "learning_rate": 6.704364923285857e-07, "logits/chosen": 0.13031154870986938, "logits/rejected": 0.22820834815502167, "logps/chosen": -406.86474609375, "logps/rejected": -403.8560485839844, "loss": 0.0488, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0081294775009155, "rewards/margins": 0.5856183171272278, "rewards/rejected": -1.593747854232788, "step": 430 }, { "epoch": 0.92, "learning_rate": 6.531149075630796e-07, "logits/chosen": 0.06417986750602722, "logits/rejected": 0.1644040048122406, "logps/chosen": -426.49859619140625, "logps/rejected": -444.4884338378906, "loss": 0.0587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0372142791748047, "rewards/margins": 0.5977233052253723, "rewards/rejected": -1.6349375247955322, "step": 440 }, { "epoch": 0.94, "learning_rate": 6.355880676182085e-07, "logits/chosen": 0.1278570294380188, "logits/rejected": 0.10543633997440338, "logps/chosen": -385.71832275390625, "logps/rejected": -487.64495849609375, "loss": 0.0594, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.880386471748352, "rewards/margins": 0.8574458360671997, "rewards/rejected": -1.7378323078155518, "step": 450 }, { "epoch": 0.96, "learning_rate": 6.178794677547137e-07, "logits/chosen": 0.14232680201530457, "logits/rejected": 0.19507645070552826, "logps/chosen": -429.20794677734375, "logps/rejected": -448.43621826171875, "loss": 0.0586, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8559433817863464, "rewards/margins": 0.6609224677085876, "rewards/rejected": -1.5168659687042236, "step": 460 }, { "epoch": 0.98, "learning_rate": 6.000128468880222e-07, "logits/chosen": 0.17209979891777039, "logits/rejected": 0.20294690132141113, "logps/chosen": -434.7554626464844, "logps/rejected": -495.654541015625, "loss": 0.0564, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0413404703140259, "rewards/margins": 0.7894097566604614, "rewards/rejected": -1.8307502269744873, "step": 470 }, { "epoch": 1.0, "learning_rate": 5.820121557655108e-07, "logits/chosen": 0.1130753755569458, "logits/rejected": 0.17763587832450867, "logps/chosen": -447.583251953125, "logps/rejected": -536.4083862304688, "loss": 0.042, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.053792953491211, "rewards/margins": 0.9330266714096069, "rewards/rejected": -1.9868196249008179, "step": 480 }, { "epoch": 1.03, "learning_rate": 5.639015248598023e-07, "logits/chosen": 0.173425555229187, "logits/rejected": 0.1563911885023117, "logps/chosen": -396.56976318359375, "logps/rejected": -509.23760986328125, "loss": 0.0272, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0132352113723755, "rewards/margins": 1.2193310260772705, "rewards/rejected": -2.2325661182403564, "step": 490 }, { "epoch": 1.05, "learning_rate": 5.457052320211339e-07, "logits/chosen": 0.1425987184047699, "logits/rejected": 0.1953365057706833, "logps/chosen": -486.52484130859375, "logps/rejected": -575.6693115234375, "loss": 0.0215, "rewards/accuracies": 0.75, "rewards/chosen": -1.450165033340454, "rewards/margins": 1.2124006748199463, "rewards/rejected": -2.6625657081604004, "step": 500 }, { "epoch": 1.05, "eval_logits/chosen": 0.22210484743118286, "eval_logits/rejected": 0.2419252097606659, "eval_logps/chosen": -465.0879821777344, "eval_logps/rejected": -598.30078125, "eval_loss": 0.03267505019903183, "eval_rewards/accuracies": 0.73046875, "eval_rewards/chosen": -1.6150331497192383, "eval_rewards/margins": 1.2517237663269043, "eval_rewards/rejected": -2.8667569160461426, "eval_runtime": 74.1502, "eval_samples_per_second": 26.972, "eval_steps_per_second": 0.432, "step": 500 }, { "epoch": 1.07, "learning_rate": 5.274476699321637e-07, "logits/chosen": 0.10247495025396347, "logits/rejected": 0.18925973773002625, "logps/chosen": -478.4522399902344, "logps/rejected": -538.3016967773438, "loss": 0.019, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6425974369049072, "rewards/margins": 0.9050165414810181, "rewards/rejected": -2.5476138591766357, "step": 510 }, { "epoch": 1.09, "learning_rate": 5.091533134088387e-07, "logits/chosen": 0.15197055041790009, "logits/rejected": 0.21326705813407898, "logps/chosen": -460.1466369628906, "logps/rejected": -564.4393310546875, "loss": 0.0199, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4866502285003662, "rewards/margins": 1.329404354095459, "rewards/rejected": -2.8160548210144043, "step": 520 }, { "epoch": 1.11, "learning_rate": 4.908466865911614e-07, "logits/chosen": 0.13690608739852905, "logits/rejected": 0.24023446440696716, "logps/chosen": -495.3097229003906, "logps/rejected": -585.8782958984375, "loss": 0.0185, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5163190364837646, "rewards/margins": 1.2400487661361694, "rewards/rejected": -2.7563679218292236, "step": 530 }, { "epoch": 1.13, "learning_rate": 4.7255233006783624e-07, "logits/chosen": 0.21338143944740295, "logits/rejected": 0.24035005271434784, "logps/chosen": -469.50958251953125, "logps/rejected": -574.6529541015625, "loss": 0.0153, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6547876596450806, "rewards/margins": 1.1869592666625977, "rewards/rejected": -2.8417468070983887, "step": 540 }, { "epoch": 1.15, "learning_rate": 4.5429476797886617e-07, "logits/chosen": 0.17387095093727112, "logits/rejected": 0.20873236656188965, "logps/chosen": -454.8916015625, "logps/rejected": -569.7607421875, "loss": 0.0162, "rewards/accuracies": 0.75, "rewards/chosen": -1.4260327816009521, "rewards/margins": 1.1895036697387695, "rewards/rejected": -2.6155364513397217, "step": 550 }, { "epoch": 1.17, "learning_rate": 4.3609847514019763e-07, "logits/chosen": 0.03245037421584129, "logits/rejected": 0.15740999579429626, "logps/chosen": -468.34393310546875, "logps/rejected": -583.3796997070312, "loss": 0.0162, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6325418949127197, "rewards/margins": 1.2155063152313232, "rewards/rejected": -2.848047971725464, "step": 560 }, { "epoch": 1.19, "learning_rate": 4.179878442344892e-07, "logits/chosen": 0.19224026799201965, "logits/rejected": 0.25054025650024414, "logps/chosen": -458.1896057128906, "logps/rejected": -582.621337890625, "loss": 0.0151, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6655352115631104, "rewards/margins": 1.4305517673492432, "rewards/rejected": -3.0960865020751953, "step": 570 }, { "epoch": 1.21, "learning_rate": 3.9998715311197783e-07, "logits/chosen": 0.11674971878528595, "logits/rejected": 0.1621953547000885, "logps/chosen": -469.97454833984375, "logps/rejected": -602.2510375976562, "loss": 0.0159, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8368819952011108, "rewards/margins": 1.2762649059295654, "rewards/rejected": -3.113147258758545, "step": 580 }, { "epoch": 1.23, "learning_rate": 3.821205322452863e-07, "logits/chosen": 0.20661136507987976, "logits/rejected": 0.2854346036911011, "logps/chosen": -462.8573303222656, "logps/rejected": -592.9844970703125, "loss": 0.0131, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7403570413589478, "rewards/margins": 1.424896478652954, "rewards/rejected": -3.1652536392211914, "step": 590 }, { "epoch": 1.26, "learning_rate": 3.6441193238179146e-07, "logits/chosen": 0.2347058355808258, "logits/rejected": 0.30280107259750366, "logps/chosen": -493.71484375, "logps/rejected": -621.8485107421875, "loss": 0.0139, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6602065563201904, "rewards/margins": 1.4890201091766357, "rewards/rejected": -3.149226665496826, "step": 600 }, { "epoch": 1.26, "eval_logits/chosen": 0.2600603699684143, "eval_logits/rejected": 0.2915884852409363, "eval_logps/chosen": -484.3870849609375, "eval_logps/rejected": -620.5768432617188, "eval_loss": 0.025975177064538002, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": -1.8080239295959473, "eval_rewards/margins": 1.2814933061599731, "eval_rewards/rejected": -3.08951735496521, "eval_runtime": 74.1333, "eval_samples_per_second": 26.978, "eval_steps_per_second": 0.432, "step": 600 }, { "epoch": 1.28, "learning_rate": 3.4688509243692034e-07, "logits/chosen": 0.11437401920557022, "logits/rejected": 0.2602505087852478, "logps/chosen": -514.426513671875, "logps/rejected": -637.3177490234375, "loss": 0.0139, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.734244704246521, "rewards/margins": 1.4157501459121704, "rewards/rejected": -3.1499948501586914, "step": 610 }, { "epoch": 1.3, "learning_rate": 3.295635076714144e-07, "logits/chosen": 0.07244641333818436, "logits/rejected": 0.16178789734840393, "logps/chosen": -518.953857421875, "logps/rejected": -616.7998657226562, "loss": 0.0141, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6754181385040283, "rewards/margins": 1.4035594463348389, "rewards/rejected": -3.078977346420288, "step": 620 }, { "epoch": 1.32, "learning_rate": 3.12470398195219e-07, "logits/chosen": 0.1367851048707962, "logits/rejected": 0.3572950065135956, "logps/chosen": -585.3605346679688, "logps/rejected": -622.2881469726562, "loss": 0.0145, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1314330101013184, "rewards/margins": 1.1666393280029297, "rewards/rejected": -3.298072099685669, "step": 630 }, { "epoch": 1.34, "learning_rate": 2.956286778402226e-07, "logits/chosen": 0.23215535283088684, "logits/rejected": 0.308633416891098, "logps/chosen": -464.2928771972656, "logps/rejected": -603.6142578125, "loss": 0.0119, "rewards/accuracies": 0.71875, "rewards/chosen": -1.798147201538086, "rewards/margins": 1.2844369411468506, "rewards/rejected": -3.0825843811035156, "step": 640 }, { "epoch": 1.36, "learning_rate": 2.7906092344356826e-07, "logits/chosen": 0.19899992644786835, "logits/rejected": 0.3110192120075226, "logps/chosen": -524.2318115234375, "logps/rejected": -611.1939697265625, "loss": 0.0135, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8059593439102173, "rewards/margins": 1.16557776927948, "rewards/rejected": -2.9715373516082764, "step": 650 }, { "epoch": 1.38, "learning_rate": 2.6278934458271996e-07, "logits/chosen": 0.11150866746902466, "logits/rejected": 0.20857281982898712, "logps/chosen": -505.08135986328125, "logps/rejected": -640.5789184570312, "loss": 0.0138, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8031425476074219, "rewards/margins": 1.359779953956604, "rewards/rejected": -3.1629223823547363, "step": 660 }, { "epoch": 1.4, "learning_rate": 2.468357538028487e-07, "logits/chosen": 0.09730945527553558, "logits/rejected": 0.19464361667633057, "logps/chosen": -528.5618896484375, "logps/rejected": -603.7526245117188, "loss": 0.0133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9411073923110962, "rewards/margins": 1.2485072612762451, "rewards/rejected": -3.189614772796631, "step": 670 }, { "epoch": 1.42, "learning_rate": 2.312215373764551e-07, "logits/chosen": 0.14483553171157837, "logits/rejected": 0.1836375743150711, "logps/chosen": -516.8216552734375, "logps/rejected": -620.4552001953125, "loss": 0.0138, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9225925207138062, "rewards/margins": 1.22107994556427, "rewards/rejected": -3.143672466278076, "step": 680 }, { "epoch": 1.44, "learning_rate": 2.1596762663442213e-07, "logits/chosen": 0.16792774200439453, "logits/rejected": 0.24698173999786377, "logps/chosen": -488.84234619140625, "logps/rejected": -614.4656372070312, "loss": 0.0129, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8360084295272827, "rewards/margins": 1.531054973602295, "rewards/rejected": -3.367063045501709, "step": 690 }, { "epoch": 1.47, "learning_rate": 2.0109446990692963e-07, "logits/chosen": 0.1680019199848175, "logits/rejected": 0.29429227113723755, "logps/chosen": -520.464599609375, "logps/rejected": -605.1278076171875, "loss": 0.0125, "rewards/accuracies": 0.75, "rewards/chosen": -1.9141952991485596, "rewards/margins": 1.3153671026229858, "rewards/rejected": -3.229562282562256, "step": 700 }, { "epoch": 1.47, "eval_logits/chosen": 0.26143062114715576, "eval_logits/rejected": 0.29468628764152527, "eval_logps/chosen": -494.7950134277344, "eval_logps/rejected": -630.4850463867188, "eval_loss": 0.02471703477203846, "eval_rewards/accuracies": 0.73046875, "eval_rewards/chosen": -1.9121036529541016, "eval_rewards/margins": 1.2764959335327148, "eval_rewards/rejected": -3.188599109649658, "eval_runtime": 74.3821, "eval_samples_per_second": 26.888, "eval_steps_per_second": 0.43, "step": 700 }, { "epoch": 1.49, "learning_rate": 1.8662200511184872e-07, "logits/chosen": 0.12966138124465942, "logits/rejected": 0.19625753164291382, "logps/chosen": -487.907470703125, "logps/rejected": -615.3556518554688, "loss": 0.0139, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.759558081626892, "rewards/margins": 1.376070261001587, "rewards/rejected": -3.1356282234191895, "step": 710 }, { "epoch": 1.51, "learning_rate": 1.725696330273575e-07, "logits/chosen": 0.1202569380402565, "logits/rejected": 0.177236407995224, "logps/chosen": -529.9605712890625, "logps/rejected": -626.4281005859375, "loss": 0.0139, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9097445011138916, "rewards/margins": 1.0962101221084595, "rewards/rejected": -3.0059542655944824, "step": 720 }, { "epoch": 1.53, "learning_rate": 1.589561912846089e-07, "logits/chosen": 0.19967588782310486, "logits/rejected": 0.3316526710987091, "logps/chosen": -481.29779052734375, "logps/rejected": -603.3109130859375, "loss": 0.0123, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8021833896636963, "rewards/margins": 1.3467209339141846, "rewards/rejected": -3.148904323577881, "step": 730 }, { "epoch": 1.55, "learning_rate": 1.4579992911531496e-07, "logits/chosen": 0.20930282771587372, "logits/rejected": 0.27201521396636963, "logps/chosen": -432.78070068359375, "logps/rejected": -560.7802124023438, "loss": 0.0121, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5994914770126343, "rewards/margins": 1.341997742652893, "rewards/rejected": -2.9414889812469482, "step": 740 }, { "epoch": 1.57, "learning_rate": 1.3311848288809813e-07, "logits/chosen": 0.10060323774814606, "logits/rejected": 0.20884795486927032, "logps/chosen": -488.33123779296875, "logps/rejected": -580.7650756835938, "loss": 0.0137, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6071643829345703, "rewards/margins": 1.2730271816253662, "rewards/rejected": -2.8801915645599365, "step": 750 }, { "epoch": 1.59, "learning_rate": 1.209288524664029e-07, "logits/chosen": 0.13590273261070251, "logits/rejected": 0.2782810628414154, "logps/chosen": -511.728271484375, "logps/rejected": -611.2279052734375, "loss": 0.0122, "rewards/accuracies": 0.71875, "rewards/chosen": -1.938084363937378, "rewards/margins": 1.1513842344284058, "rewards/rejected": -3.0894687175750732, "step": 760 }, { "epoch": 1.61, "learning_rate": 1.0924737841966497e-07, "logits/chosen": 0.2217625081539154, "logits/rejected": 0.317230761051178, "logps/chosen": -485.08624267578125, "logps/rejected": -596.3556518554688, "loss": 0.0113, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7839618921279907, "rewards/margins": 1.4122188091278076, "rewards/rejected": -3.196180820465088, "step": 770 }, { "epoch": 1.63, "learning_rate": 9.808972011828054e-08, "logits/chosen": 0.15653935074806213, "logits/rejected": 0.288215696811676, "logps/chosen": -521.5089111328125, "logps/rejected": -634.9436645507812, "loss": 0.0125, "rewards/accuracies": 0.75, "rewards/chosen": -2.080339193344116, "rewards/margins": 1.2963542938232422, "rewards/rejected": -3.3766937255859375, "step": 780 }, { "epoch": 1.65, "learning_rate": 8.747083474174527e-08, "logits/chosen": 0.18242642283439636, "logits/rejected": 0.33485209941864014, "logps/chosen": -511.490966796875, "logps/rejected": -598.623046875, "loss": 0.0118, "rewards/accuracies": 0.6875, "rewards/chosen": -1.951550841331482, "rewards/margins": 1.2288819551467896, "rewards/rejected": -3.1804327964782715, "step": 790 }, { "epoch": 1.67, "learning_rate": 7.740495722810269e-08, "logits/chosen": 0.21551553905010223, "logits/rejected": 0.30063092708587646, "logps/chosen": -491.99810791015625, "logps/rejected": -625.0213012695312, "loss": 0.0107, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9443299770355225, "rewards/margins": 1.5316195487976074, "rewards/rejected": -3.47594952583313, "step": 800 }, { "epoch": 1.67, "eval_logits/chosen": 0.28409868478775024, "eval_logits/rejected": 0.3196317255496979, "eval_logps/chosen": -503.05755615234375, "eval_logps/rejected": -641.1343994140625, "eval_loss": 0.022644678130745888, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -1.9947288036346436, "eval_rewards/margins": 1.3003644943237305, "eval_rewards/rejected": -3.295093536376953, "eval_runtime": 75.2508, "eval_samples_per_second": 26.578, "eval_steps_per_second": 0.425, "step": 800 }, { "epoch": 1.7, "learning_rate": 6.790558119157597e-08, "logits/chosen": 0.12448444217443466, "logits/rejected": 0.15070387721061707, "logps/chosen": -556.6755981445312, "logps/rejected": -664.4570922851562, "loss": 0.012, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.042440414428711, "rewards/margins": 1.2739028930664062, "rewards/rejected": -3.316342830657959, "step": 810 }, { "epoch": 1.72, "learning_rate": 5.898544083397e-08, "logits/chosen": 0.16522815823554993, "logits/rejected": 0.24165570735931396, "logps/chosen": -488.497314453125, "logps/rejected": -626.2318115234375, "loss": 0.0114, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8827340602874756, "rewards/margins": 1.2979196310043335, "rewards/rejected": -3.1806535720825195, "step": 820 }, { "epoch": 1.74, "learning_rate": 5.065649387408705e-08, "logits/chosen": 0.1387493908405304, "logits/rejected": 0.18901556730270386, "logps/chosen": -501.7972717285156, "logps/rejected": -649.2764892578125, "loss": 0.0109, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9313652515411377, "rewards/margins": 1.6679942607879639, "rewards/rejected": -3.5993595123291016, "step": 830 }, { "epoch": 1.76, "learning_rate": 4.292990551804171e-08, "logits/chosen": 0.12224831432104111, "logits/rejected": 0.2817748785018921, "logps/chosen": -532.9338989257812, "logps/rejected": -598.0217895507812, "loss": 0.0119, "rewards/accuracies": 0.75, "rewards/chosen": -1.9113423824310303, "rewards/margins": 1.2608517408370972, "rewards/rejected": -3.172194242477417, "step": 840 }, { "epoch": 1.78, "learning_rate": 3.581603349196371e-08, "logits/chosen": 0.13969172537326813, "logits/rejected": 0.24790000915527344, "logps/chosen": -473.35235595703125, "logps/rejected": -589.666259765625, "loss": 0.0108, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7080204486846924, "rewards/margins": 1.3214609622955322, "rewards/rejected": -3.0294814109802246, "step": 850 }, { "epoch": 1.8, "learning_rate": 2.9324414157151367e-08, "logits/chosen": 0.2230512797832489, "logits/rejected": 0.22629483044147491, "logps/chosen": -498.7433166503906, "logps/rejected": -645.7354736328125, "loss": 0.0118, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8642832040786743, "rewards/margins": 1.4582536220550537, "rewards/rejected": -3.3225369453430176, "step": 860 }, { "epoch": 1.82, "learning_rate": 2.3463749726290284e-08, "logits/chosen": 0.13460347056388855, "logits/rejected": 0.25947511196136475, "logps/chosen": -519.2232666015625, "logps/rejected": -617.5545654296875, "loss": 0.0107, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8353309631347656, "rewards/margins": 1.3836863040924072, "rewards/rejected": -3.219017505645752, "step": 870 }, { "epoch": 1.84, "learning_rate": 1.824189659787284e-08, "logits/chosen": 0.2357769012451172, "logits/rejected": 0.23074205219745636, "logps/chosen": -469.6444396972656, "logps/rejected": -621.1527709960938, "loss": 0.0103, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.836211919784546, "rewards/margins": 1.5233814716339111, "rewards/rejected": -3.359593152999878, "step": 880 }, { "epoch": 1.86, "learning_rate": 1.3665854824458035e-08, "logits/chosen": 0.210123211145401, "logits/rejected": 0.2904731333255768, "logps/chosen": -509.2762756347656, "logps/rejected": -632.9801025390625, "loss": 0.0112, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9698455333709717, "rewards/margins": 1.3619072437286377, "rewards/rejected": -3.3317527770996094, "step": 890 }, { "epoch": 1.88, "learning_rate": 9.741758728888217e-09, "logits/chosen": 0.12742657959461212, "logits/rejected": 0.16876272857189178, "logps/chosen": -529.4017333984375, "logps/rejected": -672.1700439453125, "loss": 0.0106, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8726167678833008, "rewards/margins": 1.5384435653686523, "rewards/rejected": -3.411060333251953, "step": 900 }, { "epoch": 1.88, "eval_logits/chosen": 0.28414925932884216, "eval_logits/rejected": 0.32150793075561523, "eval_logps/chosen": -503.032470703125, "eval_logps/rejected": -640.8138427734375, "eval_loss": 0.022440288215875626, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -1.9944782257080078, "eval_rewards/margins": 1.2974092960357666, "eval_rewards/rejected": -3.2918872833251953, "eval_runtime": 75.3961, "eval_samples_per_second": 26.527, "eval_steps_per_second": 0.424, "step": 900 }, { "epoch": 1.9, "learning_rate": 6.474868681043577e-09, "logits/chosen": 0.28579333424568176, "logits/rejected": 0.3222460150718689, "logps/chosen": -456.7623596191406, "logps/rejected": -624.6663208007812, "loss": 0.0108, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.772883415222168, "rewards/margins": 1.6853721141815186, "rewards/rejected": -3.4582557678222656, "step": 910 }, { "epoch": 1.93, "learning_rate": 3.869564046156459e-09, "logits/chosen": 0.12005837261676788, "logits/rejected": 0.25674593448638916, "logps/chosen": -503.75482177734375, "logps/rejected": -651.0211181640625, "loss": 0.0101, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.934744119644165, "rewards/margins": 1.5219500064849854, "rewards/rejected": -3.4566943645477295, "step": 920 }, { "epoch": 1.95, "learning_rate": 1.929337314139412e-09, "logits/chosen": 0.17854854464530945, "logits/rejected": 0.25535306334495544, "logps/chosen": -509.5010681152344, "logps/rejected": -665.1165771484375, "loss": 0.0115, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8284565210342407, "rewards/margins": 1.7975488901138306, "rewards/rejected": -3.626005172729492, "step": 930 }, { "epoch": 1.97, "learning_rate": 6.567894177967325e-10, "logits/chosen": 0.17056016623973846, "logits/rejected": 0.19406890869140625, "logps/chosen": -532.8721923828125, "logps/rejected": -619.7807006835938, "loss": 0.0099, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9198890924453735, "rewards/margins": 1.3293774127960205, "rewards/rejected": -3.2492668628692627, "step": 940 }, { "epoch": 1.99, "learning_rate": 5.3626246194704575e-11, "logits/chosen": 0.18597963452339172, "logits/rejected": 0.2593163549900055, "logps/chosen": -524.7967529296875, "logps/rejected": -610.752685546875, "loss": 0.0124, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9720103740692139, "rewards/margins": 1.214902639389038, "rewards/rejected": -3.186913013458252, "step": 950 }, { "epoch": 2.0, "step": 954, "total_flos": 0.0, "train_loss": 0.049936374161290924, "train_runtime": 8881.7089, "train_samples_per_second": 13.766, "train_steps_per_second": 0.107 } ], "logging_steps": 10, "max_steps": 954, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 0.0, "trial_name": null, "trial_params": null }