{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 100, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0416666666666666e-08, "logits/chosen": 0.01849743165075779, "logits/rejected": 0.013860300183296204, "logps/chosen": -318.92303466796875, "logps/rejected": -327.4117126464844, "loss": 0.0872, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": 0.0165844839066267, "logits/rejected": 0.029045505449175835, "logps/chosen": -380.119384765625, "logps/rejected": -372.70452880859375, "loss": 0.0916, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.00031676876824349165, "rewards/margins": 0.0008045767317526042, "rewards/rejected": -0.00048780813813209534, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.01443287543952465, "logits/rejected": 0.01765434443950653, "logps/chosen": -396.4976501464844, "logps/rejected": -366.0671691894531, "loss": 0.0929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.000257034320384264, "rewards/margins": 0.0013006285298615694, "rewards/rejected": -0.0010435942094773054, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 0.037671297788619995, "logits/rejected": 0.06698160618543625, "logps/chosen": -374.0677795410156, "logps/rejected": -360.3742370605469, "loss": 0.0849, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0024321433156728745, "rewards/margins": 0.003862987505272031, "rewards/rejected": -0.006295130588114262, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.016021814197301865, "logits/rejected": 0.040130265057086945, "logps/chosen": -384.62115478515625, "logps/rejected": -369.37591552734375, "loss": 0.0899, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005014514084905386, "rewards/margins": 0.00654798885807395, "rewards/rejected": -0.01156250387430191, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.999731868769026e-07, "logits/chosen": 0.021576542407274246, "logits/rejected": 0.04092331975698471, "logps/chosen": -395.0044860839844, "logps/rejected": -385.6026306152344, "loss": 0.0905, "rewards/accuracies": 0.65625, "rewards/chosen": -0.011928597465157509, "rewards/margins": 0.01728428527712822, "rewards/rejected": -0.02921288087964058, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, "logits/chosen": 0.09396852552890778, "logits/rejected": 0.177364319562912, "logps/chosen": -373.46978759765625, "logps/rejected": -350.2561950683594, "loss": 0.0896, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.022122707217931747, "rewards/margins": 0.04510267823934555, "rewards/rejected": -0.067225381731987, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.967625656594781e-07, "logits/chosen": 0.09231746941804886, "logits/rejected": 0.10504136979579926, "logps/chosen": -380.4566955566406, "logps/rejected": -384.76495361328125, "loss": 0.0895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.020214151591062546, "rewards/margins": 0.044125162065029144, "rewards/rejected": -0.06433931738138199, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.93167072587771e-07, "logits/chosen": 0.1812177449464798, "logits/rejected": 0.2344866693019867, "logps/chosen": -373.54779052734375, "logps/rejected": -344.9815673828125, "loss": 0.0887, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.003499386366456747, "rewards/margins": 0.11121924966573715, "rewards/rejected": -0.11471863090991974, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, "logits/chosen": 0.23078179359436035, "logits/rejected": 0.3160688281059265, "logps/chosen": -398.22735595703125, "logps/rejected": -354.7359619140625, "loss": 0.0854, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03982505947351456, "rewards/margins": 0.12135788053274155, "rewards/rejected": -0.1611829400062561, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.820919832540181e-07, "logits/chosen": 0.33522385358810425, "logits/rejected": 0.34693339467048645, "logps/chosen": -373.6068115234375, "logps/rejected": -393.63311767578125, "loss": 0.09, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05229802802205086, "rewards/margins": 0.1304590255022049, "rewards/rejected": -0.18275703489780426, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": 0.49261584877967834, "eval_logits/rejected": 0.5302599668502808, "eval_logps/chosen": -392.5748291015625, "eval_logps/rejected": -418.8423767089844, "eval_loss": 0.08443526923656464, "eval_rewards/accuracies": 0.69921875, "eval_rewards/chosen": -0.09445539116859436, "eval_rewards/margins": 0.20123936235904694, "eval_rewards/rejected": -0.2956947684288025, "eval_runtime": 75.5045, "eval_samples_per_second": 26.488, "eval_steps_per_second": 0.424, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.7467175306295647e-07, "logits/chosen": 0.5233359336853027, "logits/rejected": 0.5924205780029297, "logps/chosen": -409.8135681152344, "logps/rejected": -400.6418151855469, "loss": 0.0775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.17791931331157684, "rewards/margins": 0.2254853993654251, "rewards/rejected": -0.40340471267700195, "step": 110 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, "logits/chosen": 0.6610409021377563, "logits/rejected": 0.8009072542190552, "logps/chosen": -459.3719787597656, "logps/rejected": -480.128662109375, "loss": 0.0697, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2562519609928131, "rewards/margins": 0.2973101735115051, "rewards/rejected": -0.5535621643066406, "step": 120 }, { "epoch": 0.27, "learning_rate": 4.5626458262912735e-07, "logits/chosen": 0.8142817616462708, "logits/rejected": 1.0136159658432007, "logps/chosen": -453.57037353515625, "logps/rejected": -438.6094665527344, "loss": 0.0557, "rewards/accuracies": 0.65625, "rewards/chosen": -0.45035696029663086, "rewards/margins": 0.2075636386871338, "rewards/rejected": -0.6579206585884094, "step": 130 }, { "epoch": 0.29, "learning_rate": 4.453763107901675e-07, "logits/chosen": 0.9267638325691223, "logits/rejected": 0.9543718099594116, "logps/chosen": -426.4134826660156, "logps/rejected": -436.49261474609375, "loss": 0.06, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.33891427516937256, "rewards/margins": 0.302972674369812, "rewards/rejected": -0.6418868899345398, "step": 140 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, "logits/chosen": 0.834929347038269, "logits/rejected": 1.0096248388290405, "logps/chosen": -383.9637756347656, "logps/rejected": -392.84912109375, "loss": 0.0588, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2390960454940796, "rewards/margins": 0.35297515988349915, "rewards/rejected": -0.5920711755752563, "step": 150 }, { "epoch": 0.33, "learning_rate": 4.2052190435769554e-07, "logits/chosen": 1.0894076824188232, "logits/rejected": 1.2157137393951416, "logps/chosen": -429.09857177734375, "logps/rejected": -461.9745178222656, "loss": 0.0509, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5552287101745605, "rewards/margins": 0.3786623775959015, "rewards/rejected": -0.9338911175727844, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.0668899744407567e-07, "logits/chosen": 0.9078506231307983, "logits/rejected": 1.0372017621994019, "logps/chosen": -482.3373107910156, "logps/rejected": -479.88916015625, "loss": 0.0479, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.505352795124054, "rewards/margins": 0.26132458448410034, "rewards/rejected": -0.7666773796081543, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, "logits/chosen": 0.80833500623703, "logits/rejected": 0.8488121032714844, "logps/chosen": -413.3409118652344, "logps/rejected": -438.3705139160156, "loss": 0.0476, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4049296975135803, "rewards/margins": 0.3900560736656189, "rewards/rejected": -0.7949857115745544, "step": 180 }, { "epoch": 0.4, "learning_rate": 3.765821230985757e-07, "logits/chosen": 0.9091412425041199, "logits/rejected": 1.0051593780517578, "logps/chosen": -395.74383544921875, "logps/rejected": -402.8367919921875, "loss": 0.0478, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4961649775505066, "rewards/margins": 0.3637959361076355, "rewards/rejected": -0.8599609136581421, "step": 190 }, { "epoch": 0.42, "learning_rate": 3.604695382782159e-07, "logits/chosen": 1.0421111583709717, "logits/rejected": 1.1686071157455444, "logps/chosen": -422.24224853515625, "logps/rejected": -469.1251525878906, "loss": 0.0405, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7005800008773804, "rewards/margins": 0.46449971199035645, "rewards/rejected": -1.1650797128677368, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": 1.1859312057495117, "eval_logits/rejected": 1.2733540534973145, "eval_logps/chosen": -449.3788757324219, "eval_logps/rejected": -505.84661865234375, "eval_loss": 0.045209601521492004, "eval_rewards/accuracies": 0.75390625, "eval_rewards/chosen": -0.6624964475631714, "eval_rewards/margins": 0.5032405257225037, "eval_rewards/rejected": -1.1657369136810303, "eval_runtime": 75.0855, "eval_samples_per_second": 26.636, "eval_steps_per_second": 0.426, "step": 200 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, "logits/chosen": 0.9289053082466125, "logits/rejected": 1.0322377681732178, "logps/chosen": -454.09521484375, "logps/rejected": -484.48956298828125, "loss": 0.0428, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5330354571342468, "rewards/margins": 0.47441625595092773, "rewards/rejected": -1.0074517726898193, "step": 210 }, { "epoch": 0.46, "learning_rate": 3.265574537815398e-07, "logits/chosen": 0.6325788497924805, "logits/rejected": 0.8454742431640625, "logps/chosen": -443.6888732910156, "logps/rejected": -444.2510681152344, "loss": 0.051, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4299241006374359, "rewards/margins": 0.41193485260009766, "rewards/rejected": -0.8418590426445007, "step": 220 }, { "epoch": 0.48, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 0.8997888565063477, "logits/rejected": 0.9853512048721313, "logps/chosen": -413.89520263671875, "logps/rejected": -458.99676513671875, "loss": 0.0525, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5308324694633484, "rewards/margins": 0.4597201943397522, "rewards/rejected": -0.9905527830123901, "step": 230 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, "logits/chosen": 1.0547417402267456, "logits/rejected": 1.1306800842285156, "logps/chosen": -493.91790771484375, "logps/rejected": -539.1799926757812, "loss": 0.0471, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6113244295120239, "rewards/margins": 0.5182110667228699, "rewards/rejected": -1.1295355558395386, "step": 240 }, { "epoch": 0.52, "learning_rate": 2.7285261601056697e-07, "logits/chosen": 1.2281643152236938, "logits/rejected": 1.359076976776123, "logps/chosen": -466.77001953125, "logps/rejected": -483.91259765625, "loss": 0.0419, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7258759140968323, "rewards/margins": 0.42711353302001953, "rewards/rejected": -1.152989387512207, "step": 250 }, { "epoch": 0.54, "learning_rate": 2.5457665670441937e-07, "logits/chosen": 1.2255347967147827, "logits/rejected": 1.462003469467163, "logps/chosen": -491.76190185546875, "logps/rejected": -505.47161865234375, "loss": 0.0451, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7059242725372314, "rewards/margins": 0.6359472274780273, "rewards/rejected": -1.3418715000152588, "step": 260 }, { "epoch": 0.57, "learning_rate": 2.3627616503391812e-07, "logits/chosen": 1.3674428462982178, "logits/rejected": 1.578064203262329, "logps/chosen": -486.397216796875, "logps/rejected": -492.1827087402344, "loss": 0.0472, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7773429155349731, "rewards/margins": 0.38945746421813965, "rewards/rejected": -1.1668003797531128, "step": 270 }, { "epoch": 0.59, "learning_rate": 2.1804923757009882e-07, "logits/chosen": 1.366081953048706, "logits/rejected": 1.5207383632659912, "logps/chosen": -477.0743103027344, "logps/rejected": -530.8953857421875, "loss": 0.0445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8941423296928406, "rewards/margins": 0.4790104925632477, "rewards/rejected": -1.3731528520584106, "step": 280 }, { "epoch": 0.61, "learning_rate": 1.9999357655598891e-07, "logits/chosen": 1.2689809799194336, "logits/rejected": 1.4011085033416748, "logps/chosen": -438.982421875, "logps/rejected": -469.45703125, "loss": 0.0464, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.62468022108078, "rewards/margins": 0.513271689414978, "rewards/rejected": -1.1379519701004028, "step": 290 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, "logits/chosen": 1.1505718231201172, "logits/rejected": 1.4240622520446777, "logps/chosen": -458.03631591796875, "logps/rejected": -443.11712646484375, "loss": 0.0479, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6710134148597717, "rewards/margins": 0.39567166566848755, "rewards/rejected": -1.0666849613189697, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": 1.2982094287872314, "eval_logits/rejected": 1.409311056137085, "eval_logps/chosen": -435.2132568359375, "eval_logps/rejected": -501.30841064453125, "eval_loss": 0.047696553170681, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.5208398699760437, "eval_rewards/margins": 0.5995149612426758, "eval_rewards/rejected": -1.1203548908233643, "eval_runtime": 75.296, "eval_samples_per_second": 26.562, "eval_steps_per_second": 0.425, "step": 300 }, { "epoch": 0.65, "learning_rate": 1.647817538357072e-07, "logits/chosen": 1.2780801057815552, "logits/rejected": 1.3399560451507568, "logps/chosen": -475.42413330078125, "logps/rejected": -517.4520263671875, "loss": 0.0478, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7529923319816589, "rewards/margins": 0.4368392825126648, "rewards/rejected": -1.1898316144943237, "step": 310 }, { "epoch": 0.67, "learning_rate": 1.478143389201113e-07, "logits/chosen": 1.198677897453308, "logits/rejected": 1.4085700511932373, "logps/chosen": -498.35711669921875, "logps/rejected": -497.4380798339844, "loss": 0.0424, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6401562690734863, "rewards/margins": 0.48012202978134155, "rewards/rejected": -1.1202783584594727, "step": 320 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, "logits/chosen": 1.2183105945587158, "logits/rejected": 1.2747819423675537, "logps/chosen": -442.5284118652344, "logps/rejected": -533.216796875, "loss": 0.0454, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5910875201225281, "rewards/margins": 0.5799761414527893, "rewards/rejected": -1.1710636615753174, "step": 330 }, { "epoch": 0.71, "learning_rate": 1.1561076868822755e-07, "logits/chosen": 1.203604817390442, "logits/rejected": 1.1832085847854614, "logps/chosen": -441.4521484375, "logps/rejected": -512.8982543945312, "loss": 0.0428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7571262121200562, "rewards/margins": 0.4640630781650543, "rewards/rejected": -1.221189260482788, "step": 340 }, { "epoch": 0.73, "learning_rate": 1.0054723495346482e-07, "logits/chosen": 1.3052194118499756, "logits/rejected": 1.382683515548706, "logps/chosen": -465.3661193847656, "logps/rejected": -528.7847290039062, "loss": 0.0412, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7757940292358398, "rewards/margins": 0.4744884967803955, "rewards/rejected": -1.2502825260162354, "step": 350 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, "logits/chosen": 1.1976938247680664, "logits/rejected": 1.432969331741333, "logps/chosen": -491.15771484375, "logps/rejected": -515.0520629882812, "loss": 0.0446, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6491819620132446, "rewards/margins": 0.6244359612464905, "rewards/rejected": -1.2736178636550903, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.289996455765748e-08, "logits/chosen": 1.192779541015625, "logits/rejected": 1.324210524559021, "logps/chosen": -504.5486755371094, "logps/rejected": -508.7030334472656, "loss": 0.0435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7155844569206238, "rewards/margins": 0.5151349306106567, "rewards/rejected": -1.2307194471359253, "step": 370 }, { "epoch": 0.8, "learning_rate": 6.046442623320145e-08, "logits/chosen": 1.223356008529663, "logits/rejected": 1.4434764385223389, "logps/chosen": -474.7169494628906, "logps/rejected": -518.0782470703125, "loss": 0.0476, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6729675531387329, "rewards/margins": 0.6387326717376709, "rewards/rejected": -1.3117002248764038, "step": 380 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, "logits/chosen": 1.3060978651046753, "logits/rejected": 1.4896109104156494, "logps/chosen": -470.46661376953125, "logps/rejected": -502.4981384277344, "loss": 0.0482, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6783354878425598, "rewards/margins": 0.5045996904373169, "rewards/rejected": -1.1829349994659424, "step": 390 }, { "epoch": 0.84, "learning_rate": 3.8702478614051345e-08, "logits/chosen": 1.3413165807724, "logits/rejected": 1.4800562858581543, "logps/chosen": -450.84844970703125, "logps/rejected": -509.7266540527344, "loss": 0.0457, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7674819231033325, "rewards/margins": 0.4173991084098816, "rewards/rejected": -1.1848809719085693, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": 1.491492748260498, "eval_logits/rejected": 1.6154029369354248, "eval_logps/chosen": -448.419677734375, "eval_logps/rejected": -518.3443603515625, "eval_loss": 0.044891636818647385, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.6529037952423096, "eval_rewards/margins": 0.6378109455108643, "eval_rewards/rejected": -1.2907147407531738, "eval_runtime": 74.6873, "eval_samples_per_second": 26.778, "eval_steps_per_second": 0.428, "step": 400 }, { "epoch": 0.86, "learning_rate": 2.9492720416985e-08, "logits/chosen": 1.3658090829849243, "logits/rejected": 1.523946762084961, "logps/chosen": -461.0426330566406, "logps/rejected": -491.6429138183594, "loss": 0.045, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6857269406318665, "rewards/margins": 0.5137700438499451, "rewards/rejected": -1.1994969844818115, "step": 410 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, "logits/chosen": 1.3796783685684204, "logits/rejected": 1.5178402662277222, "logps/chosen": -454.60455322265625, "logps/rejected": -483.65704345703125, "loss": 0.0418, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.6705530285835266, "rewards/margins": 0.604373574256897, "rewards/rejected": -1.2749265432357788, "step": 420 }, { "epoch": 0.9, "learning_rate": 1.4662207078575684e-08, "logits/chosen": 1.334680199623108, "logits/rejected": 1.4741976261138916, "logps/chosen": -504.280029296875, "logps/rejected": -529.8871459960938, "loss": 0.0453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7305961847305298, "rewards/margins": 0.5881385207176208, "rewards/rejected": -1.3187347650527954, "step": 430 }, { "epoch": 0.92, "learning_rate": 9.12094829893642e-09, "logits/chosen": 1.3827157020568848, "logits/rejected": 1.5478546619415283, "logps/chosen": -453.01171875, "logps/rejected": -480.3030700683594, "loss": 0.0414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7755357027053833, "rewards/margins": 0.5378071069717407, "rewards/rejected": -1.313342809677124, "step": 440 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, "logits/chosen": 1.2280631065368652, "logits/rejected": 1.454526662826538, "logps/chosen": -487.4305114746094, "logps/rejected": -500.71087646484375, "loss": 0.0425, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6377500295639038, "rewards/margins": 0.5590785145759583, "rewards/rejected": -1.1968284845352173, "step": 450 }, { "epoch": 0.96, "learning_rate": 1.9347820230782295e-09, "logits/chosen": 1.336721658706665, "logits/rejected": 1.4986612796783447, "logps/chosen": -455.5997619628906, "logps/rejected": -474.46038818359375, "loss": 0.0425, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7689257264137268, "rewards/margins": 0.4747004508972168, "rewards/rejected": -1.243626356124878, "step": 460 }, { "epoch": 0.98, "learning_rate": 3.2839470889836627e-10, "logits/chosen": 1.2109500169754028, "logits/rejected": 1.3351854085922241, "logps/chosen": -490.6439514160156, "logps/rejected": -541.4273681640625, "loss": 0.0421, "rewards/accuracies": 0.75, "rewards/chosen": -0.7248164415359497, "rewards/margins": 0.548802375793457, "rewards/rejected": -1.2736186981201172, "step": 470 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 0.0564979040210352, "train_runtime": 4410.0999, "train_samples_per_second": 13.862, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 477, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "trial_name": null, "trial_params": null }