{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 100, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.6597137451171875, "logits/rejected": -2.5902962684631348, "logps/chosen": -296.01092529296875, "logps/rejected": -290.09039306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.624401807785034, "logits/rejected": -2.58382511138916, "logps/chosen": -278.5157775878906, "logps/rejected": -242.1708984375, "loss": 0.6931, "rewards/accuracies": 0.4618055522441864, "rewards/chosen": 0.00044631207128986716, "rewards/margins": 0.0005090843187645078, "rewards/rejected": -6.277219654293731e-05, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6496779918670654, "logits/rejected": -2.5882985591888428, "logps/chosen": -276.2167663574219, "logps/rejected": -246.029052734375, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0011385921388864517, "rewards/margins": 0.001740106614306569, "rewards/rejected": -0.0006015143590047956, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.6365559101104736, "logits/rejected": -2.5856800079345703, "logps/chosen": -291.7817077636719, "logps/rejected": -274.3924255371094, "loss": 0.6887, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": 0.0060002789832651615, "rewards/margins": 0.009898515418171883, "rewards/rejected": -0.0038982369005680084, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.5962817668914795, "logits/rejected": -2.533491611480713, "logps/chosen": -304.540283203125, "logps/rejected": -277.9544677734375, "loss": 0.6757, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02992216870188713, "rewards/margins": 0.03784631937742233, "rewards/rejected": -0.007924148812890053, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.999731868769026e-07, "logits/chosen": -2.550187110900879, "logits/rejected": -2.5181498527526855, "logps/chosen": -298.8063049316406, "logps/rejected": -287.06878662109375, "loss": 0.6604, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.0016534685855731368, "rewards/margins": 0.07625629007816315, "rewards/rejected": -0.07790975272655487, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, "logits/chosen": -2.5104401111602783, "logits/rejected": -2.409574031829834, "logps/chosen": -286.75982666015625, "logps/rejected": -255.2505340576172, "loss": 0.6393, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.038099173456430435, "rewards/margins": 0.16247370839118958, "rewards/rejected": -0.2005728781223297, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.967625656594781e-07, "logits/chosen": -2.5491867065429688, "logits/rejected": -2.4775824546813965, "logps/chosen": -305.6695556640625, "logps/rejected": -294.116455078125, "loss": 0.6222, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.25629425048828125, "rewards/margins": 0.2100234478712082, "rewards/rejected": -0.466317743062973, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.93167072587771e-07, "logits/chosen": -2.512619972229004, "logits/rejected": -2.459233522415161, "logps/chosen": -313.1577453613281, "logps/rejected": -333.73199462890625, "loss": 0.6115, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.26731860637664795, "rewards/margins": 0.19400887191295624, "rewards/rejected": -0.4613274931907654, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, "logits/chosen": -2.453090190887451, "logits/rejected": -2.3753929138183594, "logps/chosen": -286.51220703125, "logps/rejected": -299.60174560546875, "loss": 0.5834, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.2537994682788849, "rewards/margins": 0.3461161255836487, "rewards/rejected": -0.5999155044555664, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.820919832540181e-07, "logits/chosen": -2.481619358062744, "logits/rejected": -2.3836333751678467, "logps/chosen": -316.19061279296875, "logps/rejected": -341.30596923828125, "loss": 0.5736, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3343900740146637, "rewards/margins": 0.4206913113594055, "rewards/rejected": -0.755081295967102, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.432966470718384, "eval_logits/rejected": -2.3509771823883057, "eval_logps/chosen": -322.4753723144531, "eval_logps/rejected": -346.4595947265625, "eval_loss": 0.584247350692749, "eval_rewards/accuracies": 0.7242063283920288, "eval_rewards/chosen": -0.3836560547351837, "eval_rewards/margins": 0.46315449476242065, "eval_rewards/rejected": -0.846810519695282, "eval_runtime": 210.4195, "eval_samples_per_second": 9.505, "eval_steps_per_second": 0.299, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -2.392451763153076, "logits/rejected": -2.2609496116638184, "logps/chosen": -329.49127197265625, "logps/rejected": -338.2959289550781, "loss": 0.5785, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.32086434960365295, "rewards/margins": 0.4109874665737152, "rewards/rejected": -0.7318518161773682, "step": 110 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -1.5522091388702393, "logits/rejected": -1.2740141153335571, "logps/chosen": -365.6708984375, "logps/rejected": -378.64117431640625, "loss": 0.5441, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.8018746376037598, "rewards/margins": 0.6111066341400146, "rewards/rejected": -1.4129812717437744, "step": 120 }, { "epoch": 0.27, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -1.4644163846969604, "logits/rejected": -1.1872146129608154, "logps/chosen": -324.7268371582031, "logps/rejected": -374.5579833984375, "loss": 0.549, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4890199601650238, "rewards/margins": 0.5601330995559692, "rewards/rejected": -1.0491530895233154, "step": 130 }, { "epoch": 0.29, "learning_rate": 4.453763107901675e-07, "logits/chosen": -0.9067952036857605, "logits/rejected": -0.45260438323020935, "logps/chosen": -365.0186462402344, "logps/rejected": -414.08526611328125, "loss": 0.5265, "rewards/accuracies": 0.784375011920929, "rewards/chosen": -0.6849763989448547, "rewards/margins": 0.7878872752189636, "rewards/rejected": -1.4728636741638184, "step": 140 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -0.6669697761535645, "logits/rejected": -0.12073500454425812, "logps/chosen": -363.00494384765625, "logps/rejected": -401.21533203125, "loss": 0.5248, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7877362966537476, "rewards/margins": 0.7080036401748657, "rewards/rejected": -1.4957398176193237, "step": 150 }, { "epoch": 0.33, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -0.41461697220802307, "logits/rejected": 0.06870967149734497, "logps/chosen": -365.81103515625, "logps/rejected": -428.4544982910156, "loss": 0.5073, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8387645483016968, "rewards/margins": 0.7532976269721985, "rewards/rejected": -1.59206223487854, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -0.7587008476257324, "logits/rejected": -0.3191295862197876, "logps/chosen": -342.67425537109375, "logps/rejected": -367.0728759765625, "loss": 0.5653, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.8000960350036621, "rewards/margins": 0.5834600329399109, "rewards/rejected": -1.3835561275482178, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.17186331748962402, "logits/rejected": 0.25430601835250854, "logps/chosen": -365.8907165527344, "logps/rejected": -429.11688232421875, "loss": 0.5225, "rewards/accuracies": 0.734375, "rewards/chosen": -0.884574294090271, "rewards/margins": 0.7979339361190796, "rewards/rejected": -1.6825082302093506, "step": 180 }, { "epoch": 0.4, "learning_rate": 3.765821230985757e-07, "logits/chosen": 0.03502330183982849, "logits/rejected": 0.6186638474464417, "logps/chosen": -348.1022644042969, "logps/rejected": -428.8955993652344, "loss": 0.5105, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.7972752451896667, "rewards/margins": 0.9095140695571899, "rewards/rejected": -1.706789255142212, "step": 190 }, { "epoch": 0.42, "learning_rate": 3.604695382782159e-07, "logits/chosen": -0.39503103494644165, "logits/rejected": 0.1367359161376953, "logps/chosen": -365.77545166015625, "logps/rejected": -400.91998291015625, "loss": 0.5116, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7570338249206543, "rewards/margins": 0.7489927411079407, "rewards/rejected": -1.5060265064239502, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": 0.04591968283057213, "eval_logits/rejected": 0.5012260675430298, "eval_logps/chosen": -374.5287780761719, "eval_logps/rejected": -435.0859375, "eval_loss": 0.5307875871658325, "eval_rewards/accuracies": 0.7519841194152832, "eval_rewards/chosen": -0.904190182685852, "eval_rewards/margins": 0.8288835883140564, "eval_rewards/rejected": -1.7330738306045532, "eval_runtime": 211.7151, "eval_samples_per_second": 9.447, "eval_steps_per_second": 0.298, "step": 200 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, "logits/chosen": 0.21950086951255798, "logits/rejected": 0.7312016487121582, "logps/chosen": -394.24078369140625, "logps/rejected": -439.376220703125, "loss": 0.5279, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -1.0525823831558228, "rewards/margins": 0.7364203929901123, "rewards/rejected": -1.789002776145935, "step": 210 }, { "epoch": 0.46, "learning_rate": 3.265574537815398e-07, "logits/chosen": 0.034196797758340836, "logits/rejected": 0.35187411308288574, "logps/chosen": -332.6381530761719, "logps/rejected": -382.64434814453125, "loss": 0.5384, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.7204285860061646, "rewards/margins": 0.6646216511726379, "rewards/rejected": -1.3850500583648682, "step": 220 }, { "epoch": 0.48, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 0.3270489573478699, "logits/rejected": 0.8940129280090332, "logps/chosen": -341.1247253417969, "logps/rejected": -408.52337646484375, "loss": 0.5218, "rewards/accuracies": 0.75, "rewards/chosen": -0.765308141708374, "rewards/margins": 0.8177844882011414, "rewards/rejected": -1.583092451095581, "step": 230 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, "logits/chosen": 0.8036887049674988, "logits/rejected": 1.329053521156311, "logps/chosen": -384.66473388671875, "logps/rejected": -448.1686096191406, "loss": 0.5285, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1919479370117188, "rewards/margins": 0.8430492281913757, "rewards/rejected": -2.0349972248077393, "step": 240 }, { "epoch": 0.52, "learning_rate": 2.7285261601056697e-07, "logits/chosen": 0.51897132396698, "logits/rejected": 1.1455881595611572, "logps/chosen": -373.64044189453125, "logps/rejected": -425.75494384765625, "loss": 0.519, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.9999877214431763, "rewards/margins": 0.7631025314331055, "rewards/rejected": -1.7630901336669922, "step": 250 }, { "epoch": 0.54, "learning_rate": 2.5457665670441937e-07, "logits/chosen": 0.5749747157096863, "logits/rejected": 1.220221757888794, "logps/chosen": -371.4618225097656, "logps/rejected": -424.69439697265625, "loss": 0.4969, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9122276306152344, "rewards/margins": 0.8183482885360718, "rewards/rejected": -1.7305759191513062, "step": 260 }, { "epoch": 0.57, "learning_rate": 2.3627616503391812e-07, "logits/chosen": 0.8650910258293152, "logits/rejected": 1.3838642835617065, "logps/chosen": -398.350341796875, "logps/rejected": -484.79547119140625, "loss": 0.5051, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.091962456703186, "rewards/margins": 0.9698148965835571, "rewards/rejected": -2.061777353286743, "step": 270 }, { "epoch": 0.59, "learning_rate": 2.1804923757009882e-07, "logits/chosen": 0.9545858502388, "logits/rejected": 1.6210914850234985, "logps/chosen": -365.74053955078125, "logps/rejected": -422.39300537109375, "loss": 0.5248, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.9749383926391602, "rewards/margins": 0.841883659362793, "rewards/rejected": -1.8168220520019531, "step": 280 }, { "epoch": 0.61, "learning_rate": 1.9999357655598891e-07, "logits/chosen": 1.363952875137329, "logits/rejected": 2.0388360023498535, "logps/chosen": -385.5817565917969, "logps/rejected": -436.33209228515625, "loss": 0.5155, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -1.1055189371109009, "rewards/margins": 0.77390056848526, "rewards/rejected": -1.8794195652008057, "step": 290 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, "logits/chosen": 1.3142567873001099, "logits/rejected": 1.9236018657684326, "logps/chosen": -394.31964111328125, "logps/rejected": -467.43756103515625, "loss": 0.5027, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9480918049812317, "rewards/margins": 0.7692841291427612, "rewards/rejected": -1.7173759937286377, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": 1.1385366916656494, "eval_logits/rejected": 1.822389006614685, "eval_logps/chosen": -372.8834228515625, "eval_logps/rejected": -436.4477844238281, "eval_loss": 0.508425235748291, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -0.887736976146698, "eval_rewards/margins": 0.8589555621147156, "eval_rewards/rejected": -1.746692419052124, "eval_runtime": 211.1789, "eval_samples_per_second": 9.471, "eval_steps_per_second": 0.298, "step": 300 }, { "epoch": 0.65, "learning_rate": 1.647817538357072e-07, "logits/chosen": 1.0226597785949707, "logits/rejected": 1.96248459815979, "logps/chosen": -367.84130859375, "logps/rejected": -404.55963134765625, "loss": 0.4985, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9431735277175903, "rewards/margins": 0.8295876383781433, "rewards/rejected": -1.7727611064910889, "step": 310 }, { "epoch": 0.67, "learning_rate": 1.478143389201113e-07, "logits/chosen": 1.6019757986068726, "logits/rejected": 2.321258068084717, "logps/chosen": -374.2918701171875, "logps/rejected": -456.11602783203125, "loss": 0.5035, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.0117809772491455, "rewards/margins": 0.9862319231033325, "rewards/rejected": -1.998012900352478, "step": 320 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, "logits/chosen": 1.089303970336914, "logits/rejected": 1.724713921546936, "logps/chosen": -422.41314697265625, "logps/rejected": -462.42999267578125, "loss": 0.5162, "rewards/accuracies": 0.75, "rewards/chosen": -1.184761881828308, "rewards/margins": 0.7901886701583862, "rewards/rejected": -1.9749505519866943, "step": 330 }, { "epoch": 0.71, "learning_rate": 1.1561076868822755e-07, "logits/chosen": 1.2003899812698364, "logits/rejected": 1.7042887210845947, "logps/chosen": -374.2175598144531, "logps/rejected": -423.61114501953125, "loss": 0.5141, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0190467834472656, "rewards/margins": 0.6825306415557861, "rewards/rejected": -1.7015774250030518, "step": 340 }, { "epoch": 0.73, "learning_rate": 1.0054723495346482e-07, "logits/chosen": 1.183173418045044, "logits/rejected": 2.0356898307800293, "logps/chosen": -359.0115661621094, "logps/rejected": -438.88336181640625, "loss": 0.4766, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.0118539333343506, "rewards/margins": 0.9436109662055969, "rewards/rejected": -1.9554650783538818, "step": 350 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, "logits/chosen": 1.1831741333007812, "logits/rejected": 1.9693758487701416, "logps/chosen": -408.16705322265625, "logps/rejected": -451.48370361328125, "loss": 0.4836, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.0999749898910522, "rewards/margins": 0.8795393705368042, "rewards/rejected": -1.9795143604278564, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.289996455765748e-08, "logits/chosen": 1.1307913064956665, "logits/rejected": 2.096768856048584, "logps/chosen": -413.69805908203125, "logps/rejected": -458.0303649902344, "loss": 0.499, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0460116863250732, "rewards/margins": 0.9790540933609009, "rewards/rejected": -2.0250658988952637, "step": 370 }, { "epoch": 0.8, "learning_rate": 6.046442623320145e-08, "logits/chosen": 0.9909412264823914, "logits/rejected": 1.6467933654785156, "logps/chosen": -381.5520324707031, "logps/rejected": -453.1272888183594, "loss": 0.4727, "rewards/accuracies": 0.75, "rewards/chosen": -1.0082350969314575, "rewards/margins": 0.8770313262939453, "rewards/rejected": -1.8852663040161133, "step": 380 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, "logits/chosen": 1.356707215309143, "logits/rejected": 1.9564104080200195, "logps/chosen": -383.09393310546875, "logps/rejected": -455.9044494628906, "loss": 0.5061, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0799481868743896, "rewards/margins": 0.7995551228523254, "rewards/rejected": -1.879503607749939, "step": 390 }, { "epoch": 0.84, "learning_rate": 3.8702478614051345e-08, "logits/chosen": 1.2665964365005493, "logits/rejected": 1.9937832355499268, "logps/chosen": -441.0684509277344, "logps/rejected": -508.45269775390625, "loss": 0.4823, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.202941656112671, "rewards/margins": 1.0496307611465454, "rewards/rejected": -2.2525722980499268, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": 1.3074696063995361, "eval_logits/rejected": 1.9977563619613647, "eval_logps/chosen": -403.6374816894531, "eval_logps/rejected": -476.98516845703125, "eval_loss": 0.5037237405776978, "eval_rewards/accuracies": 0.761904776096344, "eval_rewards/chosen": -1.195277214050293, "eval_rewards/margins": 0.9567888975143433, "eval_rewards/rejected": -2.152066230773926, "eval_runtime": 212.3326, "eval_samples_per_second": 9.419, "eval_steps_per_second": 0.297, "step": 400 }, { "epoch": 0.86, "learning_rate": 2.9492720416985e-08, "logits/chosen": 1.6189842224121094, "logits/rejected": 2.477853775024414, "logps/chosen": -386.3858947753906, "logps/rejected": -442.0225524902344, "loss": 0.5095, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -1.2031550407409668, "rewards/margins": 0.845988929271698, "rewards/rejected": -2.0491440296173096, "step": 410 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, "logits/chosen": 1.4740936756134033, "logits/rejected": 1.9605319499969482, "logps/chosen": -398.62274169921875, "logps/rejected": -460.81201171875, "loss": 0.5112, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -1.1762058734893799, "rewards/margins": 0.8007314801216125, "rewards/rejected": -1.9769372940063477, "step": 420 }, { "epoch": 0.9, "learning_rate": 1.4662207078575684e-08, "logits/chosen": 1.1320947408676147, "logits/rejected": 1.8564393520355225, "logps/chosen": -397.1152648925781, "logps/rejected": -444.3277282714844, "loss": 0.5032, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.07761549949646, "rewards/margins": 0.886116623878479, "rewards/rejected": -1.963732361793518, "step": 430 }, { "epoch": 0.92, "learning_rate": 9.12094829893642e-09, "logits/chosen": 1.1670544147491455, "logits/rejected": 1.9005542993545532, "logps/chosen": -362.5390319824219, "logps/rejected": -448.8984375, "loss": 0.4901, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.019899606704712, "rewards/margins": 0.9710233807563782, "rewards/rejected": -1.9909226894378662, "step": 440 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, "logits/chosen": 1.165825605392456, "logits/rejected": 1.7851331233978271, "logps/chosen": -393.0291442871094, "logps/rejected": -466.2897033691406, "loss": 0.5112, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1060134172439575, "rewards/margins": 0.8082348704338074, "rewards/rejected": -1.9142482280731201, "step": 450 }, { "epoch": 0.96, "learning_rate": 1.9347820230782295e-09, "logits/chosen": 1.1381809711456299, "logits/rejected": 2.048168897628784, "logps/chosen": -380.725830078125, "logps/rejected": -454.0580139160156, "loss": 0.4878, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0911556482315063, "rewards/margins": 0.9514607191085815, "rewards/rejected": -2.042616367340088, "step": 460 }, { "epoch": 0.98, "learning_rate": 3.2839470889836627e-10, "logits/chosen": 0.9774567484855652, "logits/rejected": 1.967919111251831, "logps/chosen": -411.60504150390625, "logps/rejected": -474.35650634765625, "loss": 0.4835, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1035670042037964, "rewards/margins": 0.9632620811462402, "rewards/rejected": -2.066829204559326, "step": 470 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 0.540120158305458, "train_runtime": 18392.9814, "train_samples_per_second": 3.324, "train_steps_per_second": 0.026 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }