{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 45.900680687932606, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.7660439014434814, "logits/rejected": -2.717564582824707, "logps/chosen": -269.8568420410156, "logps/rejected": -360.52459716796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 42.914729941865076, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.592682361602783, "logits/rejected": -2.5630006790161133, "logps/chosen": -264.6473388671875, "logps/rejected": -251.43508911132812, "loss": 0.6923, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.15051230788230896, "rewards/margins": 0.03262672945857048, "rewards/rejected": 0.11788560450077057, "step": 10 }, { "epoch": 0.04, "grad_norm": 38.452837409916924, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6565070152282715, "logits/rejected": -2.609382390975952, "logps/chosen": -278.57049560546875, "logps/rejected": -293.88580322265625, "loss": 0.675, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 2.982862949371338, "rewards/margins": 0.061783939599990845, "rewards/rejected": 2.92107892036438, "step": 20 }, { "epoch": 0.06, "grad_norm": 33.8019211383058, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.682241916656494, "logits/rejected": -2.6154792308807373, "logps/chosen": -288.4586181640625, "logps/rejected": -252.54623413085938, "loss": 0.6263, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 12.213181495666504, "rewards/margins": 0.8092762231826782, "rewards/rejected": 11.403905868530273, "step": 30 }, { "epoch": 0.08, "grad_norm": 31.31645871670535, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6231982707977295, "logits/rejected": -2.59993052482605, "logps/chosen": -250.78829956054688, "logps/rejected": -231.2698974609375, "loss": 0.5948, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 16.337059020996094, "rewards/margins": 3.214967727661133, "rewards/rejected": 13.122090339660645, "step": 40 }, { "epoch": 0.1, "grad_norm": 28.69978279384899, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.62522554397583, "logits/rejected": -2.5902233123779297, "logps/chosen": -264.43536376953125, "logps/rejected": -280.06646728515625, "loss": 0.5678, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 15.29652214050293, "rewards/margins": 2.578859806060791, "rewards/rejected": 12.717663764953613, "step": 50 }, { "epoch": 0.13, "grad_norm": 31.624733594392392, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.714506149291992, "logits/rejected": -2.663816213607788, "logps/chosen": -253.0142364501953, "logps/rejected": -284.220703125, "loss": 0.5655, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 18.592296600341797, "rewards/margins": 4.731665134429932, "rewards/rejected": 13.860631942749023, "step": 60 }, { "epoch": 0.15, "grad_norm": 31.114783845109457, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.659493923187256, "logits/rejected": -2.6703927516937256, "logps/chosen": -245.8984375, "logps/rejected": -243.549560546875, "loss": 0.5399, "rewards/accuracies": 0.75, "rewards/chosen": 19.718013763427734, "rewards/margins": 6.550329685211182, "rewards/rejected": 13.167686462402344, "step": 70 }, { "epoch": 0.17, "grad_norm": 41.80005183977875, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.816267490386963, "logits/rejected": -2.7780513763427734, "logps/chosen": -303.81060791015625, "logps/rejected": -252.63510131835938, "loss": 0.5406, "rewards/accuracies": 0.71875, "rewards/chosen": 19.00731658935547, "rewards/margins": 7.583803653717041, "rewards/rejected": 11.42351245880127, "step": 80 }, { "epoch": 0.19, "grad_norm": 29.109112028609335, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.7375311851501465, "logits/rejected": -2.7109663486480713, "logps/chosen": -239.97073364257812, "logps/rejected": -243.8080596923828, "loss": 0.5261, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 19.140594482421875, "rewards/margins": 10.550561904907227, "rewards/rejected": 8.590032577514648, "step": 90 }, { "epoch": 0.21, "grad_norm": 27.121615894160126, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.7122435569763184, "logits/rejected": -2.710378646850586, "logps/chosen": -242.0121307373047, "logps/rejected": -249.0485382080078, "loss": 0.5187, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 18.346372604370117, "rewards/margins": 10.366473197937012, "rewards/rejected": 7.979898929595947, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.7638356685638428, "eval_logits/rejected": -2.7384395599365234, "eval_logps/chosen": -243.52903747558594, "eval_logps/rejected": -253.63619995117188, "eval_loss": 0.5296456813812256, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": 19.06442642211914, "eval_rewards/margins": 10.033439636230469, "eval_rewards/rejected": 9.030986785888672, "eval_runtime": 97.082, "eval_samples_per_second": 20.601, "eval_steps_per_second": 0.33, "step": 100 }, { "epoch": 0.23, "grad_norm": 34.19751869568068, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.7722010612487793, "logits/rejected": -2.725037097930908, "logps/chosen": -285.4516906738281, "logps/rejected": -248.623046875, "loss": 0.5339, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 19.128398895263672, "rewards/margins": 9.91553783416748, "rewards/rejected": 9.212862014770508, "step": 110 }, { "epoch": 0.25, "grad_norm": 31.322721824556634, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.70879864692688, "logits/rejected": -2.7064578533172607, "logps/chosen": -262.66522216796875, "logps/rejected": -249.9930419921875, "loss": 0.5035, "rewards/accuracies": 0.71875, "rewards/chosen": 18.501953125, "rewards/margins": 8.638445854187012, "rewards/rejected": 9.863507270812988, "step": 120 }, { "epoch": 0.27, "grad_norm": 29.14222659930629, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.7430055141448975, "logits/rejected": -2.7424817085266113, "logps/chosen": -258.7245788574219, "logps/rejected": -257.83563232421875, "loss": 0.5095, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 23.278545379638672, "rewards/margins": 12.678305625915527, "rewards/rejected": 10.600237846374512, "step": 130 }, { "epoch": 0.29, "grad_norm": 31.79008837992398, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.768493175506592, "logits/rejected": -2.759028911590576, "logps/chosen": -295.2568359375, "logps/rejected": -290.87103271484375, "loss": 0.5009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 20.70852279663086, "rewards/margins": 10.902701377868652, "rewards/rejected": 9.80582332611084, "step": 140 }, { "epoch": 0.31, "grad_norm": 30.12291595525387, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.757780075073242, "logits/rejected": -2.7286086082458496, "logps/chosen": -264.67327880859375, "logps/rejected": -267.2817077636719, "loss": 0.5095, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 18.550952911376953, "rewards/margins": 9.623664855957031, "rewards/rejected": 8.927289009094238, "step": 150 }, { "epoch": 0.33, "grad_norm": 36.61347768159374, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.8047165870666504, "logits/rejected": -2.7731175422668457, "logps/chosen": -251.7733612060547, "logps/rejected": -248.71923828125, "loss": 0.5234, "rewards/accuracies": 0.71875, "rewards/chosen": 21.294713973999023, "rewards/margins": 12.590093612670898, "rewards/rejected": 8.704621315002441, "step": 160 }, { "epoch": 0.36, "grad_norm": 35.61122377710651, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.7708938121795654, "logits/rejected": -2.7386412620544434, "logps/chosen": -230.99172973632812, "logps/rejected": -221.28964233398438, "loss": 0.5068, "rewards/accuracies": 0.6875, "rewards/chosen": 18.34353256225586, "rewards/margins": 8.508430480957031, "rewards/rejected": 9.835103988647461, "step": 170 }, { "epoch": 0.38, "grad_norm": 38.25232870150566, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.73944091796875, "logits/rejected": -2.7309627532958984, "logps/chosen": -237.39566040039062, "logps/rejected": -219.0083770751953, "loss": 0.4872, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 18.96334457397461, "rewards/margins": 10.400626182556152, "rewards/rejected": 8.562715530395508, "step": 180 }, { "epoch": 0.4, "grad_norm": 33.97866980743485, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.7516446113586426, "logits/rejected": -2.7251124382019043, "logps/chosen": -251.7504425048828, "logps/rejected": -249.9357147216797, "loss": 0.5079, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 19.492395401000977, "rewards/margins": 9.673317909240723, "rewards/rejected": 9.819077491760254, "step": 190 }, { "epoch": 0.42, "grad_norm": 134.1578718379133, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.702623128890991, "logits/rejected": -2.6973912715911865, "logps/chosen": -240.9090118408203, "logps/rejected": -245.4873046875, "loss": 0.508, "rewards/accuracies": 0.6875, "rewards/chosen": 18.98764419555664, "rewards/margins": 12.425970077514648, "rewards/rejected": 6.561669826507568, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -2.782581090927124, "eval_logits/rejected": -2.756884813308716, "eval_logps/chosen": -241.94308471679688, "eval_logps/rejected": -255.6435089111328, "eval_loss": 0.5005597472190857, "eval_rewards/accuracies": 0.7265625, "eval_rewards/chosen": 20.650381088256836, "eval_rewards/margins": 13.626703262329102, "eval_rewards/rejected": 7.02367639541626, "eval_runtime": 96.3977, "eval_samples_per_second": 20.747, "eval_steps_per_second": 0.332, "step": 200 }, { "epoch": 0.44, "grad_norm": 29.796012700680272, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.7070722579956055, "logits/rejected": -2.683690071105957, "logps/chosen": -269.78045654296875, "logps/rejected": -245.2332763671875, "loss": 0.5103, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 19.911705017089844, "rewards/margins": 13.897372245788574, "rewards/rejected": 6.0143327713012695, "step": 210 }, { "epoch": 0.46, "grad_norm": 30.457889800742976, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.6613078117370605, "logits/rejected": -2.6541290283203125, "logps/chosen": -245.76773071289062, "logps/rejected": -238.1407470703125, "loss": 0.5084, "rewards/accuracies": 0.75, "rewards/chosen": 19.60938835144043, "rewards/margins": 12.086370468139648, "rewards/rejected": 7.523016452789307, "step": 220 }, { "epoch": 0.48, "grad_norm": 32.04472046652176, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.682553768157959, "logits/rejected": -2.6649653911590576, "logps/chosen": -228.4560089111328, "logps/rejected": -240.5241241455078, "loss": 0.5079, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 20.409427642822266, "rewards/margins": 13.195713996887207, "rewards/rejected": 7.2137131690979, "step": 230 }, { "epoch": 0.5, "grad_norm": 33.84679720475086, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.7205722332000732, "logits/rejected": -2.701112985610962, "logps/chosen": -243.3650665283203, "logps/rejected": -238.9823760986328, "loss": 0.4938, "rewards/accuracies": 0.6875, "rewards/chosen": 20.33513069152832, "rewards/margins": 10.249124526977539, "rewards/rejected": 10.086007118225098, "step": 240 }, { "epoch": 0.52, "grad_norm": 31.858527336933598, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.6577446460723877, "logits/rejected": -2.6170592308044434, "logps/chosen": -275.0615234375, "logps/rejected": -250.7952880859375, "loss": 0.486, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 23.128969192504883, "rewards/margins": 14.401697158813477, "rewards/rejected": 8.727272987365723, "step": 250 }, { "epoch": 0.54, "grad_norm": 33.909893139050666, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.7196850776672363, "logits/rejected": -2.708618640899658, "logps/chosen": -272.7483825683594, "logps/rejected": -246.864013671875, "loss": 0.4929, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 21.56315803527832, "rewards/margins": 14.446769714355469, "rewards/rejected": 7.116389274597168, "step": 260 }, { "epoch": 0.56, "grad_norm": 32.106664429608195, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.7111871242523193, "logits/rejected": -2.69014310836792, "logps/chosen": -257.98822021484375, "logps/rejected": -246.855712890625, "loss": 0.5003, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 19.00775718688965, "rewards/margins": 8.219191551208496, "rewards/rejected": 10.788566589355469, "step": 270 }, { "epoch": 0.59, "grad_norm": 36.46282639590835, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.6879024505615234, "logits/rejected": -2.6575160026550293, "logps/chosen": -241.15469360351562, "logps/rejected": -236.71749877929688, "loss": 0.5092, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 20.52240753173828, "rewards/margins": 15.053570747375488, "rewards/rejected": 5.468836784362793, "step": 280 }, { "epoch": 0.61, "grad_norm": 32.938385675504726, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -2.730700969696045, "logits/rejected": -2.7169148921966553, "logps/chosen": -231.1471710205078, "logps/rejected": -233.15213012695312, "loss": 0.4989, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 19.9401798248291, "rewards/margins": 11.136396408081055, "rewards/rejected": 8.803783416748047, "step": 290 }, { "epoch": 0.63, "grad_norm": 40.46462898598811, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -2.699219226837158, "logits/rejected": -2.6627821922302246, "logps/chosen": -282.5068054199219, "logps/rejected": -253.93002319335938, "loss": 0.4808, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 18.48929214477539, "rewards/margins": 12.313825607299805, "rewards/rejected": 6.175467491149902, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -2.737816095352173, "eval_logits/rejected": -2.7115261554718018, "eval_logps/chosen": -241.775146484375, "eval_logps/rejected": -255.71316528320312, "eval_loss": 0.4966064989566803, "eval_rewards/accuracies": 0.72265625, "eval_rewards/chosen": 20.818317413330078, "eval_rewards/margins": 13.864299774169922, "eval_rewards/rejected": 6.954016208648682, "eval_runtime": 96.5223, "eval_samples_per_second": 20.721, "eval_steps_per_second": 0.332, "step": 300 }, { "epoch": 0.65, "grad_norm": 34.221593161714, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -2.650965452194214, "logits/rejected": -2.6548705101013184, "logps/chosen": -275.9816589355469, "logps/rejected": -252.0779571533203, "loss": 0.4758, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 19.354223251342773, "rewards/margins": 12.915657043457031, "rewards/rejected": 6.438567161560059, "step": 310 }, { "epoch": 0.67, "grad_norm": 33.001844259909745, "learning_rate": 1.488723393865766e-07, "logits/chosen": -2.6921844482421875, "logits/rejected": -2.691749095916748, "logps/chosen": -272.9283142089844, "logps/rejected": -237.22213745117188, "loss": 0.4701, "rewards/accuracies": 0.75, "rewards/chosen": 20.741744995117188, "rewards/margins": 13.140623092651367, "rewards/rejected": 7.6011223793029785, "step": 320 }, { "epoch": 0.69, "grad_norm": 29.49781014497435, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -2.7120227813720703, "logits/rejected": -2.677337169647217, "logps/chosen": -243.09988403320312, "logps/rejected": -251.96121215820312, "loss": 0.4909, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 19.110301971435547, "rewards/margins": 11.57287883758545, "rewards/rejected": 7.537426948547363, "step": 330 }, { "epoch": 0.71, "grad_norm": 41.44497406324907, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -2.723417043685913, "logits/rejected": -2.691898822784424, "logps/chosen": -269.8048095703125, "logps/rejected": -273.60491943359375, "loss": 0.4884, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 20.189619064331055, "rewards/margins": 13.752766609191895, "rewards/rejected": 6.43685245513916, "step": 340 }, { "epoch": 0.73, "grad_norm": 34.6350412976571, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -2.700378179550171, "logits/rejected": -2.6705689430236816, "logps/chosen": -238.4877166748047, "logps/rejected": -235.2138214111328, "loss": 0.4772, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 19.723506927490234, "rewards/margins": 14.228517532348633, "rewards/rejected": 5.494990825653076, "step": 350 }, { "epoch": 0.75, "grad_norm": 42.44685611987456, "learning_rate": 8.729103716819111e-08, "logits/chosen": -2.732093334197998, "logits/rejected": -2.685842990875244, "logps/chosen": -282.49151611328125, "logps/rejected": -255.60482788085938, "loss": 0.5041, "rewards/accuracies": 0.75, "rewards/chosen": 20.200531005859375, "rewards/margins": 14.500404357910156, "rewards/rejected": 5.700125694274902, "step": 360 }, { "epoch": 0.77, "grad_norm": 33.10027222498208, "learning_rate": 7.387025063449081e-08, "logits/chosen": -2.69976544380188, "logits/rejected": -2.6724953651428223, "logps/chosen": -256.20074462890625, "logps/rejected": -229.0190887451172, "loss": 0.5081, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 19.13229751586914, "rewards/margins": 9.917332649230957, "rewards/rejected": 9.2149658203125, "step": 370 }, { "epoch": 0.79, "grad_norm": 30.0028500984578, "learning_rate": 6.138919252022435e-08, "logits/chosen": -2.6289491653442383, "logits/rejected": -2.6343891620635986, "logps/chosen": -221.06399536132812, "logps/rejected": -254.60464477539062, "loss": 0.4847, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 18.420679092407227, "rewards/margins": 14.3089017868042, "rewards/rejected": 4.111776828765869, "step": 380 }, { "epoch": 0.82, "grad_norm": 41.20295613426392, "learning_rate": 4.991445467064689e-08, "logits/chosen": -2.673283338546753, "logits/rejected": -2.668842315673828, "logps/chosen": -285.0795593261719, "logps/rejected": -274.74163818359375, "loss": 0.4871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 20.55733871459961, "rewards/margins": 11.32852840423584, "rewards/rejected": 9.228808403015137, "step": 390 }, { "epoch": 0.84, "grad_norm": 32.56833882197615, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -2.676893949508667, "logits/rejected": -2.6494295597076416, "logps/chosen": -250.10324096679688, "logps/rejected": -261.3594055175781, "loss": 0.4835, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 20.431964874267578, "rewards/margins": 14.77888011932373, "rewards/rejected": 5.653082370758057, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -2.731470823287964, "eval_logits/rejected": -2.703744649887085, "eval_logps/chosen": -241.37046813964844, "eval_logps/rejected": -256.2980041503906, "eval_loss": 0.4916878044605255, "eval_rewards/accuracies": 0.734375, "eval_rewards/chosen": 21.223026275634766, "eval_rewards/margins": 14.853860855102539, "eval_rewards/rejected": 6.369164943695068, "eval_runtime": 96.4786, "eval_samples_per_second": 20.73, "eval_steps_per_second": 0.332, "step": 400 }, { "epoch": 0.86, "grad_norm": 33.3145791190938, "learning_rate": 3.022313472693447e-08, "logits/chosen": -2.7175421714782715, "logits/rejected": -2.6865715980529785, "logps/chosen": -276.17120361328125, "logps/rejected": -262.1912841796875, "loss": 0.4963, "rewards/accuracies": 0.78125, "rewards/chosen": 22.746294021606445, "rewards/margins": 16.12038803100586, "rewards/rejected": 6.625903129577637, "step": 410 }, { "epoch": 0.88, "grad_norm": 38.26789970081439, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -2.6725165843963623, "logits/rejected": -2.6631102561950684, "logps/chosen": -278.31524658203125, "logps/rejected": -268.63787841796875, "loss": 0.497, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 18.758575439453125, "rewards/margins": 12.9342041015625, "rewards/rejected": 5.824368953704834, "step": 420 }, { "epoch": 0.9, "grad_norm": 29.576447372480786, "learning_rate": 1.521597710086439e-08, "logits/chosen": -2.622992515563965, "logits/rejected": -2.5860610008239746, "logps/chosen": -260.331787109375, "logps/rejected": -252.45510864257812, "loss": 0.4834, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 18.70633316040039, "rewards/margins": 13.38142204284668, "rewards/rejected": 5.3249101638793945, "step": 430 }, { "epoch": 0.92, "grad_norm": 31.865322800676886, "learning_rate": 9.57301420397924e-09, "logits/chosen": -2.696772813796997, "logits/rejected": -2.6611225605010986, "logps/chosen": -262.6088562011719, "logps/rejected": -258.1268005371094, "loss": 0.485, "rewards/accuracies": 0.6875, "rewards/chosen": 21.72829246520996, "rewards/margins": 10.743375778198242, "rewards/rejected": 10.984918594360352, "step": 440 }, { "epoch": 0.94, "grad_norm": 29.684061550729, "learning_rate": 5.212833302556258e-09, "logits/chosen": -2.6331818103790283, "logits/rejected": -2.646833658218384, "logps/chosen": -273.6771240234375, "logps/rejected": -299.3607482910156, "loss": 0.4824, "rewards/accuracies": 0.6875, "rewards/chosen": 20.00701904296875, "rewards/margins": 12.293313980102539, "rewards/rejected": 7.7137041091918945, "step": 450 }, { "epoch": 0.96, "grad_norm": 35.062338944611746, "learning_rate": 2.158697848236607e-09, "logits/chosen": -2.686453342437744, "logits/rejected": -2.6655733585357666, "logps/chosen": -252.51998901367188, "logps/rejected": -234.3843231201172, "loss": 0.4908, "rewards/accuracies": 0.6875, "rewards/chosen": 19.44460105895996, "rewards/margins": 11.141111373901367, "rewards/rejected": 8.303489685058594, "step": 460 }, { "epoch": 0.98, "grad_norm": 32.367340987762965, "learning_rate": 4.269029751107489e-10, "logits/chosen": -2.6681036949157715, "logits/rejected": -2.640652656555176, "logps/chosen": -258.857177734375, "logps/rejected": -274.1594543457031, "loss": 0.4653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 19.84146499633789, "rewards/margins": 11.649839401245117, "rewards/rejected": 8.191625595092773, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.5147256711536871, "train_runtime": 7551.132, "train_samples_per_second": 8.096, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }