{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1724, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.43359375, "learning_rate": 2.890173410404624e-09, "logits/chosen": 0.1325806975364685, "logits/rejected": 0.3077998757362366, "logps/chosen": -239.35935974121094, "logps/rejected": -304.581298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.416015625, "learning_rate": 2.890173410404624e-08, "logits/chosen": -0.010774746537208557, "logits/rejected": 0.23452165722846985, "logps/chosen": -243.3074493408203, "logps/rejected": -304.1199035644531, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.00028879166347905993, "rewards/margins": 0.0006378353573381901, "rewards/margins_max": 0.0028404404874891043, "rewards/margins_min": -0.0015647696563974023, "rewards/margins_std": 0.0031149541027843952, "rewards/rejected": -0.00034904375206679106, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.400390625, "learning_rate": 5.780346820809248e-08, "logits/chosen": -0.05719061568379402, "logits/rejected": 0.5148837566375732, "logps/chosen": -272.7169494628906, "logps/rejected": -216.58859252929688, "loss": 0.6931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0008704366046003997, "rewards/margins": 0.0001740378502290696, "rewards/margins_max": 0.0022189407609403133, "rewards/margins_min": -0.0018708650022745132, "rewards/margins_std": 0.002891929354518652, "rewards/rejected": -0.0010444745421409607, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.4921875, "learning_rate": 8.670520231213872e-08, "logits/chosen": 0.05507341027259827, "logits/rejected": 0.5646872520446777, "logps/chosen": -272.96728515625, "logps/rejected": -252.10733032226562, "loss": 0.6932, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0014279346214607358, "rewards/margins": -0.001033178297802806, "rewards/margins_max": 0.002007028553634882, "rewards/margins_min": -0.004073385149240494, "rewards/margins_std": 0.00429950188845396, "rewards/rejected": -0.00039475635276176035, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.447265625, "learning_rate": 1.1560693641618496e-07, "logits/chosen": -0.08530770242214203, "logits/rejected": 0.37523841857910156, "logps/chosen": -256.03692626953125, "logps/rejected": -224.8648223876953, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0013576907804235816, "rewards/margins": -0.0014004515251144767, "rewards/margins_max": 0.0015217246254906058, "rewards/margins_min": -0.004322628024965525, "rewards/margins_std": 0.0041325814090669155, "rewards/rejected": 4.276079198461957e-05, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.45703125, "learning_rate": 1.445086705202312e-07, "logits/chosen": 0.10976707935333252, "logits/rejected": 0.40187758207321167, "logps/chosen": -205.61318969726562, "logps/rejected": -214.9802703857422, "loss": 0.693, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0007841205224394798, "rewards/margins": 0.0018329259473830462, "rewards/margins_max": 0.004336017183959484, "rewards/margins_min": -0.0006701658712700009, "rewards/margins_std": 0.0035399063490331173, "rewards/rejected": -0.0010488051921129227, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.39453125, "learning_rate": 1.7341040462427744e-07, "logits/chosen": 0.2901094853878021, "logits/rejected": 0.4794164299964905, "logps/chosen": -207.44509887695312, "logps/rejected": -231.39382934570312, "loss": 0.693, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.001270442851819098, "rewards/margins": -0.0007280521094799042, "rewards/margins_max": 0.0019893264397978783, "rewards/margins_min": -0.0034454308915883303, "rewards/margins_std": 0.0038429535925388336, "rewards/rejected": -0.0005423908005468547, "step": 60 }, { "epoch": 0.04, "grad_norm": 0.435546875, "learning_rate": 2.023121387283237e-07, "logits/chosen": 0.035371266305446625, "logits/rejected": 0.4755796492099762, "logps/chosen": -259.833740234375, "logps/rejected": -226.2167205810547, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0010710505302995443, "rewards/margins": 0.0011786860413849354, "rewards/margins_max": 0.004792899824678898, "rewards/margins_min": -0.002435527741909027, "rewards/margins_std": 0.005111270118504763, "rewards/rejected": -0.0022497368045151234, "step": 70 }, { "epoch": 0.05, "grad_norm": 0.4609375, "learning_rate": 2.3121387283236991e-07, "logits/chosen": 0.27303510904312134, "logits/rejected": 0.7382463216781616, "logps/chosen": -217.78671264648438, "logps/rejected": -208.35910034179688, "loss": 0.6928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2639263988821767e-05, "rewards/margins": 0.0014770211419090629, "rewards/margins_max": 0.0042491876520216465, "rewards/margins_min": -0.0012951450189575553, "rewards/margins_std": 0.003920434974133968, "rewards/rejected": -0.0014996604295447469, "step": 80 }, { "epoch": 0.05, "grad_norm": 0.6640625, "learning_rate": 2.601156069364162e-07, "logits/chosen": -0.20650863647460938, "logits/rejected": 0.17405006289482117, "logps/chosen": -226.12808227539062, "logps/rejected": -233.56381225585938, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": 0.000633719377219677, "rewards/margins": 0.0017947215819731355, "rewards/margins_max": 0.004501459188759327, "rewards/margins_min": -0.0009120159666053951, "rewards/margins_std": 0.0038279048167169094, "rewards/rejected": -0.0011610020883381367, "step": 90 }, { "epoch": 0.06, "grad_norm": 0.431640625, "learning_rate": 2.890173410404624e-07, "logits/chosen": -0.019260473549365997, "logits/rejected": 0.5504380464553833, "logps/chosen": -292.51995849609375, "logps/rejected": -235.86843872070312, "loss": 0.6919, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.001650218851864338, "rewards/margins": 0.002649242291226983, "rewards/margins_max": 0.005218566861003637, "rewards/margins_min": 7.99179106252268e-05, "rewards/margins_std": 0.0036335731856524944, "rewards/rejected": -0.0009990233229473233, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.53125, "learning_rate": 3.1791907514450865e-07, "logits/chosen": -0.06840448081493378, "logits/rejected": 0.6899427175521851, "logps/chosen": -252.0308380126953, "logps/rejected": -199.84799194335938, "loss": 0.6918, "rewards/accuracies": 0.75, "rewards/chosen": 0.0018273231107741594, "rewards/margins": 0.00415054801851511, "rewards/margins_max": 0.0076604606583714485, "rewards/margins_min": 0.0006406344473361969, "rewards/margins_std": 0.004963767249137163, "rewards/rejected": -0.0023232249077409506, "step": 110 }, { "epoch": 0.07, "grad_norm": 0.36328125, "learning_rate": 3.468208092485549e-07, "logits/chosen": 0.09203040599822998, "logits/rejected": 0.5125548243522644, "logps/chosen": -256.213623046875, "logps/rejected": -232.49942016601562, "loss": 0.6915, "rewards/accuracies": 0.75, "rewards/chosen": 0.0007183876005001366, "rewards/margins": 0.004233072511851788, "rewards/margins_max": 0.007029411382973194, "rewards/margins_min": 0.0014367332914844155, "rewards/margins_std": 0.003954620566219091, "rewards/rejected": -0.0035146852023899555, "step": 120 }, { "epoch": 0.08, "grad_norm": 0.462890625, "learning_rate": 3.757225433526011e-07, "logits/chosen": -0.027632858604192734, "logits/rejected": 0.39557844400405884, "logps/chosen": -266.2771911621094, "logps/rejected": -271.76116943359375, "loss": 0.6907, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.002352924318984151, "rewards/margins": 0.005208231043070555, "rewards/margins_max": 0.008825947530567646, "rewards/margins_min": 0.001590514904819429, "rewards/margins_std": 0.005116222891956568, "rewards/rejected": -0.00285530649125576, "step": 130 }, { "epoch": 0.08, "grad_norm": 0.40625, "learning_rate": 4.046242774566474e-07, "logits/chosen": 0.06764040887355804, "logits/rejected": 0.3966519236564636, "logps/chosen": -178.83749389648438, "logps/rejected": -188.39877319335938, "loss": 0.6908, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0029165446758270264, "rewards/margins": 0.006306161172688007, "rewards/margins_max": 0.009462257847189903, "rewards/margins_min": 0.0031500644981861115, "rewards/margins_std": 0.004463394172489643, "rewards/rejected": -0.0033896160311996937, "step": 140 }, { "epoch": 0.09, "grad_norm": 0.447265625, "learning_rate": 4.3352601156069365e-07, "logits/chosen": 0.011811649426817894, "logits/rejected": 0.4984157979488373, "logps/chosen": -268.1231994628906, "logps/rejected": -223.78799438476562, "loss": 0.6899, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.002369340742006898, "rewards/margins": 0.006674068979918957, "rewards/margins_max": 0.013764929957687855, "rewards/margins_min": -0.0004167918232269585, "rewards/margins_std": 0.010027991607785225, "rewards/rejected": -0.0043047284707427025, "step": 150 }, { "epoch": 0.09, "grad_norm": 0.322265625, "learning_rate": 4.6242774566473983e-07, "logits/chosen": -0.03828499838709831, "logits/rejected": 0.3794795870780945, "logps/chosen": -245.52865600585938, "logps/rejected": -234.1727752685547, "loss": 0.689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.004552280530333519, "rewards/margins": 0.008487861603498459, "rewards/margins_max": 0.012918056920170784, "rewards/margins_min": 0.004057666752487421, "rewards/margins_std": 0.006265241652727127, "rewards/rejected": -0.003935581538826227, "step": 160 }, { "epoch": 0.1, "grad_norm": 0.49609375, "learning_rate": 4.913294797687861e-07, "logits/chosen": -0.0168992280960083, "logits/rejected": 0.500325620174408, "logps/chosen": -296.49517822265625, "logps/rejected": -248.3328094482422, "loss": 0.6887, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.003083079354837537, "rewards/margins": 0.006065175868570805, "rewards/margins_max": 0.011483820155262947, "rewards/margins_min": 0.0006465300684794784, "rewards/margins_std": 0.0076631223782896996, "rewards/rejected": -0.002982096979394555, "step": 170 }, { "epoch": 0.1, "grad_norm": 0.40625, "learning_rate": 4.999748710138438e-07, "logits/chosen": 0.14815935492515564, "logits/rejected": 0.5510139465332031, "logps/chosen": -233.9811553955078, "logps/rejected": -228.5449676513672, "loss": 0.688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.003167560789734125, "rewards/margins": 0.007796141318976879, "rewards/margins_max": 0.012642833404242992, "rewards/margins_min": 0.002949449699372053, "rewards/margins_std": 0.006854257546365261, "rewards/rejected": -0.004628580994904041, "step": 180 }, { "epoch": 0.11, "grad_norm": 0.416015625, "learning_rate": 4.998518024263461e-07, "logits/chosen": 0.19040322303771973, "logits/rejected": 0.6236617565155029, "logps/chosen": -230.96762084960938, "logps/rejected": -211.4745330810547, "loss": 0.6871, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.006373309530317783, "rewards/margins": 0.012960617430508137, "rewards/margins_max": 0.01996336504817009, "rewards/margins_min": 0.0059578740037977695, "rewards/margins_std": 0.0099033759906888, "rewards/rejected": -0.006587309297174215, "step": 190 }, { "epoch": 0.12, "grad_norm": 0.416015625, "learning_rate": 4.996262291366814e-07, "logits/chosen": 0.054732900112867355, "logits/rejected": 0.22424785792827606, "logps/chosen": -210.0012664794922, "logps/rejected": -233.76388549804688, "loss": 0.6873, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.004412280861288309, "rewards/margins": 0.011961949989199638, "rewards/margins_max": 0.017657486721873283, "rewards/margins_min": 0.006266415119171143, "rewards/margins_std": 0.0080547034740448, "rewards/rejected": -0.007549669593572617, "step": 200 }, { "epoch": 0.12, "grad_norm": 0.498046875, "learning_rate": 4.992982436890003e-07, "logits/chosen": 0.09016792476177216, "logits/rejected": 0.45956069231033325, "logps/chosen": -226.3985595703125, "logps/rejected": -221.092529296875, "loss": 0.6868, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.005489318631589413, "rewards/margins": 0.013238553889095783, "rewards/margins_max": 0.018587926402688026, "rewards/margins_min": 0.00788918323814869, "rewards/margins_std": 0.007565152831375599, "rewards/rejected": -0.007749234326183796, "step": 210 }, { "epoch": 0.13, "grad_norm": 0.458984375, "learning_rate": 4.988679806432711e-07, "logits/chosen": -0.08951343595981598, "logits/rejected": 0.46994414925575256, "logps/chosen": -264.4379577636719, "logps/rejected": -236.77346801757812, "loss": 0.6853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.007678179536014795, "rewards/margins": 0.01784335821866989, "rewards/margins_max": 0.025632936507463455, "rewards/margins_min": 0.010053779929876328, "rewards/margins_std": 0.011016124859452248, "rewards/rejected": -0.010165175423026085, "step": 220 }, { "epoch": 0.13, "grad_norm": 0.474609375, "learning_rate": 4.983356165200751e-07, "logits/chosen": 0.07358375936746597, "logits/rejected": 0.617803692817688, "logps/chosen": -276.56536865234375, "logps/rejected": -237.3117218017578, "loss": 0.6848, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0074386284686625, "rewards/margins": 0.01824963092803955, "rewards/margins_max": 0.026552444323897362, "rewards/margins_min": 0.00994681753218174, "rewards/margins_std": 0.01174195110797882, "rewards/rejected": -0.010811002925038338, "step": 230 }, { "epoch": 0.14, "grad_norm": 0.4296875, "learning_rate": 4.977013697281864e-07, "logits/chosen": 0.23069170117378235, "logits/rejected": 0.546830952167511, "logps/chosen": -229.92764282226562, "logps/rejected": -231.63357543945312, "loss": 0.6848, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.005361995659768581, "rewards/margins": 0.015256190672516823, "rewards/margins_max": 0.022752556949853897, "rewards/margins_min": 0.007759819272905588, "rewards/margins_std": 0.010601467452943325, "rewards/rejected": -0.009894194081425667, "step": 240 }, { "epoch": 0.15, "grad_norm": 0.412109375, "learning_rate": 4.969655004749673e-07, "logits/chosen": 0.05646086856722832, "logits/rejected": 0.3687281012535095, "logps/chosen": -203.8467559814453, "logps/rejected": -216.0234375, "loss": 0.6846, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.002810864243656397, "rewards/margins": 0.014029537327587605, "rewards/margins_max": 0.019475888460874557, "rewards/margins_min": 0.008583188988268375, "rewards/margins_std": 0.007702300790697336, "rewards/rejected": -0.011218673549592495, "step": 250 }, { "epoch": 0.15, "grad_norm": 0.490234375, "learning_rate": 4.961283106596155e-07, "logits/chosen": 0.1512751430273056, "logits/rejected": 0.5323320627212524, "logps/chosen": -256.96673583984375, "logps/rejected": -265.65509033203125, "loss": 0.6829, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.011281570419669151, "rewards/margins": 0.0202823244035244, "rewards/margins_max": 0.02979358099400997, "rewards/margins_min": 0.010771063156425953, "rewards/margins_std": 0.013450953178107738, "rewards/rejected": -0.009000752121210098, "step": 260 }, { "epoch": 0.16, "grad_norm": 0.447265625, "learning_rate": 4.951901437493054e-07, "logits/chosen": 0.08749596029520035, "logits/rejected": 0.47565847635269165, "logps/chosen": -252.97323608398438, "logps/rejected": -220.1329803466797, "loss": 0.6826, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.005718126427382231, "rewards/margins": 0.019988398998975754, "rewards/margins_max": 0.025959456339478493, "rewards/margins_min": 0.014017338864505291, "rewards/margins_std": 0.008444352075457573, "rewards/rejected": -0.014270270243287086, "step": 270 }, { "epoch": 0.16, "grad_norm": 0.453125, "learning_rate": 4.941513846382779e-07, "logits/chosen": 0.31170374155044556, "logits/rejected": 0.6478020548820496, "logps/chosen": -207.89794921875, "logps/rejected": -225.51791381835938, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.010051739402115345, "rewards/margins": 0.019436318427324295, "rewards/margins_max": 0.025176430121064186, "rewards/margins_min": 0.013696206733584404, "rewards/margins_std": 0.008117742836475372, "rewards/rejected": -0.009384581819176674, "step": 280 }, { "epoch": 0.17, "grad_norm": 0.431640625, "learning_rate": 4.930124594899313e-07, "logits/chosen": 0.14136287569999695, "logits/rejected": 0.5530031323432922, "logps/chosen": -244.9897918701172, "logps/rejected": -244.90457153320312, "loss": 0.6814, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0166664756834507, "rewards/margins": 0.02829556167125702, "rewards/margins_max": 0.037106942385435104, "rewards/margins_min": 0.019484177231788635, "rewards/margins_std": 0.012461178004741669, "rewards/rejected": -0.011629085056483746, "step": 290 }, { "epoch": 0.17, "grad_norm": 0.494140625, "learning_rate": 4.917738355619842e-07, "logits/chosen": 0.2040259838104248, "logits/rejected": 0.6138412356376648, "logps/chosen": -193.21507263183594, "logps/rejected": -194.8699188232422, "loss": 0.6796, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.012191513553261757, "rewards/margins": 0.026244569569826126, "rewards/margins_max": 0.036748819053173065, "rewards/margins_min": 0.015740320086479187, "rewards/margins_std": 0.014855247922241688, "rewards/rejected": -0.01405305415391922, "step": 300 }, { "epoch": 0.18, "grad_norm": 0.453125, "learning_rate": 4.904360210147762e-07, "logits/chosen": 0.1507195234298706, "logits/rejected": 0.5720406174659729, "logps/chosen": -242.0141143798828, "logps/rejected": -216.76132202148438, "loss": 0.6791, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.010296806693077087, "rewards/margins": 0.02473880909383297, "rewards/margins_max": 0.036660365760326385, "rewards/margins_min": 0.012817250564694405, "rewards/margins_std": 0.0168596301227808, "rewards/rejected": -0.014442001469433308, "step": 310 }, { "epoch": 0.19, "grad_norm": 0.41796875, "learning_rate": 4.8899956470279e-07, "logits/chosen": -0.03488525375723839, "logits/rejected": 0.40159520506858826, "logps/chosen": -218.23812866210938, "logps/rejected": -190.8876953125, "loss": 0.679, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014135973528027534, "rewards/margins": 0.02363484725356102, "rewards/margins_max": 0.036806877702474594, "rewards/margins_min": 0.010462815873324871, "rewards/margins_std": 0.018628064543008804, "rewards/rejected": -0.00949887465685606, "step": 320 }, { "epoch": 0.19, "grad_norm": 0.4375, "learning_rate": 4.874650559494765e-07, "logits/chosen": 0.10674601793289185, "logits/rejected": 0.5667238831520081, "logps/chosen": -242.5848388671875, "logps/rejected": -212.60922241210938, "loss": 0.6782, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.008991287089884281, "rewards/margins": 0.02689727023243904, "rewards/margins_max": 0.03854988515377045, "rewards/margins_min": 0.015244655311107635, "rewards/margins_std": 0.016479285433888435, "rewards/rejected": -0.017905984073877335, "step": 330 }, { "epoch": 0.2, "grad_norm": 0.357421875, "learning_rate": 4.858331243054782e-07, "logits/chosen": 0.09378918260335922, "logits/rejected": 0.42793530225753784, "logps/chosen": -282.80413818359375, "logps/rejected": -245.1541748046875, "loss": 0.6796, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.004886592272669077, "rewards/margins": 0.021504424512386322, "rewards/margins_max": 0.03542860597372055, "rewards/margins_min": 0.007580241654068232, "rewards/margins_std": 0.019691769033670425, "rewards/rejected": -0.016617832705378532, "step": 340 }, { "epoch": 0.2, "grad_norm": 0.486328125, "learning_rate": 4.841044392903481e-07, "logits/chosen": 0.1290682703256607, "logits/rejected": 0.6047347784042358, "logps/chosen": -232.40908813476562, "logps/rejected": -181.57228088378906, "loss": 0.6783, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.008800150826573372, "rewards/margins": 0.028118547052145004, "rewards/margins_max": 0.04057111591100693, "rewards/margins_min": 0.015665989369153976, "rewards/margins_std": 0.0176105834543705, "rewards/rejected": -0.01931839995086193, "step": 350 }, { "epoch": 0.21, "grad_norm": 0.435546875, "learning_rate": 4.822797101178718e-07, "logits/chosen": -0.10504484176635742, "logits/rejected": 0.437595933675766, "logps/chosen": -256.3827209472656, "logps/rejected": -231.28836059570312, "loss": 0.6777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014989467337727547, "rewards/margins": 0.03444572165608406, "rewards/margins_max": 0.04873298108577728, "rewards/margins_min": 0.02015846036374569, "rewards/margins_std": 0.020205235108733177, "rewards/rejected": -0.019456254318356514, "step": 360 }, { "epoch": 0.21, "grad_norm": 0.390625, "learning_rate": 4.803596854051038e-07, "logits/chosen": -0.0018104672199115157, "logits/rejected": 0.5270112752914429, "logps/chosen": -251.33740234375, "logps/rejected": -203.73886108398438, "loss": 0.6749, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.010898159816861153, "rewards/margins": 0.02897489070892334, "rewards/margins_max": 0.041702691465616226, "rewards/margins_min": 0.016247089952230453, "rewards/margins_std": 0.01799982599914074, "rewards/rejected": -0.018076732754707336, "step": 370 }, { "epoch": 0.22, "grad_norm": 0.3671875, "learning_rate": 4.783451528652382e-07, "logits/chosen": 0.03281222656369209, "logits/rejected": 0.3939230740070343, "logps/chosen": -203.0167694091797, "logps/rejected": -197.302490234375, "loss": 0.6775, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01019463874399662, "rewards/margins": 0.030594149604439735, "rewards/margins_max": 0.041967082768678665, "rewards/margins_min": 0.019221220165491104, "rewards/margins_std": 0.01608375459909439, "rewards/rejected": -0.020399510860443115, "step": 380 }, { "epoch": 0.23, "grad_norm": 0.4140625, "learning_rate": 4.7623693898443963e-07, "logits/chosen": 0.06993720680475235, "logits/rejected": 0.44206172227859497, "logps/chosen": -185.37237548828125, "logps/rejected": -187.4385986328125, "loss": 0.6751, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.009011445567011833, "rewards/margins": 0.03231946378946304, "rewards/margins_max": 0.04668620228767395, "rewards/margins_min": 0.017952727153897285, "rewards/margins_std": 0.02031763456761837, "rewards/rejected": -0.02330802008509636, "step": 390 }, { "epoch": 0.23, "grad_norm": 0.44140625, "learning_rate": 4.740359086827685e-07, "logits/chosen": -0.0161175187677145, "logits/rejected": 0.4163980484008789, "logps/chosen": -239.71432495117188, "logps/rejected": -241.2501678466797, "loss": 0.6737, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018473349511623383, "rewards/margins": 0.04534245282411575, "rewards/margins_max": 0.06162145733833313, "rewards/margins_min": 0.02906343713402748, "rewards/margins_std": 0.0230219978839159, "rewards/rejected": -0.026869099587202072, "step": 400 }, { "epoch": 0.24, "grad_norm": 0.359375, "learning_rate": 4.7174296495933593e-07, "logits/chosen": -0.04076371714472771, "logits/rejected": 0.20715077221393585, "logps/chosen": -188.3863525390625, "logps/rejected": -203.01266479492188, "loss": 0.6749, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.011351143009960651, "rewards/margins": 0.03776105120778084, "rewards/margins_max": 0.05341630056500435, "rewards/margins_min": 0.022105801850557327, "rewards/margins_std": 0.022139865905046463, "rewards/rejected": -0.026409905403852463, "step": 410 }, { "epoch": 0.24, "grad_norm": 0.478515625, "learning_rate": 4.6935904852183805e-07, "logits/chosen": 0.29291218519210815, "logits/rejected": 0.5505505800247192, "logps/chosen": -203.9456024169922, "logps/rejected": -217.8910369873047, "loss": 0.6712, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012085825204849243, "rewards/margins": 0.038635291159152985, "rewards/margins_max": 0.059398896992206573, "rewards/margins_min": 0.017871689051389694, "rewards/margins_std": 0.029364168643951416, "rewards/rejected": -0.02654946781694889, "step": 420 }, { "epoch": 0.25, "grad_norm": 0.431640625, "learning_rate": 4.6688513740061965e-07, "logits/chosen": 0.12483358383178711, "logits/rejected": 0.46587473154067993, "logps/chosen": -264.0867004394531, "logps/rejected": -292.27685546875, "loss": 0.6731, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.019537176936864853, "rewards/margins": 0.040542975068092346, "rewards/margins_max": 0.05839340761303902, "rewards/margins_min": 0.022692536935210228, "rewards/margins_std": 0.02524433098733425, "rewards/rejected": -0.021005798131227493, "step": 430 }, { "epoch": 0.26, "grad_norm": 0.4296875, "learning_rate": 4.6432224654742475e-07, "logits/chosen": -0.0027520388830453157, "logits/rejected": 0.48325324058532715, "logps/chosen": -231.2857208251953, "logps/rejected": -221.3975372314453, "loss": 0.6719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017787110060453415, "rewards/margins": 0.04569714143872261, "rewards/margins_max": 0.06507585942745209, "rewards/margins_min": 0.026318421587347984, "rewards/margins_std": 0.027405640110373497, "rewards/rejected": -0.027910029515624046, "step": 440 }, { "epoch": 0.26, "grad_norm": 0.4375, "learning_rate": 4.616714274190011e-07, "logits/chosen": 0.3332589566707611, "logits/rejected": 0.5584608316421509, "logps/chosen": -211.74325561523438, "logps/rejected": -225.31689453125, "loss": 0.6705, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.010198825970292091, "rewards/margins": 0.04217001795768738, "rewards/margins_max": 0.0582113042473793, "rewards/margins_min": 0.026128727942705154, "rewards/margins_std": 0.022685810923576355, "rewards/rejected": -0.031971193850040436, "step": 450 }, { "epoch": 0.27, "grad_norm": 0.435546875, "learning_rate": 4.589337675457273e-07, "logits/chosen": 0.10014849901199341, "logits/rejected": 0.564907431602478, "logps/chosen": -217.19985961914062, "logps/rejected": -214.29440307617188, "loss": 0.6713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.018607165664434433, "rewards/margins": 0.05433148890733719, "rewards/margins_max": 0.07488565146923065, "rewards/margins_min": 0.033777330070734024, "rewards/margins_std": 0.02906796894967556, "rewards/rejected": -0.03572431951761246, "step": 460 }, { "epoch": 0.27, "grad_norm": 0.4609375, "learning_rate": 4.5611039008544007e-07, "logits/chosen": 0.13153567910194397, "logits/rejected": 0.652635931968689, "logps/chosen": -261.8456726074219, "logps/rejected": -231.66531372070312, "loss": 0.671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013766567222774029, "rewards/margins": 0.04572372883558273, "rewards/margins_max": 0.06320376694202423, "rewards/margins_min": 0.028243690729141235, "rewards/margins_std": 0.024720508605241776, "rewards/rejected": -0.03195716068148613, "step": 470 }, { "epoch": 0.28, "grad_norm": 0.419921875, "learning_rate": 4.532024533626457e-07, "logits/chosen": 0.0050893365405499935, "logits/rejected": 0.3075583577156067, "logps/chosen": -214.87033081054688, "logps/rejected": -231.591064453125, "loss": 0.6694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.012458743527531624, "rewards/margins": 0.046287618577480316, "rewards/margins_max": 0.06574501842260361, "rewards/margins_min": 0.026830215007066727, "rewards/margins_std": 0.02751692570745945, "rewards/rejected": -0.03382887691259384, "step": 480 }, { "epoch": 0.28, "grad_norm": 0.435546875, "learning_rate": 4.502111503933032e-07, "logits/chosen": 0.16573339700698853, "logits/rejected": 0.5059231519699097, "logps/chosen": -214.00900268554688, "logps/rejected": -226.75070190429688, "loss": 0.6705, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011546745896339417, "rewards/margins": 0.03893359750509262, "rewards/margins_max": 0.0571872778236866, "rewards/margins_min": 0.020679913461208344, "rewards/margins_std": 0.0258146021515131, "rewards/rejected": -0.027386849746108055, "step": 490 }, { "epoch": 0.29, "grad_norm": 0.42578125, "learning_rate": 4.471377083953753e-07, "logits/chosen": 0.19767063856124878, "logits/rejected": 0.6161295175552368, "logps/chosen": -211.5915985107422, "logps/rejected": -231.336669921875, "loss": 0.6672, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.021602794528007507, "rewards/margins": 0.05690021067857742, "rewards/margins_max": 0.08022460341453552, "rewards/margins_min": 0.03357581049203873, "rewards/margins_std": 0.032985687255859375, "rewards/rejected": -0.03529741242527962, "step": 500 }, { "epoch": 0.3, "grad_norm": 0.4609375, "learning_rate": 4.4398338828534766e-07, "logits/chosen": 0.051334965974092484, "logits/rejected": 0.5114815831184387, "logps/chosen": -252.36349487304688, "logps/rejected": -253.6934051513672, "loss": 0.67, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.021400339901447296, "rewards/margins": 0.05237139016389847, "rewards/margins_max": 0.07569600641727448, "rewards/margins_min": 0.029046764597296715, "rewards/margins_std": 0.03298599272966385, "rewards/rejected": -0.030971046537160873, "step": 510 }, { "epoch": 0.3, "grad_norm": 0.40234375, "learning_rate": 4.407494841609224e-07, "logits/chosen": 0.16097505390644073, "logits/rejected": 0.503351092338562, "logps/chosen": -187.7499542236328, "logps/rejected": -182.64669799804688, "loss": 0.6691, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.015485493466258049, "rewards/margins": 0.039487432688474655, "rewards/margins_max": 0.0597788468003273, "rewards/margins_min": 0.019196024164557457, "rewards/margins_std": 0.028696388006210327, "rewards/rejected": -0.024001937359571457, "step": 520 }, { "epoch": 0.31, "grad_norm": 0.462890625, "learning_rate": 4.374373227700993e-07, "logits/chosen": 0.03560265153646469, "logits/rejected": 0.5799299478530884, "logps/chosen": -273.8843688964844, "logps/rejected": -234.033935546875, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 0.007162511348724365, "rewards/margins": 0.0483052022755146, "rewards/margins_max": 0.06804867088794708, "rewards/margins_min": 0.028561726212501526, "rewards/margins_std": 0.027921488508582115, "rewards/rejected": -0.04114269092679024, "step": 530 }, { "epoch": 0.31, "grad_norm": 0.408203125, "learning_rate": 4.340482629668615e-07, "logits/chosen": 0.027306120842695236, "logits/rejected": 0.671806812286377, "logps/chosen": -259.85015869140625, "logps/rejected": -201.55807495117188, "loss": 0.6673, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02854643389582634, "rewards/margins": 0.0538957342505455, "rewards/margins_max": 0.0864059180021286, "rewards/margins_min": 0.0213855542242527, "rewards/margins_std": 0.045976340770721436, "rewards/rejected": -0.025349300354719162, "step": 540 }, { "epoch": 0.32, "grad_norm": 0.3515625, "learning_rate": 4.30583695153689e-07, "logits/chosen": 0.04380347207188606, "logits/rejected": 0.4509994089603424, "logps/chosen": -273.69775390625, "logps/rejected": -259.96966552734375, "loss": 0.6693, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022089816629886627, "rewards/margins": 0.056071024388074875, "rewards/margins_max": 0.08100839704275131, "rewards/margins_min": 0.031133651733398438, "rewards/margins_std": 0.035266775637865067, "rewards/rejected": -0.033981211483478546, "step": 550 }, { "epoch": 0.32, "grad_norm": 0.4140625, "learning_rate": 4.2704504071112986e-07, "logits/chosen": 0.10579466819763184, "logits/rejected": 0.5407041311264038, "logps/chosen": -240.98483276367188, "logps/rejected": -211.9040985107422, "loss": 0.6687, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.017832906916737556, "rewards/margins": 0.05916459485888481, "rewards/margins_max": 0.08200596272945404, "rewards/margins_min": 0.036323241889476776, "rewards/margins_std": 0.03230256214737892, "rewards/rejected": -0.041331697255373, "step": 560 }, { "epoch": 0.33, "grad_norm": 0.376953125, "learning_rate": 4.234337514146612e-07, "logits/chosen": 0.11410923302173615, "logits/rejected": 0.6912606954574585, "logps/chosen": -251.16793823242188, "logps/rejected": -229.26553344726562, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": 0.019808156415820122, "rewards/margins": 0.05665863677859306, "rewards/margins_max": 0.08191566169261932, "rewards/margins_min": 0.0314016118645668, "rewards/margins_std": 0.03571882098913193, "rewards/rejected": -0.036850474774837494, "step": 570 }, { "epoch": 0.34, "grad_norm": 0.357421875, "learning_rate": 4.197513088390813e-07, "logits/chosen": -0.013543277978897095, "logits/rejected": 0.37492939829826355, "logps/chosen": -232.13333129882812, "logps/rejected": -223.6721954345703, "loss": 0.6657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014923980459570885, "rewards/margins": 0.05013802647590637, "rewards/margins_max": 0.07493571937084198, "rewards/margins_min": 0.025340333580970764, "rewards/margins_std": 0.03506923094391823, "rewards/rejected": -0.03521404415369034, "step": 580 }, { "epoch": 0.34, "grad_norm": 0.51171875, "learning_rate": 4.1599922375067554e-07, "logits/chosen": -0.03167729452252388, "logits/rejected": 0.535004734992981, "logps/chosen": -325.4375915527344, "logps/rejected": -253.494873046875, "loss": 0.6668, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01660420373082161, "rewards/margins": 0.059089016169309616, "rewards/margins_max": 0.08827444911003113, "rewards/margins_min": 0.029903585091233253, "rewards/margins_std": 0.041274432092905045, "rewards/rejected": -0.04248481243848801, "step": 590 }, { "epoch": 0.35, "grad_norm": 0.380859375, "learning_rate": 4.121790354874065e-07, "logits/chosen": 0.05303360894322395, "logits/rejected": 0.40770038962364197, "logps/chosen": -202.06549072265625, "logps/rejected": -214.628173828125, "loss": 0.6649, "rewards/accuracies": 1.0, "rewards/chosen": 0.005082354880869389, "rewards/margins": 0.05396551638841629, "rewards/margins_max": 0.07737747579813004, "rewards/margins_min": 0.03055354580283165, "rewards/margins_std": 0.03310951590538025, "rewards/rejected": -0.04888315126299858, "step": 600 }, { "epoch": 0.35, "grad_norm": 0.369140625, "learning_rate": 4.082923113273822e-07, "logits/chosen": 0.11870566755533218, "logits/rejected": 0.464911550283432, "logps/chosen": -231.35336303710938, "logps/rejected": -234.9374237060547, "loss": 0.6666, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01106190960854292, "rewards/margins": 0.0625653862953186, "rewards/margins_max": 0.08917935192584991, "rewards/margins_min": 0.03595142811536789, "rewards/margins_std": 0.037637822329998016, "rewards/rejected": -0.05150347948074341, "step": 610 }, { "epoch": 0.36, "grad_norm": 0.443359375, "learning_rate": 4.043406458458609e-07, "logits/chosen": 0.09034819900989532, "logits/rejected": 0.5873952507972717, "logps/chosen": -265.25396728515625, "logps/rejected": -214.2862548828125, "loss": 0.6628, "rewards/accuracies": 1.0, "rewards/chosen": 0.0020419310312718153, "rewards/margins": 0.06574475765228271, "rewards/margins_max": 0.08710642158985138, "rewards/margins_min": 0.04438310116529465, "rewards/margins_std": 0.030209947377443314, "rewards/rejected": -0.06370283663272858, "step": 620 }, { "epoch": 0.37, "grad_norm": 0.4921875, "learning_rate": 4.0032566026105806e-07, "logits/chosen": 0.008516276255249977, "logits/rejected": 0.6535265445709229, "logps/chosen": -260.87298583984375, "logps/rejected": -267.5401916503906, "loss": 0.663, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03661227226257324, "rewards/margins": 0.07144369184970856, "rewards/margins_max": 0.09834811091423035, "rewards/margins_min": 0.044539276510477066, "rewards/margins_std": 0.03804859146475792, "rewards/rejected": -0.03483142331242561, "step": 630 }, { "epoch": 0.37, "grad_norm": 0.474609375, "learning_rate": 3.9624900176902184e-07, "logits/chosen": 0.013054514303803444, "logits/rejected": 0.3652392029762268, "logps/chosen": -235.1199493408203, "logps/rejected": -248.31411743164062, "loss": 0.6656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014549237675964832, "rewards/margins": 0.05561714246869087, "rewards/margins_max": 0.08446307480335236, "rewards/margins_min": 0.026771211996674538, "rewards/margins_std": 0.040794309228658676, "rewards/rejected": -0.041067905724048615, "step": 640 }, { "epoch": 0.38, "grad_norm": 0.41015625, "learning_rate": 3.921123428678511e-07, "logits/chosen": 0.022506317123770714, "logits/rejected": 0.6284270882606506, "logps/chosen": -305.97674560546875, "logps/rejected": -239.0786590576172, "loss": 0.666, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020474497228860855, "rewards/margins": 0.06788565218448639, "rewards/margins_max": 0.09115969389677048, "rewards/margins_min": 0.044611603021621704, "rewards/margins_std": 0.03291446715593338, "rewards/rejected": -0.047411151230335236, "step": 650 }, { "epoch": 0.38, "grad_norm": 0.478515625, "learning_rate": 3.8791738067153314e-07, "logits/chosen": 0.07077694684267044, "logits/rejected": 0.5682755708694458, "logps/chosen": -231.22695922851562, "logps/rejected": -227.6490478515625, "loss": 0.6622, "rewards/accuracies": 1.0, "rewards/chosen": 0.03146480768918991, "rewards/margins": 0.06544210761785507, "rewards/margins_max": 0.0967545360326767, "rewards/margins_min": 0.034129686653614044, "rewards/margins_std": 0.044282447546720505, "rewards/rejected": -0.03397729992866516, "step": 660 }, { "epoch": 0.39, "grad_norm": 0.41796875, "learning_rate": 3.83665836213682e-07, "logits/chosen": 0.12142015993595123, "logits/rejected": 0.5390751957893372, "logps/chosen": -207.6114501953125, "logps/rejected": -215.29849243164062, "loss": 0.6636, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011886438354849815, "rewards/margins": 0.05365458130836487, "rewards/margins_max": 0.07296213507652283, "rewards/margins_min": 0.03434702754020691, "rewards/margins_std": 0.027305006980895996, "rewards/rejected": -0.0417681448161602, "step": 670 }, { "epoch": 0.39, "grad_norm": 0.46875, "learning_rate": 3.7935945374146417e-07, "logits/chosen": 0.007061509881168604, "logits/rejected": 0.3642507493495941, "logps/chosen": -236.29788208007812, "logps/rejected": -242.33544921875, "loss": 0.6631, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02563950978219509, "rewards/margins": 0.05955478549003601, "rewards/margins_max": 0.08539506047964096, "rewards/margins_min": 0.03371449559926987, "rewards/margins_std": 0.036543674767017365, "rewards/rejected": -0.03391526639461517, "step": 680 }, { "epoch": 0.4, "grad_norm": 0.5234375, "learning_rate": 3.75e-07, "logits/chosen": 0.08328167349100113, "logits/rejected": 0.5527598857879639, "logps/chosen": -239.66159057617188, "logps/rejected": -235.6712188720703, "loss": 0.6622, "rewards/accuracies": 1.0, "rewards/chosen": 0.023291967809200287, "rewards/margins": 0.07459411025047302, "rewards/margins_max": 0.1087113469839096, "rewards/margins_min": 0.04047687351703644, "rewards/margins_std": 0.04824905842542648, "rewards/rejected": -0.051302142441272736, "step": 690 }, { "epoch": 0.41, "grad_norm": 0.40625, "learning_rate": 3.7058926350753517e-07, "logits/chosen": 0.04602205008268356, "logits/rejected": 0.6276509165763855, "logps/chosen": -247.14205932617188, "logps/rejected": -208.6519775390625, "loss": 0.6614, "rewards/accuracies": 1.0, "rewards/chosen": 0.022474488243460655, "rewards/margins": 0.07001164555549622, "rewards/margins_max": 0.09704446792602539, "rewards/margins_min": 0.04297882691025734, "rewards/margins_std": 0.038230184465646744, "rewards/rejected": -0.04753715917468071, "step": 700 }, { "epoch": 0.41, "grad_norm": 0.4453125, "learning_rate": 3.661290538216798e-07, "logits/chosen": 0.291398823261261, "logits/rejected": 0.6808168292045593, "logps/chosen": -224.65090942382812, "logps/rejected": -205.6571807861328, "loss": 0.6632, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0020084187854081392, "rewards/margins": 0.05480460077524185, "rewards/margins_max": 0.0770978108048439, "rewards/margins_min": 0.0325113907456398, "rewards/margins_std": 0.031527359038591385, "rewards/rejected": -0.05279617756605148, "step": 710 }, { "epoch": 0.42, "grad_norm": 0.4375, "learning_rate": 3.616212007970159e-07, "logits/chosen": 0.05395558476448059, "logits/rejected": 0.29135066270828247, "logps/chosen": -189.52139282226562, "logps/rejected": -215.48080444335938, "loss": 0.6633, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.008078034035861492, "rewards/margins": 0.05178927257657051, "rewards/margins_max": 0.0689278393983841, "rewards/margins_min": 0.034650713205337524, "rewards/margins_std": 0.024237588047981262, "rewards/rejected": -0.043711237609386444, "step": 720 }, { "epoch": 0.42, "grad_norm": 0.4609375, "learning_rate": 3.5706755383437703e-07, "logits/chosen": 0.09721295535564423, "logits/rejected": 0.5186147689819336, "logps/chosen": -302.69482421875, "logps/rejected": -258.5033874511719, "loss": 0.6646, "rewards/accuracies": 1.0, "rewards/chosen": 0.020449183881282806, "rewards/margins": 0.052381712943315506, "rewards/margins_max": 0.07583948969841003, "rewards/margins_min": 0.02892393246293068, "rewards/margins_std": 0.0331743024289608, "rewards/rejected": -0.0319325253367424, "step": 730 }, { "epoch": 0.43, "grad_norm": 0.443359375, "learning_rate": 3.5246998112210993e-07, "logits/chosen": 0.13969309628009796, "logits/rejected": 0.6499422192573547, "logps/chosen": -262.07000732421875, "logps/rejected": -253.33364868164062, "loss": 0.6583, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.020577292889356613, "rewards/margins": 0.08194496482610703, "rewards/margins_max": 0.10924677550792694, "rewards/margins_min": 0.05464313551783562, "rewards/margins_std": 0.038610607385635376, "rewards/rejected": -0.061367668211460114, "step": 740 }, { "epoch": 0.44, "grad_norm": 0.39453125, "learning_rate": 3.4783036886962736e-07, "logits/chosen": 0.15751202404499054, "logits/rejected": 0.583830714225769, "logps/chosen": -232.4749298095703, "logps/rejected": -251.43881225585938, "loss": 0.6642, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.013448268175125122, "rewards/margins": 0.06021388620138168, "rewards/margins_max": 0.08211688697338104, "rewards/margins_min": 0.03831087797880173, "rewards/margins_std": 0.030975526198744774, "rewards/rejected": -0.04676561802625656, "step": 750 }, { "epoch": 0.44, "grad_norm": 0.451171875, "learning_rate": 3.4315062053356847e-07, "logits/chosen": -0.02616945281624794, "logits/rejected": 0.5470731854438782, "logps/chosen": -247.7039031982422, "logps/rejected": -204.8767547607422, "loss": 0.6635, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02075277827680111, "rewards/margins": 0.06478811800479889, "rewards/margins_max": 0.09738490730524063, "rewards/margins_min": 0.03219131752848625, "rewards/margins_std": 0.04609883576631546, "rewards/rejected": -0.04403533786535263, "step": 760 }, { "epoch": 0.45, "grad_norm": 0.515625, "learning_rate": 3.384326560368826e-07, "logits/chosen": 0.040539853274822235, "logits/rejected": 0.5014762878417969, "logps/chosen": -249.2455596923828, "logps/rejected": -242.47781372070312, "loss": 0.662, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02245604246854782, "rewards/margins": 0.05939044803380966, "rewards/margins_max": 0.08405659347772598, "rewards/margins_min": 0.03472430631518364, "rewards/margins_std": 0.03488319739699364, "rewards/rejected": -0.03693440556526184, "step": 770 }, { "epoch": 0.45, "grad_norm": 0.5, "learning_rate": 3.3367841098115777e-07, "logits/chosen": 0.05805939435958862, "logits/rejected": 0.47922706604003906, "logps/chosen": -286.8292541503906, "logps/rejected": -230.5067138671875, "loss": 0.6653, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.004244116134941578, "rewards/margins": 0.0571456179022789, "rewards/margins_max": 0.08360336720943451, "rewards/margins_min": 0.030687877908349037, "rewards/margins_std": 0.03741690143942833, "rewards/rejected": -0.052901506423950195, "step": 780 }, { "epoch": 0.46, "grad_norm": 0.40234375, "learning_rate": 3.2888983585251713e-07, "logits/chosen": 0.11492130905389786, "logits/rejected": 0.3956727087497711, "logps/chosen": -204.6266632080078, "logps/rejected": -208.7443084716797, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.011013984680175781, "rewards/margins": 0.057107020169496536, "rewards/margins_max": 0.07711775600910187, "rewards/margins_min": 0.037096280604600906, "rewards/margins_std": 0.02829946205019951, "rewards/rejected": -0.046093035489320755, "step": 790 }, { "epoch": 0.46, "grad_norm": 0.466796875, "learning_rate": 3.240688952214085e-07, "logits/chosen": -0.019520867615938187, "logits/rejected": 0.34635210037231445, "logps/chosen": -278.4693298339844, "logps/rejected": -257.54986572265625, "loss": 0.6607, "rewards/accuracies": 1.0, "rewards/chosen": 0.020895112305879593, "rewards/margins": 0.08000204712152481, "rewards/margins_max": 0.1040647029876709, "rewards/margins_min": 0.05593939870595932, "rewards/margins_std": 0.034029725939035416, "rewards/rejected": -0.05910693481564522, "step": 800 }, { "epoch": 0.47, "grad_norm": 0.365234375, "learning_rate": 3.192175669366156e-07, "logits/chosen": 0.08061734586954117, "logits/rejected": 0.440199077129364, "logps/chosen": -216.41323852539062, "logps/rejected": -240.26333618164062, "loss": 0.6611, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.011639273725450039, "rewards/margins": 0.061767347157001495, "rewards/margins_max": 0.09113974124193192, "rewards/margins_min": 0.03239493444561958, "rewards/margins_std": 0.04153885692358017, "rewards/rejected": -0.050128065049648285, "step": 810 }, { "epoch": 0.48, "grad_norm": 0.435546875, "learning_rate": 3.14337841313822e-07, "logits/chosen": 0.2162504643201828, "logits/rejected": 0.6251672506332397, "logps/chosen": -249.9015655517578, "logps/rejected": -198.54403686523438, "loss": 0.6629, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.008589675650000572, "rewards/margins": 0.05789928883314133, "rewards/margins_max": 0.07874341309070587, "rewards/margins_min": 0.03705517202615738, "rewards/margins_std": 0.029478034004569054, "rewards/rejected": -0.0493096187710762, "step": 820 }, { "epoch": 0.48, "grad_norm": 0.443359375, "learning_rate": 3.094317203190603e-07, "logits/chosen": -0.0029448375571519136, "logits/rejected": 0.4555005431175232, "logps/chosen": -240.8060760498047, "logps/rejected": -222.56246948242188, "loss": 0.6561, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.022363275289535522, "rewards/margins": 0.08168495446443558, "rewards/margins_max": 0.11077789962291718, "rewards/margins_min": 0.052591998130083084, "rewards/margins_std": 0.04114364832639694, "rewards/rejected": -0.059321679174900055, "step": 830 }, { "epoch": 0.49, "grad_norm": 0.38671875, "learning_rate": 3.045012167473814e-07, "logits/chosen": 0.1808149516582489, "logits/rejected": 0.5233570337295532, "logps/chosen": -263.43255615234375, "logps/rejected": -270.8913269042969, "loss": 0.6616, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02442259155213833, "rewards/margins": 0.0733276903629303, "rewards/margins_max": 0.104800745844841, "rewards/margins_min": 0.041854631155729294, "rewards/margins_std": 0.04450962692499161, "rewards/rejected": -0.04890510439872742, "step": 840 }, { "epoch": 0.49, "grad_norm": 0.4140625, "learning_rate": 2.995483533970809e-07, "logits/chosen": 0.2622363269329071, "logits/rejected": 0.7754552960395813, "logps/chosen": -228.362060546875, "logps/rejected": -187.44383239746094, "loss": 0.6618, "rewards/accuracies": 1.0, "rewards/chosen": 0.011710538528859615, "rewards/margins": 0.06277038902044296, "rewards/margins_max": 0.08341649174690247, "rewards/margins_min": 0.04212428256869316, "rewards/margins_std": 0.029198000207543373, "rewards/rejected": -0.05105985328555107, "step": 850 }, { "epoch": 0.5, "grad_norm": 0.453125, "learning_rate": 2.9457516223982235e-07, "logits/chosen": 0.11260411888360977, "logits/rejected": 0.47127556800842285, "logps/chosen": -251.4638214111328, "logps/rejected": -251.6316680908203, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 0.009782608598470688, "rewards/margins": 0.07295442372560501, "rewards/margins_max": 0.10423107445240021, "rewards/margins_min": 0.04167778044939041, "rewards/margins_std": 0.044231854379177094, "rewards/rejected": -0.06317181885242462, "step": 860 }, { "epoch": 0.5, "grad_norm": 0.44921875, "learning_rate": 2.895836835869962e-07, "logits/chosen": 0.03560788184404373, "logits/rejected": 0.4069921374320984, "logps/chosen": -228.38876342773438, "logps/rejected": -221.29638671875, "loss": 0.662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009866083040833473, "rewards/margins": 0.06033489108085632, "rewards/margins_max": 0.09506522119045258, "rewards/margins_min": 0.02560456469655037, "rewards/margins_std": 0.0491160973906517, "rewards/rejected": -0.050468809902668, "step": 870 }, { "epoch": 0.51, "grad_norm": 0.48046875, "learning_rate": 2.845759652526574e-07, "logits/chosen": 0.07124204933643341, "logits/rejected": 0.5192992687225342, "logps/chosen": -234.10836791992188, "logps/rejected": -189.55230712890625, "loss": 0.66, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01570773683488369, "rewards/margins": 0.05234966799616814, "rewards/margins_max": 0.07433562725782394, "rewards/margins_min": 0.030363699421286583, "rewards/margins_std": 0.031092852354049683, "rewards/rejected": -0.036641925573349, "step": 880 }, { "epoch": 0.52, "grad_norm": 0.427734375, "learning_rate": 2.795540617133853e-07, "logits/chosen": 0.24306873977184296, "logits/rejected": 0.4881308674812317, "logps/chosen": -233.5541534423828, "logps/rejected": -271.29119873046875, "loss": 0.6601, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0062574222683906555, "rewards/margins": 0.06694331020116806, "rewards/margins_max": 0.0913429707288742, "rewards/margins_min": 0.04254365712404251, "rewards/margins_std": 0.03450632840394974, "rewards/rejected": -0.060685895383358, "step": 890 }, { "epoch": 0.52, "grad_norm": 0.40234375, "learning_rate": 2.7452003326540995e-07, "logits/chosen": 0.1885126382112503, "logits/rejected": 0.6096329689025879, "logps/chosen": -223.55380249023438, "logps/rejected": -210.834716796875, "loss": 0.6613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01565275713801384, "rewards/margins": 0.0681251734495163, "rewards/margins_max": 0.0929432287812233, "rewards/margins_min": 0.043307114392519, "rewards/margins_std": 0.035098038613796234, "rewards/rejected": -0.05247241258621216, "step": 900 }, { "epoch": 0.53, "grad_norm": 0.369140625, "learning_rate": 2.694759451793508e-07, "logits/chosen": 0.3056187033653259, "logits/rejected": 0.5238193273544312, "logps/chosen": -180.62220764160156, "logps/rejected": -202.76705932617188, "loss": 0.6628, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.005610722117125988, "rewards/margins": 0.053133320063352585, "rewards/margins_max": 0.0700041875243187, "rewards/margins_min": 0.03626246377825737, "rewards/margins_std": 0.023858997970819473, "rewards/rejected": -0.04752260446548462, "step": 910 }, { "epoch": 0.53, "grad_norm": 0.48828125, "learning_rate": 2.644238668529146e-07, "logits/chosen": 0.21234102547168732, "logits/rejected": 0.48591142892837524, "logps/chosen": -223.54971313476562, "logps/rejected": -248.9346466064453, "loss": 0.6607, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017756493762135506, "rewards/margins": 0.07771660387516022, "rewards/margins_max": 0.11433382332324982, "rewards/margins_min": 0.04109939560294151, "rewards/margins_std": 0.05178455635905266, "rewards/rejected": -0.05996011570096016, "step": 920 }, { "epoch": 0.54, "grad_norm": 0.396484375, "learning_rate": 2.593658709619001e-07, "logits/chosen": 0.11299429088830948, "logits/rejected": 0.5906545519828796, "logps/chosen": -222.49609375, "logps/rejected": -204.37290954589844, "loss": 0.6601, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02080368809401989, "rewards/margins": 0.07051359862089157, "rewards/margins_max": 0.10480418056249619, "rewards/margins_min": 0.03622300922870636, "rewards/margins_std": 0.048494212329387665, "rewards/rejected": -0.04970990866422653, "step": 930 }, { "epoch": 0.55, "grad_norm": 0.423828125, "learning_rate": 2.5430403260985807e-07, "logits/chosen": 0.11868913471698761, "logits/rejected": 0.5508742332458496, "logps/chosen": -212.3166961669922, "logps/rejected": -219.1356658935547, "loss": 0.6583, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.021529385820031166, "rewards/margins": 0.06332559883594513, "rewards/margins_max": 0.0937047004699707, "rewards/margins_min": 0.03294649347662926, "rewards/margins_std": 0.042962536215782166, "rewards/rejected": -0.04179621487855911, "step": 940 }, { "epoch": 0.55, "grad_norm": 0.470703125, "learning_rate": 2.4924042847675503e-07, "logits/chosen": 0.06126406043767929, "logits/rejected": 0.5420705080032349, "logps/chosen": -294.85845947265625, "logps/rejected": -215.2727813720703, "loss": 0.661, "rewards/accuracies": 0.875, "rewards/chosen": 0.007373870350420475, "rewards/margins": 0.05419896915555, "rewards/margins_max": 0.08067617565393448, "rewards/margins_min": 0.02772175334393978, "rewards/margins_std": 0.03744443506002426, "rewards/rejected": -0.0468250997364521, "step": 950 }, { "epoch": 0.56, "grad_norm": 0.47265625, "learning_rate": 2.441771359669902e-07, "logits/chosen": 0.13893456757068634, "logits/rejected": 0.4921324849128723, "logps/chosen": -235.5193634033203, "logps/rejected": -225.794189453125, "loss": 0.6607, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.012106789276003838, "rewards/margins": 0.06842382997274399, "rewards/margins_max": 0.100715771317482, "rewards/margins_min": 0.03613189607858658, "rewards/margins_std": 0.045667704194784164, "rewards/rejected": -0.056317038834095, "step": 960 }, { "epoch": 0.56, "grad_norm": 0.443359375, "learning_rate": 2.391162323571161e-07, "logits/chosen": 0.07089251279830933, "logits/rejected": 0.48170119524002075, "logps/chosen": -230.9342498779297, "logps/rejected": -226.3340301513672, "loss": 0.6617, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.010878843255341053, "rewards/margins": 0.06217268109321594, "rewards/margins_max": 0.08883620798587799, "rewards/margins_min": 0.03550915792584419, "rewards/margins_std": 0.037707917392253876, "rewards/rejected": -0.051293838769197464, "step": 970 }, { "epoch": 0.57, "grad_norm": 0.42578125, "learning_rate": 2.340597939436097e-07, "logits/chosen": 0.03681742399930954, "logits/rejected": 0.5955736041069031, "logps/chosen": -234.0045166015625, "logps/rejected": -216.2124786376953, "loss": 0.6614, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0253006462007761, "rewards/margins": 0.06550078094005585, "rewards/margins_max": 0.0953935831785202, "rewards/margins_min": 0.035607993602752686, "rewards/margins_std": 0.04227479174733162, "rewards/rejected": -0.0402001328766346, "step": 980 }, { "epoch": 0.57, "grad_norm": 0.42578125, "learning_rate": 2.2900989519104796e-07, "logits/chosen": 0.1664225161075592, "logits/rejected": 0.4196982979774475, "logps/chosen": -182.28829956054688, "logps/rejected": -211.08865356445312, "loss": 0.6625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0058049350045621395, "rewards/margins": 0.06564933061599731, "rewards/margins_max": 0.09529349207878113, "rewards/margins_min": 0.036005161702632904, "rewards/margins_std": 0.04192318022251129, "rewards/rejected": -0.05984439328312874, "step": 990 }, { "epoch": 0.58, "grad_norm": 0.4375, "learning_rate": 2.2396860788103353e-07, "logits/chosen": -0.04069889336824417, "logits/rejected": 0.4455093741416931, "logps/chosen": -208.73477172851562, "logps/rejected": -199.85501098632812, "loss": 0.6608, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015201890841126442, "rewards/margins": 0.08097913861274719, "rewards/margins_max": 0.11325138807296753, "rewards/margins_min": 0.04870688170194626, "rewards/margins_std": 0.04563985764980316, "rewards/rejected": -0.0657772421836853, "step": 1000 }, { "epoch": 0.59, "grad_norm": 0.451171875, "learning_rate": 2.1893800026222083e-07, "logits/chosen": 0.24370861053466797, "logits/rejected": 0.655241847038269, "logps/chosen": -239.9451446533203, "logps/rejected": -255.0171356201172, "loss": 0.6612, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01818387396633625, "rewards/margins": 0.06645138561725616, "rewards/margins_max": 0.0944729745388031, "rewards/margins_min": 0.03842979669570923, "rewards/margins_std": 0.039628516882658005, "rewards/rejected": -0.048267509788274765, "step": 1010 }, { "epoch": 0.59, "grad_norm": 0.376953125, "learning_rate": 2.1392013620179336e-07, "logits/chosen": -0.15726599097251892, "logits/rejected": 0.27727076411247253, "logps/chosen": -208.62881469726562, "logps/rejected": -205.62429809570312, "loss": 0.6593, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012712801806628704, "rewards/margins": 0.07130307704210281, "rewards/margins_max": 0.09740529954433441, "rewards/margins_min": 0.04520086199045181, "rewards/margins_std": 0.03691411018371582, "rewards/rejected": -0.05859028175473213, "step": 1020 }, { "epoch": 0.6, "grad_norm": 0.373046875, "learning_rate": 2.0891707433873623e-07, "logits/chosen": 0.2577076256275177, "logits/rejected": 0.5587279796600342, "logps/chosen": -232.6507568359375, "logps/rejected": -236.791015625, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.007417677901685238, "rewards/margins": 0.06323407590389252, "rewards/margins_max": 0.09169165790081024, "rewards/margins_min": 0.03477650135755539, "rewards/margins_std": 0.040245089679956436, "rewards/rejected": -0.055816400796175, "step": 1030 }, { "epoch": 0.6, "grad_norm": 0.4609375, "learning_rate": 2.039308672392556e-07, "logits/chosen": 0.09692186862230301, "logits/rejected": 0.5365327000617981, "logps/chosen": -220.7172393798828, "logps/rejected": -204.85055541992188, "loss": 0.6567, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.016125962138175964, "rewards/margins": 0.06824339926242828, "rewards/margins_max": 0.10508973896503448, "rewards/margins_min": 0.03139704838395119, "rewards/margins_std": 0.052108604460954666, "rewards/rejected": -0.05211742967367172, "step": 1040 }, { "epoch": 0.61, "grad_norm": 0.36328125, "learning_rate": 1.9896356055468845e-07, "logits/chosen": 0.24312233924865723, "logits/rejected": 0.5007752180099487, "logps/chosen": -217.9171600341797, "logps/rejected": -255.72866821289062, "loss": 0.6605, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.015429767780005932, "rewards/margins": 0.06471355259418488, "rewards/margins_max": 0.09141434729099274, "rewards/margins_min": 0.03801275044679642, "rewards/margins_std": 0.03776064142584801, "rewards/rejected": -0.04928378015756607, "step": 1050 }, { "epoch": 0.61, "grad_norm": 0.359375, "learning_rate": 1.940171921822496e-07, "logits/chosen": 0.007707296404987574, "logits/rejected": 0.3314017653465271, "logps/chosen": -218.86654663085938, "logps/rejected": -214.7074737548828, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 0.010595353320240974, "rewards/margins": 0.05604109913110733, "rewards/margins_max": 0.08353577554225922, "rewards/margins_min": 0.028546428307890892, "rewards/margins_std": 0.03888333961367607, "rewards/rejected": -0.045445747673511505, "step": 1060 }, { "epoch": 0.62, "grad_norm": 0.421875, "learning_rate": 1.8909379142895977e-07, "logits/chosen": 0.08975931257009506, "logits/rejected": 0.49662691354751587, "logps/chosen": -243.73941040039062, "logps/rejected": -218.0565643310547, "loss": 0.6628, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.017341626808047295, "rewards/margins": 0.06548301875591278, "rewards/margins_max": 0.10044316947460175, "rewards/margins_min": 0.030522847548127174, "rewards/margins_std": 0.0494411401450634, "rewards/rejected": -0.04814138263463974, "step": 1070 }, { "epoch": 0.63, "grad_norm": 0.419921875, "learning_rate": 1.841953781790983e-07, "logits/chosen": 0.14877240359783173, "logits/rejected": 0.32807669043540955, "logps/chosen": -201.35398864746094, "logps/rejected": -237.98403930664062, "loss": 0.6614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.011331291869282722, "rewards/margins": 0.05169866234064102, "rewards/margins_max": 0.08101126551628113, "rewards/margins_min": 0.02238604798913002, "rewards/margins_std": 0.041454292833805084, "rewards/rejected": -0.04036737233400345, "step": 1080 }, { "epoch": 0.63, "grad_norm": 0.5234375, "learning_rate": 1.793239620655211e-07, "logits/chosen": 0.10640072822570801, "logits/rejected": 0.5526248812675476, "logps/chosen": -198.35403442382812, "logps/rejected": -196.8388671875, "loss": 0.6604, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0263301283121109, "rewards/margins": 0.07441949844360352, "rewards/margins_max": 0.1034015566110611, "rewards/margins_min": 0.045437444001436234, "rewards/margins_std": 0.040986817330121994, "rewards/rejected": -0.04808937385678291, "step": 1090 }, { "epoch": 0.64, "grad_norm": 0.390625, "learning_rate": 1.744815416451847e-07, "logits/chosen": 0.1694943606853485, "logits/rejected": 0.6004883050918579, "logps/chosen": -255.3223114013672, "logps/rejected": -243.01541137695312, "loss": 0.6625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01719365268945694, "rewards/margins": 0.06180461123585701, "rewards/margins_max": 0.08655586838722229, "rewards/margins_min": 0.03705335780978203, "rewards/margins_std": 0.03500355780124664, "rewards/rejected": -0.04461096227169037, "step": 1100 }, { "epoch": 0.64, "grad_norm": 0.4453125, "learning_rate": 1.6967010357921446e-07, "logits/chosen": 0.11355743557214737, "logits/rejected": 0.4874862730503082, "logps/chosen": -210.58767700195312, "logps/rejected": -219.46701049804688, "loss": 0.6618, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.005143271759152412, "rewards/margins": 0.061519283801317215, "rewards/margins_max": 0.0864943265914917, "rewards/margins_min": 0.036544252187013626, "rewards/margins_std": 0.035320036113262177, "rewards/rejected": -0.05637601017951965, "step": 1110 }, { "epoch": 0.65, "grad_norm": 0.439453125, "learning_rate": 1.6489162181785255e-07, "logits/chosen": 0.15795719623565674, "logits/rejected": 0.5425394773483276, "logps/chosen": -245.29562377929688, "logps/rejected": -233.9000244140625, "loss": 0.6602, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.021811651065945625, "rewards/margins": 0.07487231492996216, "rewards/margins_max": 0.09871380031108856, "rewards/margins_min": 0.051030855625867844, "rewards/margins_std": 0.03371693566441536, "rewards/rejected": -0.05306067317724228, "step": 1120 }, { "epoch": 0.66, "grad_norm": 0.361328125, "learning_rate": 1.6014805679062183e-07, "logits/chosen": -0.04248831048607826, "logits/rejected": 0.36503881216049194, "logps/chosen": -204.58383178710938, "logps/rejected": -203.0003204345703, "loss": 0.6607, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.020199652761220932, "rewards/margins": 0.08475508540868759, "rewards/margins_max": 0.11757893860340118, "rewards/margins_min": 0.051931243389844894, "rewards/margins_std": 0.046419933438301086, "rewards/rejected": -0.06455543637275696, "step": 1130 }, { "epoch": 0.66, "grad_norm": 0.482421875, "learning_rate": 1.5544135460203527e-07, "logits/chosen": 0.250204861164093, "logits/rejected": 0.5448838472366333, "logps/chosen": -212.43508911132812, "logps/rejected": -247.50747680664062, "loss": 0.6601, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.013406927697360516, "rewards/margins": 0.07055126130580902, "rewards/margins_max": 0.09891954064369202, "rewards/margins_min": 0.04218297451734543, "rewards/margins_std": 0.04011881351470947, "rewards/rejected": -0.05714433267712593, "step": 1140 }, { "epoch": 0.67, "grad_norm": 0.408203125, "learning_rate": 1.5077344623318388e-07, "logits/chosen": 0.08146306127309799, "logits/rejected": 0.5028539896011353, "logps/chosen": -244.5470733642578, "logps/rejected": -203.9750213623047, "loss": 0.6622, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.00543981185182929, "rewards/margins": 0.0606420524418354, "rewards/margins_max": 0.09149619191884995, "rewards/margins_min": 0.029787922278046608, "rewards/margins_std": 0.043634332716464996, "rewards/rejected": -0.05520225316286087, "step": 1150 }, { "epoch": 0.67, "grad_norm": 0.4921875, "learning_rate": 1.461462467495284e-07, "logits/chosen": 0.09238779544830322, "logits/rejected": 0.5282326340675354, "logps/chosen": -239.08853149414062, "logps/rejected": -234.31228637695312, "loss": 0.6582, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020727628841996193, "rewards/margins": 0.07139938324689865, "rewards/margins_max": 0.0972491055727005, "rewards/margins_min": 0.045549679547548294, "rewards/margins_std": 0.036557018756866455, "rewards/rejected": -0.0506717674434185, "step": 1160 }, { "epoch": 0.68, "grad_norm": 0.400390625, "learning_rate": 1.4156165451522028e-07, "logits/chosen": 0.08472833782434464, "logits/rejected": 0.5027869939804077, "logps/chosen": -205.4404754638672, "logps/rejected": -202.98440551757812, "loss": 0.663, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.011948509141802788, "rewards/margins": 0.06199117749929428, "rewards/margins_max": 0.08956360816955566, "rewards/margins_min": 0.03441876173019409, "rewards/margins_std": 0.038993291556835175, "rewards/rejected": -0.05004267022013664, "step": 1170 }, { "epoch": 0.68, "grad_norm": 0.470703125, "learning_rate": 1.3702155041427543e-07, "logits/chosen": 0.1654224544763565, "logits/rejected": 0.39103928208351135, "logps/chosen": -221.5464630126953, "logps/rejected": -246.1484832763672, "loss": 0.6611, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.008782127872109413, "rewards/margins": 0.05567712336778641, "rewards/margins_max": 0.07324758917093277, "rewards/margins_min": 0.038106657564640045, "rewards/margins_std": 0.024848390370607376, "rewards/rejected": -0.046894993633031845, "step": 1180 }, { "epoch": 0.69, "grad_norm": 0.4375, "learning_rate": 1.3252779707891902e-07, "logits/chosen": 0.009541223756968975, "logits/rejected": 0.48217493295669556, "logps/chosen": -272.9510192871094, "logps/rejected": -204.46435546875, "loss": 0.6611, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009134182706475258, "rewards/margins": 0.05944829061627388, "rewards/margins_max": 0.08002766221761703, "rewards/margins_min": 0.03886892646551132, "rewards/margins_std": 0.02910362184047699, "rewards/rejected": -0.05031410977244377, "step": 1190 }, { "epoch": 0.7, "grad_norm": 0.462890625, "learning_rate": 1.2808223812541774e-07, "logits/chosen": 0.07254563271999359, "logits/rejected": 0.47662535309791565, "logps/chosen": -241.54336547851562, "logps/rejected": -211.88424682617188, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.0020990788470953703, "rewards/margins": 0.05149079114198685, "rewards/margins_max": 0.08034542202949524, "rewards/margins_min": 0.022636160254478455, "rewards/margins_std": 0.040806613862514496, "rewards/rejected": -0.04939170926809311, "step": 1200 }, { "epoch": 0.7, "grad_norm": 0.4375, "learning_rate": 1.2368669739771469e-07, "logits/chosen": 0.07886068522930145, "logits/rejected": 0.4947189390659332, "logps/chosen": -206.33993530273438, "logps/rejected": -212.7965850830078, "loss": 0.6578, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009903495199978352, "rewards/margins": 0.0682389959692955, "rewards/margins_max": 0.09637950360774994, "rewards/margins_min": 0.04009848088026047, "rewards/margins_std": 0.03979669511318207, "rewards/rejected": -0.058335501700639725, "step": 1210 }, { "epoch": 0.71, "grad_norm": 0.439453125, "learning_rate": 1.1934297821917497e-07, "logits/chosen": -0.18527595698833466, "logits/rejected": 0.35417476296424866, "logps/chosen": -271.8248291015625, "logps/rejected": -208.87966918945312, "loss": 0.6619, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014687316492199898, "rewards/margins": 0.05254317447543144, "rewards/margins_max": 0.0765123963356018, "rewards/margins_min": 0.028573954477906227, "rewards/margins_std": 0.03389759734272957, "rewards/rejected": -0.03785586357116699, "step": 1220 }, { "epoch": 0.71, "grad_norm": 0.40234375, "learning_rate": 1.1505286265275094e-07, "logits/chosen": 0.09351782500743866, "logits/rejected": 0.5304566621780396, "logps/chosen": -217.6367645263672, "logps/rejected": -209.18603515625, "loss": 0.666, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01146542839705944, "rewards/margins": 0.07028119266033173, "rewards/margins_max": 0.10538403689861298, "rewards/margins_min": 0.03517835959792137, "rewards/margins_std": 0.0496429018676281, "rewards/rejected": -0.05881576985120773, "step": 1230 }, { "epoch": 0.72, "grad_norm": 0.390625, "learning_rate": 1.1081811076986963e-07, "logits/chosen": 0.026241421699523926, "logits/rejected": 0.6041153073310852, "logps/chosen": -228.3728790283203, "logps/rejected": -190.1019287109375, "loss": 0.6596, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.016418198123574257, "rewards/margins": 0.0706411749124527, "rewards/margins_max": 0.09941698610782623, "rewards/margins_min": 0.041865330189466476, "rewards/margins_std": 0.04069516435265541, "rewards/rejected": -0.054222963750362396, "step": 1240 }, { "epoch": 0.73, "grad_norm": 0.427734375, "learning_rate": 1.0664045992834184e-07, "logits/chosen": 0.19840288162231445, "logits/rejected": 0.5584182143211365, "logps/chosen": -254.10147094726562, "logps/rejected": -256.0483703613281, "loss": 0.6583, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012557362206280231, "rewards/margins": 0.06964166462421417, "rewards/margins_max": 0.09085742384195328, "rewards/margins_min": 0.04842590540647507, "rewards/margins_std": 0.030003610998392105, "rewards/rejected": -0.057084303349256516, "step": 1250 }, { "epoch": 0.73, "grad_norm": 0.484375, "learning_rate": 1.0252162405959042e-07, "logits/chosen": -0.029180001467466354, "logits/rejected": 0.4648149609565735, "logps/chosen": -273.28375244140625, "logps/rejected": -244.730712890625, "loss": 0.6602, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02007482200860977, "rewards/margins": 0.06700652837753296, "rewards/margins_max": 0.10410724580287933, "rewards/margins_min": 0.029905814677476883, "rewards/margins_std": 0.05246833711862564, "rewards/rejected": -0.04693170636892319, "step": 1260 }, { "epoch": 0.74, "grad_norm": 0.494140625, "learning_rate": 9.846329296548963e-08, "logits/chosen": -0.017562460154294968, "logits/rejected": 0.4763096868991852, "logps/chosen": -269.8515625, "logps/rejected": -263.83148193359375, "loss": 0.6598, "rewards/accuracies": 1.0, "rewards/chosen": 0.010733803734183311, "rewards/margins": 0.07448114454746246, "rewards/margins_max": 0.10118886083364487, "rewards/margins_min": 0.04777342826128006, "rewards/margins_std": 0.03777041286230087, "rewards/rejected": -0.0637473464012146, "step": 1270 }, { "epoch": 0.74, "grad_norm": 0.486328125, "learning_rate": 9.446713162510341e-08, "logits/chosen": 0.22771111130714417, "logits/rejected": 0.7621752023696899, "logps/chosen": -266.06390380859375, "logps/rejected": -250.635498046875, "loss": 0.6584, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.030348753556609154, "rewards/margins": 0.07343067973852158, "rewards/margins_max": 0.10677297413349152, "rewards/margins_min": 0.040088407695293427, "rewards/margins_std": 0.04715309664607048, "rewards/rejected": -0.04308192804455757, "step": 1280 }, { "epoch": 0.75, "grad_norm": 0.515625, "learning_rate": 9.053477951160737e-08, "logits/chosen": 0.015399669297039509, "logits/rejected": 0.7483765482902527, "logps/chosen": -276.5067443847656, "logps/rejected": -227.33761596679688, "loss": 0.6579, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026790842413902283, "rewards/margins": 0.08279003202915192, "rewards/margins_max": 0.11221597343683243, "rewards/margins_min": 0.05336407572031021, "rewards/margins_std": 0.04161457344889641, "rewards/rejected": -0.05599917098879814, "step": 1290 }, { "epoch": 0.75, "grad_norm": 0.396484375, "learning_rate": 8.666784991967596e-08, "logits/chosen": 0.010845961980521679, "logits/rejected": 0.42500224709510803, "logps/chosen": -213.1592254638672, "logps/rejected": -199.2817840576172, "loss": 0.6613, "rewards/accuracies": 1.0, "rewards/chosen": 0.014592917636036873, "rewards/margins": 0.0668349340558052, "rewards/margins_max": 0.09872870147228241, "rewards/margins_min": 0.03494114801287651, "rewards/margins_std": 0.04510461539030075, "rewards/rejected": -0.05224201828241348, "step": 1300 }, { "epoch": 0.76, "grad_norm": 0.4921875, "learning_rate": 8.286792930360823e-08, "logits/chosen": 0.25165149569511414, "logits/rejected": 0.6992672681808472, "logps/chosen": -217.0974884033203, "logps/rejected": -202.47030639648438, "loss": 0.6599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.011730032041668892, "rewards/margins": 0.0590001717209816, "rewards/margins_max": 0.07914995402097702, "rewards/margins_min": 0.03885037824511528, "rewards/margins_std": 0.02849610149860382, "rewards/rejected": -0.04727013781666756, "step": 1310 }, { "epoch": 0.77, "grad_norm": 0.52734375, "learning_rate": 7.91365766264665e-08, "logits/chosen": 0.20514824986457825, "logits/rejected": 0.5356392860412598, "logps/chosen": -248.6316680908203, "logps/rejected": -240.5338134765625, "loss": 0.6591, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.010535435751080513, "rewards/margins": 0.06282900273799896, "rewards/margins_max": 0.09407368302345276, "rewards/margins_min": 0.031584326177835464, "rewards/margins_std": 0.04418665170669556, "rewards/rejected": -0.052293576300144196, "step": 1320 }, { "epoch": 0.77, "grad_norm": 0.455078125, "learning_rate": 7.547532272049264e-08, "logits/chosen": 0.25605538487434387, "logits/rejected": 0.6374403238296509, "logps/chosen": -255.80410766601562, "logps/rejected": -255.73764038085938, "loss": 0.6619, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.013418711721897125, "rewards/margins": 0.06125851348042488, "rewards/margins_max": 0.08139893412590027, "rewards/margins_min": 0.04111810773611069, "rewards/margins_std": 0.028482843190431595, "rewards/rejected": -0.047839801758527756, "step": 1330 }, { "epoch": 0.78, "grad_norm": 0.4140625, "learning_rate": 7.188566965906584e-08, "logits/chosen": 0.10137088596820831, "logits/rejected": 0.5515474081039429, "logps/chosen": -271.2210693359375, "logps/rejected": -272.3622131347656, "loss": 0.6598, "rewards/accuracies": 1.0, "rewards/chosen": -0.00015007219917606562, "rewards/margins": 0.06623668223619461, "rewards/margins_max": 0.10004226863384247, "rewards/margins_min": 0.03243108466267586, "rewards/margins_std": 0.04780833050608635, "rewards/rejected": -0.06638675183057785, "step": 1340 }, { "epoch": 0.78, "grad_norm": 0.412109375, "learning_rate": 6.836909014045924e-08, "logits/chosen": 0.005819192621856928, "logits/rejected": 0.38501212000846863, "logps/chosen": -247.23056030273438, "logps/rejected": -238.4652557373047, "loss": 0.6607, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01672416180372238, "rewards/margins": 0.07304920256137848, "rewards/margins_max": 0.10092739760875702, "rewards/margins_min": 0.04517098516225815, "rewards/margins_std": 0.039425741881132126, "rewards/rejected": -0.0563250370323658, "step": 1350 }, { "epoch": 0.79, "grad_norm": 0.4609375, "learning_rate": 6.492702688364737e-08, "logits/chosen": -0.07613168656826019, "logits/rejected": 0.20295462012290955, "logps/chosen": -203.92233276367188, "logps/rejected": -247.69277954101562, "loss": 0.6604, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014894701540470123, "rewards/margins": 0.06641440093517303, "rewards/margins_max": 0.09283626079559326, "rewards/margins_min": 0.039992526173591614, "rewards/margins_std": 0.037366170436143875, "rewards/rejected": -0.05151969939470291, "step": 1360 }, { "epoch": 0.79, "grad_norm": 0.4375, "learning_rate": 6.156089203641373e-08, "logits/chosen": -0.014948748052120209, "logits/rejected": 0.4398605227470398, "logps/chosen": -247.429931640625, "logps/rejected": -251.06826782226562, "loss": 0.6571, "rewards/accuracies": 1.0, "rewards/chosen": 0.0333079919219017, "rewards/margins": 0.08266235888004303, "rewards/margins_max": 0.10667815059423447, "rewards/margins_min": 0.0586465522646904, "rewards/margins_std": 0.03396347165107727, "rewards/rejected": -0.04935435950756073, "step": 1370 }, { "epoch": 0.8, "grad_norm": 0.427734375, "learning_rate": 5.827206659599987e-08, "logits/chosen": 0.28106218576431274, "logits/rejected": 0.7749143242835999, "logps/chosen": -222.03665161132812, "logps/rejected": -200.11221313476562, "loss": 0.6576, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017674388363957405, "rewards/margins": 0.07599468529224396, "rewards/margins_max": 0.11385379731655121, "rewards/margins_min": 0.038135576993227005, "rewards/margins_std": 0.05354086682200432, "rewards/rejected": -0.058320302516222, "step": 1380 }, { "epoch": 0.81, "grad_norm": 0.50390625, "learning_rate": 5.506189984253501e-08, "logits/chosen": 0.16949541866779327, "logits/rejected": 0.4548502564430237, "logps/chosen": -205.447265625, "logps/rejected": -221.4696044921875, "loss": 0.6611, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.003050294006243348, "rewards/margins": 0.06650832295417786, "rewards/margins_max": 0.09234586358070374, "rewards/margins_min": 0.040670786052942276, "rewards/margins_std": 0.036539800465106964, "rewards/rejected": -0.06345803290605545, "step": 1390 }, { "epoch": 0.81, "grad_norm": 0.482421875, "learning_rate": 5.1931708785477506e-08, "logits/chosen": 0.11355874687433243, "logits/rejected": 0.6481127738952637, "logps/chosen": -216.15432739257812, "logps/rejected": -187.30389404296875, "loss": 0.6592, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015445582568645477, "rewards/margins": 0.05808136984705925, "rewards/margins_max": 0.08922155201435089, "rewards/margins_min": 0.026941198855638504, "rewards/margins_std": 0.04403885826468468, "rewards/rejected": -0.04263579100370407, "step": 1400 }, { "epoch": 0.82, "grad_norm": 0.380859375, "learning_rate": 4.888277762329582e-08, "logits/chosen": 0.11872565746307373, "logits/rejected": 0.5771151185035706, "logps/chosen": -215.25442504882812, "logps/rejected": -214.4876251220703, "loss": 0.6619, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01657172292470932, "rewards/margins": 0.06676243245601654, "rewards/margins_max": 0.0983147844672203, "rewards/margins_min": 0.03521009162068367, "rewards/margins_std": 0.04462175816297531, "rewards/rejected": -0.05019070953130722, "step": 1410 }, { "epoch": 0.82, "grad_norm": 0.439453125, "learning_rate": 4.591635721661072e-08, "logits/chosen": 0.1136382669210434, "logits/rejected": 0.5482941269874573, "logps/chosen": -243.9540557861328, "logps/rejected": -231.51473999023438, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.01714186929166317, "rewards/margins": 0.07303180545568466, "rewards/margins_max": 0.10039409250020981, "rewards/margins_min": 0.045669522136449814, "rewards/margins_std": 0.03869611397385597, "rewards/rejected": -0.05588993430137634, "step": 1420 }, { "epoch": 0.83, "grad_norm": 0.431640625, "learning_rate": 4.3033664575015005e-08, "logits/chosen": 0.24127981066703796, "logits/rejected": 0.6273223161697388, "logps/chosen": -258.4788818359375, "logps/rejected": -255.1360321044922, "loss": 0.6591, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0254741869866848, "rewards/margins": 0.0617264024913311, "rewards/margins_max": 0.08791927993297577, "rewards/margins_min": 0.035533517599105835, "rewards/margins_std": 0.03704233095049858, "rewards/rejected": -0.036252211779356, "step": 1430 }, { "epoch": 0.84, "grad_norm": 0.4453125, "learning_rate": 4.023588235778019e-08, "logits/chosen": 0.048088885843753815, "logits/rejected": 0.4085961878299713, "logps/chosen": -235.32763671875, "logps/rejected": -246.94937133789062, "loss": 0.6625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.017656199634075165, "rewards/margins": 0.07100087404251099, "rewards/margins_max": 0.09923985600471497, "rewards/margins_min": 0.042761895805597305, "rewards/margins_std": 0.039935946464538574, "rewards/rejected": -0.05334467440843582, "step": 1440 }, { "epoch": 0.84, "grad_norm": 0.4609375, "learning_rate": 3.752415838865664e-08, "logits/chosen": -0.09887398779392242, "logits/rejected": 0.5310045480728149, "logps/chosen": -245.59951782226562, "logps/rejected": -266.8290100097656, "loss": 0.6586, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018602244555950165, "rewards/margins": 0.08193326741456985, "rewards/margins_max": 0.11139090359210968, "rewards/margins_min": 0.05247562378644943, "rewards/margins_std": 0.041659384965896606, "rewards/rejected": -0.06333102285861969, "step": 1450 }, { "epoch": 0.85, "grad_norm": 0.439453125, "learning_rate": 3.4899605184965206e-08, "logits/chosen": 0.03019891306757927, "logits/rejected": 0.44324207305908203, "logps/chosen": -225.20443725585938, "logps/rejected": -183.06094360351562, "loss": 0.6609, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0028962846845388412, "rewards/margins": 0.0560896173119545, "rewards/margins_max": 0.07679092139005661, "rewards/margins_min": 0.035388313233852386, "rewards/margins_std": 0.02927606739103794, "rewards/rejected": -0.05319333076477051, "step": 1460 }, { "epoch": 0.85, "grad_norm": 0.439453125, "learning_rate": 3.23632995011732e-08, "logits/chosen": -0.06648756563663483, "logits/rejected": 0.29680854082107544, "logps/chosen": -226.04983520507812, "logps/rejected": -258.3298034667969, "loss": 0.6587, "rewards/accuracies": 1.0, "rewards/chosen": 0.03215508535504341, "rewards/margins": 0.08979654312133789, "rewards/margins_max": 0.12097585201263428, "rewards/margins_min": 0.058617234230041504, "rewards/margins_std": 0.044094208627939224, "rewards/rejected": -0.057641465216875076, "step": 1470 }, { "epoch": 0.86, "grad_norm": 0.455078125, "learning_rate": 2.991628188714351e-08, "logits/chosen": 0.00623916694894433, "logits/rejected": 0.48251962661743164, "logps/chosen": -313.39935302734375, "logps/rejected": -245.91720581054688, "loss": 0.6596, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.004381291568279266, "rewards/margins": 0.07124367356300354, "rewards/margins_max": 0.09969727694988251, "rewards/margins_min": 0.04279007390141487, "rewards/margins_std": 0.04023946821689606, "rewards/rejected": -0.06686238944530487, "step": 1480 }, { "epoch": 0.86, "grad_norm": 0.4375, "learning_rate": 2.755955626123596e-08, "logits/chosen": 0.12439896166324615, "logits/rejected": 0.6011586785316467, "logps/chosen": -250.7643585205078, "logps/rejected": -217.0757293701172, "loss": 0.6624, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.018308712169528008, "rewards/margins": 0.05787688493728638, "rewards/margins_max": 0.09185748547315598, "rewards/margins_min": 0.023896273225545883, "rewards/margins_std": 0.04805583506822586, "rewards/rejected": -0.03956816717982292, "step": 1490 }, { "epoch": 0.87, "grad_norm": 0.42578125, "learning_rate": 2.5294089498438225e-08, "logits/chosen": 0.024487819522619247, "logits/rejected": 0.5533932447433472, "logps/chosen": -245.57492065429688, "logps/rejected": -220.93258666992188, "loss": 0.6584, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.010946778580546379, "rewards/margins": 0.06493957340717316, "rewards/margins_max": 0.0981217697262764, "rewards/margins_min": 0.03175736218690872, "rewards/margins_std": 0.046926725655794144, "rewards/rejected": -0.05399278551340103, "step": 1500 }, { "epoch": 0.88, "grad_norm": 0.48046875, "learning_rate": 2.312081103369354e-08, "logits/chosen": 0.10629892349243164, "logits/rejected": 0.5729449987411499, "logps/chosen": -227.0969696044922, "logps/rejected": -209.62841796875, "loss": 0.659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013625606894493103, "rewards/margins": 0.05797373503446579, "rewards/margins_max": 0.0893624946475029, "rewards/margins_min": 0.02658497728407383, "rewards/margins_std": 0.04439040273427963, "rewards/rejected": -0.04434812813997269, "step": 1510 }, { "epoch": 0.88, "grad_norm": 0.48046875, "learning_rate": 2.104061248058872e-08, "logits/chosen": 0.10214777290821075, "logits/rejected": 0.4200982451438904, "logps/chosen": -213.7083740234375, "logps/rejected": -225.8516845703125, "loss": 0.6666, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.018484923988580704, "rewards/margins": 0.058260779827833176, "rewards/margins_max": 0.08636601269245148, "rewards/margins_min": 0.030155545100569725, "rewards/margins_std": 0.03974680230021477, "rewards/rejected": -0.03977585583925247, "step": 1520 }, { "epoch": 0.89, "grad_norm": 0.44921875, "learning_rate": 1.9054347265559213e-08, "logits/chosen": 0.1583404242992401, "logits/rejected": 0.6649370193481445, "logps/chosen": -259.9563903808594, "logps/rejected": -223.4931640625, "loss": 0.6565, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014935478568077087, "rewards/margins": 0.07356850802898407, "rewards/margins_max": 0.10868100821971893, "rewards/margins_min": 0.0384560152888298, "rewards/margins_std": 0.049656566232442856, "rewards/rejected": -0.058633022010326385, "step": 1530 }, { "epoch": 0.89, "grad_norm": 0.498046875, "learning_rate": 1.716283027776061e-08, "logits/chosen": 0.2019151747226715, "logits/rejected": 0.8282853364944458, "logps/chosen": -291.37066650390625, "logps/rejected": -222.61831665039062, "loss": 0.6634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.016527246683835983, "rewards/margins": 0.07255034148693085, "rewards/margins_max": 0.1086968407034874, "rewards/margins_min": 0.036403849720954895, "rewards/margins_std": 0.05111886188387871, "rewards/rejected": -0.05602309852838516, "step": 1540 }, { "epoch": 0.9, "grad_norm": 0.4296875, "learning_rate": 1.536683753475043e-08, "logits/chosen": 0.22870250046253204, "logits/rejected": 0.4174967408180237, "logps/chosen": -219.11306762695312, "logps/rejected": -241.36563110351562, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025456459261476994, "rewards/margins": 0.059264473617076874, "rewards/margins_max": 0.08250005543231964, "rewards/margins_min": 0.036028891801834106, "rewards/margins_std": 0.032860077917575836, "rewards/rejected": -0.061810124665498734, "step": 1550 }, { "epoch": 0.9, "grad_norm": 0.390625, "learning_rate": 1.3667105864117873e-08, "logits/chosen": 0.21612632274627686, "logits/rejected": 0.39824485778808594, "logps/chosen": -200.84498596191406, "logps/rejected": -228.2679901123047, "loss": 0.6605, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008642548695206642, "rewards/margins": 0.0651601254940033, "rewards/margins_max": 0.10423406213521957, "rewards/margins_min": 0.026086175814270973, "rewards/margins_std": 0.05525890737771988, "rewards/rejected": -0.0565175786614418, "step": 1560 }, { "epoch": 0.91, "grad_norm": 0.41796875, "learning_rate": 1.2064332601191163e-08, "logits/chosen": -0.04893340915441513, "logits/rejected": 0.339263916015625, "logps/chosen": -222.4666748046875, "logps/rejected": -217.02999877929688, "loss": 0.6612, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0008535057422704995, "rewards/margins": 0.05954117700457573, "rewards/margins_max": 0.0829622894525528, "rewards/margins_min": 0.03612007200717926, "rewards/margins_std": 0.03312245011329651, "rewards/rejected": -0.06039468199014664, "step": 1570 }, { "epoch": 0.92, "grad_norm": 0.39453125, "learning_rate": 1.0559175302947476e-08, "logits/chosen": 0.012552693486213684, "logits/rejected": 0.5173078775405884, "logps/chosen": -260.0834045410156, "logps/rejected": -247.43447875976562, "loss": 0.6595, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011661765165627003, "rewards/margins": 0.06366874277591705, "rewards/margins_max": 0.09778660535812378, "rewards/margins_min": 0.029550885781645775, "rewards/margins_std": 0.04824993759393692, "rewards/rejected": -0.052006978541612625, "step": 1580 }, { "epoch": 0.92, "grad_norm": 0.349609375, "learning_rate": 9.152251478242417e-09, "logits/chosen": -0.02594194933772087, "logits/rejected": 0.4399421215057373, "logps/chosen": -212.4099578857422, "logps/rejected": -199.73458862304688, "loss": 0.6594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.007081738207489252, "rewards/margins": 0.06215248256921768, "rewards/margins_max": 0.08854631334543228, "rewards/margins_min": 0.03575865179300308, "rewards/margins_std": 0.03732650727033615, "rewards/rejected": -0.055070746690034866, "step": 1590 }, { "epoch": 0.93, "grad_norm": 0.427734375, "learning_rate": 7.844138334469425e-09, "logits/chosen": 0.4558231234550476, "logits/rejected": 0.8965223431587219, "logps/chosen": -201.3118438720703, "logps/rejected": -192.5732421875, "loss": 0.6628, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.009340132586658001, "rewards/margins": 0.0616113655269146, "rewards/margins_max": 0.09181926399469376, "rewards/margins_min": 0.03140346333384514, "rewards/margins_std": 0.04272041842341423, "rewards/rejected": -0.05227123573422432, "step": 1600 }, { "epoch": 0.93, "grad_norm": 0.37109375, "learning_rate": 6.635372540753498e-09, "logits/chosen": 0.11258337646722794, "logits/rejected": 0.6999211311340332, "logps/chosen": -240.33975219726562, "logps/rejected": -214.0699920654297, "loss": 0.6577, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.027147358283400536, "rewards/margins": 0.0817473754286766, "rewards/margins_max": 0.12004182487726212, "rewards/margins_min": 0.0434529110789299, "rewards/margins_std": 0.05415653437376022, "rewards/rejected": -0.05460001155734062, "step": 1610 }, { "epoch": 0.94, "grad_norm": 0.4609375, "learning_rate": 5.526450007776435e-09, "logits/chosen": 0.1300087720155716, "logits/rejected": 0.5238357782363892, "logps/chosen": -292.7140197753906, "logps/rejected": -246.2644805908203, "loss": 0.6611, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0037552430294454098, "rewards/margins": 0.05609096214175224, "rewards/margins_max": 0.07447664439678192, "rewards/margins_min": 0.03770528361201286, "rewards/margins_std": 0.026001274585723877, "rewards/rejected": -0.052335720509290695, "step": 1620 }, { "epoch": 0.95, "grad_norm": 0.431640625, "learning_rate": 4.517825684323323e-09, "logits/chosen": 0.18602465093135834, "logits/rejected": 0.5172281861305237, "logps/chosen": -223.3422088623047, "logps/rejected": -241.034912109375, "loss": 0.6596, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.00845097191631794, "rewards/margins": 0.06410791724920273, "rewards/margins_max": 0.09119440615177155, "rewards/margins_min": 0.037021439522504807, "rewards/margins_std": 0.03830606862902641, "rewards/rejected": -0.05565694719552994, "step": 1630 }, { "epoch": 0.95, "grad_norm": 0.474609375, "learning_rate": 3.6099133706344044e-09, "logits/chosen": 0.13008326292037964, "logits/rejected": 0.6074930429458618, "logps/chosen": -223.1219940185547, "logps/rejected": -207.696044921875, "loss": 0.6569, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02304968610405922, "rewards/margins": 0.07380314916372299, "rewards/margins_max": 0.09590893238782883, "rewards/margins_min": 0.05169736221432686, "rewards/margins_std": 0.03126230835914612, "rewards/rejected": -0.05075346678495407, "step": 1640 }, { "epoch": 0.96, "grad_norm": 0.4921875, "learning_rate": 2.8030855486386174e-09, "logits/chosen": 0.28828924894332886, "logits/rejected": 0.6710017919540405, "logps/chosen": -256.94903564453125, "logps/rejected": -281.40411376953125, "loss": 0.6586, "rewards/accuracies": 1.0, "rewards/chosen": 0.023295782506465912, "rewards/margins": 0.071876659989357, "rewards/margins_max": 0.09554243832826614, "rewards/margins_min": 0.048210885375738144, "rewards/margins_std": 0.03346845880150795, "rewards/rejected": -0.04858088120818138, "step": 1650 }, { "epoch": 0.96, "grad_norm": 0.515625, "learning_rate": 2.097673229138286e-09, "logits/chosen": 0.16988131403923035, "logits/rejected": 0.47897881269454956, "logps/chosen": -224.6415557861328, "logps/rejected": -232.2594451904297, "loss": 0.6587, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012618700973689556, "rewards/margins": 0.07099349051713943, "rewards/margins_max": 0.10776303708553314, "rewards/margins_min": 0.03422392159700394, "rewards/margins_std": 0.0520000159740448, "rewards/rejected": -0.05837478116154671, "step": 1660 }, { "epoch": 0.97, "grad_norm": 0.44921875, "learning_rate": 1.493965816008136e-09, "logits/chosen": -0.009510600939393044, "logits/rejected": 0.3807966113090515, "logps/chosen": -211.14254760742188, "logps/rejected": -236.635498046875, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 0.00740268686786294, "rewards/margins": 0.07398059964179993, "rewards/margins_max": 0.10376466810703278, "rewards/margins_min": 0.04419652372598648, "rewards/margins_std": 0.0421210452914238, "rewards/rejected": -0.06657791137695312, "step": 1670 }, { "epoch": 0.97, "grad_norm": 0.447265625, "learning_rate": 9.922109874636875e-10, "logits/chosen": 0.19054090976715088, "logits/rejected": 0.557522177696228, "logps/chosen": -233.7532501220703, "logps/rejected": -239.6273651123047, "loss": 0.6579, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.015365364961326122, "rewards/margins": 0.08128596842288971, "rewards/margins_max": 0.11999186128377914, "rewards/margins_min": 0.04258008301258087, "rewards/margins_std": 0.05473839119076729, "rewards/rejected": -0.06592060625553131, "step": 1680 }, { "epoch": 0.98, "grad_norm": 0.416015625, "learning_rate": 5.926145944483984e-10, "logits/chosen": 0.04970569908618927, "logits/rejected": 0.41454869508743286, "logps/chosen": -197.70941162109375, "logps/rejected": -207.9854278564453, "loss": 0.6625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009294511750340462, "rewards/margins": 0.05480729788541794, "rewards/margins_max": 0.08153598010540009, "rewards/margins_min": 0.02807862125337124, "rewards/margins_std": 0.03780006244778633, "rewards/rejected": -0.04551279544830322, "step": 1690 }, { "epoch": 0.99, "grad_norm": 0.42578125, "learning_rate": 2.9534057618091356e-10, "logits/chosen": 0.1366875320672989, "logits/rejected": 0.4813140034675598, "logps/chosen": -195.55368041992188, "logps/rejected": -211.63711547851562, "loss": 0.6599, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.014302869327366352, "rewards/margins": 0.0652112141251564, "rewards/margins_max": 0.09685875475406647, "rewards/margins_min": 0.03356366977095604, "rewards/margins_std": 0.04475637897849083, "rewards/rejected": -0.05090833827853203, "step": 1700 }, { "epoch": 0.99, "grad_norm": 0.4453125, "learning_rate": 1.0051089289686565e-10, "logits/chosen": 0.20965194702148438, "logits/rejected": 0.5980690121650696, "logps/chosen": -218.3548583984375, "logps/rejected": -252.60159301757812, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 0.01929156482219696, "rewards/margins": 0.06570716202259064, "rewards/margins_max": 0.09711313247680664, "rewards/margins_min": 0.03430120274424553, "rewards/margins_std": 0.044414736330509186, "rewards/rejected": -0.04641559720039368, "step": 1710 }, { "epoch": 1.0, "grad_norm": 0.404296875, "learning_rate": 8.205475813372054e-12, "logits/chosen": 0.07036467641592026, "logits/rejected": 0.6885267496109009, "logps/chosen": -334.186279296875, "logps/rejected": -232.6072998046875, "loss": 0.6604, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015851657837629318, "rewards/margins": 0.06690393388271332, "rewards/margins_max": 0.0959465354681015, "rewards/margins_min": 0.037861332297325134, "rewards/margins_std": 0.041072435677051544, "rewards/rejected": -0.0510522723197937, "step": 1720 }, { "epoch": 1.0, "eval_logits/chosen": 0.7297662496566772, "eval_logits/rejected": 0.8997808694839478, "eval_logps/chosen": -337.8507080078125, "eval_logps/rejected": -318.01556396484375, "eval_loss": 0.6928703784942627, "eval_rewards/accuracies": 0.5364999771118164, "eval_rewards/chosen": 0.002909434260800481, "eval_rewards/margins": 0.0005662557086907327, "eval_rewards/margins_max": 0.07228709012269974, "eval_rewards/margins_min": -0.08225506544113159, "eval_rewards/margins_std": 0.050406549125909805, "eval_rewards/rejected": 0.002343178726732731, "eval_runtime": 864.7602, "eval_samples_per_second": 9.251, "eval_steps_per_second": 0.289, "step": 1724 }, { "epoch": 1.0, "step": 1724, "total_flos": 0.0, "train_loss": 0.6676546893927447, "train_runtime": 9120.8228, "train_samples_per_second": 3.024, "train_steps_per_second": 0.189 } ], "logging_steps": 10, "max_steps": 1724, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }