diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3163 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 1724, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.43359375, + "learning_rate": 2.890173410404624e-09, + "logits/chosen": 0.1325806975364685, + "logits/rejected": 0.3077998757362366, + "logps/chosen": -239.35935974121094, + "logps/rejected": -304.581298828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 0.416015625, + "learning_rate": 2.890173410404624e-08, + "logits/chosen": -0.010774746537208557, + "logits/rejected": 0.23452165722846985, + "logps/chosen": -243.3074493408203, + "logps/rejected": -304.1199035644531, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00028879166347905993, + "rewards/margins": 0.0006378353573381901, + "rewards/margins_max": 0.0028404404874891043, + "rewards/margins_min": -0.0015647696563974023, + "rewards/margins_std": 0.0031149541027843952, + "rewards/rejected": -0.00034904375206679106, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.400390625, + "learning_rate": 5.780346820809248e-08, + "logits/chosen": -0.05719061568379402, + "logits/rejected": 0.5148837566375732, + "logps/chosen": -272.7169494628906, + "logps/rejected": -216.58859252929688, + "loss": 0.6931, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0008704366046003997, + "rewards/margins": 0.0001740378502290696, + "rewards/margins_max": 0.0022189407609403133, + "rewards/margins_min": -0.0018708650022745132, + "rewards/margins_std": 0.002891929354518652, + "rewards/rejected": -0.0010444745421409607, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.4921875, + "learning_rate": 8.670520231213872e-08, + "logits/chosen": 0.05507341027259827, + "logits/rejected": 0.5646872520446777, + "logps/chosen": -272.96728515625, + "logps/rejected": -252.10733032226562, + "loss": 0.6932, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0014279346214607358, + "rewards/margins": -0.001033178297802806, + "rewards/margins_max": 0.002007028553634882, + "rewards/margins_min": -0.004073385149240494, + "rewards/margins_std": 0.00429950188845396, + "rewards/rejected": -0.00039475635276176035, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.447265625, + "learning_rate": 1.1560693641618496e-07, + "logits/chosen": -0.08530770242214203, + "logits/rejected": 0.37523841857910156, + "logps/chosen": -256.03692626953125, + "logps/rejected": -224.8648223876953, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0013576907804235816, + "rewards/margins": -0.0014004515251144767, + "rewards/margins_max": 0.0015217246254906058, + "rewards/margins_min": -0.004322628024965525, + "rewards/margins_std": 0.0041325814090669155, + "rewards/rejected": 4.276079198461957e-05, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 0.45703125, + "learning_rate": 1.445086705202312e-07, + "logits/chosen": 0.10976707935333252, + "logits/rejected": 0.40187758207321167, + "logps/chosen": -205.61318969726562, + "logps/rejected": -214.9802703857422, + "loss": 0.693, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0007841205224394798, + "rewards/margins": 0.0018329259473830462, + "rewards/margins_max": 0.004336017183959484, + "rewards/margins_min": -0.0006701658712700009, + "rewards/margins_std": 0.0035399063490331173, + "rewards/rejected": -0.0010488051921129227, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 0.39453125, + "learning_rate": 1.7341040462427744e-07, + "logits/chosen": 0.2901094853878021, + "logits/rejected": 0.4794164299964905, + "logps/chosen": -207.44509887695312, + "logps/rejected": -231.39382934570312, + "loss": 0.693, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.001270442851819098, + "rewards/margins": -0.0007280521094799042, + "rewards/margins_max": 0.0019893264397978783, + "rewards/margins_min": -0.0034454308915883303, + "rewards/margins_std": 0.0038429535925388336, + "rewards/rejected": -0.0005423908005468547, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 0.435546875, + "learning_rate": 2.023121387283237e-07, + "logits/chosen": 0.035371266305446625, + "logits/rejected": 0.4755796492099762, + "logps/chosen": -259.833740234375, + "logps/rejected": -226.2167205810547, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0010710505302995443, + "rewards/margins": 0.0011786860413849354, + "rewards/margins_max": 0.004792899824678898, + "rewards/margins_min": -0.002435527741909027, + "rewards/margins_std": 0.005111270118504763, + "rewards/rejected": -0.0022497368045151234, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 0.4609375, + "learning_rate": 2.3121387283236991e-07, + "logits/chosen": 0.27303510904312134, + "logits/rejected": 0.7382463216781616, + "logps/chosen": -217.78671264648438, + "logps/rejected": -208.35910034179688, + "loss": 0.6928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2639263988821767e-05, + "rewards/margins": 0.0014770211419090629, + "rewards/margins_max": 0.0042491876520216465, + "rewards/margins_min": -0.0012951450189575553, + "rewards/margins_std": 0.003920434974133968, + "rewards/rejected": -0.0014996604295447469, + "step": 80 + }, + { + "epoch": 0.05, + "grad_norm": 0.6640625, + "learning_rate": 2.601156069364162e-07, + "logits/chosen": -0.20650863647460938, + "logits/rejected": 0.17405006289482117, + "logps/chosen": -226.12808227539062, + "logps/rejected": -233.56381225585938, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.000633719377219677, + "rewards/margins": 0.0017947215819731355, + "rewards/margins_max": 0.004501459188759327, + "rewards/margins_min": -0.0009120159666053951, + "rewards/margins_std": 0.0038279048167169094, + "rewards/rejected": -0.0011610020883381367, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 0.431640625, + "learning_rate": 2.890173410404624e-07, + "logits/chosen": -0.019260473549365997, + "logits/rejected": 0.5504380464553833, + "logps/chosen": -292.51995849609375, + "logps/rejected": -235.86843872070312, + "loss": 0.6919, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.001650218851864338, + "rewards/margins": 0.002649242291226983, + "rewards/margins_max": 0.005218566861003637, + "rewards/margins_min": 7.99179106252268e-05, + "rewards/margins_std": 0.0036335731856524944, + "rewards/rejected": -0.0009990233229473233, + "step": 100 + }, + { + "epoch": 0.06, + "grad_norm": 0.53125, + "learning_rate": 3.1791907514450865e-07, + "logits/chosen": -0.06840448081493378, + "logits/rejected": 0.6899427175521851, + "logps/chosen": -252.0308380126953, + "logps/rejected": -199.84799194335938, + "loss": 0.6918, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0018273231107741594, + "rewards/margins": 0.00415054801851511, + "rewards/margins_max": 0.0076604606583714485, + "rewards/margins_min": 0.0006406344473361969, + "rewards/margins_std": 0.004963767249137163, + "rewards/rejected": -0.0023232249077409506, + "step": 110 + }, + { + "epoch": 0.07, + "grad_norm": 0.36328125, + "learning_rate": 3.468208092485549e-07, + "logits/chosen": 0.09203040599822998, + "logits/rejected": 0.5125548243522644, + "logps/chosen": -256.213623046875, + "logps/rejected": -232.49942016601562, + "loss": 0.6915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0007183876005001366, + "rewards/margins": 0.004233072511851788, + "rewards/margins_max": 0.007029411382973194, + "rewards/margins_min": 0.0014367332914844155, + "rewards/margins_std": 0.003954620566219091, + "rewards/rejected": -0.0035146852023899555, + "step": 120 + }, + { + "epoch": 0.08, + "grad_norm": 0.462890625, + "learning_rate": 3.757225433526011e-07, + "logits/chosen": -0.027632858604192734, + "logits/rejected": 0.39557844400405884, + "logps/chosen": -266.2771911621094, + "logps/rejected": -271.76116943359375, + "loss": 0.6907, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.002352924318984151, + "rewards/margins": 0.005208231043070555, + "rewards/margins_max": 0.008825947530567646, + "rewards/margins_min": 0.001590514904819429, + "rewards/margins_std": 0.005116222891956568, + "rewards/rejected": -0.00285530649125576, + "step": 130 + }, + { + "epoch": 0.08, + "grad_norm": 0.40625, + "learning_rate": 4.046242774566474e-07, + "logits/chosen": 0.06764040887355804, + "logits/rejected": 0.3966519236564636, + "logps/chosen": -178.83749389648438, + "logps/rejected": -188.39877319335938, + "loss": 0.6908, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0029165446758270264, + "rewards/margins": 0.006306161172688007, + "rewards/margins_max": 0.009462257847189903, + "rewards/margins_min": 0.0031500644981861115, + "rewards/margins_std": 0.004463394172489643, + "rewards/rejected": -0.0033896160311996937, + "step": 140 + }, + { + "epoch": 0.09, + "grad_norm": 0.447265625, + "learning_rate": 4.3352601156069365e-07, + "logits/chosen": 0.011811649426817894, + "logits/rejected": 0.4984157979488373, + "logps/chosen": -268.1231994628906, + "logps/rejected": -223.78799438476562, + "loss": 0.6899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.002369340742006898, + "rewards/margins": 0.006674068979918957, + "rewards/margins_max": 0.013764929957687855, + "rewards/margins_min": -0.0004167918232269585, + "rewards/margins_std": 0.010027991607785225, + "rewards/rejected": -0.0043047284707427025, + "step": 150 + }, + { + "epoch": 0.09, + "grad_norm": 0.322265625, + "learning_rate": 4.6242774566473983e-07, + "logits/chosen": -0.03828499838709831, + "logits/rejected": 0.3794795870780945, + "logps/chosen": -245.52865600585938, + "logps/rejected": -234.1727752685547, + "loss": 0.689, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.004552280530333519, + "rewards/margins": 0.008487861603498459, + "rewards/margins_max": 0.012918056920170784, + "rewards/margins_min": 0.004057666752487421, + "rewards/margins_std": 0.006265241652727127, + "rewards/rejected": -0.003935581538826227, + "step": 160 + }, + { + "epoch": 0.1, + "grad_norm": 0.49609375, + "learning_rate": 4.913294797687861e-07, + "logits/chosen": -0.0168992280960083, + "logits/rejected": 0.500325620174408, + "logps/chosen": -296.49517822265625, + "logps/rejected": -248.3328094482422, + "loss": 0.6887, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.003083079354837537, + "rewards/margins": 0.006065175868570805, + "rewards/margins_max": 0.011483820155262947, + "rewards/margins_min": 0.0006465300684794784, + "rewards/margins_std": 0.0076631223782896996, + "rewards/rejected": -0.002982096979394555, + "step": 170 + }, + { + "epoch": 0.1, + "grad_norm": 0.40625, + "learning_rate": 4.999748710138438e-07, + "logits/chosen": 0.14815935492515564, + "logits/rejected": 0.5510139465332031, + "logps/chosen": -233.9811553955078, + "logps/rejected": -228.5449676513672, + "loss": 0.688, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.003167560789734125, + "rewards/margins": 0.007796141318976879, + "rewards/margins_max": 0.012642833404242992, + "rewards/margins_min": 0.002949449699372053, + "rewards/margins_std": 0.006854257546365261, + "rewards/rejected": -0.004628580994904041, + "step": 180 + }, + { + "epoch": 0.11, + "grad_norm": 0.416015625, + "learning_rate": 4.998518024263461e-07, + "logits/chosen": 0.19040322303771973, + "logits/rejected": 0.6236617565155029, + "logps/chosen": -230.96762084960938, + "logps/rejected": -211.4745330810547, + "loss": 0.6871, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.006373309530317783, + "rewards/margins": 0.012960617430508137, + "rewards/margins_max": 0.01996336504817009, + "rewards/margins_min": 0.0059578740037977695, + "rewards/margins_std": 0.0099033759906888, + "rewards/rejected": -0.006587309297174215, + "step": 190 + }, + { + "epoch": 0.12, + "grad_norm": 0.416015625, + "learning_rate": 4.996262291366814e-07, + "logits/chosen": 0.054732900112867355, + "logits/rejected": 0.22424785792827606, + "logps/chosen": -210.0012664794922, + "logps/rejected": -233.76388549804688, + "loss": 0.6873, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.004412280861288309, + "rewards/margins": 0.011961949989199638, + "rewards/margins_max": 0.017657486721873283, + "rewards/margins_min": 0.006266415119171143, + "rewards/margins_std": 0.0080547034740448, + "rewards/rejected": -0.007549669593572617, + "step": 200 + }, + { + "epoch": 0.12, + "grad_norm": 0.498046875, + "learning_rate": 4.992982436890003e-07, + "logits/chosen": 0.09016792476177216, + "logits/rejected": 0.45956069231033325, + "logps/chosen": -226.3985595703125, + "logps/rejected": -221.092529296875, + "loss": 0.6868, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.005489318631589413, + "rewards/margins": 0.013238553889095783, + "rewards/margins_max": 0.018587926402688026, + "rewards/margins_min": 0.00788918323814869, + "rewards/margins_std": 0.007565152831375599, + "rewards/rejected": -0.007749234326183796, + "step": 210 + }, + { + "epoch": 0.13, + "grad_norm": 0.458984375, + "learning_rate": 4.988679806432711e-07, + "logits/chosen": -0.08951343595981598, + "logits/rejected": 0.46994414925575256, + "logps/chosen": -264.4379577636719, + "logps/rejected": -236.77346801757812, + "loss": 0.6853, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.007678179536014795, + "rewards/margins": 0.01784335821866989, + "rewards/margins_max": 0.025632936507463455, + "rewards/margins_min": 0.010053779929876328, + "rewards/margins_std": 0.011016124859452248, + "rewards/rejected": -0.010165175423026085, + "step": 220 + }, + { + "epoch": 0.13, + "grad_norm": 0.474609375, + "learning_rate": 4.983356165200751e-07, + "logits/chosen": 0.07358375936746597, + "logits/rejected": 0.617803692817688, + "logps/chosen": -276.56536865234375, + "logps/rejected": -237.3117218017578, + "loss": 0.6848, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0074386284686625, + "rewards/margins": 0.01824963092803955, + "rewards/margins_max": 0.026552444323897362, + "rewards/margins_min": 0.00994681753218174, + "rewards/margins_std": 0.01174195110797882, + "rewards/rejected": -0.010811002925038338, + "step": 230 + }, + { + "epoch": 0.14, + "grad_norm": 0.4296875, + "learning_rate": 4.977013697281864e-07, + "logits/chosen": 0.23069170117378235, + "logits/rejected": 0.546830952167511, + "logps/chosen": -229.92764282226562, + "logps/rejected": -231.63357543945312, + "loss": 0.6848, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.005361995659768581, + "rewards/margins": 0.015256190672516823, + "rewards/margins_max": 0.022752556949853897, + "rewards/margins_min": 0.007759819272905588, + "rewards/margins_std": 0.010601467452943325, + "rewards/rejected": -0.009894194081425667, + "step": 240 + }, + { + "epoch": 0.15, + "grad_norm": 0.412109375, + "learning_rate": 4.969655004749673e-07, + "logits/chosen": 0.05646086856722832, + "logits/rejected": 0.3687281012535095, + "logps/chosen": -203.8467559814453, + "logps/rejected": -216.0234375, + "loss": 0.6846, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.002810864243656397, + "rewards/margins": 0.014029537327587605, + "rewards/margins_max": 0.019475888460874557, + "rewards/margins_min": 0.008583188988268375, + "rewards/margins_std": 0.007702300790697336, + "rewards/rejected": -0.011218673549592495, + "step": 250 + }, + { + "epoch": 0.15, + "grad_norm": 0.490234375, + "learning_rate": 4.961283106596155e-07, + "logits/chosen": 0.1512751430273056, + "logits/rejected": 0.5323320627212524, + "logps/chosen": -256.96673583984375, + "logps/rejected": -265.65509033203125, + "loss": 0.6829, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.011281570419669151, + "rewards/margins": 0.0202823244035244, + "rewards/margins_max": 0.02979358099400997, + "rewards/margins_min": 0.010771063156425953, + "rewards/margins_std": 0.013450953178107738, + "rewards/rejected": -0.009000752121210098, + "step": 260 + }, + { + "epoch": 0.16, + "grad_norm": 0.447265625, + "learning_rate": 4.951901437493054e-07, + "logits/chosen": 0.08749596029520035, + "logits/rejected": 0.47565847635269165, + "logps/chosen": -252.97323608398438, + "logps/rejected": -220.1329803466797, + "loss": 0.6826, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.005718126427382231, + "rewards/margins": 0.019988398998975754, + "rewards/margins_max": 0.025959456339478493, + "rewards/margins_min": 0.014017338864505291, + "rewards/margins_std": 0.008444352075457573, + "rewards/rejected": -0.014270270243287086, + "step": 270 + }, + { + "epoch": 0.16, + "grad_norm": 0.453125, + "learning_rate": 4.941513846382779e-07, + "logits/chosen": 0.31170374155044556, + "logits/rejected": 0.6478020548820496, + "logps/chosen": -207.89794921875, + "logps/rejected": -225.51791381835938, + "loss": 0.6828, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010051739402115345, + "rewards/margins": 0.019436318427324295, + "rewards/margins_max": 0.025176430121064186, + "rewards/margins_min": 0.013696206733584404, + "rewards/margins_std": 0.008117742836475372, + "rewards/rejected": -0.009384581819176674, + "step": 280 + }, + { + "epoch": 0.17, + "grad_norm": 0.431640625, + "learning_rate": 4.930124594899313e-07, + "logits/chosen": 0.14136287569999695, + "logits/rejected": 0.5530031323432922, + "logps/chosen": -244.9897918701172, + "logps/rejected": -244.90457153320312, + "loss": 0.6814, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0166664756834507, + "rewards/margins": 0.02829556167125702, + "rewards/margins_max": 0.037106942385435104, + "rewards/margins_min": 0.019484177231788635, + "rewards/margins_std": 0.012461178004741669, + "rewards/rejected": -0.011629085056483746, + "step": 290 + }, + { + "epoch": 0.17, + "grad_norm": 0.494140625, + "learning_rate": 4.917738355619842e-07, + "logits/chosen": 0.2040259838104248, + "logits/rejected": 0.6138412356376648, + "logps/chosen": -193.21507263183594, + "logps/rejected": -194.8699188232422, + "loss": 0.6796, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.012191513553261757, + "rewards/margins": 0.026244569569826126, + "rewards/margins_max": 0.036748819053173065, + "rewards/margins_min": 0.015740320086479187, + "rewards/margins_std": 0.014855247922241688, + "rewards/rejected": -0.01405305415391922, + "step": 300 + }, + { + "epoch": 0.18, + "grad_norm": 0.453125, + "learning_rate": 4.904360210147762e-07, + "logits/chosen": 0.1507195234298706, + "logits/rejected": 0.5720406174659729, + "logps/chosen": -242.0141143798828, + "logps/rejected": -216.76132202148438, + "loss": 0.6791, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.010296806693077087, + "rewards/margins": 0.02473880909383297, + "rewards/margins_max": 0.036660365760326385, + "rewards/margins_min": 0.012817250564694405, + "rewards/margins_std": 0.0168596301227808, + "rewards/rejected": -0.014442001469433308, + "step": 310 + }, + { + "epoch": 0.19, + "grad_norm": 0.41796875, + "learning_rate": 4.8899956470279e-07, + "logits/chosen": -0.03488525375723839, + "logits/rejected": 0.40159520506858826, + "logps/chosen": -218.23812866210938, + "logps/rejected": -190.8876953125, + "loss": 0.679, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014135973528027534, + "rewards/margins": 0.02363484725356102, + "rewards/margins_max": 0.036806877702474594, + "rewards/margins_min": 0.010462815873324871, + "rewards/margins_std": 0.018628064543008804, + "rewards/rejected": -0.00949887465685606, + "step": 320 + }, + { + "epoch": 0.19, + "grad_norm": 0.4375, + "learning_rate": 4.874650559494765e-07, + "logits/chosen": 0.10674601793289185, + "logits/rejected": 0.5667238831520081, + "logps/chosen": -242.5848388671875, + "logps/rejected": -212.60922241210938, + "loss": 0.6782, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.008991287089884281, + "rewards/margins": 0.02689727023243904, + "rewards/margins_max": 0.03854988515377045, + "rewards/margins_min": 0.015244655311107635, + "rewards/margins_std": 0.016479285433888435, + "rewards/rejected": -0.017905984073877335, + "step": 330 + }, + { + "epoch": 0.2, + "grad_norm": 0.357421875, + "learning_rate": 4.858331243054782e-07, + "logits/chosen": 0.09378918260335922, + "logits/rejected": 0.42793530225753784, + "logps/chosen": -282.80413818359375, + "logps/rejected": -245.1541748046875, + "loss": 0.6796, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.004886592272669077, + "rewards/margins": 0.021504424512386322, + "rewards/margins_max": 0.03542860597372055, + "rewards/margins_min": 0.007580241654068232, + "rewards/margins_std": 0.019691769033670425, + "rewards/rejected": -0.016617832705378532, + "step": 340 + }, + { + "epoch": 0.2, + "grad_norm": 0.486328125, + "learning_rate": 4.841044392903481e-07, + "logits/chosen": 0.1290682703256607, + "logits/rejected": 0.6047347784042358, + "logps/chosen": -232.40908813476562, + "logps/rejected": -181.57228088378906, + "loss": 0.6783, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.008800150826573372, + "rewards/margins": 0.028118547052145004, + "rewards/margins_max": 0.04057111591100693, + "rewards/margins_min": 0.015665989369153976, + "rewards/margins_std": 0.0176105834543705, + "rewards/rejected": -0.01931839995086193, + "step": 350 + }, + { + "epoch": 0.21, + "grad_norm": 0.435546875, + "learning_rate": 4.822797101178718e-07, + "logits/chosen": -0.10504484176635742, + "logits/rejected": 0.437595933675766, + "logps/chosen": -256.3827209472656, + "logps/rejected": -231.28836059570312, + "loss": 0.6777, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014989467337727547, + "rewards/margins": 0.03444572165608406, + "rewards/margins_max": 0.04873298108577728, + "rewards/margins_min": 0.02015846036374569, + "rewards/margins_std": 0.020205235108733177, + "rewards/rejected": -0.019456254318356514, + "step": 360 + }, + { + "epoch": 0.21, + "grad_norm": 0.390625, + "learning_rate": 4.803596854051038e-07, + "logits/chosen": -0.0018104672199115157, + "logits/rejected": 0.5270112752914429, + "logps/chosen": -251.33740234375, + "logps/rejected": -203.73886108398438, + "loss": 0.6749, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.010898159816861153, + "rewards/margins": 0.02897489070892334, + "rewards/margins_max": 0.041702691465616226, + "rewards/margins_min": 0.016247089952230453, + "rewards/margins_std": 0.01799982599914074, + "rewards/rejected": -0.018076732754707336, + "step": 370 + }, + { + "epoch": 0.22, + "grad_norm": 0.3671875, + "learning_rate": 4.783451528652382e-07, + "logits/chosen": 0.03281222656369209, + "logits/rejected": 0.3939230740070343, + "logps/chosen": -203.0167694091797, + "logps/rejected": -197.302490234375, + "loss": 0.6775, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01019463874399662, + "rewards/margins": 0.030594149604439735, + "rewards/margins_max": 0.041967082768678665, + "rewards/margins_min": 0.019221220165491104, + "rewards/margins_std": 0.01608375459909439, + "rewards/rejected": -0.020399510860443115, + "step": 380 + }, + { + "epoch": 0.23, + "grad_norm": 0.4140625, + "learning_rate": 4.7623693898443963e-07, + "logits/chosen": 0.06993720680475235, + "logits/rejected": 0.44206172227859497, + "logps/chosen": -185.37237548828125, + "logps/rejected": -187.4385986328125, + "loss": 0.6751, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.009011445567011833, + "rewards/margins": 0.03231946378946304, + "rewards/margins_max": 0.04668620228767395, + "rewards/margins_min": 0.017952727153897285, + "rewards/margins_std": 0.02031763456761837, + "rewards/rejected": -0.02330802008509636, + "step": 390 + }, + { + "epoch": 0.23, + "grad_norm": 0.44140625, + "learning_rate": 4.740359086827685e-07, + "logits/chosen": -0.0161175187677145, + "logits/rejected": 0.4163980484008789, + "logps/chosen": -239.71432495117188, + "logps/rejected": -241.2501678466797, + "loss": 0.6737, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018473349511623383, + "rewards/margins": 0.04534245282411575, + "rewards/margins_max": 0.06162145733833313, + "rewards/margins_min": 0.02906343713402748, + "rewards/margins_std": 0.0230219978839159, + "rewards/rejected": -0.026869099587202072, + "step": 400 + }, + { + "epoch": 0.24, + "grad_norm": 0.359375, + "learning_rate": 4.7174296495933593e-07, + "logits/chosen": -0.04076371714472771, + "logits/rejected": 0.20715077221393585, + "logps/chosen": -188.3863525390625, + "logps/rejected": -203.01266479492188, + "loss": 0.6749, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.011351143009960651, + "rewards/margins": 0.03776105120778084, + "rewards/margins_max": 0.05341630056500435, + "rewards/margins_min": 0.022105801850557327, + "rewards/margins_std": 0.022139865905046463, + "rewards/rejected": -0.026409905403852463, + "step": 410 + }, + { + "epoch": 0.24, + "grad_norm": 0.478515625, + "learning_rate": 4.6935904852183805e-07, + "logits/chosen": 0.29291218519210815, + "logits/rejected": 0.5505505800247192, + "logps/chosen": -203.9456024169922, + "logps/rejected": -217.8910369873047, + "loss": 0.6712, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.012085825204849243, + "rewards/margins": 0.038635291159152985, + "rewards/margins_max": 0.059398896992206573, + "rewards/margins_min": 0.017871689051389694, + "rewards/margins_std": 0.029364168643951416, + "rewards/rejected": -0.02654946781694889, + "step": 420 + }, + { + "epoch": 0.25, + "grad_norm": 0.431640625, + "learning_rate": 4.6688513740061965e-07, + "logits/chosen": 0.12483358383178711, + "logits/rejected": 0.46587473154067993, + "logps/chosen": -264.0867004394531, + "logps/rejected": -292.27685546875, + "loss": 0.6731, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.019537176936864853, + "rewards/margins": 0.040542975068092346, + "rewards/margins_max": 0.05839340761303902, + "rewards/margins_min": 0.022692536935210228, + "rewards/margins_std": 0.02524433098733425, + "rewards/rejected": -0.021005798131227493, + "step": 430 + }, + { + "epoch": 0.26, + "grad_norm": 0.4296875, + "learning_rate": 4.6432224654742475e-07, + "logits/chosen": -0.0027520388830453157, + "logits/rejected": 0.48325324058532715, + "logps/chosen": -231.2857208251953, + "logps/rejected": -221.3975372314453, + "loss": 0.6719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017787110060453415, + "rewards/margins": 0.04569714143872261, + "rewards/margins_max": 0.06507585942745209, + "rewards/margins_min": 0.026318421587347984, + "rewards/margins_std": 0.027405640110373497, + "rewards/rejected": -0.027910029515624046, + "step": 440 + }, + { + "epoch": 0.26, + "grad_norm": 0.4375, + "learning_rate": 4.616714274190011e-07, + "logits/chosen": 0.3332589566707611, + "logits/rejected": 0.5584608316421509, + "logps/chosen": -211.74325561523438, + "logps/rejected": -225.31689453125, + "loss": 0.6705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.010198825970292091, + "rewards/margins": 0.04217001795768738, + "rewards/margins_max": 0.0582113042473793, + "rewards/margins_min": 0.026128727942705154, + "rewards/margins_std": 0.022685810923576355, + "rewards/rejected": -0.031971193850040436, + "step": 450 + }, + { + "epoch": 0.27, + "grad_norm": 0.435546875, + "learning_rate": 4.589337675457273e-07, + "logits/chosen": 0.10014849901199341, + "logits/rejected": 0.564907431602478, + "logps/chosen": -217.19985961914062, + "logps/rejected": -214.29440307617188, + "loss": 0.6713, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.018607165664434433, + "rewards/margins": 0.05433148890733719, + "rewards/margins_max": 0.07488565146923065, + "rewards/margins_min": 0.033777330070734024, + "rewards/margins_std": 0.02906796894967556, + "rewards/rejected": -0.03572431951761246, + "step": 460 + }, + { + "epoch": 0.27, + "grad_norm": 0.4609375, + "learning_rate": 4.5611039008544007e-07, + "logits/chosen": 0.13153567910194397, + "logits/rejected": 0.652635931968689, + "logps/chosen": -261.8456726074219, + "logps/rejected": -231.66531372070312, + "loss": 0.671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.013766567222774029, + "rewards/margins": 0.04572372883558273, + "rewards/margins_max": 0.06320376694202423, + "rewards/margins_min": 0.028243690729141235, + "rewards/margins_std": 0.024720508605241776, + "rewards/rejected": -0.03195716068148613, + "step": 470 + }, + { + "epoch": 0.28, + "grad_norm": 0.419921875, + "learning_rate": 4.532024533626457e-07, + "logits/chosen": 0.0050893365405499935, + "logits/rejected": 0.3075583577156067, + "logps/chosen": -214.87033081054688, + "logps/rejected": -231.591064453125, + "loss": 0.6694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.012458743527531624, + "rewards/margins": 0.046287618577480316, + "rewards/margins_max": 0.06574501842260361, + "rewards/margins_min": 0.026830215007066727, + "rewards/margins_std": 0.02751692570745945, + "rewards/rejected": -0.03382887691259384, + "step": 480 + }, + { + "epoch": 0.28, + "grad_norm": 0.435546875, + "learning_rate": 4.502111503933032e-07, + "logits/chosen": 0.16573339700698853, + "logits/rejected": 0.5059231519699097, + "logps/chosen": -214.00900268554688, + "logps/rejected": -226.75070190429688, + "loss": 0.6705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.011546745896339417, + "rewards/margins": 0.03893359750509262, + "rewards/margins_max": 0.0571872778236866, + "rewards/margins_min": 0.020679913461208344, + "rewards/margins_std": 0.0258146021515131, + "rewards/rejected": -0.027386849746108055, + "step": 490 + }, + { + "epoch": 0.29, + "grad_norm": 0.42578125, + "learning_rate": 4.471377083953753e-07, + "logits/chosen": 0.19767063856124878, + "logits/rejected": 0.6161295175552368, + "logps/chosen": -211.5915985107422, + "logps/rejected": -231.336669921875, + "loss": 0.6672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021602794528007507, + "rewards/margins": 0.05690021067857742, + "rewards/margins_max": 0.08022460341453552, + "rewards/margins_min": 0.03357581049203873, + "rewards/margins_std": 0.032985687255859375, + "rewards/rejected": -0.03529741242527962, + "step": 500 + }, + { + "epoch": 0.3, + "grad_norm": 0.4609375, + "learning_rate": 4.4398338828534766e-07, + "logits/chosen": 0.051334965974092484, + "logits/rejected": 0.5114815831184387, + "logps/chosen": -252.36349487304688, + "logps/rejected": -253.6934051513672, + "loss": 0.67, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.021400339901447296, + "rewards/margins": 0.05237139016389847, + "rewards/margins_max": 0.07569600641727448, + "rewards/margins_min": 0.029046764597296715, + "rewards/margins_std": 0.03298599272966385, + "rewards/rejected": -0.030971046537160873, + "step": 510 + }, + { + "epoch": 0.3, + "grad_norm": 0.40234375, + "learning_rate": 4.407494841609224e-07, + "logits/chosen": 0.16097505390644073, + "logits/rejected": 0.503351092338562, + "logps/chosen": -187.7499542236328, + "logps/rejected": -182.64669799804688, + "loss": 0.6691, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.015485493466258049, + "rewards/margins": 0.039487432688474655, + "rewards/margins_max": 0.0597788468003273, + "rewards/margins_min": 0.019196024164557457, + "rewards/margins_std": 0.028696388006210327, + "rewards/rejected": -0.024001937359571457, + "step": 520 + }, + { + "epoch": 0.31, + "grad_norm": 0.462890625, + "learning_rate": 4.374373227700993e-07, + "logits/chosen": 0.03560265153646469, + "logits/rejected": 0.5799299478530884, + "logps/chosen": -273.8843688964844, + "logps/rejected": -234.033935546875, + "loss": 0.6673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007162511348724365, + "rewards/margins": 0.0483052022755146, + "rewards/margins_max": 0.06804867088794708, + "rewards/margins_min": 0.028561726212501526, + "rewards/margins_std": 0.027921488508582115, + "rewards/rejected": -0.04114269092679024, + "step": 530 + }, + { + "epoch": 0.31, + "grad_norm": 0.408203125, + "learning_rate": 4.340482629668615e-07, + "logits/chosen": 0.027306120842695236, + "logits/rejected": 0.671806812286377, + "logps/chosen": -259.85015869140625, + "logps/rejected": -201.55807495117188, + "loss": 0.6673, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02854643389582634, + "rewards/margins": 0.0538957342505455, + "rewards/margins_max": 0.0864059180021286, + "rewards/margins_min": 0.0213855542242527, + "rewards/margins_std": 0.045976340770721436, + "rewards/rejected": -0.025349300354719162, + "step": 540 + }, + { + "epoch": 0.32, + "grad_norm": 0.3515625, + "learning_rate": 4.30583695153689e-07, + "logits/chosen": 0.04380347207188606, + "logits/rejected": 0.4509994089603424, + "logps/chosen": -273.69775390625, + "logps/rejected": -259.96966552734375, + "loss": 0.6693, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022089816629886627, + "rewards/margins": 0.056071024388074875, + "rewards/margins_max": 0.08100839704275131, + "rewards/margins_min": 0.031133651733398438, + "rewards/margins_std": 0.035266775637865067, + "rewards/rejected": -0.033981211483478546, + "step": 550 + }, + { + "epoch": 0.32, + "grad_norm": 0.4140625, + "learning_rate": 4.2704504071112986e-07, + "logits/chosen": 0.10579466819763184, + "logits/rejected": 0.5407041311264038, + "logps/chosen": -240.98483276367188, + "logps/rejected": -211.9040985107422, + "loss": 0.6687, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.017832906916737556, + "rewards/margins": 0.05916459485888481, + "rewards/margins_max": 0.08200596272945404, + "rewards/margins_min": 0.036323241889476776, + "rewards/margins_std": 0.03230256214737892, + "rewards/rejected": -0.041331697255373, + "step": 560 + }, + { + "epoch": 0.33, + "grad_norm": 0.376953125, + "learning_rate": 4.234337514146612e-07, + "logits/chosen": 0.11410923302173615, + "logits/rejected": 0.6912606954574585, + "logps/chosen": -251.16793823242188, + "logps/rejected": -229.26553344726562, + "loss": 0.6663, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019808156415820122, + "rewards/margins": 0.05665863677859306, + "rewards/margins_max": 0.08191566169261932, + "rewards/margins_min": 0.0314016118645668, + "rewards/margins_std": 0.03571882098913193, + "rewards/rejected": -0.036850474774837494, + "step": 570 + }, + { + "epoch": 0.34, + "grad_norm": 0.357421875, + "learning_rate": 4.197513088390813e-07, + "logits/chosen": -0.013543277978897095, + "logits/rejected": 0.37492939829826355, + "logps/chosen": -232.13333129882812, + "logps/rejected": -223.6721954345703, + "loss": 0.6657, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014923980459570885, + "rewards/margins": 0.05013802647590637, + "rewards/margins_max": 0.07493571937084198, + "rewards/margins_min": 0.025340333580970764, + "rewards/margins_std": 0.03506923094391823, + "rewards/rejected": -0.03521404415369034, + "step": 580 + }, + { + "epoch": 0.34, + "grad_norm": 0.51171875, + "learning_rate": 4.1599922375067554e-07, + "logits/chosen": -0.03167729452252388, + "logits/rejected": 0.535004734992981, + "logps/chosen": -325.4375915527344, + "logps/rejected": -253.494873046875, + "loss": 0.6668, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01660420373082161, + "rewards/margins": 0.059089016169309616, + "rewards/margins_max": 0.08827444911003113, + "rewards/margins_min": 0.029903585091233253, + "rewards/margins_std": 0.041274432092905045, + "rewards/rejected": -0.04248481243848801, + "step": 590 + }, + { + "epoch": 0.35, + "grad_norm": 0.380859375, + "learning_rate": 4.121790354874065e-07, + "logits/chosen": 0.05303360894322395, + "logits/rejected": 0.40770038962364197, + "logps/chosen": -202.06549072265625, + "logps/rejected": -214.628173828125, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005082354880869389, + "rewards/margins": 0.05396551638841629, + "rewards/margins_max": 0.07737747579813004, + "rewards/margins_min": 0.03055354580283165, + "rewards/margins_std": 0.03310951590538025, + "rewards/rejected": -0.04888315126299858, + "step": 600 + }, + { + "epoch": 0.35, + "grad_norm": 0.369140625, + "learning_rate": 4.082923113273822e-07, + "logits/chosen": 0.11870566755533218, + "logits/rejected": 0.464911550283432, + "logps/chosen": -231.35336303710938, + "logps/rejected": -234.9374237060547, + "loss": 0.6666, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01106190960854292, + "rewards/margins": 0.0625653862953186, + "rewards/margins_max": 0.08917935192584991, + "rewards/margins_min": 0.03595142811536789, + "rewards/margins_std": 0.037637822329998016, + "rewards/rejected": -0.05150347948074341, + "step": 610 + }, + { + "epoch": 0.36, + "grad_norm": 0.443359375, + "learning_rate": 4.043406458458609e-07, + "logits/chosen": 0.09034819900989532, + "logits/rejected": 0.5873952507972717, + "logps/chosen": -265.25396728515625, + "logps/rejected": -214.2862548828125, + "loss": 0.6628, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0020419310312718153, + "rewards/margins": 0.06574475765228271, + "rewards/margins_max": 0.08710642158985138, + "rewards/margins_min": 0.04438310116529465, + "rewards/margins_std": 0.030209947377443314, + "rewards/rejected": -0.06370283663272858, + "step": 620 + }, + { + "epoch": 0.37, + "grad_norm": 0.4921875, + "learning_rate": 4.0032566026105806e-07, + "logits/chosen": 0.008516276255249977, + "logits/rejected": 0.6535265445709229, + "logps/chosen": -260.87298583984375, + "logps/rejected": -267.5401916503906, + "loss": 0.663, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.03661227226257324, + "rewards/margins": 0.07144369184970856, + "rewards/margins_max": 0.09834811091423035, + "rewards/margins_min": 0.044539276510477066, + "rewards/margins_std": 0.03804859146475792, + "rewards/rejected": -0.03483142331242561, + "step": 630 + }, + { + "epoch": 0.37, + "grad_norm": 0.474609375, + "learning_rate": 3.9624900176902184e-07, + "logits/chosen": 0.013054514303803444, + "logits/rejected": 0.3652392029762268, + "logps/chosen": -235.1199493408203, + "logps/rejected": -248.31411743164062, + "loss": 0.6656, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.014549237675964832, + "rewards/margins": 0.05561714246869087, + "rewards/margins_max": 0.08446307480335236, + "rewards/margins_min": 0.026771211996674538, + "rewards/margins_std": 0.040794309228658676, + "rewards/rejected": -0.041067905724048615, + "step": 640 + }, + { + "epoch": 0.38, + "grad_norm": 0.41015625, + "learning_rate": 3.921123428678511e-07, + "logits/chosen": 0.022506317123770714, + "logits/rejected": 0.6284270882606506, + "logps/chosen": -305.97674560546875, + "logps/rejected": -239.0786590576172, + "loss": 0.666, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020474497228860855, + "rewards/margins": 0.06788565218448639, + "rewards/margins_max": 0.09115969389677048, + "rewards/margins_min": 0.044611603021621704, + "rewards/margins_std": 0.03291446715593338, + "rewards/rejected": -0.047411151230335236, + "step": 650 + }, + { + "epoch": 0.38, + "grad_norm": 0.478515625, + "learning_rate": 3.8791738067153314e-07, + "logits/chosen": 0.07077694684267044, + "logits/rejected": 0.5682755708694458, + "logps/chosen": -231.22695922851562, + "logps/rejected": -227.6490478515625, + "loss": 0.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03146480768918991, + "rewards/margins": 0.06544210761785507, + "rewards/margins_max": 0.0967545360326767, + "rewards/margins_min": 0.034129686653614044, + "rewards/margins_std": 0.044282447546720505, + "rewards/rejected": -0.03397729992866516, + "step": 660 + }, + { + "epoch": 0.39, + "grad_norm": 0.41796875, + "learning_rate": 3.83665836213682e-07, + "logits/chosen": 0.12142015993595123, + "logits/rejected": 0.5390751957893372, + "logps/chosen": -207.6114501953125, + "logps/rejected": -215.29849243164062, + "loss": 0.6636, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.011886438354849815, + "rewards/margins": 0.05365458130836487, + "rewards/margins_max": 0.07296213507652283, + "rewards/margins_min": 0.03434702754020691, + "rewards/margins_std": 0.027305006980895996, + "rewards/rejected": -0.0417681448161602, + "step": 670 + }, + { + "epoch": 0.39, + "grad_norm": 0.46875, + "learning_rate": 3.7935945374146417e-07, + "logits/chosen": 0.007061509881168604, + "logits/rejected": 0.3642507493495941, + "logps/chosen": -236.29788208007812, + "logps/rejected": -242.33544921875, + "loss": 0.6631, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02563950978219509, + "rewards/margins": 0.05955478549003601, + "rewards/margins_max": 0.08539506047964096, + "rewards/margins_min": 0.03371449559926987, + "rewards/margins_std": 0.036543674767017365, + "rewards/rejected": -0.03391526639461517, + "step": 680 + }, + { + "epoch": 0.4, + "grad_norm": 0.5234375, + "learning_rate": 3.75e-07, + "logits/chosen": 0.08328167349100113, + "logits/rejected": 0.5527598857879639, + "logps/chosen": -239.66159057617188, + "logps/rejected": -235.6712188720703, + "loss": 0.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023291967809200287, + "rewards/margins": 0.07459411025047302, + "rewards/margins_max": 0.1087113469839096, + "rewards/margins_min": 0.04047687351703644, + "rewards/margins_std": 0.04824905842542648, + "rewards/rejected": -0.051302142441272736, + "step": 690 + }, + { + "epoch": 0.41, + "grad_norm": 0.40625, + "learning_rate": 3.7058926350753517e-07, + "logits/chosen": 0.04602205008268356, + "logits/rejected": 0.6276509165763855, + "logps/chosen": -247.14205932617188, + "logps/rejected": -208.6519775390625, + "loss": 0.6614, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022474488243460655, + "rewards/margins": 0.07001164555549622, + "rewards/margins_max": 0.09704446792602539, + "rewards/margins_min": 0.04297882691025734, + "rewards/margins_std": 0.038230184465646744, + "rewards/rejected": -0.04753715917468071, + "step": 700 + }, + { + "epoch": 0.41, + "grad_norm": 0.4453125, + "learning_rate": 3.661290538216798e-07, + "logits/chosen": 0.291398823261261, + "logits/rejected": 0.6808168292045593, + "logps/chosen": -224.65090942382812, + "logps/rejected": -205.6571807861328, + "loss": 0.6632, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0020084187854081392, + "rewards/margins": 0.05480460077524185, + "rewards/margins_max": 0.0770978108048439, + "rewards/margins_min": 0.0325113907456398, + "rewards/margins_std": 0.031527359038591385, + "rewards/rejected": -0.05279617756605148, + "step": 710 + }, + { + "epoch": 0.42, + "grad_norm": 0.4375, + "learning_rate": 3.616212007970159e-07, + "logits/chosen": 0.05395558476448059, + "logits/rejected": 0.29135066270828247, + "logps/chosen": -189.52139282226562, + "logps/rejected": -215.48080444335938, + "loss": 0.6633, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.008078034035861492, + "rewards/margins": 0.05178927257657051, + "rewards/margins_max": 0.0689278393983841, + "rewards/margins_min": 0.034650713205337524, + "rewards/margins_std": 0.024237588047981262, + "rewards/rejected": -0.043711237609386444, + "step": 720 + }, + { + "epoch": 0.42, + "grad_norm": 0.4609375, + "learning_rate": 3.5706755383437703e-07, + "logits/chosen": 0.09721295535564423, + "logits/rejected": 0.5186147689819336, + "logps/chosen": -302.69482421875, + "logps/rejected": -258.5033874511719, + "loss": 0.6646, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020449183881282806, + "rewards/margins": 0.052381712943315506, + "rewards/margins_max": 0.07583948969841003, + "rewards/margins_min": 0.02892393246293068, + "rewards/margins_std": 0.0331743024289608, + "rewards/rejected": -0.0319325253367424, + "step": 730 + }, + { + "epoch": 0.43, + "grad_norm": 0.443359375, + "learning_rate": 3.5246998112210993e-07, + "logits/chosen": 0.13969309628009796, + "logits/rejected": 0.6499422192573547, + "logps/chosen": -262.07000732421875, + "logps/rejected": -253.33364868164062, + "loss": 0.6583, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.020577292889356613, + "rewards/margins": 0.08194496482610703, + "rewards/margins_max": 0.10924677550792694, + "rewards/margins_min": 0.05464313551783562, + "rewards/margins_std": 0.038610607385635376, + "rewards/rejected": -0.061367668211460114, + "step": 740 + }, + { + "epoch": 0.44, + "grad_norm": 0.39453125, + "learning_rate": 3.4783036886962736e-07, + "logits/chosen": 0.15751202404499054, + "logits/rejected": 0.583830714225769, + "logps/chosen": -232.4749298095703, + "logps/rejected": -251.43881225585938, + "loss": 0.6642, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.013448268175125122, + "rewards/margins": 0.06021388620138168, + "rewards/margins_max": 0.08211688697338104, + "rewards/margins_min": 0.03831087797880173, + "rewards/margins_std": 0.030975526198744774, + "rewards/rejected": -0.04676561802625656, + "step": 750 + }, + { + "epoch": 0.44, + "grad_norm": 0.451171875, + "learning_rate": 3.4315062053356847e-07, + "logits/chosen": -0.02616945281624794, + "logits/rejected": 0.5470731854438782, + "logps/chosen": -247.7039031982422, + "logps/rejected": -204.8767547607422, + "loss": 0.6635, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02075277827680111, + "rewards/margins": 0.06478811800479889, + "rewards/margins_max": 0.09738490730524063, + "rewards/margins_min": 0.03219131752848625, + "rewards/margins_std": 0.04609883576631546, + "rewards/rejected": -0.04403533786535263, + "step": 760 + }, + { + "epoch": 0.45, + "grad_norm": 0.515625, + "learning_rate": 3.384326560368826e-07, + "logits/chosen": 0.040539853274822235, + "logits/rejected": 0.5014762878417969, + "logps/chosen": -249.2455596923828, + "logps/rejected": -242.47781372070312, + "loss": 0.662, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02245604246854782, + "rewards/margins": 0.05939044803380966, + "rewards/margins_max": 0.08405659347772598, + "rewards/margins_min": 0.03472430631518364, + "rewards/margins_std": 0.03488319739699364, + "rewards/rejected": -0.03693440556526184, + "step": 770 + }, + { + "epoch": 0.45, + "grad_norm": 0.5, + "learning_rate": 3.3367841098115777e-07, + "logits/chosen": 0.05805939435958862, + "logits/rejected": 0.47922706604003906, + "logps/chosen": -286.8292541503906, + "logps/rejected": -230.5067138671875, + "loss": 0.6653, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.004244116134941578, + "rewards/margins": 0.0571456179022789, + "rewards/margins_max": 0.08360336720943451, + "rewards/margins_min": 0.030687877908349037, + "rewards/margins_std": 0.03741690143942833, + "rewards/rejected": -0.052901506423950195, + "step": 780 + }, + { + "epoch": 0.46, + "grad_norm": 0.40234375, + "learning_rate": 3.2888983585251713e-07, + "logits/chosen": 0.11492130905389786, + "logits/rejected": 0.3956727087497711, + "logps/chosen": -204.6266632080078, + "logps/rejected": -208.7443084716797, + "loss": 0.6606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011013984680175781, + "rewards/margins": 0.057107020169496536, + "rewards/margins_max": 0.07711775600910187, + "rewards/margins_min": 0.037096280604600906, + "rewards/margins_std": 0.02829946205019951, + "rewards/rejected": -0.046093035489320755, + "step": 790 + }, + { + "epoch": 0.46, + "grad_norm": 0.466796875, + "learning_rate": 3.240688952214085e-07, + "logits/chosen": -0.019520867615938187, + "logits/rejected": 0.34635210037231445, + "logps/chosen": -278.4693298339844, + "logps/rejected": -257.54986572265625, + "loss": 0.6607, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020895112305879593, + "rewards/margins": 0.08000204712152481, + "rewards/margins_max": 0.1040647029876709, + "rewards/margins_min": 0.05593939870595932, + "rewards/margins_std": 0.034029725939035416, + "rewards/rejected": -0.05910693481564522, + "step": 800 + }, + { + "epoch": 0.47, + "grad_norm": 0.365234375, + "learning_rate": 3.192175669366156e-07, + "logits/chosen": 0.08061734586954117, + "logits/rejected": 0.440199077129364, + "logps/chosen": -216.41323852539062, + "logps/rejected": -240.26333618164062, + "loss": 0.6611, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.011639273725450039, + "rewards/margins": 0.061767347157001495, + "rewards/margins_max": 0.09113974124193192, + "rewards/margins_min": 0.03239493444561958, + "rewards/margins_std": 0.04153885692358017, + "rewards/rejected": -0.050128065049648285, + "step": 810 + }, + { + "epoch": 0.48, + "grad_norm": 0.435546875, + "learning_rate": 3.14337841313822e-07, + "logits/chosen": 0.2162504643201828, + "logits/rejected": 0.6251672506332397, + "logps/chosen": -249.9015655517578, + "logps/rejected": -198.54403686523438, + "loss": 0.6629, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.008589675650000572, + "rewards/margins": 0.05789928883314133, + "rewards/margins_max": 0.07874341309070587, + "rewards/margins_min": 0.03705517202615738, + "rewards/margins_std": 0.029478034004569054, + "rewards/rejected": -0.0493096187710762, + "step": 820 + }, + { + "epoch": 0.48, + "grad_norm": 0.443359375, + "learning_rate": 3.094317203190603e-07, + "logits/chosen": -0.0029448375571519136, + "logits/rejected": 0.4555005431175232, + "logps/chosen": -240.8060760498047, + "logps/rejected": -222.56246948242188, + "loss": 0.6561, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.022363275289535522, + "rewards/margins": 0.08168495446443558, + "rewards/margins_max": 0.11077789962291718, + "rewards/margins_min": 0.052591998130083084, + "rewards/margins_std": 0.04114364832639694, + "rewards/rejected": -0.059321679174900055, + "step": 830 + }, + { + "epoch": 0.49, + "grad_norm": 0.38671875, + "learning_rate": 3.045012167473814e-07, + "logits/chosen": 0.1808149516582489, + "logits/rejected": 0.5233570337295532, + "logps/chosen": -263.43255615234375, + "logps/rejected": -270.8913269042969, + "loss": 0.6616, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02442259155213833, + "rewards/margins": 0.0733276903629303, + "rewards/margins_max": 0.104800745844841, + "rewards/margins_min": 0.041854631155729294, + "rewards/margins_std": 0.04450962692499161, + "rewards/rejected": -0.04890510439872742, + "step": 840 + }, + { + "epoch": 0.49, + "grad_norm": 0.4140625, + "learning_rate": 2.995483533970809e-07, + "logits/chosen": 0.2622363269329071, + "logits/rejected": 0.7754552960395813, + "logps/chosen": -228.362060546875, + "logps/rejected": -187.44383239746094, + "loss": 0.6618, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011710538528859615, + "rewards/margins": 0.06277038902044296, + "rewards/margins_max": 0.08341649174690247, + "rewards/margins_min": 0.04212428256869316, + "rewards/margins_std": 0.029198000207543373, + "rewards/rejected": -0.05105985328555107, + "step": 850 + }, + { + "epoch": 0.5, + "grad_norm": 0.453125, + "learning_rate": 2.9457516223982235e-07, + "logits/chosen": 0.11260411888360977, + "logits/rejected": 0.47127556800842285, + "logps/chosen": -251.4638214111328, + "logps/rejected": -251.6316680908203, + "loss": 0.6609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009782608598470688, + "rewards/margins": 0.07295442372560501, + "rewards/margins_max": 0.10423107445240021, + "rewards/margins_min": 0.04167778044939041, + "rewards/margins_std": 0.044231854379177094, + "rewards/rejected": -0.06317181885242462, + "step": 860 + }, + { + "epoch": 0.5, + "grad_norm": 0.44921875, + "learning_rate": 2.895836835869962e-07, + "logits/chosen": 0.03560788184404373, + "logits/rejected": 0.4069921374320984, + "logps/chosen": -228.38876342773438, + "logps/rejected": -221.29638671875, + "loss": 0.662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.009866083040833473, + "rewards/margins": 0.06033489108085632, + "rewards/margins_max": 0.09506522119045258, + "rewards/margins_min": 0.02560456469655037, + "rewards/margins_std": 0.0491160973906517, + "rewards/rejected": -0.050468809902668, + "step": 870 + }, + { + "epoch": 0.51, + "grad_norm": 0.48046875, + "learning_rate": 2.845759652526574e-07, + "logits/chosen": 0.07124204933643341, + "logits/rejected": 0.5192992687225342, + "logps/chosen": -234.10836791992188, + "logps/rejected": -189.55230712890625, + "loss": 0.66, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01570773683488369, + "rewards/margins": 0.05234966799616814, + "rewards/margins_max": 0.07433562725782394, + "rewards/margins_min": 0.030363699421286583, + "rewards/margins_std": 0.031092852354049683, + "rewards/rejected": -0.036641925573349, + "step": 880 + }, + { + "epoch": 0.52, + "grad_norm": 0.427734375, + "learning_rate": 2.795540617133853e-07, + "logits/chosen": 0.24306873977184296, + "logits/rejected": 0.4881308674812317, + "logps/chosen": -233.5541534423828, + "logps/rejected": -271.29119873046875, + "loss": 0.6601, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0062574222683906555, + "rewards/margins": 0.06694331020116806, + "rewards/margins_max": 0.0913429707288742, + "rewards/margins_min": 0.04254365712404251, + "rewards/margins_std": 0.03450632840394974, + "rewards/rejected": -0.060685895383358, + "step": 890 + }, + { + "epoch": 0.52, + "grad_norm": 0.40234375, + "learning_rate": 2.7452003326540995e-07, + "logits/chosen": 0.1885126382112503, + "logits/rejected": 0.6096329689025879, + "logps/chosen": -223.55380249023438, + "logps/rejected": -210.834716796875, + "loss": 0.6613, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01565275713801384, + "rewards/margins": 0.0681251734495163, + "rewards/margins_max": 0.0929432287812233, + "rewards/margins_min": 0.043307114392519, + "rewards/margins_std": 0.035098038613796234, + "rewards/rejected": -0.05247241258621216, + "step": 900 + }, + { + "epoch": 0.53, + "grad_norm": 0.369140625, + "learning_rate": 2.694759451793508e-07, + "logits/chosen": 0.3056187033653259, + "logits/rejected": 0.5238193273544312, + "logps/chosen": -180.62220764160156, + "logps/rejected": -202.76705932617188, + "loss": 0.6628, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.005610722117125988, + "rewards/margins": 0.053133320063352585, + "rewards/margins_max": 0.0700041875243187, + "rewards/margins_min": 0.03626246377825737, + "rewards/margins_std": 0.023858997970819473, + "rewards/rejected": -0.04752260446548462, + "step": 910 + }, + { + "epoch": 0.53, + "grad_norm": 0.48828125, + "learning_rate": 2.644238668529146e-07, + "logits/chosen": 0.21234102547168732, + "logits/rejected": 0.48591142892837524, + "logps/chosen": -223.54971313476562, + "logps/rejected": -248.9346466064453, + "loss": 0.6607, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017756493762135506, + "rewards/margins": 0.07771660387516022, + "rewards/margins_max": 0.11433382332324982, + "rewards/margins_min": 0.04109939560294151, + "rewards/margins_std": 0.05178455635905266, + "rewards/rejected": -0.05996011570096016, + "step": 920 + }, + { + "epoch": 0.54, + "grad_norm": 0.396484375, + "learning_rate": 2.593658709619001e-07, + "logits/chosen": 0.11299429088830948, + "logits/rejected": 0.5906545519828796, + "logps/chosen": -222.49609375, + "logps/rejected": -204.37290954589844, + "loss": 0.6601, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02080368809401989, + "rewards/margins": 0.07051359862089157, + "rewards/margins_max": 0.10480418056249619, + "rewards/margins_min": 0.03622300922870636, + "rewards/margins_std": 0.048494212329387665, + "rewards/rejected": -0.04970990866422653, + "step": 930 + }, + { + "epoch": 0.55, + "grad_norm": 0.423828125, + "learning_rate": 2.5430403260985807e-07, + "logits/chosen": 0.11868913471698761, + "logits/rejected": 0.5508742332458496, + "logps/chosen": -212.3166961669922, + "logps/rejected": -219.1356658935547, + "loss": 0.6583, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.021529385820031166, + "rewards/margins": 0.06332559883594513, + "rewards/margins_max": 0.0937047004699707, + "rewards/margins_min": 0.03294649347662926, + "rewards/margins_std": 0.042962536215782166, + "rewards/rejected": -0.04179621487855911, + "step": 940 + }, + { + "epoch": 0.55, + "grad_norm": 0.470703125, + "learning_rate": 2.4924042847675503e-07, + "logits/chosen": 0.06126406043767929, + "logits/rejected": 0.5420705080032349, + "logps/chosen": -294.85845947265625, + "logps/rejected": -215.2727813720703, + "loss": 0.661, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.007373870350420475, + "rewards/margins": 0.05419896915555, + "rewards/margins_max": 0.08067617565393448, + "rewards/margins_min": 0.02772175334393978, + "rewards/margins_std": 0.03744443506002426, + "rewards/rejected": -0.0468250997364521, + "step": 950 + }, + { + "epoch": 0.56, + "grad_norm": 0.47265625, + "learning_rate": 2.441771359669902e-07, + "logits/chosen": 0.13893456757068634, + "logits/rejected": 0.4921324849128723, + "logps/chosen": -235.5193634033203, + "logps/rejected": -225.794189453125, + "loss": 0.6607, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.012106789276003838, + "rewards/margins": 0.06842382997274399, + "rewards/margins_max": 0.100715771317482, + "rewards/margins_min": 0.03613189607858658, + "rewards/margins_std": 0.045667704194784164, + "rewards/rejected": -0.056317038834095, + "step": 960 + }, + { + "epoch": 0.56, + "grad_norm": 0.443359375, + "learning_rate": 2.391162323571161e-07, + "logits/chosen": 0.07089251279830933, + "logits/rejected": 0.48170119524002075, + "logps/chosen": -230.9342498779297, + "logps/rejected": -226.3340301513672, + "loss": 0.6617, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.010878843255341053, + "rewards/margins": 0.06217268109321594, + "rewards/margins_max": 0.08883620798587799, + "rewards/margins_min": 0.03550915792584419, + "rewards/margins_std": 0.037707917392253876, + "rewards/rejected": -0.051293838769197464, + "step": 970 + }, + { + "epoch": 0.57, + "grad_norm": 0.42578125, + "learning_rate": 2.340597939436097e-07, + "logits/chosen": 0.03681742399930954, + "logits/rejected": 0.5955736041069031, + "logps/chosen": -234.0045166015625, + "logps/rejected": -216.2124786376953, + "loss": 0.6614, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0253006462007761, + "rewards/margins": 0.06550078094005585, + "rewards/margins_max": 0.0953935831785202, + "rewards/margins_min": 0.035607993602752686, + "rewards/margins_std": 0.04227479174733162, + "rewards/rejected": -0.0402001328766346, + "step": 980 + }, + { + "epoch": 0.57, + "grad_norm": 0.42578125, + "learning_rate": 2.2900989519104796e-07, + "logits/chosen": 0.1664225161075592, + "logits/rejected": 0.4196982979774475, + "logps/chosen": -182.28829956054688, + "logps/rejected": -211.08865356445312, + "loss": 0.6625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0058049350045621395, + "rewards/margins": 0.06564933061599731, + "rewards/margins_max": 0.09529349207878113, + "rewards/margins_min": 0.036005161702632904, + "rewards/margins_std": 0.04192318022251129, + "rewards/rejected": -0.05984439328312874, + "step": 990 + }, + { + "epoch": 0.58, + "grad_norm": 0.4375, + "learning_rate": 2.2396860788103353e-07, + "logits/chosen": -0.04069889336824417, + "logits/rejected": 0.4455093741416931, + "logps/chosen": -208.73477172851562, + "logps/rejected": -199.85501098632812, + "loss": 0.6608, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.015201890841126442, + "rewards/margins": 0.08097913861274719, + "rewards/margins_max": 0.11325138807296753, + "rewards/margins_min": 0.04870688170194626, + "rewards/margins_std": 0.04563985764980316, + "rewards/rejected": -0.0657772421836853, + "step": 1000 + }, + { + "epoch": 0.59, + "grad_norm": 0.451171875, + "learning_rate": 2.1893800026222083e-07, + "logits/chosen": 0.24370861053466797, + "logits/rejected": 0.655241847038269, + "logps/chosen": -239.9451446533203, + "logps/rejected": -255.0171356201172, + "loss": 0.6612, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01818387396633625, + "rewards/margins": 0.06645138561725616, + "rewards/margins_max": 0.0944729745388031, + "rewards/margins_min": 0.03842979669570923, + "rewards/margins_std": 0.039628516882658005, + "rewards/rejected": -0.048267509788274765, + "step": 1010 + }, + { + "epoch": 0.59, + "grad_norm": 0.376953125, + "learning_rate": 2.1392013620179336e-07, + "logits/chosen": -0.15726599097251892, + "logits/rejected": 0.27727076411247253, + "logps/chosen": -208.62881469726562, + "logps/rejected": -205.62429809570312, + "loss": 0.6593, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.012712801806628704, + "rewards/margins": 0.07130307704210281, + "rewards/margins_max": 0.09740529954433441, + "rewards/margins_min": 0.04520086199045181, + "rewards/margins_std": 0.03691411018371582, + "rewards/rejected": -0.05859028175473213, + "step": 1020 + }, + { + "epoch": 0.6, + "grad_norm": 0.373046875, + "learning_rate": 2.0891707433873623e-07, + "logits/chosen": 0.2577076256275177, + "logits/rejected": 0.5587279796600342, + "logps/chosen": -232.6507568359375, + "logps/rejected": -236.791015625, + "loss": 0.6608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007417677901685238, + "rewards/margins": 0.06323407590389252, + "rewards/margins_max": 0.09169165790081024, + "rewards/margins_min": 0.03477650135755539, + "rewards/margins_std": 0.040245089679956436, + "rewards/rejected": -0.055816400796175, + "step": 1030 + }, + { + "epoch": 0.6, + "grad_norm": 0.4609375, + "learning_rate": 2.039308672392556e-07, + "logits/chosen": 0.09692186862230301, + "logits/rejected": 0.5365327000617981, + "logps/chosen": -220.7172393798828, + "logps/rejected": -204.85055541992188, + "loss": 0.6567, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.016125962138175964, + "rewards/margins": 0.06824339926242828, + "rewards/margins_max": 0.10508973896503448, + "rewards/margins_min": 0.03139704838395119, + "rewards/margins_std": 0.052108604460954666, + "rewards/rejected": -0.05211742967367172, + "step": 1040 + }, + { + "epoch": 0.61, + "grad_norm": 0.36328125, + "learning_rate": 1.9896356055468845e-07, + "logits/chosen": 0.24312233924865723, + "logits/rejected": 0.5007752180099487, + "logps/chosen": -217.9171600341797, + "logps/rejected": -255.72866821289062, + "loss": 0.6605, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.015429767780005932, + "rewards/margins": 0.06471355259418488, + "rewards/margins_max": 0.09141434729099274, + "rewards/margins_min": 0.03801275044679642, + "rewards/margins_std": 0.03776064142584801, + "rewards/rejected": -0.04928378015756607, + "step": 1050 + }, + { + "epoch": 0.61, + "grad_norm": 0.359375, + "learning_rate": 1.940171921822496e-07, + "logits/chosen": 0.007707296404987574, + "logits/rejected": 0.3314017653465271, + "logps/chosen": -218.86654663085938, + "logps/rejected": -214.7074737548828, + "loss": 0.6625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010595353320240974, + "rewards/margins": 0.05604109913110733, + "rewards/margins_max": 0.08353577554225922, + "rewards/margins_min": 0.028546428307890892, + "rewards/margins_std": 0.03888333961367607, + "rewards/rejected": -0.045445747673511505, + "step": 1060 + }, + { + "epoch": 0.62, + "grad_norm": 0.421875, + "learning_rate": 1.8909379142895977e-07, + "logits/chosen": 0.08975931257009506, + "logits/rejected": 0.49662691354751587, + "logps/chosen": -243.73941040039062, + "logps/rejected": -218.0565643310547, + "loss": 0.6628, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.017341626808047295, + "rewards/margins": 0.06548301875591278, + "rewards/margins_max": 0.10044316947460175, + "rewards/margins_min": 0.030522847548127174, + "rewards/margins_std": 0.0494411401450634, + "rewards/rejected": -0.04814138263463974, + "step": 1070 + }, + { + "epoch": 0.63, + "grad_norm": 0.419921875, + "learning_rate": 1.841953781790983e-07, + "logits/chosen": 0.14877240359783173, + "logits/rejected": 0.32807669043540955, + "logps/chosen": -201.35398864746094, + "logps/rejected": -237.98403930664062, + "loss": 0.6614, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.011331291869282722, + "rewards/margins": 0.05169866234064102, + "rewards/margins_max": 0.08101126551628113, + "rewards/margins_min": 0.02238604798913002, + "rewards/margins_std": 0.041454292833805084, + "rewards/rejected": -0.04036737233400345, + "step": 1080 + }, + { + "epoch": 0.63, + "grad_norm": 0.5234375, + "learning_rate": 1.793239620655211e-07, + "logits/chosen": 0.10640072822570801, + "logits/rejected": 0.5526248812675476, + "logps/chosen": -198.35403442382812, + "logps/rejected": -196.8388671875, + "loss": 0.6604, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0263301283121109, + "rewards/margins": 0.07441949844360352, + "rewards/margins_max": 0.1034015566110611, + "rewards/margins_min": 0.045437444001436234, + "rewards/margins_std": 0.040986817330121994, + "rewards/rejected": -0.04808937385678291, + "step": 1090 + }, + { + "epoch": 0.64, + "grad_norm": 0.390625, + "learning_rate": 1.744815416451847e-07, + "logits/chosen": 0.1694943606853485, + "logits/rejected": 0.6004883050918579, + "logps/chosen": -255.3223114013672, + "logps/rejected": -243.01541137695312, + "loss": 0.6625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01719365268945694, + "rewards/margins": 0.06180461123585701, + "rewards/margins_max": 0.08655586838722229, + "rewards/margins_min": 0.03705335780978203, + "rewards/margins_std": 0.03500355780124664, + "rewards/rejected": -0.04461096227169037, + "step": 1100 + }, + { + "epoch": 0.64, + "grad_norm": 0.4453125, + "learning_rate": 1.6967010357921446e-07, + "logits/chosen": 0.11355743557214737, + "logits/rejected": 0.4874862730503082, + "logps/chosen": -210.58767700195312, + "logps/rejected": -219.46701049804688, + "loss": 0.6618, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.005143271759152412, + "rewards/margins": 0.061519283801317215, + "rewards/margins_max": 0.0864943265914917, + "rewards/margins_min": 0.036544252187013626, + "rewards/margins_std": 0.035320036113262177, + "rewards/rejected": -0.05637601017951965, + "step": 1110 + }, + { + "epoch": 0.65, + "grad_norm": 0.439453125, + "learning_rate": 1.6489162181785255e-07, + "logits/chosen": 0.15795719623565674, + "logits/rejected": 0.5425394773483276, + "logps/chosen": -245.29562377929688, + "logps/rejected": -233.9000244140625, + "loss": 0.6602, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021811651065945625, + "rewards/margins": 0.07487231492996216, + "rewards/margins_max": 0.09871380031108856, + "rewards/margins_min": 0.051030855625867844, + "rewards/margins_std": 0.03371693566441536, + "rewards/rejected": -0.05306067317724228, + "step": 1120 + }, + { + "epoch": 0.66, + "grad_norm": 0.361328125, + "learning_rate": 1.6014805679062183e-07, + "logits/chosen": -0.04248831048607826, + "logits/rejected": 0.36503881216049194, + "logps/chosen": -204.58383178710938, + "logps/rejected": -203.0003204345703, + "loss": 0.6607, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020199652761220932, + "rewards/margins": 0.08475508540868759, + "rewards/margins_max": 0.11757893860340118, + "rewards/margins_min": 0.051931243389844894, + "rewards/margins_std": 0.046419933438301086, + "rewards/rejected": -0.06455543637275696, + "step": 1130 + }, + { + "epoch": 0.66, + "grad_norm": 0.482421875, + "learning_rate": 1.5544135460203527e-07, + "logits/chosen": 0.250204861164093, + "logits/rejected": 0.5448838472366333, + "logps/chosen": -212.43508911132812, + "logps/rejected": -247.50747680664062, + "loss": 0.6601, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.013406927697360516, + "rewards/margins": 0.07055126130580902, + "rewards/margins_max": 0.09891954064369202, + "rewards/margins_min": 0.04218297451734543, + "rewards/margins_std": 0.04011881351470947, + "rewards/rejected": -0.05714433267712593, + "step": 1140 + }, + { + "epoch": 0.67, + "grad_norm": 0.408203125, + "learning_rate": 1.5077344623318388e-07, + "logits/chosen": 0.08146306127309799, + "logits/rejected": 0.5028539896011353, + "logps/chosen": -244.5470733642578, + "logps/rejected": -203.9750213623047, + "loss": 0.6622, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.00543981185182929, + "rewards/margins": 0.0606420524418354, + "rewards/margins_max": 0.09149619191884995, + "rewards/margins_min": 0.029787922278046608, + "rewards/margins_std": 0.043634332716464996, + "rewards/rejected": -0.05520225316286087, + "step": 1150 + }, + { + "epoch": 0.67, + "grad_norm": 0.4921875, + "learning_rate": 1.461462467495284e-07, + "logits/chosen": 0.09238779544830322, + "logits/rejected": 0.5282326340675354, + "logps/chosen": -239.08853149414062, + "logps/rejected": -234.31228637695312, + "loss": 0.6582, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020727628841996193, + "rewards/margins": 0.07139938324689865, + "rewards/margins_max": 0.0972491055727005, + "rewards/margins_min": 0.045549679547548294, + "rewards/margins_std": 0.036557018756866455, + "rewards/rejected": -0.0506717674434185, + "step": 1160 + }, + { + "epoch": 0.68, + "grad_norm": 0.400390625, + "learning_rate": 1.4156165451522028e-07, + "logits/chosen": 0.08472833782434464, + "logits/rejected": 0.5027869939804077, + "logps/chosen": -205.4404754638672, + "logps/rejected": -202.98440551757812, + "loss": 0.663, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.011948509141802788, + "rewards/margins": 0.06199117749929428, + "rewards/margins_max": 0.08956360816955566, + "rewards/margins_min": 0.03441876173019409, + "rewards/margins_std": 0.038993291556835175, + "rewards/rejected": -0.05004267022013664, + "step": 1170 + }, + { + "epoch": 0.68, + "grad_norm": 0.470703125, + "learning_rate": 1.3702155041427543e-07, + "logits/chosen": 0.1654224544763565, + "logits/rejected": 0.39103928208351135, + "logps/chosen": -221.5464630126953, + "logps/rejected": -246.1484832763672, + "loss": 0.6611, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.008782127872109413, + "rewards/margins": 0.05567712336778641, + "rewards/margins_max": 0.07324758917093277, + "rewards/margins_min": 0.038106657564640045, + "rewards/margins_std": 0.024848390370607376, + "rewards/rejected": -0.046894993633031845, + "step": 1180 + }, + { + "epoch": 0.69, + "grad_norm": 0.4375, + "learning_rate": 1.3252779707891902e-07, + "logits/chosen": 0.009541223756968975, + "logits/rejected": 0.48217493295669556, + "logps/chosen": -272.9510192871094, + "logps/rejected": -204.46435546875, + "loss": 0.6611, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.009134182706475258, + "rewards/margins": 0.05944829061627388, + "rewards/margins_max": 0.08002766221761703, + "rewards/margins_min": 0.03886892646551132, + "rewards/margins_std": 0.02910362184047699, + "rewards/rejected": -0.05031410977244377, + "step": 1190 + }, + { + "epoch": 0.7, + "grad_norm": 0.462890625, + "learning_rate": 1.2808223812541774e-07, + "logits/chosen": 0.07254563271999359, + "logits/rejected": 0.47662535309791565, + "logps/chosen": -241.54336547851562, + "logps/rejected": -211.88424682617188, + "loss": 0.6606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0020990788470953703, + "rewards/margins": 0.05149079114198685, + "rewards/margins_max": 0.08034542202949524, + "rewards/margins_min": 0.022636160254478455, + "rewards/margins_std": 0.040806613862514496, + "rewards/rejected": -0.04939170926809311, + "step": 1200 + }, + { + "epoch": 0.7, + "grad_norm": 0.4375, + "learning_rate": 1.2368669739771469e-07, + "logits/chosen": 0.07886068522930145, + "logits/rejected": 0.4947189390659332, + "logps/chosen": -206.33993530273438, + "logps/rejected": -212.7965850830078, + "loss": 0.6578, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.009903495199978352, + "rewards/margins": 0.0682389959692955, + "rewards/margins_max": 0.09637950360774994, + "rewards/margins_min": 0.04009848088026047, + "rewards/margins_std": 0.03979669511318207, + "rewards/rejected": -0.058335501700639725, + "step": 1210 + }, + { + "epoch": 0.71, + "grad_norm": 0.439453125, + "learning_rate": 1.1934297821917497e-07, + "logits/chosen": -0.18527595698833466, + "logits/rejected": 0.35417476296424866, + "logps/chosen": -271.8248291015625, + "logps/rejected": -208.87966918945312, + "loss": 0.6619, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014687316492199898, + "rewards/margins": 0.05254317447543144, + "rewards/margins_max": 0.0765123963356018, + "rewards/margins_min": 0.028573954477906227, + "rewards/margins_std": 0.03389759734272957, + "rewards/rejected": -0.03785586357116699, + "step": 1220 + }, + { + "epoch": 0.71, + "grad_norm": 0.40234375, + "learning_rate": 1.1505286265275094e-07, + "logits/chosen": 0.09351782500743866, + "logits/rejected": 0.5304566621780396, + "logps/chosen": -217.6367645263672, + "logps/rejected": -209.18603515625, + "loss": 0.666, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01146542839705944, + "rewards/margins": 0.07028119266033173, + "rewards/margins_max": 0.10538403689861298, + "rewards/margins_min": 0.03517835959792137, + "rewards/margins_std": 0.0496429018676281, + "rewards/rejected": -0.05881576985120773, + "step": 1230 + }, + { + "epoch": 0.72, + "grad_norm": 0.390625, + "learning_rate": 1.1081811076986963e-07, + "logits/chosen": 0.026241421699523926, + "logits/rejected": 0.6041153073310852, + "logps/chosen": -228.3728790283203, + "logps/rejected": -190.1019287109375, + "loss": 0.6596, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.016418198123574257, + "rewards/margins": 0.0706411749124527, + "rewards/margins_max": 0.09941698610782623, + "rewards/margins_min": 0.041865330189466476, + "rewards/margins_std": 0.04069516435265541, + "rewards/rejected": -0.054222963750362396, + "step": 1240 + }, + { + "epoch": 0.73, + "grad_norm": 0.427734375, + "learning_rate": 1.0664045992834184e-07, + "logits/chosen": 0.19840288162231445, + "logits/rejected": 0.5584182143211365, + "logps/chosen": -254.10147094726562, + "logps/rejected": -256.0483703613281, + "loss": 0.6583, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.012557362206280231, + "rewards/margins": 0.06964166462421417, + "rewards/margins_max": 0.09085742384195328, + "rewards/margins_min": 0.04842590540647507, + "rewards/margins_std": 0.030003610998392105, + "rewards/rejected": -0.057084303349256516, + "step": 1250 + }, + { + "epoch": 0.73, + "grad_norm": 0.484375, + "learning_rate": 1.0252162405959042e-07, + "logits/chosen": -0.029180001467466354, + "logits/rejected": 0.4648149609565735, + "logps/chosen": -273.28375244140625, + "logps/rejected": -244.730712890625, + "loss": 0.6602, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.02007482200860977, + "rewards/margins": 0.06700652837753296, + "rewards/margins_max": 0.10410724580287933, + "rewards/margins_min": 0.029905814677476883, + "rewards/margins_std": 0.05246833711862564, + "rewards/rejected": -0.04693170636892319, + "step": 1260 + }, + { + "epoch": 0.74, + "grad_norm": 0.494140625, + "learning_rate": 9.846329296548963e-08, + "logits/chosen": -0.017562460154294968, + "logits/rejected": 0.4763096868991852, + "logps/chosen": -269.8515625, + "logps/rejected": -263.83148193359375, + "loss": 0.6598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010733803734183311, + "rewards/margins": 0.07448114454746246, + "rewards/margins_max": 0.10118886083364487, + "rewards/margins_min": 0.04777342826128006, + "rewards/margins_std": 0.03777041286230087, + "rewards/rejected": -0.0637473464012146, + "step": 1270 + }, + { + "epoch": 0.74, + "grad_norm": 0.486328125, + "learning_rate": 9.446713162510341e-08, + "logits/chosen": 0.22771111130714417, + "logits/rejected": 0.7621752023696899, + "logps/chosen": -266.06390380859375, + "logps/rejected": -250.635498046875, + "loss": 0.6584, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.030348753556609154, + "rewards/margins": 0.07343067973852158, + "rewards/margins_max": 0.10677297413349152, + "rewards/margins_min": 0.040088407695293427, + "rewards/margins_std": 0.04715309664607048, + "rewards/rejected": -0.04308192804455757, + "step": 1280 + }, + { + "epoch": 0.75, + "grad_norm": 0.515625, + "learning_rate": 9.053477951160737e-08, + "logits/chosen": 0.015399669297039509, + "logits/rejected": 0.7483765482902527, + "logps/chosen": -276.5067443847656, + "logps/rejected": -227.33761596679688, + "loss": 0.6579, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.026790842413902283, + "rewards/margins": 0.08279003202915192, + "rewards/margins_max": 0.11221597343683243, + "rewards/margins_min": 0.05336407572031021, + "rewards/margins_std": 0.04161457344889641, + "rewards/rejected": -0.05599917098879814, + "step": 1290 + }, + { + "epoch": 0.75, + "grad_norm": 0.396484375, + "learning_rate": 8.666784991967596e-08, + "logits/chosen": 0.010845961980521679, + "logits/rejected": 0.42500224709510803, + "logps/chosen": -213.1592254638672, + "logps/rejected": -199.2817840576172, + "loss": 0.6613, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014592917636036873, + "rewards/margins": 0.0668349340558052, + "rewards/margins_max": 0.09872870147228241, + "rewards/margins_min": 0.03494114801287651, + "rewards/margins_std": 0.04510461539030075, + "rewards/rejected": -0.05224201828241348, + "step": 1300 + }, + { + "epoch": 0.76, + "grad_norm": 0.4921875, + "learning_rate": 8.286792930360823e-08, + "logits/chosen": 0.25165149569511414, + "logits/rejected": 0.6992672681808472, + "logps/chosen": -217.0974884033203, + "logps/rejected": -202.47030639648438, + "loss": 0.6599, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.011730032041668892, + "rewards/margins": 0.0590001717209816, + "rewards/margins_max": 0.07914995402097702, + "rewards/margins_min": 0.03885037824511528, + "rewards/margins_std": 0.02849610149860382, + "rewards/rejected": -0.04727013781666756, + "step": 1310 + }, + { + "epoch": 0.77, + "grad_norm": 0.52734375, + "learning_rate": 7.91365766264665e-08, + "logits/chosen": 0.20514824986457825, + "logits/rejected": 0.5356392860412598, + "logps/chosen": -248.6316680908203, + "logps/rejected": -240.5338134765625, + "loss": 0.6591, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.010535435751080513, + "rewards/margins": 0.06282900273799896, + "rewards/margins_max": 0.09407368302345276, + "rewards/margins_min": 0.031584326177835464, + "rewards/margins_std": 0.04418665170669556, + "rewards/rejected": -0.052293576300144196, + "step": 1320 + }, + { + "epoch": 0.77, + "grad_norm": 0.455078125, + "learning_rate": 7.547532272049264e-08, + "logits/chosen": 0.25605538487434387, + "logits/rejected": 0.6374403238296509, + "logps/chosen": -255.80410766601562, + "logps/rejected": -255.73764038085938, + "loss": 0.6619, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.013418711721897125, + "rewards/margins": 0.06125851348042488, + "rewards/margins_max": 0.08139893412590027, + "rewards/margins_min": 0.04111810773611069, + "rewards/margins_std": 0.028482843190431595, + "rewards/rejected": -0.047839801758527756, + "step": 1330 + }, + { + "epoch": 0.78, + "grad_norm": 0.4140625, + "learning_rate": 7.188566965906584e-08, + "logits/chosen": 0.10137088596820831, + "logits/rejected": 0.5515474081039429, + "logps/chosen": -271.2210693359375, + "logps/rejected": -272.3622131347656, + "loss": 0.6598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00015007219917606562, + "rewards/margins": 0.06623668223619461, + "rewards/margins_max": 0.10004226863384247, + "rewards/margins_min": 0.03243108466267586, + "rewards/margins_std": 0.04780833050608635, + "rewards/rejected": -0.06638675183057785, + "step": 1340 + }, + { + "epoch": 0.78, + "grad_norm": 0.412109375, + "learning_rate": 6.836909014045924e-08, + "logits/chosen": 0.005819192621856928, + "logits/rejected": 0.38501212000846863, + "logps/chosen": -247.23056030273438, + "logps/rejected": -238.4652557373047, + "loss": 0.6607, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01672416180372238, + "rewards/margins": 0.07304920256137848, + "rewards/margins_max": 0.10092739760875702, + "rewards/margins_min": 0.04517098516225815, + "rewards/margins_std": 0.039425741881132126, + "rewards/rejected": -0.0563250370323658, + "step": 1350 + }, + { + "epoch": 0.79, + "grad_norm": 0.4609375, + "learning_rate": 6.492702688364737e-08, + "logits/chosen": -0.07613168656826019, + "logits/rejected": 0.20295462012290955, + "logps/chosen": -203.92233276367188, + "logps/rejected": -247.69277954101562, + "loss": 0.6604, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.014894701540470123, + "rewards/margins": 0.06641440093517303, + "rewards/margins_max": 0.09283626079559326, + "rewards/margins_min": 0.039992526173591614, + "rewards/margins_std": 0.037366170436143875, + "rewards/rejected": -0.05151969939470291, + "step": 1360 + }, + { + "epoch": 0.79, + "grad_norm": 0.4375, + "learning_rate": 6.156089203641373e-08, + "logits/chosen": -0.014948748052120209, + "logits/rejected": 0.4398605227470398, + "logps/chosen": -247.429931640625, + "logps/rejected": -251.06826782226562, + "loss": 0.6571, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0333079919219017, + "rewards/margins": 0.08266235888004303, + "rewards/margins_max": 0.10667815059423447, + "rewards/margins_min": 0.0586465522646904, + "rewards/margins_std": 0.03396347165107727, + "rewards/rejected": -0.04935435950756073, + "step": 1370 + }, + { + "epoch": 0.8, + "grad_norm": 0.427734375, + "learning_rate": 5.827206659599987e-08, + "logits/chosen": 0.28106218576431274, + "logits/rejected": 0.7749143242835999, + "logps/chosen": -222.03665161132812, + "logps/rejected": -200.11221313476562, + "loss": 0.6576, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.017674388363957405, + "rewards/margins": 0.07599468529224396, + "rewards/margins_max": 0.11385379731655121, + "rewards/margins_min": 0.038135576993227005, + "rewards/margins_std": 0.05354086682200432, + "rewards/rejected": -0.058320302516222, + "step": 1380 + }, + { + "epoch": 0.81, + "grad_norm": 0.50390625, + "learning_rate": 5.506189984253501e-08, + "logits/chosen": 0.16949541866779327, + "logits/rejected": 0.4548502564430237, + "logps/chosen": -205.447265625, + "logps/rejected": -221.4696044921875, + "loss": 0.6611, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.003050294006243348, + "rewards/margins": 0.06650832295417786, + "rewards/margins_max": 0.09234586358070374, + "rewards/margins_min": 0.040670786052942276, + "rewards/margins_std": 0.036539800465106964, + "rewards/rejected": -0.06345803290605545, + "step": 1390 + }, + { + "epoch": 0.81, + "grad_norm": 0.482421875, + "learning_rate": 5.1931708785477506e-08, + "logits/chosen": 0.11355874687433243, + "logits/rejected": 0.6481127738952637, + "logps/chosen": -216.15432739257812, + "logps/rejected": -187.30389404296875, + "loss": 0.6592, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.015445582568645477, + "rewards/margins": 0.05808136984705925, + "rewards/margins_max": 0.08922155201435089, + "rewards/margins_min": 0.026941198855638504, + "rewards/margins_std": 0.04403885826468468, + "rewards/rejected": -0.04263579100370407, + "step": 1400 + }, + { + "epoch": 0.82, + "grad_norm": 0.380859375, + "learning_rate": 4.888277762329582e-08, + "logits/chosen": 0.11872565746307373, + "logits/rejected": 0.5771151185035706, + "logps/chosen": -215.25442504882812, + "logps/rejected": -214.4876251220703, + "loss": 0.6619, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01657172292470932, + "rewards/margins": 0.06676243245601654, + "rewards/margins_max": 0.0983147844672203, + "rewards/margins_min": 0.03521009162068367, + "rewards/margins_std": 0.04462175816297531, + "rewards/rejected": -0.05019070953130722, + "step": 1410 + }, + { + "epoch": 0.82, + "grad_norm": 0.439453125, + "learning_rate": 4.591635721661072e-08, + "logits/chosen": 0.1136382669210434, + "logits/rejected": 0.5482941269874573, + "logps/chosen": -243.9540557861328, + "logps/rejected": -231.51473999023438, + "loss": 0.6606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01714186929166317, + "rewards/margins": 0.07303180545568466, + "rewards/margins_max": 0.10039409250020981, + "rewards/margins_min": 0.045669522136449814, + "rewards/margins_std": 0.03869611397385597, + "rewards/rejected": -0.05588993430137634, + "step": 1420 + }, + { + "epoch": 0.83, + "grad_norm": 0.431640625, + "learning_rate": 4.3033664575015005e-08, + "logits/chosen": 0.24127981066703796, + "logits/rejected": 0.6273223161697388, + "logps/chosen": -258.4788818359375, + "logps/rejected": -255.1360321044922, + "loss": 0.6591, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0254741869866848, + "rewards/margins": 0.0617264024913311, + "rewards/margins_max": 0.08791927993297577, + "rewards/margins_min": 0.035533517599105835, + "rewards/margins_std": 0.03704233095049858, + "rewards/rejected": -0.036252211779356, + "step": 1430 + }, + { + "epoch": 0.84, + "grad_norm": 0.4453125, + "learning_rate": 4.023588235778019e-08, + "logits/chosen": 0.048088885843753815, + "logits/rejected": 0.4085961878299713, + "logps/chosen": -235.32763671875, + "logps/rejected": -246.94937133789062, + "loss": 0.6625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017656199634075165, + "rewards/margins": 0.07100087404251099, + "rewards/margins_max": 0.09923985600471497, + "rewards/margins_min": 0.042761895805597305, + "rewards/margins_std": 0.039935946464538574, + "rewards/rejected": -0.05334467440843582, + "step": 1440 + }, + { + "epoch": 0.84, + "grad_norm": 0.4609375, + "learning_rate": 3.752415838865664e-08, + "logits/chosen": -0.09887398779392242, + "logits/rejected": 0.5310045480728149, + "logps/chosen": -245.59951782226562, + "logps/rejected": -266.8290100097656, + "loss": 0.6586, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018602244555950165, + "rewards/margins": 0.08193326741456985, + "rewards/margins_max": 0.11139090359210968, + "rewards/margins_min": 0.05247562378644943, + "rewards/margins_std": 0.041659384965896606, + "rewards/rejected": -0.06333102285861969, + "step": 1450 + }, + { + "epoch": 0.85, + "grad_norm": 0.439453125, + "learning_rate": 3.4899605184965206e-08, + "logits/chosen": 0.03019891306757927, + "logits/rejected": 0.44324207305908203, + "logps/chosen": -225.20443725585938, + "logps/rejected": -183.06094360351562, + "loss": 0.6609, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0028962846845388412, + "rewards/margins": 0.0560896173119545, + "rewards/margins_max": 0.07679092139005661, + "rewards/margins_min": 0.035388313233852386, + "rewards/margins_std": 0.02927606739103794, + "rewards/rejected": -0.05319333076477051, + "step": 1460 + }, + { + "epoch": 0.85, + "grad_norm": 0.439453125, + "learning_rate": 3.23632995011732e-08, + "logits/chosen": -0.06648756563663483, + "logits/rejected": 0.29680854082107544, + "logps/chosen": -226.04983520507812, + "logps/rejected": -258.3298034667969, + "loss": 0.6587, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03215508535504341, + "rewards/margins": 0.08979654312133789, + "rewards/margins_max": 0.12097585201263428, + "rewards/margins_min": 0.058617234230041504, + "rewards/margins_std": 0.044094208627939224, + "rewards/rejected": -0.057641465216875076, + "step": 1470 + }, + { + "epoch": 0.86, + "grad_norm": 0.455078125, + "learning_rate": 2.991628188714351e-08, + "logits/chosen": 0.00623916694894433, + "logits/rejected": 0.48251962661743164, + "logps/chosen": -313.39935302734375, + "logps/rejected": -245.91720581054688, + "loss": 0.6596, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.004381291568279266, + "rewards/margins": 0.07124367356300354, + "rewards/margins_max": 0.09969727694988251, + "rewards/margins_min": 0.04279007390141487, + "rewards/margins_std": 0.04023946821689606, + "rewards/rejected": -0.06686238944530487, + "step": 1480 + }, + { + "epoch": 0.86, + "grad_norm": 0.4375, + "learning_rate": 2.755955626123596e-08, + "logits/chosen": 0.12439896166324615, + "logits/rejected": 0.6011586785316467, + "logps/chosen": -250.7643585205078, + "logps/rejected": -217.0757293701172, + "loss": 0.6624, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.018308712169528008, + "rewards/margins": 0.05787688493728638, + "rewards/margins_max": 0.09185748547315598, + "rewards/margins_min": 0.023896273225545883, + "rewards/margins_std": 0.04805583506822586, + "rewards/rejected": -0.03956816717982292, + "step": 1490 + }, + { + "epoch": 0.87, + "grad_norm": 0.42578125, + "learning_rate": 2.5294089498438225e-08, + "logits/chosen": 0.024487819522619247, + "logits/rejected": 0.5533932447433472, + "logps/chosen": -245.57492065429688, + "logps/rejected": -220.93258666992188, + "loss": 0.6584, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.010946778580546379, + "rewards/margins": 0.06493957340717316, + "rewards/margins_max": 0.0981217697262764, + "rewards/margins_min": 0.03175736218690872, + "rewards/margins_std": 0.046926725655794144, + "rewards/rejected": -0.05399278551340103, + "step": 1500 + }, + { + "epoch": 0.88, + "grad_norm": 0.48046875, + "learning_rate": 2.312081103369354e-08, + "logits/chosen": 0.10629892349243164, + "logits/rejected": 0.5729449987411499, + "logps/chosen": -227.0969696044922, + "logps/rejected": -209.62841796875, + "loss": 0.659, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.013625606894493103, + "rewards/margins": 0.05797373503446579, + "rewards/margins_max": 0.0893624946475029, + "rewards/margins_min": 0.02658497728407383, + "rewards/margins_std": 0.04439040273427963, + "rewards/rejected": -0.04434812813997269, + "step": 1510 + }, + { + "epoch": 0.88, + "grad_norm": 0.48046875, + "learning_rate": 2.104061248058872e-08, + "logits/chosen": 0.10214777290821075, + "logits/rejected": 0.4200982451438904, + "logps/chosen": -213.7083740234375, + "logps/rejected": -225.8516845703125, + "loss": 0.6666, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.018484923988580704, + "rewards/margins": 0.058260779827833176, + "rewards/margins_max": 0.08636601269245148, + "rewards/margins_min": 0.030155545100569725, + "rewards/margins_std": 0.03974680230021477, + "rewards/rejected": -0.03977585583925247, + "step": 1520 + }, + { + "epoch": 0.89, + "grad_norm": 0.44921875, + "learning_rate": 1.9054347265559213e-08, + "logits/chosen": 0.1583404242992401, + "logits/rejected": 0.6649370193481445, + "logps/chosen": -259.9563903808594, + "logps/rejected": -223.4931640625, + "loss": 0.6565, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.014935478568077087, + "rewards/margins": 0.07356850802898407, + "rewards/margins_max": 0.10868100821971893, + "rewards/margins_min": 0.0384560152888298, + "rewards/margins_std": 0.049656566232442856, + "rewards/rejected": -0.058633022010326385, + "step": 1530 + }, + { + "epoch": 0.89, + "grad_norm": 0.498046875, + "learning_rate": 1.716283027776061e-08, + "logits/chosen": 0.2019151747226715, + "logits/rejected": 0.8282853364944458, + "logps/chosen": -291.37066650390625, + "logps/rejected": -222.61831665039062, + "loss": 0.6634, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.016527246683835983, + "rewards/margins": 0.07255034148693085, + "rewards/margins_max": 0.1086968407034874, + "rewards/margins_min": 0.036403849720954895, + "rewards/margins_std": 0.05111886188387871, + "rewards/rejected": -0.05602309852838516, + "step": 1540 + }, + { + "epoch": 0.9, + "grad_norm": 0.4296875, + "learning_rate": 1.536683753475043e-08, + "logits/chosen": 0.22870250046253204, + "logits/rejected": 0.4174967408180237, + "logps/chosen": -219.11306762695312, + "logps/rejected": -241.36563110351562, + "loss": 0.6615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025456459261476994, + "rewards/margins": 0.059264473617076874, + "rewards/margins_max": 0.08250005543231964, + "rewards/margins_min": 0.036028891801834106, + "rewards/margins_std": 0.032860077917575836, + "rewards/rejected": -0.061810124665498734, + "step": 1550 + }, + { + "epoch": 0.9, + "grad_norm": 0.390625, + "learning_rate": 1.3667105864117873e-08, + "logits/chosen": 0.21612632274627686, + "logits/rejected": 0.39824485778808594, + "logps/chosen": -200.84498596191406, + "logps/rejected": -228.2679901123047, + "loss": 0.6605, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.008642548695206642, + "rewards/margins": 0.0651601254940033, + "rewards/margins_max": 0.10423406213521957, + "rewards/margins_min": 0.026086175814270973, + "rewards/margins_std": 0.05525890737771988, + "rewards/rejected": -0.0565175786614418, + "step": 1560 + }, + { + "epoch": 0.91, + "grad_norm": 0.41796875, + "learning_rate": 1.2064332601191163e-08, + "logits/chosen": -0.04893340915441513, + "logits/rejected": 0.339263916015625, + "logps/chosen": -222.4666748046875, + "logps/rejected": -217.02999877929688, + "loss": 0.6612, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.0008535057422704995, + "rewards/margins": 0.05954117700457573, + "rewards/margins_max": 0.0829622894525528, + "rewards/margins_min": 0.03612007200717926, + "rewards/margins_std": 0.03312245011329651, + "rewards/rejected": -0.06039468199014664, + "step": 1570 + }, + { + "epoch": 0.92, + "grad_norm": 0.39453125, + "learning_rate": 1.0559175302947476e-08, + "logits/chosen": 0.012552693486213684, + "logits/rejected": 0.5173078775405884, + "logps/chosen": -260.0834045410156, + "logps/rejected": -247.43447875976562, + "loss": 0.6595, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.011661765165627003, + "rewards/margins": 0.06366874277591705, + "rewards/margins_max": 0.09778660535812378, + "rewards/margins_min": 0.029550885781645775, + "rewards/margins_std": 0.04824993759393692, + "rewards/rejected": -0.052006978541612625, + "step": 1580 + }, + { + "epoch": 0.92, + "grad_norm": 0.349609375, + "learning_rate": 9.152251478242417e-09, + "logits/chosen": -0.02594194933772087, + "logits/rejected": 0.4399421215057373, + "logps/chosen": -212.4099578857422, + "logps/rejected": -199.73458862304688, + "loss": 0.6594, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.007081738207489252, + "rewards/margins": 0.06215248256921768, + "rewards/margins_max": 0.08854631334543228, + "rewards/margins_min": 0.03575865179300308, + "rewards/margins_std": 0.03732650727033615, + "rewards/rejected": -0.055070746690034866, + "step": 1590 + }, + { + "epoch": 0.93, + "grad_norm": 0.427734375, + "learning_rate": 7.844138334469425e-09, + "logits/chosen": 0.4558231234550476, + "logits/rejected": 0.8965223431587219, + "logps/chosen": -201.3118438720703, + "logps/rejected": -192.5732421875, + "loss": 0.6628, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.009340132586658001, + "rewards/margins": 0.0616113655269146, + "rewards/margins_max": 0.09181926399469376, + "rewards/margins_min": 0.03140346333384514, + "rewards/margins_std": 0.04272041842341423, + "rewards/rejected": -0.05227123573422432, + "step": 1600 + }, + { + "epoch": 0.93, + "grad_norm": 0.37109375, + "learning_rate": 6.635372540753498e-09, + "logits/chosen": 0.11258337646722794, + "logits/rejected": 0.6999211311340332, + "logps/chosen": -240.33975219726562, + "logps/rejected": -214.0699920654297, + "loss": 0.6577, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.027147358283400536, + "rewards/margins": 0.0817473754286766, + "rewards/margins_max": 0.12004182487726212, + "rewards/margins_min": 0.0434529110789299, + "rewards/margins_std": 0.05415653437376022, + "rewards/rejected": -0.05460001155734062, + "step": 1610 + }, + { + "epoch": 0.94, + "grad_norm": 0.4609375, + "learning_rate": 5.526450007776435e-09, + "logits/chosen": 0.1300087720155716, + "logits/rejected": 0.5238357782363892, + "logps/chosen": -292.7140197753906, + "logps/rejected": -246.2644805908203, + "loss": 0.6611, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0037552430294454098, + "rewards/margins": 0.05609096214175224, + "rewards/margins_max": 0.07447664439678192, + "rewards/margins_min": 0.03770528361201286, + "rewards/margins_std": 0.026001274585723877, + "rewards/rejected": -0.052335720509290695, + "step": 1620 + }, + { + "epoch": 0.95, + "grad_norm": 0.431640625, + "learning_rate": 4.517825684323323e-09, + "logits/chosen": 0.18602465093135834, + "logits/rejected": 0.5172281861305237, + "logps/chosen": -223.3422088623047, + "logps/rejected": -241.034912109375, + "loss": 0.6596, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.00845097191631794, + "rewards/margins": 0.06410791724920273, + "rewards/margins_max": 0.09119440615177155, + "rewards/margins_min": 0.037021439522504807, + "rewards/margins_std": 0.03830606862902641, + "rewards/rejected": -0.05565694719552994, + "step": 1630 + }, + { + "epoch": 0.95, + "grad_norm": 0.474609375, + "learning_rate": 3.6099133706344044e-09, + "logits/chosen": 0.13008326292037964, + "logits/rejected": 0.6074930429458618, + "logps/chosen": -223.1219940185547, + "logps/rejected": -207.696044921875, + "loss": 0.6569, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.02304968610405922, + "rewards/margins": 0.07380314916372299, + "rewards/margins_max": 0.09590893238782883, + "rewards/margins_min": 0.05169736221432686, + "rewards/margins_std": 0.03126230835914612, + "rewards/rejected": -0.05075346678495407, + "step": 1640 + }, + { + "epoch": 0.96, + "grad_norm": 0.4921875, + "learning_rate": 2.8030855486386174e-09, + "logits/chosen": 0.28828924894332886, + "logits/rejected": 0.6710017919540405, + "logps/chosen": -256.94903564453125, + "logps/rejected": -281.40411376953125, + "loss": 0.6586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023295782506465912, + "rewards/margins": 0.071876659989357, + "rewards/margins_max": 0.09554243832826614, + "rewards/margins_min": 0.048210885375738144, + "rewards/margins_std": 0.03346845880150795, + "rewards/rejected": -0.04858088120818138, + "step": 1650 + }, + { + "epoch": 0.96, + "grad_norm": 0.515625, + "learning_rate": 2.097673229138286e-09, + "logits/chosen": 0.16988131403923035, + "logits/rejected": 0.47897881269454956, + "logps/chosen": -224.6415557861328, + "logps/rejected": -232.2594451904297, + "loss": 0.6587, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.012618700973689556, + "rewards/margins": 0.07099349051713943, + "rewards/margins_max": 0.10776303708553314, + "rewards/margins_min": 0.03422392159700394, + "rewards/margins_std": 0.0520000159740448, + "rewards/rejected": -0.05837478116154671, + "step": 1660 + }, + { + "epoch": 0.97, + "grad_norm": 0.44921875, + "learning_rate": 1.493965816008136e-09, + "logits/chosen": -0.009510600939393044, + "logits/rejected": 0.3807966113090515, + "logps/chosen": -211.14254760742188, + "logps/rejected": -236.635498046875, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00740268686786294, + "rewards/margins": 0.07398059964179993, + "rewards/margins_max": 0.10376466810703278, + "rewards/margins_min": 0.04419652372598648, + "rewards/margins_std": 0.0421210452914238, + "rewards/rejected": -0.06657791137695312, + "step": 1670 + }, + { + "epoch": 0.97, + "grad_norm": 0.447265625, + "learning_rate": 9.922109874636875e-10, + "logits/chosen": 0.19054090976715088, + "logits/rejected": 0.557522177696228, + "logps/chosen": -233.7532501220703, + "logps/rejected": -239.6273651123047, + "loss": 0.6579, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.015365364961326122, + "rewards/margins": 0.08128596842288971, + "rewards/margins_max": 0.11999186128377914, + "rewards/margins_min": 0.04258008301258087, + "rewards/margins_std": 0.05473839119076729, + "rewards/rejected": -0.06592060625553131, + "step": 1680 + }, + { + "epoch": 0.98, + "grad_norm": 0.416015625, + "learning_rate": 5.926145944483984e-10, + "logits/chosen": 0.04970569908618927, + "logits/rejected": 0.41454869508743286, + "logps/chosen": -197.70941162109375, + "logps/rejected": -207.9854278564453, + "loss": 0.6625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.009294511750340462, + "rewards/margins": 0.05480729788541794, + "rewards/margins_max": 0.08153598010540009, + "rewards/margins_min": 0.02807862125337124, + "rewards/margins_std": 0.03780006244778633, + "rewards/rejected": -0.04551279544830322, + "step": 1690 + }, + { + "epoch": 0.99, + "grad_norm": 0.42578125, + "learning_rate": 2.9534057618091356e-10, + "logits/chosen": 0.1366875320672989, + "logits/rejected": 0.4813140034675598, + "logps/chosen": -195.55368041992188, + "logps/rejected": -211.63711547851562, + "loss": 0.6599, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014302869327366352, + "rewards/margins": 0.0652112141251564, + "rewards/margins_max": 0.09685875475406647, + "rewards/margins_min": 0.03356366977095604, + "rewards/margins_std": 0.04475637897849083, + "rewards/rejected": -0.05090833827853203, + "step": 1700 + }, + { + "epoch": 0.99, + "grad_norm": 0.4453125, + "learning_rate": 1.0051089289686565e-10, + "logits/chosen": 0.20965194702148438, + "logits/rejected": 0.5980690121650696, + "logps/chosen": -218.3548583984375, + "logps/rejected": -252.60159301757812, + "loss": 0.6601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01929156482219696, + "rewards/margins": 0.06570716202259064, + "rewards/margins_max": 0.09711313247680664, + "rewards/margins_min": 0.03430120274424553, + "rewards/margins_std": 0.044414736330509186, + "rewards/rejected": -0.04641559720039368, + "step": 1710 + }, + { + "epoch": 1.0, + "grad_norm": 0.404296875, + "learning_rate": 8.205475813372054e-12, + "logits/chosen": 0.07036467641592026, + "logits/rejected": 0.6885267496109009, + "logps/chosen": -334.186279296875, + "logps/rejected": -232.6072998046875, + "loss": 0.6604, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.015851657837629318, + "rewards/margins": 0.06690393388271332, + "rewards/margins_max": 0.0959465354681015, + "rewards/margins_min": 0.037861332297325134, + "rewards/margins_std": 0.041072435677051544, + "rewards/rejected": -0.0510522723197937, + "step": 1720 + }, + { + "epoch": 1.0, + "eval_logits/chosen": 0.7297662496566772, + "eval_logits/rejected": 0.8997808694839478, + "eval_logps/chosen": -337.8507080078125, + "eval_logps/rejected": -318.01556396484375, + "eval_loss": 0.6928703784942627, + "eval_rewards/accuracies": 0.5364999771118164, + "eval_rewards/chosen": 0.002909434260800481, + "eval_rewards/margins": 0.0005662557086907327, + "eval_rewards/margins_max": 0.07228709012269974, + "eval_rewards/margins_min": -0.08225506544113159, + "eval_rewards/margins_std": 0.050406549125909805, + "eval_rewards/rejected": 0.002343178726732731, + "eval_runtime": 864.7602, + "eval_samples_per_second": 9.251, + "eval_steps_per_second": 0.289, + "step": 1724 + }, + { + "epoch": 1.0, + "step": 1724, + "total_flos": 0.0, + "train_loss": 0.6676546893927447, + "train_runtime": 9120.8228, + "train_samples_per_second": 3.024, + "train_steps_per_second": 0.189 + } + ], + "logging_steps": 10, + "max_steps": 1724, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}