{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5156854318865493, "eval_steps": 50, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008594757198109154, "grad_norm": 0.05167795345187187, "learning_rate": 4.999451708687114e-06, "logits/chosen": 15.084823608398438, "logits/rejected": 15.218259811401367, "logps/chosen": -0.3124043345451355, "logps/rejected": -0.31854626536369324, "loss": 0.9405, "rewards/accuracies": 0.4375, "rewards/chosen": -0.46860653162002563, "rewards/margins": 0.009212849661707878, "rewards/rejected": -0.47781938314437866, "step": 10 }, { "epoch": 0.017189514396218308, "grad_norm": 0.06444549560546875, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.565855026245117, "logits/rejected": 14.914319038391113, "logps/chosen": -0.28220412135124207, "logps/rejected": -0.3605547249317169, "loss": 0.9294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4233061671257019, "rewards/margins": 0.11752591282129288, "rewards/rejected": -0.5408320426940918, "step": 20 }, { "epoch": 0.02578427159432746, "grad_norm": 0.059900399297475815, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.878230094909668, "logits/rejected": 15.334558486938477, "logps/chosen": -0.2837519347667694, "logps/rejected": -0.320808470249176, "loss": 0.9338, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4256278872489929, "rewards/margins": 0.05558476969599724, "rewards/rejected": -0.48121267557144165, "step": 30 }, { "epoch": 0.034379028792436615, "grad_norm": 0.05459418520331383, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.800946235656738, "logits/rejected": 15.134121894836426, "logps/chosen": -0.2971518635749817, "logps/rejected": -0.3476788401603699, "loss": 0.9202, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4457278251647949, "rewards/margins": 0.07579050213098526, "rewards/rejected": -0.521518349647522, "step": 40 }, { "epoch": 0.042973785990545764, "grad_norm": 0.05792691186070442, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.62980842590332, "logits/rejected": 14.848493576049805, "logps/chosen": -0.27511823177337646, "logps/rejected": -0.32557612657546997, "loss": 0.9213, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4126773774623871, "rewards/margins": 0.07568677514791489, "rewards/rejected": -0.48836421966552734, "step": 50 }, { "epoch": 0.042973785990545764, "eval_logits/chosen": 14.195974349975586, "eval_logits/rejected": 15.046167373657227, "eval_logps/chosen": -0.27934810519218445, "eval_logps/rejected": -0.3643363118171692, "eval_loss": 0.9250189065933228, "eval_rewards/accuracies": 0.557894766330719, "eval_rewards/chosen": -0.4190221428871155, "eval_rewards/margins": 0.1274823397397995, "eval_rewards/rejected": -0.5465044379234314, "eval_runtime": 26.0506, "eval_samples_per_second": 28.905, "eval_steps_per_second": 3.647, "step": 50 }, { "epoch": 0.05156854318865492, "grad_norm": 0.08806851506233215, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.311370849609375, "logits/rejected": 15.19476318359375, "logps/chosen": -0.26153135299682617, "logps/rejected": -0.34108471870422363, "loss": 0.9255, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.39229699969291687, "rewards/margins": 0.11933007091283798, "rewards/rejected": -0.5116270780563354, "step": 60 }, { "epoch": 0.060163300386764075, "grad_norm": 0.10536951571702957, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.646909713745117, "logits/rejected": 15.134190559387207, "logps/chosen": -0.2928832173347473, "logps/rejected": -0.37275972962379456, "loss": 0.9155, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4393247961997986, "rewards/margins": 0.11981481313705444, "rewards/rejected": -0.559139609336853, "step": 70 }, { "epoch": 0.06875805758487323, "grad_norm": 0.07452531903982162, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.383807182312012, "logits/rejected": 14.806958198547363, "logps/chosen": -0.2724239230155945, "logps/rejected": -0.33048146963119507, "loss": 0.9191, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.4086359143257141, "rewards/margins": 0.08708634227514267, "rewards/rejected": -0.495722234249115, "step": 80 }, { "epoch": 0.07735281478298238, "grad_norm": 0.06996195018291473, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.557902336120605, "logits/rejected": 15.043550491333008, "logps/chosen": -0.3053165078163147, "logps/rejected": -0.36941051483154297, "loss": 0.9255, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.45797473192214966, "rewards/margins": 0.0961410254240036, "rewards/rejected": -0.5541157126426697, "step": 90 }, { "epoch": 0.08594757198109153, "grad_norm": 0.09053988754749298, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 13.747509956359863, "logits/rejected": 14.678106307983398, "logps/chosen": -0.2453141212463379, "logps/rejected": -0.36430835723876953, "loss": 0.9022, "rewards/accuracies": 0.625, "rewards/chosen": -0.36797118186950684, "rewards/margins": 0.17849135398864746, "rewards/rejected": -0.5464625358581543, "step": 100 }, { "epoch": 0.08594757198109153, "eval_logits/chosen": 14.017444610595703, "eval_logits/rejected": 14.885564804077148, "eval_logps/chosen": -0.2685285806655884, "eval_logps/rejected": -0.3654690384864807, "eval_loss": 0.9166209697723389, "eval_rewards/accuracies": 0.557894766330719, "eval_rewards/chosen": -0.4027928411960602, "eval_rewards/margins": 0.14541073143482208, "eval_rewards/rejected": -0.5482036471366882, "eval_runtime": 26.0431, "eval_samples_per_second": 28.914, "eval_steps_per_second": 3.648, "step": 100 }, { "epoch": 0.09454232917920069, "grad_norm": 0.07788874208927155, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.805160522460938, "logits/rejected": 14.767298698425293, "logps/chosen": -0.30586495995521545, "logps/rejected": -0.3159794211387634, "loss": 0.9128, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.45879751443862915, "rewards/margins": 0.015171671286225319, "rewards/rejected": -0.47396916151046753, "step": 110 }, { "epoch": 0.10313708637730984, "grad_norm": 0.07691823691129684, "learning_rate": 4.921457902821578e-06, "logits/chosen": 13.761972427368164, "logits/rejected": 14.64726448059082, "logps/chosen": -0.2784760296344757, "logps/rejected": -0.34076255559921265, "loss": 0.9179, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.41771402955055237, "rewards/margins": 0.09342982620000839, "rewards/rejected": -0.5111438632011414, "step": 120 }, { "epoch": 0.11173184357541899, "grad_norm": 0.08534488826990128, "learning_rate": 4.907906416994146e-06, "logits/chosen": 13.837780952453613, "logits/rejected": 14.767657279968262, "logps/chosen": -0.26367664337158203, "logps/rejected": -0.3845904469490051, "loss": 0.8978, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.39551490545272827, "rewards/margins": 0.18137072026729584, "rewards/rejected": -0.5768855810165405, "step": 130 }, { "epoch": 0.12032660077352815, "grad_norm": 0.08117899298667908, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.270025253295898, "logits/rejected": 14.128207206726074, "logps/chosen": -0.24728116393089294, "logps/rejected": -0.3510771095752716, "loss": 0.9117, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.370921790599823, "rewards/margins": 0.1556939035654068, "rewards/rejected": -0.5266156196594238, "step": 140 }, { "epoch": 0.1289213579716373, "grad_norm": 0.1263500601053238, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 13.525009155273438, "logits/rejected": 14.163309097290039, "logps/chosen": -0.24874648451805115, "logps/rejected": -0.38132259249687195, "loss": 0.9007, "rewards/accuracies": 0.625, "rewards/chosen": -0.3731197714805603, "rewards/margins": 0.1988641768693924, "rewards/rejected": -0.5719839334487915, "step": 150 }, { "epoch": 0.1289213579716373, "eval_logits/chosen": 12.438652992248535, "eval_logits/rejected": 13.519843101501465, "eval_logps/chosen": -0.2689361274242401, "eval_logps/rejected": -0.3897271454334259, "eval_loss": 0.8991575241088867, "eval_rewards/accuracies": 0.5894736647605896, "eval_rewards/chosen": -0.40340420603752136, "eval_rewards/margins": 0.1811865121126175, "eval_rewards/rejected": -0.5845907330513, "eval_runtime": 26.0482, "eval_samples_per_second": 28.908, "eval_steps_per_second": 3.647, "step": 150 }, { "epoch": 0.13751611516974646, "grad_norm": 0.11390316486358643, "learning_rate": 4.860940925593703e-06, "logits/chosen": 12.494891166687012, "logits/rejected": 13.346384048461914, "logps/chosen": -0.26858460903167725, "logps/rejected": -0.4170496463775635, "loss": 0.8854, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4028768539428711, "rewards/margins": 0.22269758582115173, "rewards/rejected": -0.6255744695663452, "step": 160 }, { "epoch": 0.1461108723678556, "grad_norm": 0.14250700175762177, "learning_rate": 4.84320497372973e-06, "logits/chosen": 11.637483596801758, "logits/rejected": 12.72177505493164, "logps/chosen": -0.2967775762081146, "logps/rejected": -0.440357506275177, "loss": 0.8884, "rewards/accuracies": 0.625, "rewards/chosen": -0.4451664090156555, "rewards/margins": 0.21536986529827118, "rewards/rejected": -0.6605362892150879, "step": 170 }, { "epoch": 0.15470562956596476, "grad_norm": 0.174351766705513, "learning_rate": 4.824441214720629e-06, "logits/chosen": 11.577589988708496, "logits/rejected": 12.179681777954102, "logps/chosen": -0.29397666454315186, "logps/rejected": -0.4009665548801422, "loss": 0.8756, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.44096502661705017, "rewards/margins": 0.16048480570316315, "rewards/rejected": -0.6014498472213745, "step": 180 }, { "epoch": 0.1633003867640739, "grad_norm": 0.22877676784992218, "learning_rate": 4.804657878971252e-06, "logits/chosen": 9.352752685546875, "logits/rejected": 10.27645206451416, "logps/chosen": -0.30452457070350647, "logps/rejected": -0.4765443205833435, "loss": 0.8781, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4567868113517761, "rewards/margins": 0.25802966952323914, "rewards/rejected": -0.7148164510726929, "step": 190 }, { "epoch": 0.17189514396218306, "grad_norm": 0.2517675459384918, "learning_rate": 4.783863644106502e-06, "logits/chosen": 8.136419296264648, "logits/rejected": 9.26432991027832, "logps/chosen": -0.3416380286216736, "logps/rejected": -0.4680122435092926, "loss": 0.8531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5124570727348328, "rewards/margins": 0.18956127762794495, "rewards/rejected": -0.7020183801651001, "step": 200 }, { "epoch": 0.17189514396218306, "eval_logits/chosen": 7.26609992980957, "eval_logits/rejected": 8.391904830932617, "eval_logps/chosen": -0.31862083077430725, "eval_logps/rejected": -0.5189473032951355, "eval_loss": 0.8484573364257812, "eval_rewards/accuracies": 0.6315789222717285, "eval_rewards/chosen": -0.47793126106262207, "eval_rewards/margins": 0.30048972368240356, "eval_rewards/rejected": -0.7784210443496704, "eval_runtime": 26.0496, "eval_samples_per_second": 28.906, "eval_steps_per_second": 3.647, "step": 200 }, { "epoch": 0.18048990116029223, "grad_norm": 0.28971683979034424, "learning_rate": 4.762067631165049e-06, "logits/chosen": 7.321592807769775, "logits/rejected": 7.871228218078613, "logps/chosen": -0.3311695158481598, "logps/rejected": -0.4879254400730133, "loss": 0.8211, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4967542588710785, "rewards/margins": 0.23513388633728027, "rewards/rejected": -0.7318881750106812, "step": 210 }, { "epoch": 0.18908465835840138, "grad_norm": 0.568050742149353, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 5.077876091003418, "logits/rejected": 5.706583499908447, "logps/chosen": -0.3127230405807495, "logps/rejected": -0.5744297504425049, "loss": 0.8331, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.46908459067344666, "rewards/margins": 0.39256006479263306, "rewards/rejected": -0.8616446256637573, "step": 220 }, { "epoch": 0.19767941555651053, "grad_norm": 0.32453760504722595, "learning_rate": 4.715508948078037e-06, "logits/chosen": 4.265925407409668, "logits/rejected": 4.2006964683532715, "logps/chosen": -0.4032830595970154, "logps/rejected": -0.6459742784500122, "loss": 0.7986, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6049246191978455, "rewards/margins": 0.3640367388725281, "rewards/rejected": -0.9689614176750183, "step": 230 }, { "epoch": 0.20627417275461968, "grad_norm": 0.448809951543808, "learning_rate": 4.690766700109659e-06, "logits/chosen": 3.3534884452819824, "logits/rejected": 3.4250903129577637, "logps/chosen": -0.3817242383956909, "logps/rejected": -0.7190496921539307, "loss": 0.7708, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5725863575935364, "rewards/margins": 0.5059882402420044, "rewards/rejected": -1.078574538230896, "step": 240 }, { "epoch": 0.21486892995272883, "grad_norm": 0.4277574419975281, "learning_rate": 4.665063509461098e-06, "logits/chosen": 3.151397228240967, "logits/rejected": 2.8183228969573975, "logps/chosen": -0.44173598289489746, "logps/rejected": -0.8323748707771301, "loss": 0.7722, "rewards/accuracies": 0.625, "rewards/chosen": -0.6626039743423462, "rewards/margins": 0.5859583616256714, "rewards/rejected": -1.248562216758728, "step": 250 }, { "epoch": 0.21486892995272883, "eval_logits/chosen": 2.520007848739624, "eval_logits/rejected": 1.9197090864181519, "eval_logps/chosen": -0.4703753888607025, "eval_logps/rejected": -0.90553879737854, "eval_loss": 0.7410055994987488, "eval_rewards/accuracies": 0.6631578803062439, "eval_rewards/chosen": -0.7055630087852478, "eval_rewards/margins": 0.6527453064918518, "eval_rewards/rejected": -1.3583083152770996, "eval_runtime": 26.0441, "eval_samples_per_second": 28.912, "eval_steps_per_second": 3.648, "step": 250 }, { "epoch": 0.22346368715083798, "grad_norm": 0.5626497268676758, "learning_rate": 4.638410650401267e-06, "logits/chosen": 1.2351257801055908, "logits/rejected": 0.5925868153572083, "logps/chosen": -0.46581563353538513, "logps/rejected": -0.9673674702644348, "loss": 0.6933, "rewards/accuracies": 0.75, "rewards/chosen": -0.6987233757972717, "rewards/margins": 0.7523276209831238, "rewards/rejected": -1.451051115989685, "step": 260 }, { "epoch": 0.23205844434894715, "grad_norm": 0.7433231472969055, "learning_rate": 4.610819813755038e-06, "logits/chosen": 3.1690659523010254, "logits/rejected": 2.0423803329467773, "logps/chosen": -0.506645679473877, "logps/rejected": -1.0180162191390991, "loss": 0.7265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7599684596061707, "rewards/margins": 0.767055869102478, "rewards/rejected": -1.527024507522583, "step": 270 }, { "epoch": 0.2406532015470563, "grad_norm": 1.4220589399337769, "learning_rate": 4.582303101775249e-06, "logits/chosen": 2.8173985481262207, "logits/rejected": 1.5537467002868652, "logps/chosen": -0.5869659185409546, "logps/rejected": -1.1085975170135498, "loss": 0.6725, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8804486989974976, "rewards/margins": 0.7824474573135376, "rewards/rejected": -1.6628963947296143, "step": 280 }, { "epoch": 0.24924795874516545, "grad_norm": 0.6397098898887634, "learning_rate": 4.55287302283426e-06, "logits/chosen": 2.734229564666748, "logits/rejected": 1.9948323965072632, "logps/chosen": -0.6540845036506653, "logps/rejected": -1.451608419418335, "loss": 0.571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9811266660690308, "rewards/margins": 1.1962860822677612, "rewards/rejected": -2.177412748336792, "step": 290 }, { "epoch": 0.2578427159432746, "grad_norm": 0.4591177701950073, "learning_rate": 4.522542485937369e-06, "logits/chosen": 2.2491040229797363, "logits/rejected": 1.345014214515686, "logps/chosen": -0.6877793073654175, "logps/rejected": -1.6054528951644897, "loss": 0.5782, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0316689014434814, "rewards/margins": 1.3765103816986084, "rewards/rejected": -2.408179521560669, "step": 300 }, { "epoch": 0.2578427159432746, "eval_logits/chosen": 1.661840796470642, "eval_logits/rejected": 0.6246702671051025, "eval_logps/chosen": -0.7322248816490173, "eval_logps/rejected": -2.272771120071411, "eval_loss": 0.563686728477478, "eval_rewards/accuracies": 0.7157894968986511, "eval_rewards/chosen": -1.0983372926712036, "eval_rewards/margins": 2.310819387435913, "eval_rewards/rejected": -3.409156560897827, "eval_runtime": 26.0455, "eval_samples_per_second": 28.911, "eval_steps_per_second": 3.647, "step": 300 }, { "epoch": 0.2664374731413838, "grad_norm": 0.786809504032135, "learning_rate": 4.491324795060491e-06, "logits/chosen": 1.3445788621902466, "logits/rejected": 0.4989510178565979, "logps/chosen": -0.7276264429092407, "logps/rejected": -2.3235878944396973, "loss": 0.5253, "rewards/accuracies": 0.75, "rewards/chosen": -1.0914397239685059, "rewards/margins": 2.393942356109619, "rewards/rejected": -3.485382080078125, "step": 310 }, { "epoch": 0.2750322303394929, "grad_norm": 0.3913320004940033, "learning_rate": 4.4592336433146e-06, "logits/chosen": 2.61965012550354, "logits/rejected": 1.9477211236953735, "logps/chosen": -0.7146936655044556, "logps/rejected": -1.9647115468978882, "loss": 0.5294, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0720404386520386, "rewards/margins": 1.8750267028808594, "rewards/rejected": -2.9470672607421875, "step": 320 }, { "epoch": 0.28362698753760207, "grad_norm": 0.4867005944252014, "learning_rate": 4.426283106939474e-06, "logits/chosen": 2.500439167022705, "logits/rejected": 1.6413562297821045, "logps/chosen": -0.8710287809371948, "logps/rejected": -2.36894154548645, "loss": 0.548, "rewards/accuracies": 0.625, "rewards/chosen": -1.306543231010437, "rewards/margins": 2.246868848800659, "rewards/rejected": -3.5534119606018066, "step": 330 }, { "epoch": 0.2922217447357112, "grad_norm": 0.8009849786758423, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 1.3847177028656006, "logits/rejected": 0.8994542360305786, "logps/chosen": -0.8447234034538269, "logps/rejected": -2.800283908843994, "loss": 0.4797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2670851945877075, "rewards/margins": 2.9333412647247314, "rewards/rejected": -4.2004265785217285, "step": 340 }, { "epoch": 0.30081650193382037, "grad_norm": 2.0202796459198, "learning_rate": 4.357862063693486e-06, "logits/chosen": 2.3197357654571533, "logits/rejected": 1.37326180934906, "logps/chosen": -0.8590717315673828, "logps/rejected": -2.1532845497131348, "loss": 0.5126, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2886077165603638, "rewards/margins": 1.941319465637207, "rewards/rejected": -3.2299270629882812, "step": 350 }, { "epoch": 0.30081650193382037, "eval_logits/chosen": 2.0864102840423584, "eval_logits/rejected": 1.2036340236663818, "eval_logps/chosen": -0.9554746150970459, "eval_logps/rejected": -3.0601954460144043, "eval_loss": 0.5108997821807861, "eval_rewards/accuracies": 0.7368420958518982, "eval_rewards/chosen": -1.4332119226455688, "eval_rewards/margins": 3.15708065032959, "eval_rewards/rejected": -4.590292930603027, "eval_runtime": 26.0503, "eval_samples_per_second": 28.906, "eval_steps_per_second": 3.647, "step": 350 }, { "epoch": 0.3094112591319295, "grad_norm": 1.0668681859970093, "learning_rate": 4.322421568553529e-06, "logits/chosen": 1.6770871877670288, "logits/rejected": 1.073407530784607, "logps/chosen": -1.1393296718597412, "logps/rejected": -2.886169910430908, "loss": 0.5031, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7089945077896118, "rewards/margins": 2.620260238647461, "rewards/rejected": -4.329255104064941, "step": 360 }, { "epoch": 0.31800601633003867, "grad_norm": 0.5015287399291992, "learning_rate": 4.286181699082008e-06, "logits/chosen": 2.156587600708008, "logits/rejected": 1.371209979057312, "logps/chosen": -0.9851818084716797, "logps/rejected": -3.2286324501037598, "loss": 0.4662, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.47777259349823, "rewards/margins": 3.3651764392852783, "rewards/rejected": -4.842948913574219, "step": 370 }, { "epoch": 0.3266007735281478, "grad_norm": 0.9893808960914612, "learning_rate": 4.249158351283414e-06, "logits/chosen": 2.6184191703796387, "logits/rejected": 2.212998390197754, "logps/chosen": -0.9414733052253723, "logps/rejected": -2.940886974334717, "loss": 0.4829, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4122098684310913, "rewards/margins": 2.9991202354431152, "rewards/rejected": -4.411330223083496, "step": 380 }, { "epoch": 0.33519553072625696, "grad_norm": 0.7588702440261841, "learning_rate": 4.211367764821722e-06, "logits/chosen": 3.257941484451294, "logits/rejected": 2.5362088680267334, "logps/chosen": -1.182255744934082, "logps/rejected": -2.8621151447296143, "loss": 0.4538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7733834981918335, "rewards/margins": 2.5197887420654297, "rewards/rejected": -4.293172359466553, "step": 390 }, { "epoch": 0.3437902879243661, "grad_norm": 0.6317985653877258, "learning_rate": 4.172826515897146e-06, "logits/chosen": 3.057791233062744, "logits/rejected": 2.4121367931365967, "logps/chosen": -1.0847463607788086, "logps/rejected": -3.3152599334716797, "loss": 0.4847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6271196603775024, "rewards/margins": 3.3457705974578857, "rewards/rejected": -4.9728899002075195, "step": 400 }, { "epoch": 0.3437902879243661, "eval_logits/chosen": 2.9584426879882812, "eval_logits/rejected": 2.292771577835083, "eval_logps/chosen": -1.202886939048767, "eval_logps/rejected": -3.6770312786102295, "eval_loss": 0.47303518652915955, "eval_rewards/accuracies": 0.7473683953285217, "eval_rewards/chosen": -1.8043304681777954, "eval_rewards/margins": 3.711216688156128, "eval_rewards/rejected": -5.515547275543213, "eval_runtime": 26.0247, "eval_samples_per_second": 28.934, "eval_steps_per_second": 3.65, "step": 400 }, { "epoch": 0.3523850451224753, "grad_norm": 1.0523916482925415, "learning_rate": 4.133551509975264e-06, "logits/chosen": 2.9360365867614746, "logits/rejected": 2.330521583557129, "logps/chosen": -1.3002166748046875, "logps/rejected": -3.2887542247772217, "loss": 0.4398, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9503250122070312, "rewards/margins": 2.9828057289123535, "rewards/rejected": -4.933130741119385, "step": 410 }, { "epoch": 0.36097980232058446, "grad_norm": 0.6079875826835632, "learning_rate": 4.093559974371725e-06, "logits/chosen": 3.1500794887542725, "logits/rejected": 2.329282283782959, "logps/chosen": -1.23466157913208, "logps/rejected": -3.291548252105713, "loss": 0.4774, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8519923686981201, "rewards/margins": 3.085329532623291, "rewards/rejected": -4.93732213973999, "step": 420 }, { "epoch": 0.3695745595186936, "grad_norm": 1.3175437450408936, "learning_rate": 4.052869450695776e-06, "logits/chosen": 3.4488296508789062, "logits/rejected": 2.6282899379730225, "logps/chosen": -1.380877137184143, "logps/rejected": -4.005017280578613, "loss": 0.4158, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0713157653808594, "rewards/margins": 3.9362099170684814, "rewards/rejected": -6.007525444030762, "step": 430 }, { "epoch": 0.37816931671680276, "grad_norm": 3.7249863147735596, "learning_rate": 4.011497787155938e-06, "logits/chosen": 2.5173678398132324, "logits/rejected": 1.943926215171814, "logps/chosen": -1.7800304889678955, "logps/rejected": -4.422289848327637, "loss": 0.3916, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.6700453758239746, "rewards/margins": 3.9633898735046387, "rewards/rejected": -6.633435249328613, "step": 440 }, { "epoch": 0.3867640739149119, "grad_norm": 2.9776103496551514, "learning_rate": 3.969463130731183e-06, "logits/chosen": 3.2318034172058105, "logits/rejected": 2.5253517627716064, "logps/chosen": -2.309701442718506, "logps/rejected": -4.725776672363281, "loss": 0.368, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.464552640914917, "rewards/margins": 3.624112606048584, "rewards/rejected": -7.0886640548706055, "step": 450 }, { "epoch": 0.3867640739149119, "eval_logits/chosen": 2.397157907485962, "eval_logits/rejected": 2.0492196083068848, "eval_logps/chosen": -2.6244213581085205, "eval_logps/rejected": -5.247391700744629, "eval_loss": 0.3982011079788208, "eval_rewards/accuracies": 0.8842105269432068, "eval_rewards/chosen": -3.936631917953491, "eval_rewards/margins": 3.934455633163452, "eval_rewards/rejected": -7.87108850479126, "eval_runtime": 26.0501, "eval_samples_per_second": 28.906, "eval_steps_per_second": 3.647, "step": 450 }, { "epoch": 0.39535883111302106, "grad_norm": 2.3925623893737793, "learning_rate": 3.92678391921108e-06, "logits/chosen": 3.0329971313476562, "logits/rejected": 2.67683482170105, "logps/chosen": -2.4644994735717773, "logps/rejected": -4.755246162414551, "loss": 0.3584, "rewards/accuracies": 0.8125, "rewards/chosen": -3.696749210357666, "rewards/margins": 3.436119794845581, "rewards/rejected": -7.132868766784668, "step": 460 }, { "epoch": 0.4039535883111302, "grad_norm": 3.1981327533721924, "learning_rate": 3.88347887310836e-06, "logits/chosen": 2.219741106033325, "logits/rejected": 1.8649622201919556, "logps/chosen": -2.2890329360961914, "logps/rejected": -5.124932289123535, "loss": 0.3709, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.433549404144287, "rewards/margins": 4.253849029541016, "rewards/rejected": -7.687398433685303, "step": 470 }, { "epoch": 0.41254834550923936, "grad_norm": 2.0272741317749023, "learning_rate": 3.839566987447492e-06, "logits/chosen": 3.6659038066864014, "logits/rejected": 3.202749252319336, "logps/chosen": -2.5729193687438965, "logps/rejected": -4.992354393005371, "loss": 0.3837, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.859379529953003, "rewards/margins": 3.629152297973633, "rewards/rejected": -7.488531589508057, "step": 480 }, { "epoch": 0.4211431027073485, "grad_norm": 2.5182268619537354, "learning_rate": 3.795067523432826e-06, "logits/chosen": 3.327012538909912, "logits/rejected": 3.1205530166625977, "logps/chosen": -3.016247510910034, "logps/rejected": -5.566779136657715, "loss": 0.3112, "rewards/accuracies": 0.875, "rewards/chosen": -4.524371147155762, "rewards/margins": 3.8257980346679688, "rewards/rejected": -8.35016918182373, "step": 490 }, { "epoch": 0.42973785990545765, "grad_norm": 2.990694046020508, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 2.7793381214141846, "logits/rejected": 2.7330098152160645, "logps/chosen": -2.7836732864379883, "logps/rejected": -5.60109806060791, "loss": 0.3069, "rewards/accuracies": 0.875, "rewards/chosen": -4.175509929656982, "rewards/margins": 4.226136684417725, "rewards/rejected": -8.401647567749023, "step": 500 }, { "epoch": 0.42973785990545765, "eval_logits/chosen": 2.5767242908477783, "eval_logits/rejected": 2.1918540000915527, "eval_logps/chosen": -3.1751770973205566, "eval_logps/rejected": -6.361191749572754, "eval_loss": 0.35469338297843933, "eval_rewards/accuracies": 0.9157894849777222, "eval_rewards/chosen": -4.762764930725098, "eval_rewards/margins": 4.779022693634033, "eval_rewards/rejected": -9.541787147521973, "eval_runtime": 26.0483, "eval_samples_per_second": 28.908, "eval_steps_per_second": 3.647, "step": 500 }, { "epoch": 0.4383326171035668, "grad_norm": 3.1177096366882324, "learning_rate": 3.7043841852542884e-06, "logits/chosen": 3.4840216636657715, "logits/rejected": 2.871774196624756, "logps/chosen": -2.739344596862793, "logps/rejected": -5.363945960998535, "loss": 0.3468, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.1090168952941895, "rewards/margins": 3.9369025230407715, "rewards/rejected": -8.045918464660645, "step": 510 }, { "epoch": 0.44692737430167595, "grad_norm": 2.212597131729126, "learning_rate": 3.658240087799655e-06, "logits/chosen": 2.8667449951171875, "logits/rejected": 2.463776111602783, "logps/chosen": -3.17940092086792, "logps/rejected": -6.375420570373535, "loss": 0.3092, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.769101619720459, "rewards/margins": 4.794029235839844, "rewards/rejected": -9.563131332397461, "step": 520 }, { "epoch": 0.45552213149978515, "grad_norm": 4.475163459777832, "learning_rate": 3.611587947962319e-06, "logits/chosen": 3.234764814376831, "logits/rejected": 2.6656813621520996, "logps/chosen": -3.0503814220428467, "logps/rejected": -5.525468826293945, "loss": 0.3044, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.5755720138549805, "rewards/margins": 3.7126305103302, "rewards/rejected": -8.288202285766602, "step": 530 }, { "epoch": 0.4641168886978943, "grad_norm": 1.8678548336029053, "learning_rate": 3.564448228912682e-06, "logits/chosen": 2.1433145999908447, "logits/rejected": 2.1412692070007324, "logps/chosen": -2.6177189350128174, "logps/rejected": -5.8179192543029785, "loss": 0.3376, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.9265785217285156, "rewards/margins": 4.800299644470215, "rewards/rejected": -8.72687816619873, "step": 540 }, { "epoch": 0.47271164589600345, "grad_norm": 2.3289716243743896, "learning_rate": 3.516841607689501e-06, "logits/chosen": 2.7216885089874268, "logits/rejected": 2.549870729446411, "logps/chosen": -2.7370285987854004, "logps/rejected": -5.929703712463379, "loss": 0.2937, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.1055426597595215, "rewards/margins": 4.7890119552612305, "rewards/rejected": -8.894556045532227, "step": 550 }, { "epoch": 0.47271164589600345, "eval_logits/chosen": 2.7431576251983643, "eval_logits/rejected": 2.386326789855957, "eval_logps/chosen": -3.3791866302490234, "eval_logps/rejected": -6.955687999725342, "eval_loss": 0.33076339960098267, "eval_rewards/accuracies": 0.9157894849777222, "eval_rewards/chosen": -5.068779945373535, "eval_rewards/margins": 5.364751815795898, "eval_rewards/rejected": -10.433531761169434, "eval_runtime": 26.0558, "eval_samples_per_second": 28.899, "eval_steps_per_second": 3.646, "step": 550 }, { "epoch": 0.4813064030941126, "grad_norm": 2.7705740928649902, "learning_rate": 3.4687889661302577e-06, "logits/chosen": 2.2392983436584473, "logits/rejected": 1.9859422445297241, "logps/chosen": -3.14917254447937, "logps/rejected": -6.809067726135254, "loss": 0.2983, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.723758697509766, "rewards/margins": 5.489841938018799, "rewards/rejected": -10.213602066040039, "step": 560 }, { "epoch": 0.48990116029222175, "grad_norm": 2.1203205585479736, "learning_rate": 3.4203113817116955e-06, "logits/chosen": 2.5817489624023438, "logits/rejected": 2.54498291015625, "logps/chosen": -3.4195308685302734, "logps/rejected": -7.411266326904297, "loss": 0.3014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.129295349121094, "rewards/margins": 5.987602710723877, "rewards/rejected": -11.116899490356445, "step": 570 }, { "epoch": 0.4984959174903309, "grad_norm": 1.7489718198776245, "learning_rate": 3.3714301183045382e-06, "logits/chosen": 2.1257646083831787, "logits/rejected": 2.1210994720458984, "logps/chosen": -2.9680445194244385, "logps/rejected": -6.824588775634766, "loss": 0.2752, "rewards/accuracies": 0.9375, "rewards/chosen": -4.452066898345947, "rewards/margins": 5.784815788269043, "rewards/rejected": -10.236883163452148, "step": 580 }, { "epoch": 0.50709067468844, "grad_norm": 2.1680099964141846, "learning_rate": 3.3221666168464584e-06, "logits/chosen": 2.5764970779418945, "logits/rejected": 2.2523038387298584, "logps/chosen": -3.667435884475708, "logps/rejected": -7.162708282470703, "loss": 0.2968, "rewards/accuracies": 0.9375, "rewards/chosen": -5.501153945922852, "rewards/margins": 5.242908954620361, "rewards/rejected": -10.744061470031738, "step": 590 }, { "epoch": 0.5156854318865493, "grad_norm": 1.7536494731903076, "learning_rate": 3.272542485937369e-06, "logits/chosen": 2.2658116817474365, "logits/rejected": 1.980126142501831, "logps/chosen": -3.5995922088623047, "logps/rejected": -7.158552646636963, "loss": 0.2971, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.399388313293457, "rewards/margins": 5.338440418243408, "rewards/rejected": -10.737829208374023, "step": 600 }, { "epoch": 0.5156854318865493, "eval_logits/chosen": 2.6781415939331055, "eval_logits/rejected": 2.508939027786255, "eval_logps/chosen": -3.80741548538208, "eval_logps/rejected": -7.577634334564209, "eval_loss": 0.3210188150405884, "eval_rewards/accuracies": 0.9368420839309692, "eval_rewards/chosen": -5.711122989654541, "eval_rewards/margins": 5.655328273773193, "eval_rewards/rejected": -11.366451263427734, "eval_runtime": 26.0494, "eval_samples_per_second": 28.907, "eval_steps_per_second": 3.647, "step": 600 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4077101809126605e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }