diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11956 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998854993048172, + "eval_steps": 100, + "global_step": 7641, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 6.535947712418301e-09, + "logits/chosen": -2.896247386932373, + "logits/rejected": -2.8002498149871826, + "logps/chosen": -240.16311645507812, + "logps/rejected": -260.468994140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": -2.7081668376922607, + "logits/rejected": -2.7046549320220947, + "logps/chosen": -287.52117919921875, + "logps/rejected": -263.2371520996094, + "loss": 0.6935, + "rewards/accuracies": 0.3194444477558136, + "rewards/chosen": -0.00040327783790417016, + "rewards/margins": -0.0007606567232869565, + "rewards/rejected": 0.00035737885627895594, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.3071895424836603e-07, + "logits/chosen": -2.7486777305603027, + "logits/rejected": -2.735520839691162, + "logps/chosen": -281.30926513671875, + "logps/rejected": -285.0191955566406, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0007181967375800014, + "rewards/margins": 0.0012460026191547513, + "rewards/rejected": -0.0005278057651594281, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.9607843137254904e-07, + "logits/chosen": -2.735398769378662, + "logits/rejected": -2.6820080280303955, + "logps/chosen": -251.34585571289062, + "logps/rejected": -225.1924285888672, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00032737970468588173, + "rewards/margins": 0.0003834707895293832, + "rewards/rejected": -5.609105573967099e-05, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": -2.8098397254943848, + "logits/rejected": -2.707000494003296, + "logps/chosen": -281.877197265625, + "logps/rejected": -259.3277587890625, + "loss": 0.6937, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0002893812779802829, + "rewards/margins": -0.0011306366650387645, + "rewards/rejected": 0.000841255416162312, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 3.267973856209151e-07, + "logits/chosen": -2.8089194297790527, + "logits/rejected": -2.823747396469116, + "logps/chosen": -242.1394805908203, + "logps/rejected": -241.009521484375, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00046527199447155, + "rewards/margins": -0.0001682665169937536, + "rewards/rejected": 0.000633538409601897, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.921568627450981e-07, + "logits/chosen": -2.7848849296569824, + "logits/rejected": -2.7923500537872314, + "logps/chosen": -283.9176330566406, + "logps/rejected": -254.57522583007812, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0006139302277006209, + "rewards/margins": 0.00016295790555886924, + "rewards/rejected": 0.00045097232214175165, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": -2.775444746017456, + "logits/rejected": -2.773970127105713, + "logps/chosen": -264.4169616699219, + "logps/rejected": -220.95669555664062, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0009447624906897545, + "rewards/margins": 2.7601974579738453e-05, + "rewards/rejected": 0.0009171604178845882, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 5.228758169934641e-07, + "logits/chosen": -2.7369792461395264, + "logits/rejected": -2.719099760055542, + "logps/chosen": -305.594482421875, + "logps/rejected": -296.13848876953125, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00130544847343117, + "rewards/margins": 0.0002942857681773603, + "rewards/rejected": 0.0010111627634614706, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -2.8638012409210205, + "logits/rejected": -2.7611491680145264, + "logps/chosen": -269.5434265136719, + "logps/rejected": -238.3434295654297, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0009442205773666501, + "rewards/margins": 0.00013440940529108047, + "rewards/rejected": 0.0008098110556602478, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": -2.756626605987549, + "logits/rejected": -2.762937068939209, + "logps/chosen": -247.02676391601562, + "logps/rejected": -251.1614990234375, + "loss": 0.6934, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0012332749320194125, + "rewards/margins": -0.00041856098687276244, + "rewards/rejected": 0.001651835860684514, + "step": 100 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.608107089996338, + "eval_logits/rejected": -2.576101303100586, + "eval_logps/chosen": -285.0846862792969, + "eval_logps/rejected": -263.6873474121094, + "eval_loss": 0.6930220723152161, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.002080138074234128, + "eval_rewards/margins": 0.00025725935120135546, + "eval_rewards/rejected": 0.0018228788394480944, + "eval_runtime": 1173.2958, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 7.189542483660131e-07, + "logits/chosen": -2.756783962249756, + "logits/rejected": -2.710714101791382, + "logps/chosen": -289.2729797363281, + "logps/rejected": -269.6139221191406, + "loss": 0.6928, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.002697740215808153, + "rewards/margins": 0.0007278420380316675, + "rewards/rejected": 0.0019698983523994684, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 7.843137254901962e-07, + "logits/chosen": -2.792996406555176, + "logits/rejected": -2.717787504196167, + "logps/chosen": -303.8896789550781, + "logps/rejected": -243.03982543945312, + "loss": 0.6927, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.001842958852648735, + "rewards/margins": 0.0008399001089856029, + "rewards/rejected": 0.0010030587436631322, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": -2.7940175533294678, + "logits/rejected": -2.771402359008789, + "logps/chosen": -265.6170349121094, + "logps/rejected": -236.84207153320312, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00197789934463799, + "rewards/margins": 0.00039560170262120664, + "rewards/rejected": 0.0015822972636669874, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 9.150326797385621e-07, + "logits/chosen": -2.7395403385162354, + "logits/rejected": -2.6864776611328125, + "logps/chosen": -279.4371032714844, + "logps/rejected": -249.57913208007812, + "loss": 0.6924, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.003136344952508807, + "rewards/margins": 0.0014974649529904127, + "rewards/rejected": 0.0016388805815950036, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -2.8249659538269043, + "logits/rejected": -2.7408270835876465, + "logps/chosen": -335.9043884277344, + "logps/rejected": -297.19586181640625, + "loss": 0.6926, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0033657816238701344, + "rewards/margins": 0.0010477075120434165, + "rewards/rejected": 0.002318073995411396, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": -2.8694610595703125, + "logits/rejected": -2.738370656967163, + "logps/chosen": -262.63018798828125, + "logps/rejected": -240.7825469970703, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.003096712287515402, + "rewards/margins": 0.00016818303265608847, + "rewards/rejected": 0.0029285294003784657, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.111111111111111e-06, + "logits/chosen": -2.7636828422546387, + "logits/rejected": -2.7123844623565674, + "logps/chosen": -245.6793975830078, + "logps/rejected": -231.6672821044922, + "loss": 0.6927, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0035004685632884502, + "rewards/margins": 0.0009301775135099888, + "rewards/rejected": 0.0025702915154397488, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": -2.8375208377838135, + "logits/rejected": -2.639805316925049, + "logps/chosen": -346.8587341308594, + "logps/rejected": -268.3854675292969, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005277971271425486, + "rewards/margins": 0.0029592241626232862, + "rewards/rejected": 0.0023187468759715557, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": -2.6728103160858154, + "logits/rejected": -2.654719829559326, + "logps/chosen": -266.8128662109375, + "logps/rejected": -227.26626586914062, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006007979158312082, + "rewards/margins": 0.0008784201927483082, + "rewards/rejected": 0.005129558499902487, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 1.3071895424836604e-06, + "logits/chosen": -2.8150954246520996, + "logits/rejected": -2.7260398864746094, + "logps/chosen": -261.29449462890625, + "logps/rejected": -262.1349182128906, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.006078018341213465, + "rewards/margins": 0.0021574501879513264, + "rewards/rejected": 0.003920567687600851, + "step": 200 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.6088781356811523, + "eval_logits/rejected": -2.576608657836914, + "eval_logps/chosen": -284.6488037109375, + "eval_logps/rejected": -263.39703369140625, + "eval_loss": 0.6922996640205383, + "eval_rewards/accuracies": 0.5820000171661377, + "eval_rewards/chosen": 0.00643900316208601, + "eval_rewards/margins": 0.0017128386534750462, + "eval_rewards/rejected": 0.0047261640429496765, + "eval_runtime": 1173.3945, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.3725490196078434e-06, + "logits/chosen": -2.8238754272460938, + "logits/rejected": -2.802713632583618, + "logps/chosen": -283.5692443847656, + "logps/rejected": -258.04443359375, + "loss": 0.6918, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.00797777995467186, + "rewards/margins": 0.0026797554455697536, + "rewards/rejected": 0.005298024974763393, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": -2.720637083053589, + "logits/rejected": -2.683905839920044, + "logps/chosen": -285.0937194824219, + "logps/rejected": -259.00048828125, + "loss": 0.6916, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.007448518183082342, + "rewards/margins": 0.003169125411659479, + "rewards/rejected": 0.004279392305761576, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.5032679738562091e-06, + "logits/chosen": -2.6920247077941895, + "logits/rejected": -2.685770034790039, + "logps/chosen": -236.8917694091797, + "logps/rejected": -278.4407958984375, + "loss": 0.6929, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006391332484781742, + "rewards/margins": 0.0004684348532464355, + "rewards/rejected": 0.005922897718846798, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.5686274509803923e-06, + "logits/chosen": -2.7400903701782227, + "logits/rejected": -2.697659969329834, + "logps/chosen": -250.9820098876953, + "logps/rejected": -268.54437255859375, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.005527496337890625, + "rewards/margins": 8.923767745727673e-05, + "rewards/rejected": 0.0054382579401135445, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": -2.7918593883514404, + "logits/rejected": -2.662835121154785, + "logps/chosen": -315.0617980957031, + "logps/rejected": -275.05499267578125, + "loss": 0.6917, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.007765919901430607, + "rewards/margins": 0.002958547091111541, + "rewards/rejected": 0.00480737304314971, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.6993464052287585e-06, + "logits/chosen": -2.8143398761749268, + "logits/rejected": -2.777773857116699, + "logps/chosen": -297.04248046875, + "logps/rejected": -261.8209228515625, + "loss": 0.691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.008783089928328991, + "rewards/margins": 0.004354340024292469, + "rewards/rejected": 0.0044287508353590965, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 1.7647058823529414e-06, + "logits/chosen": -2.741014003753662, + "logits/rejected": -2.662790060043335, + "logps/chosen": -254.10281372070312, + "logps/rejected": -228.9877166748047, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007920559495687485, + "rewards/margins": 0.0039781928062438965, + "rewards/rejected": 0.003942367620766163, + "step": 270 + }, + { + "epoch": 0.04, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": -2.8170313835144043, + "logits/rejected": -2.7090656757354736, + "logps/chosen": -317.5547790527344, + "logps/rejected": -283.766845703125, + "loss": 0.6911, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.009623361751437187, + "rewards/margins": 0.004096606746315956, + "rewards/rejected": 0.005526755005121231, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 1.8954248366013072e-06, + "logits/chosen": -2.7824151515960693, + "logits/rejected": -2.7288193702697754, + "logps/chosen": -291.642333984375, + "logps/rejected": -274.235107421875, + "loss": 0.6902, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.011028922162950039, + "rewards/margins": 0.005957460962235928, + "rewards/rejected": 0.005071460269391537, + "step": 290 + }, + { + "epoch": 0.04, + "learning_rate": 1.96078431372549e-06, + "logits/chosen": -2.8855767250061035, + "logits/rejected": -2.769399642944336, + "logps/chosen": -304.3223876953125, + "logps/rejected": -249.46762084960938, + "loss": 0.6913, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.01247901190072298, + "rewards/margins": 0.0038229345809668303, + "rewards/rejected": 0.008656077086925507, + "step": 300 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.6104612350463867, + "eval_logits/rejected": -2.5774478912353516, + "eval_logps/chosen": -284.02532958984375, + "eval_logps/rejected": -263.038330078125, + "eval_loss": 0.6910020709037781, + "eval_rewards/accuracies": 0.6194999814033508, + "eval_rewards/chosen": 0.012673730961978436, + "eval_rewards/margins": 0.004360521212220192, + "eval_rewards/rejected": 0.008313210681080818, + "eval_runtime": 1173.212, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": -2.726591110229492, + "logits/rejected": -2.7057387828826904, + "logps/chosen": -290.943359375, + "logps/rejected": -283.51690673828125, + "loss": 0.6918, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.013583977706730366, + "rewards/margins": 0.002739064861088991, + "rewards/rejected": 0.010844913311302662, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.0915032679738565e-06, + "logits/chosen": -2.77579927444458, + "logits/rejected": -2.6547343730926514, + "logps/chosen": -264.1826477050781, + "logps/rejected": -255.95779418945312, + "loss": 0.6916, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.013339856639504433, + "rewards/margins": 0.0032375603914260864, + "rewards/rejected": 0.010102294385433197, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.1568627450980393e-06, + "logits/chosen": -2.8468410968780518, + "logits/rejected": -2.7400074005126953, + "logps/chosen": -303.87384033203125, + "logps/rejected": -253.5267791748047, + "loss": 0.6912, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.014395073056221008, + "rewards/margins": 0.003921784460544586, + "rewards/rejected": 0.010473288595676422, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": -2.795454978942871, + "logits/rejected": -2.681230306625366, + "logps/chosen": -272.3973693847656, + "logps/rejected": -230.4618682861328, + "loss": 0.6904, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.015390843152999878, + "rewards/margins": 0.0054797702468931675, + "rewards/rejected": 0.009911073371767998, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 2.2875816993464053e-06, + "logits/chosen": -2.806203603744507, + "logits/rejected": -2.747692108154297, + "logps/chosen": -321.1043395996094, + "logps/rejected": -270.4984130859375, + "loss": 0.6891, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.018319781869649887, + "rewards/margins": 0.008197757415473461, + "rewards/rejected": 0.010122022591531277, + "step": 350 + }, + { + "epoch": 0.05, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -2.84110426902771, + "logits/rejected": -2.7785239219665527, + "logps/chosen": -265.24749755859375, + "logps/rejected": -260.6332092285156, + "loss": 0.6908, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.018875570967793465, + "rewards/margins": 0.004776433110237122, + "rewards/rejected": 0.014099137857556343, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": -2.7692885398864746, + "logits/rejected": -2.760307788848877, + "logps/chosen": -269.5130920410156, + "logps/rejected": -244.1840362548828, + "loss": 0.6889, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.01895751990377903, + "rewards/margins": 0.008716282434761524, + "rewards/rejected": 0.010241237469017506, + "step": 370 + }, + { + "epoch": 0.05, + "learning_rate": 2.4836601307189544e-06, + "logits/chosen": -2.8237204551696777, + "logits/rejected": -2.781487464904785, + "logps/chosen": -289.22412109375, + "logps/rejected": -254.5786590576172, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020854527130723, + "rewards/margins": 0.00919763371348381, + "rewards/rejected": 0.01165689341723919, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 2.549019607843137e-06, + "logits/chosen": -2.7682974338531494, + "logits/rejected": -2.6853373050689697, + "logps/chosen": -305.08367919921875, + "logps/rejected": -246.4702606201172, + "loss": 0.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.023305263370275497, + "rewards/margins": 0.010024776682257652, + "rewards/rejected": 0.013280488550662994, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": -2.813638925552368, + "logits/rejected": -2.840217113494873, + "logps/chosen": -289.2650451660156, + "logps/rejected": -301.24688720703125, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.026052230969071388, + "rewards/margins": 0.008794544264674187, + "rewards/rejected": 0.0172576867043972, + "step": 400 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.6114237308502197, + "eval_logits/rejected": -2.5777738094329834, + "eval_logps/chosen": -282.9473876953125, + "eval_logps/rejected": -262.29913330078125, + "eval_loss": 0.6893764734268188, + "eval_rewards/accuracies": 0.6209999918937683, + "eval_rewards/chosen": 0.02345350943505764, + "eval_rewards/margins": 0.007748373784124851, + "eval_rewards/rejected": 0.015705134719610214, + "eval_runtime": 1173.1663, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.6797385620915036e-06, + "logits/chosen": -2.747955083847046, + "logits/rejected": -2.6892735958099365, + "logps/chosen": -263.50299072265625, + "logps/rejected": -218.99850463867188, + "loss": 0.69, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.022609690204262733, + "rewards/margins": 0.006450247950851917, + "rewards/rejected": 0.01615944132208824, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.7450980392156867e-06, + "logits/chosen": -2.7311861515045166, + "logits/rejected": -2.6678895950317383, + "logps/chosen": -280.86883544921875, + "logps/rejected": -280.25823974609375, + "loss": 0.6894, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.022379932925105095, + "rewards/margins": 0.007762663997709751, + "rewards/rejected": 0.014617268927395344, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": -2.8040578365325928, + "logits/rejected": -2.7093348503112793, + "logps/chosen": -287.013671875, + "logps/rejected": -269.5040588378906, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024479230865836143, + "rewards/margins": 0.0074189514853060246, + "rewards/rejected": 0.017060281708836555, + "step": 430 + }, + { + "epoch": 0.06, + "learning_rate": 2.8758169934640523e-06, + "logits/chosen": -2.806948661804199, + "logits/rejected": -2.7780017852783203, + "logps/chosen": -272.99945068359375, + "logps/rejected": -245.48623657226562, + "loss": 0.6872, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.026299938559532166, + "rewards/margins": 0.012182589620351791, + "rewards/rejected": 0.014117350801825523, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 2.9411764705882355e-06, + "logits/chosen": -2.7656569480895996, + "logits/rejected": -2.783059597015381, + "logps/chosen": -290.8000183105469, + "logps/rejected": -304.024169921875, + "loss": 0.6886, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.027255898341536522, + "rewards/margins": 0.009453834965825081, + "rewards/rejected": 0.01780206523835659, + "step": 450 + }, + { + "epoch": 0.06, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -2.6999731063842773, + "logits/rejected": -2.675001859664917, + "logps/chosen": -257.7070007324219, + "logps/rejected": -248.4289093017578, + "loss": 0.6869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.032838813960552216, + "rewards/margins": 0.012781324796378613, + "rewards/rejected": 0.02005748823285103, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 3.071895424836602e-06, + "logits/chosen": -2.782094717025757, + "logits/rejected": -2.7665746212005615, + "logps/chosen": -279.497314453125, + "logps/rejected": -260.2268981933594, + "loss": 0.6857, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.037720657885074615, + "rewards/margins": 0.01530242245644331, + "rewards/rejected": 0.02241823635995388, + "step": 470 + }, + { + "epoch": 0.06, + "learning_rate": 3.1372549019607846e-06, + "logits/chosen": -2.7836313247680664, + "logits/rejected": -2.7629621028900146, + "logps/chosen": -286.4947204589844, + "logps/rejected": -240.4879608154297, + "loss": 0.6843, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.03812272474169731, + "rewards/margins": 0.018193546682596207, + "rewards/rejected": 0.019929179921746254, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": -2.810976266860962, + "logits/rejected": -2.720935821533203, + "logps/chosen": -276.6731262207031, + "logps/rejected": -224.75808715820312, + "loss": 0.686, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.03245459124445915, + "rewards/margins": 0.014776689000427723, + "rewards/rejected": 0.017677903175354004, + "step": 490 + }, + { + "epoch": 0.07, + "learning_rate": 3.2679738562091506e-06, + "logits/chosen": -2.761012315750122, + "logits/rejected": -2.6800663471221924, + "logps/chosen": -254.37905883789062, + "logps/rejected": -234.1659698486328, + "loss": 0.6881, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.025483015924692154, + "rewards/margins": 0.01065473910421133, + "rewards/rejected": 0.014828277751803398, + "step": 500 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.601078987121582, + "eval_logits/rejected": -2.5648090839385986, + "eval_logps/chosen": -282.0685119628906, + "eval_logps/rejected": -262.00579833984375, + "eval_loss": 0.6866379380226135, + "eval_rewards/accuracies": 0.621999979019165, + "eval_rewards/chosen": 0.03224240243434906, + "eval_rewards/margins": 0.013603860512375832, + "eval_rewards/rejected": 0.01863854192197323, + "eval_runtime": 1173.0502, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.649364709854126, + "logits/rejected": -2.695103645324707, + "logps/chosen": -263.63897705078125, + "logps/rejected": -262.0106201171875, + "loss": 0.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03161431476473808, + "rewards/margins": 0.010357612743973732, + "rewards/rejected": 0.0212567001581192, + "step": 510 + }, + { + "epoch": 0.07, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": -2.7090542316436768, + "logits/rejected": -2.6017651557922363, + "logps/chosen": -256.1787109375, + "logps/rejected": -256.99066162109375, + "loss": 0.6846, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03309843689203262, + "rewards/margins": 0.017676886171102524, + "rewards/rejected": 0.015421552583575249, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 3.4640522875816997e-06, + "logits/chosen": -2.7257823944091797, + "logits/rejected": -2.676396369934082, + "logps/chosen": -262.1148681640625, + "logps/rejected": -240.3489990234375, + "loss": 0.685, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.031875304877758026, + "rewards/margins": 0.016705092042684555, + "rewards/rejected": 0.01517021656036377, + "step": 530 + }, + { + "epoch": 0.07, + "learning_rate": 3.529411764705883e-06, + "logits/chosen": -2.7843520641326904, + "logits/rejected": -2.6656417846679688, + "logps/chosen": -286.06494140625, + "logps/rejected": -263.2212829589844, + "loss": 0.6845, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.03619309142231941, + "rewards/margins": 0.01780475489795208, + "rewards/rejected": 0.018388336524367332, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": -2.7776148319244385, + "logits/rejected": -2.716484546661377, + "logps/chosen": -255.5173797607422, + "logps/rejected": -238.3636474609375, + "loss": 0.6779, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04090049862861633, + "rewards/margins": 0.031408317387104034, + "rewards/rejected": 0.00949217937886715, + "step": 550 + }, + { + "epoch": 0.07, + "learning_rate": 3.6601307189542484e-06, + "logits/chosen": -2.7813169956207275, + "logits/rejected": -2.696591854095459, + "logps/chosen": -288.11907958984375, + "logps/rejected": -301.1776428222656, + "loss": 0.6838, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02972649596631527, + "rewards/margins": 0.019590891897678375, + "rewards/rejected": 0.010135604068636894, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 3.7254901960784316e-06, + "logits/chosen": -2.866082191467285, + "logits/rejected": -2.8021016120910645, + "logps/chosen": -271.49017333984375, + "logps/rejected": -257.3333435058594, + "loss": 0.6859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02512175962328911, + "rewards/margins": 0.015149513259530067, + "rewards/rejected": 0.009972251020371914, + "step": 570 + }, + { + "epoch": 0.08, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -2.798537015914917, + "logits/rejected": -2.652083396911621, + "logps/chosen": -316.0807189941406, + "logps/rejected": -284.3338623046875, + "loss": 0.683, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03139640763401985, + "rewards/margins": 0.021447105333209038, + "rewards/rejected": 0.009949302300810814, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 3.856209150326798e-06, + "logits/chosen": -2.79559588432312, + "logits/rejected": -2.657886028289795, + "logps/chosen": -294.9044189453125, + "logps/rejected": -254.8910675048828, + "loss": 0.6833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03273575007915497, + "rewards/margins": 0.020848657935857773, + "rewards/rejected": 0.011887092143297195, + "step": 590 + }, + { + "epoch": 0.08, + "learning_rate": 3.92156862745098e-06, + "logits/chosen": -2.746609687805176, + "logits/rejected": -2.6399028301239014, + "logps/chosen": -243.6225128173828, + "logps/rejected": -247.07846069335938, + "loss": 0.6848, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.028709720820188522, + "rewards/margins": 0.01827172189950943, + "rewards/rejected": 0.010438000783324242, + "step": 600 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.600626230239868, + "eval_logits/rejected": -2.5620691776275635, + "eval_logps/chosen": -281.3836364746094, + "eval_logps/rejected": -262.1441955566406, + "eval_loss": 0.6829456090927124, + "eval_rewards/accuracies": 0.6230000257492065, + "eval_rewards/chosen": 0.039090972393751144, + "eval_rewards/margins": 0.021836327388882637, + "eval_rewards/rejected": 0.017254654318094254, + "eval_runtime": 1173.2709, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -2.7640035152435303, + "logits/rejected": -2.650482654571533, + "logps/chosen": -239.72201538085938, + "logps/rejected": -207.37960815429688, + "loss": 0.6815, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.05101596564054489, + "rewards/margins": 0.02423090860247612, + "rewards/rejected": 0.02678506076335907, + "step": 610 + }, + { + "epoch": 0.08, + "learning_rate": 4.052287581699347e-06, + "logits/chosen": -2.7376561164855957, + "logits/rejected": -2.6810410022735596, + "logps/chosen": -288.5543212890625, + "logps/rejected": -267.516845703125, + "loss": 0.6799, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0445859432220459, + "rewards/margins": 0.027907002717256546, + "rewards/rejected": 0.016678940504789352, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 4.11764705882353e-06, + "logits/chosen": -2.727701425552368, + "logits/rejected": -2.6993868350982666, + "logps/chosen": -274.822509765625, + "logps/rejected": -256.6311340332031, + "loss": 0.6745, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0639568492770195, + "rewards/margins": 0.03929334878921509, + "rewards/rejected": 0.02466350421309471, + "step": 630 + }, + { + "epoch": 0.08, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": -2.822457790374756, + "logits/rejected": -2.756417751312256, + "logps/chosen": -271.39703369140625, + "logps/rejected": -258.93212890625, + "loss": 0.6856, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04032561182975769, + "rewards/margins": 0.01741691306233406, + "rewards/rejected": 0.02290869876742363, + "step": 640 + }, + { + "epoch": 0.09, + "learning_rate": 4.2483660130718954e-06, + "logits/chosen": -2.7424185276031494, + "logits/rejected": -2.7166240215301514, + "logps/chosen": -269.97467041015625, + "logps/rejected": -256.1397399902344, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03450140729546547, + "rewards/margins": 0.021146176382899284, + "rewards/rejected": 0.013355230912566185, + "step": 650 + }, + { + "epoch": 0.09, + "learning_rate": 4.313725490196079e-06, + "logits/chosen": -2.773705005645752, + "logits/rejected": -2.6584715843200684, + "logps/chosen": -306.5790100097656, + "logps/rejected": -254.39697265625, + "loss": 0.6757, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05402520298957825, + "rewards/margins": 0.03739168122410774, + "rewards/rejected": 0.016633519902825356, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -2.8280460834503174, + "logits/rejected": -2.7472095489501953, + "logps/chosen": -293.5730285644531, + "logps/rejected": -309.07196044921875, + "loss": 0.685, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0446760393679142, + "rewards/margins": 0.019286952912807465, + "rewards/rejected": 0.025389084592461586, + "step": 670 + }, + { + "epoch": 0.09, + "learning_rate": 4.444444444444444e-06, + "logits/chosen": -2.8368771076202393, + "logits/rejected": -2.781674861907959, + "logps/chosen": -280.62738037109375, + "logps/rejected": -264.23553466796875, + "loss": 0.6849, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.053720176219940186, + "rewards/margins": 0.01869060844182968, + "rewards/rejected": 0.035029567778110504, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 4.509803921568628e-06, + "logits/chosen": -2.809265613555908, + "logits/rejected": -2.7491776943206787, + "logps/chosen": -304.00579833984375, + "logps/rejected": -263.46368408203125, + "loss": 0.6759, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05731926113367081, + "rewards/margins": 0.03783116117119789, + "rewards/rejected": 0.01948809251189232, + "step": 690 + }, + { + "epoch": 0.09, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -2.769301414489746, + "logits/rejected": -2.728238105773926, + "logps/chosen": -280.30340576171875, + "logps/rejected": -268.40081787109375, + "loss": 0.6706, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07635831832885742, + "rewards/margins": 0.0484110489487648, + "rewards/rejected": 0.027947265654802322, + "step": 700 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.5860841274261475, + "eval_logits/rejected": -2.543696403503418, + "eval_logps/chosen": -280.1424560546875, + "eval_logps/rejected": -262.17578125, + "eval_loss": 0.6776183843612671, + "eval_rewards/accuracies": 0.6134999990463257, + "eval_rewards/chosen": 0.05150264874100685, + "eval_rewards/margins": 0.03456386178731918, + "eval_rewards/rejected": 0.01693878136575222, + "eval_runtime": 1173.3642, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 4.640522875816994e-06, + "logits/chosen": -2.8206183910369873, + "logits/rejected": -2.7675373554229736, + "logps/chosen": -285.945068359375, + "logps/rejected": -273.6590576171875, + "loss": 0.6718, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.06139975041151047, + "rewards/margins": 0.047468554228544235, + "rewards/rejected": 0.013931198045611382, + "step": 710 + }, + { + "epoch": 0.09, + "learning_rate": 4.705882352941177e-06, + "logits/chosen": -2.7912356853485107, + "logits/rejected": -2.746445655822754, + "logps/chosen": -325.3434753417969, + "logps/rejected": -304.920654296875, + "loss": 0.6697, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.047881126403808594, + "rewards/margins": 0.05247528478503227, + "rewards/rejected": -0.004594164900481701, + "step": 720 + }, + { + "epoch": 0.1, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -2.786642551422119, + "logits/rejected": -2.7573294639587402, + "logps/chosen": -289.21734619140625, + "logps/rejected": -281.2444152832031, + "loss": 0.6692, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.052878547459840775, + "rewards/margins": 0.05296853929758072, + "rewards/rejected": -8.999630517791957e-05, + "step": 730 + }, + { + "epoch": 0.1, + "learning_rate": 4.836601307189543e-06, + "logits/chosen": -2.795292377471924, + "logits/rejected": -2.666905641555786, + "logps/chosen": -305.7247314453125, + "logps/rejected": -264.77825927734375, + "loss": 0.6652, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05517961457371712, + "rewards/margins": 0.06262197345495224, + "rewards/rejected": -0.007442360278218985, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 4.901960784313726e-06, + "logits/chosen": -2.666839361190796, + "logits/rejected": -2.665914297103882, + "logps/chosen": -250.14108276367188, + "logps/rejected": -241.32608032226562, + "loss": 0.656, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.03899381682276726, + "rewards/margins": 0.08067650347948074, + "rewards/rejected": -0.04168267175555229, + "step": 750 + }, + { + "epoch": 0.1, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -2.7957959175109863, + "logits/rejected": -2.714207172393799, + "logps/chosen": -285.6178894042969, + "logps/rejected": -246.8248291015625, + "loss": 0.6672, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.007369598839432001, + "rewards/margins": 0.05723084881901741, + "rewards/rejected": -0.04986124485731125, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 4.999993476542427e-06, + "logits/chosen": -2.751208782196045, + "logits/rejected": -2.791194438934326, + "logps/chosen": -294.9483642578125, + "logps/rejected": -281.4031066894531, + "loss": 0.6549, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.021667104214429855, + "rewards/margins": 0.08953282982110977, + "rewards/rejected": -0.06786571443080902, + "step": 770 + }, + { + "epoch": 0.1, + "learning_rate": 4.999941289086112e-06, + "logits/chosen": -2.803582191467285, + "logits/rejected": -2.6576297283172607, + "logps/chosen": -308.5997619628906, + "logps/rejected": -281.09771728515625, + "loss": 0.6482, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.021102851256728172, + "rewards/margins": 0.10389794409275055, + "rewards/rejected": -0.12500080466270447, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 4.999836915262896e-06, + "logits/chosen": -2.6909666061401367, + "logits/rejected": -2.71191668510437, + "logps/chosen": -300.32122802734375, + "logps/rejected": -303.9979553222656, + "loss": 0.6562, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08398447185754776, + "rewards/margins": 0.09393363445997238, + "rewards/rejected": -0.17791810631752014, + "step": 790 + }, + { + "epoch": 0.1, + "learning_rate": 4.999680357251587e-06, + "logits/chosen": -2.566818952560425, + "logits/rejected": -2.565382242202759, + "logps/chosen": -273.0006103515625, + "logps/rejected": -289.88714599609375, + "loss": 0.6544, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.060392219573259354, + "rewards/margins": 0.09415189921855927, + "rewards/rejected": -0.15454411506652832, + "step": 800 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.5676047801971436, + "eval_logits/rejected": -2.5207531452178955, + "eval_logps/chosen": -293.7215576171875, + "eval_logps/rejected": -279.89556884765625, + "eval_loss": 0.6649993658065796, + "eval_rewards/accuracies": 0.6065000295639038, + "eval_rewards/chosen": -0.0842883288860321, + "eval_rewards/margins": 0.0759705975651741, + "eval_rewards/rejected": -0.160258948802948, + "eval_runtime": 1173.6251, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 4.999471618320339e-06, + "logits/chosen": -2.781917095184326, + "logits/rejected": -2.65069317817688, + "logps/chosen": -305.3197326660156, + "logps/rejected": -281.01947021484375, + "loss": 0.6586, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10152751207351685, + "rewards/margins": 0.0869065374135971, + "rewards/rejected": -0.18843403458595276, + "step": 810 + }, + { + "epoch": 0.11, + "learning_rate": 4.999210702826586e-06, + "logits/chosen": -2.884624719619751, + "logits/rejected": -2.808931827545166, + "logps/chosen": -329.1521301269531, + "logps/rejected": -292.8260498046875, + "loss": 0.6533, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09741269052028656, + "rewards/margins": 0.09646307677030563, + "rewards/rejected": -0.1938757747411728, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 4.998897616216947e-06, + "logits/chosen": -2.705528974533081, + "logits/rejected": -2.759491205215454, + "logps/chosen": -258.15032958984375, + "logps/rejected": -292.4070129394531, + "loss": 0.6459, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11559978872537613, + "rewards/margins": 0.11973357200622559, + "rewards/rejected": -0.23533335328102112, + "step": 830 + }, + { + "epoch": 0.11, + "learning_rate": 4.998532365027117e-06, + "logits/chosen": -2.665297269821167, + "logits/rejected": -2.577087879180908, + "logps/chosen": -315.3207092285156, + "logps/rejected": -264.4286804199219, + "loss": 0.654, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14258244633674622, + "rewards/margins": 0.10882475227117538, + "rewards/rejected": -0.2514071762561798, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 4.9981149568817275e-06, + "logits/chosen": -2.703603506088257, + "logits/rejected": -2.700209140777588, + "logps/chosen": -307.2548522949219, + "logps/rejected": -332.7950439453125, + "loss": 0.6573, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19128301739692688, + "rewards/margins": 0.09529605507850647, + "rewards/rejected": -0.28657907247543335, + "step": 850 + }, + { + "epoch": 0.11, + "learning_rate": 4.997645400494192e-06, + "logits/chosen": -2.7560465335845947, + "logits/rejected": -2.7364706993103027, + "logps/chosen": -284.1697692871094, + "logps/rejected": -289.9046325683594, + "loss": 0.6815, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.31733840703964233, + "rewards/margins": 0.0750778540968895, + "rewards/rejected": -0.3924162685871124, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 4.997123705666514e-06, + "logits/chosen": -2.727036237716675, + "logits/rejected": -2.6650333404541016, + "logps/chosen": -322.28765869140625, + "logps/rejected": -320.51031494140625, + "loss": 0.6651, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2925903797149658, + "rewards/margins": 0.10269337892532349, + "rewards/rejected": -0.3952837586402893, + "step": 870 + }, + { + "epoch": 0.12, + "learning_rate": 4.996549883289093e-06, + "logits/chosen": -2.7099690437316895, + "logits/rejected": -2.6788439750671387, + "logps/chosen": -287.4695129394531, + "logps/rejected": -311.8743591308594, + "loss": 0.6602, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.24767592549324036, + "rewards/margins": 0.09456877410411835, + "rewards/rejected": -0.3422446846961975, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 4.995923945340495e-06, + "logits/chosen": -2.7683117389678955, + "logits/rejected": -2.7528557777404785, + "logps/chosen": -283.42852783203125, + "logps/rejected": -299.6573486328125, + "loss": 0.6573, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15794335305690765, + "rewards/margins": 0.10221956670284271, + "rewards/rejected": -0.26016291975975037, + "step": 890 + }, + { + "epoch": 0.12, + "learning_rate": 4.995245904887195e-06, + "logits/chosen": -2.7562127113342285, + "logits/rejected": -2.6816959381103516, + "logps/chosen": -278.16796875, + "logps/rejected": -251.93893432617188, + "loss": 0.668, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21272382140159607, + "rewards/margins": 0.08459267765283585, + "rewards/rejected": -0.2973165512084961, + "step": 900 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.56126070022583, + "eval_logits/rejected": -2.5179994106292725, + "eval_logps/chosen": -302.1818542480469, + "eval_logps/rejected": -291.8527526855469, + "eval_loss": 0.655211329460144, + "eval_rewards/accuracies": 0.6169999837875366, + "eval_rewards/chosen": -0.16889113187789917, + "eval_rewards/margins": 0.11093967407941818, + "eval_rewards/rejected": -0.27983081340789795, + "eval_runtime": 1173.1095, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 4.994515776083313e-06, + "logits/chosen": -2.6447596549987793, + "logits/rejected": -2.7301154136657715, + "logps/chosen": -299.1502685546875, + "logps/rejected": -334.7135925292969, + "loss": 0.6481, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1342255175113678, + "rewards/margins": 0.13406458497047424, + "rewards/rejected": -0.26829007267951965, + "step": 910 + }, + { + "epoch": 0.12, + "learning_rate": 4.993733574170316e-06, + "logits/chosen": -2.722386360168457, + "logits/rejected": -2.6634514331817627, + "logps/chosen": -253.35986328125, + "logps/rejected": -266.76409912109375, + "loss": 0.6321, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11939827352762222, + "rewards/margins": 0.1619001179933548, + "rewards/rejected": -0.28129833936691284, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 4.992899315476696e-06, + "logits/chosen": -2.738288402557373, + "logits/rejected": -2.7077269554138184, + "logps/chosen": -335.08160400390625, + "logps/rejected": -319.16583251953125, + "loss": 0.6579, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18058130145072937, + "rewards/margins": 0.11753213405609131, + "rewards/rejected": -0.2981134355068207, + "step": 930 + }, + { + "epoch": 0.12, + "learning_rate": 4.9920130174176354e-06, + "logits/chosen": -2.735929489135742, + "logits/rejected": -2.6511847972869873, + "logps/chosen": -314.0692443847656, + "logps/rejected": -319.5186462402344, + "loss": 0.6098, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20987017452716827, + "rewards/margins": 0.21007618308067322, + "rewards/rejected": -0.4199463725090027, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 4.991074698494638e-06, + "logits/chosen": -2.791724681854248, + "logits/rejected": -2.652005434036255, + "logps/chosen": -311.9523620605469, + "logps/rejected": -291.4970703125, + "loss": 0.6306, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2346021831035614, + "rewards/margins": 0.16210028529167175, + "rewards/rejected": -0.39670246839523315, + "step": 950 + }, + { + "epoch": 0.13, + "learning_rate": 4.990084378295148e-06, + "logits/chosen": -2.7419540882110596, + "logits/rejected": -2.7098803520202637, + "logps/chosen": -277.81427001953125, + "logps/rejected": -267.3054504394531, + "loss": 0.6452, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23132053017616272, + "rewards/margins": 0.14752307534217834, + "rewards/rejected": -0.37884360551834106, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 4.989042077492135e-06, + "logits/chosen": -2.708400249481201, + "logits/rejected": -2.6803619861602783, + "logps/chosen": -316.9068298339844, + "logps/rejected": -321.396728515625, + "loss": 0.6019, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2604824900627136, + "rewards/margins": 0.22696319222450256, + "rewards/rejected": -0.4874456822872162, + "step": 970 + }, + { + "epoch": 0.13, + "learning_rate": 4.987947817843665e-06, + "logits/chosen": -2.5916519165039062, + "logits/rejected": -2.6380982398986816, + "logps/chosen": -310.9526672363281, + "logps/rejected": -296.34222412109375, + "loss": 0.6496, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4472068250179291, + "rewards/margins": 0.1646738201379776, + "rewards/rejected": -0.6118806600570679, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 4.986801622192453e-06, + "logits/chosen": -2.671600580215454, + "logits/rejected": -2.6104674339294434, + "logps/chosen": -275.8227233886719, + "logps/rejected": -276.8636474609375, + "loss": 0.6118, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3716748356819153, + "rewards/margins": 0.22761638462543488, + "rewards/rejected": -0.599291205406189, + "step": 990 + }, + { + "epoch": 0.13, + "learning_rate": 4.985603514465372e-06, + "logits/chosen": -2.6798784732818604, + "logits/rejected": -2.6702020168304443, + "logps/chosen": -330.8289489746094, + "logps/rejected": -341.70623779296875, + "loss": 0.6285, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4649590849876404, + "rewards/margins": 0.22645731270313263, + "rewards/rejected": -0.6914163827896118, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.4939186573028564, + "eval_logits/rejected": -2.4563186168670654, + "eval_logps/chosen": -338.1634521484375, + "eval_logps/rejected": -335.08056640625, + "eval_loss": 0.6456961035728455, + "eval_rewards/accuracies": 0.6290000081062317, + "eval_rewards/chosen": -0.5287072062492371, + "eval_rewards/margins": 0.18340161442756653, + "eval_rewards/rejected": -0.7121089100837708, + "eval_runtime": 1172.976, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 4.984353519672966e-06, + "logits/chosen": -2.6857194900512695, + "logits/rejected": -2.5498063564300537, + "logps/chosen": -330.83270263671875, + "logps/rejected": -315.6231994628906, + "loss": 0.6602, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5528852939605713, + "rewards/margins": 0.13348236680030823, + "rewards/rejected": -0.6863676309585571, + "step": 1010 + }, + { + "epoch": 0.13, + "learning_rate": 4.9830516639089226e-06, + "logits/chosen": -2.660613536834717, + "logits/rejected": -2.6461730003356934, + "logps/chosen": -377.82635498046875, + "logps/rejected": -328.1082458496094, + "loss": 0.592, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5226212739944458, + "rewards/margins": 0.2594411373138428, + "rewards/rejected": -0.7820624113082886, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 4.9816979743495296e-06, + "logits/chosen": -2.6877143383026123, + "logits/rejected": -2.6519954204559326, + "logps/chosen": -380.0502624511719, + "logps/rejected": -382.59869384765625, + "loss": 0.5997, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6410335302352905, + "rewards/margins": 0.30625757575035095, + "rewards/rejected": -0.9472910761833191, + "step": 1030 + }, + { + "epoch": 0.14, + "learning_rate": 4.980292479253105e-06, + "logits/chosen": -2.7361416816711426, + "logits/rejected": -2.6469337940216064, + "logps/chosen": -386.8782958984375, + "logps/rejected": -373.96405029296875, + "loss": 0.5811, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6625531911849976, + "rewards/margins": 0.34235090017318726, + "rewards/rejected": -1.00490403175354, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 4.978835207959414e-06, + "logits/chosen": -2.579270839691162, + "logits/rejected": -2.5607879161834717, + "logps/chosen": -343.03533935546875, + "logps/rejected": -352.88916015625, + "loss": 0.6196, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.702850878238678, + "rewards/margins": 0.24091625213623047, + "rewards/rejected": -0.9437670707702637, + "step": 1050 + }, + { + "epoch": 0.14, + "learning_rate": 4.977326190889046e-06, + "logits/chosen": -2.577333927154541, + "logits/rejected": -2.3452649116516113, + "logps/chosen": -359.04669189453125, + "logps/rejected": -328.72467041015625, + "loss": 0.6402, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7241743803024292, + "rewards/margins": 0.2433900088071823, + "rewards/rejected": -0.9675644040107727, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 4.975765459542788e-06, + "logits/chosen": -2.4829294681549072, + "logits/rejected": -2.470693826675415, + "logps/chosen": -329.3133239746094, + "logps/rejected": -346.67462158203125, + "loss": 0.5942, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5526672601699829, + "rewards/margins": 0.3274114727973938, + "rewards/rejected": -0.8800787925720215, + "step": 1070 + }, + { + "epoch": 0.14, + "learning_rate": 4.9741530465009665e-06, + "logits/chosen": -2.445230007171631, + "logits/rejected": -2.4143424034118652, + "logps/chosen": -319.7929382324219, + "logps/rejected": -325.70654296875, + "loss": 0.6218, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5758613348007202, + "rewards/margins": 0.2517636716365814, + "rewards/rejected": -0.8276249766349792, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 4.972488985422763e-06, + "logits/chosen": -2.496365547180176, + "logits/rejected": -2.4725289344787598, + "logps/chosen": -324.9794006347656, + "logps/rejected": -326.0638122558594, + "loss": 0.5809, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.608519971370697, + "rewards/margins": 0.37370020151138306, + "rewards/rejected": -0.9822202920913696, + "step": 1090 + }, + { + "epoch": 0.14, + "learning_rate": 4.970773311045514e-06, + "logits/chosen": -2.509723663330078, + "logits/rejected": -2.404963970184326, + "logps/chosen": -345.8732604980469, + "logps/rejected": -341.27020263671875, + "loss": 0.6741, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6360703110694885, + "rewards/margins": 0.16143734753131866, + "rewards/rejected": -0.7975075840950012, + "step": 1100 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.322707414627075, + "eval_logits/rejected": -2.2815117835998535, + "eval_logps/chosen": -355.5893249511719, + "eval_logps/rejected": -358.6846618652344, + "eval_loss": 0.6395881772041321, + "eval_rewards/accuracies": 0.6305000185966492, + "eval_rewards/chosen": -0.7029657959938049, + "eval_rewards/margins": 0.24518389999866486, + "eval_rewards/rejected": -0.9481497406959534, + "eval_runtime": 1173.8672, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 1100 + }, + { + "epoch": 0.15, + "learning_rate": 4.969006059183984e-06, + "logits/chosen": -2.5231308937072754, + "logits/rejected": -2.448873996734619, + "logps/chosen": -348.31427001953125, + "logps/rejected": -335.48345947265625, + "loss": 0.69, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6456478238105774, + "rewards/margins": 0.12208940088748932, + "rewards/rejected": -0.7677371501922607, + "step": 1110 + }, + { + "epoch": 0.15, + "learning_rate": 4.967187266729623e-06, + "logits/chosen": -2.678138256072998, + "logits/rejected": -2.5693306922912598, + "logps/chosen": -338.0236511230469, + "logps/rejected": -344.2812194824219, + "loss": 0.6281, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.43570685386657715, + "rewards/margins": 0.19579991698265076, + "rewards/rejected": -0.6315068006515503, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 4.965316971649791e-06, + "logits/chosen": -2.6538443565368652, + "logits/rejected": -2.5499930381774902, + "logps/chosen": -344.49639892578125, + "logps/rejected": -328.1519470214844, + "loss": 0.5859, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3683456480503082, + "rewards/margins": 0.3128353953361511, + "rewards/rejected": -0.6811810731887817, + "step": 1130 + }, + { + "epoch": 0.15, + "learning_rate": 4.963395212986964e-06, + "logits/chosen": -2.6395363807678223, + "logits/rejected": -2.521418333053589, + "logps/chosen": -297.9776916503906, + "logps/rejected": -293.7052917480469, + "loss": 0.6067, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4740704894065857, + "rewards/margins": 0.2535014748573303, + "rewards/rejected": -0.7275720238685608, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 4.9614220308579285e-06, + "logits/chosen": -2.5091240406036377, + "logits/rejected": -2.5847342014312744, + "logps/chosen": -349.41888427734375, + "logps/rejected": -358.28790283203125, + "loss": 0.6277, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5796940922737122, + "rewards/margins": 0.2202250212430954, + "rewards/rejected": -0.799919068813324, + "step": 1150 + }, + { + "epoch": 0.15, + "learning_rate": 4.9593974664529325e-06, + "logits/chosen": -2.5793509483337402, + "logits/rejected": -2.458042621612549, + "logps/chosen": -345.2643127441406, + "logps/rejected": -371.79388427734375, + "loss": 0.6192, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6415513753890991, + "rewards/margins": 0.27423813939094543, + "rewards/rejected": -0.9157896041870117, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 4.957321562034833e-06, + "logits/chosen": -2.5987672805786133, + "logits/rejected": -2.5602588653564453, + "logps/chosen": -367.10552978515625, + "logps/rejected": -363.56866455078125, + "loss": 0.5996, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6459072828292847, + "rewards/margins": 0.30040204524993896, + "rewards/rejected": -0.9463094472885132, + "step": 1170 + }, + { + "epoch": 0.15, + "learning_rate": 4.955194360938214e-06, + "logits/chosen": -2.5837607383728027, + "logits/rejected": -2.5810608863830566, + "logps/chosen": -349.5048828125, + "logps/rejected": -340.4737243652344, + "loss": 0.6318, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7418169975280762, + "rewards/margins": 0.25447434186935425, + "rewards/rejected": -0.9962912797927856, + "step": 1180 + }, + { + "epoch": 0.16, + "learning_rate": 4.9530159075684735e-06, + "logits/chosen": -2.541612148284912, + "logits/rejected": -2.5287868976593018, + "logps/chosen": -328.6512451171875, + "logps/rejected": -410.9795837402344, + "loss": 0.6613, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8162468671798706, + "rewards/margins": 0.17309090495109558, + "rewards/rejected": -0.9893378019332886, + "step": 1190 + }, + { + "epoch": 0.16, + "learning_rate": 4.950786247400908e-06, + "logits/chosen": -2.456754446029663, + "logits/rejected": -2.5195212364196777, + "logps/chosen": -329.7606201171875, + "logps/rejected": -354.2069091796875, + "loss": 0.605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7051799893379211, + "rewards/margins": 0.27783241868019104, + "rewards/rejected": -0.983012318611145, + "step": 1200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.2607674598693848, + "eval_logits/rejected": -2.2197513580322266, + "eval_logps/chosen": -356.06011962890625, + "eval_logps/rejected": -360.99627685546875, + "eval_loss": 0.6279371380805969, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": -0.7076742053031921, + "eval_rewards/margins": 0.26359233260154724, + "eval_rewards/rejected": -0.971266508102417, + "eval_runtime": 1173.3612, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 4.948505426979756e-06, + "logits/chosen": -2.5039706230163574, + "logits/rejected": -2.4758121967315674, + "logps/chosen": -349.3768615722656, + "logps/rejected": -365.09967041015625, + "loss": 0.5967, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7029610872268677, + "rewards/margins": 0.3289002776145935, + "rewards/rejected": -1.031861424446106, + "step": 1210 + }, + { + "epoch": 0.16, + "learning_rate": 4.946173493917228e-06, + "logits/chosen": -2.492128849029541, + "logits/rejected": -2.4254136085510254, + "logps/chosen": -362.53436279296875, + "logps/rejected": -328.02490234375, + "loss": 0.7295, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.8030228614807129, + "rewards/margins": 0.019256453961133957, + "rewards/rejected": -0.822279155254364, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 4.943790496892513e-06, + "logits/chosen": -2.600513458251953, + "logits/rejected": -2.4696056842803955, + "logps/chosen": -316.8155212402344, + "logps/rejected": -307.95892333984375, + "loss": 0.5983, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43885356187820435, + "rewards/margins": 0.3040347695350647, + "rewards/rejected": -0.7428882718086243, + "step": 1230 + }, + { + "epoch": 0.16, + "learning_rate": 4.941356485650762e-06, + "logits/chosen": -2.6881508827209473, + "logits/rejected": -2.6177926063537598, + "logps/chosen": -367.1802062988281, + "logps/rejected": -365.7328186035156, + "loss": 0.6168, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4106849133968353, + "rewards/margins": 0.2384069859981537, + "rewards/rejected": -0.649091899394989, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 4.93887151100205e-06, + "logits/chosen": -2.733797550201416, + "logits/rejected": -2.6489291191101074, + "logps/chosen": -366.79193115234375, + "logps/rejected": -345.00640869140625, + "loss": 0.6567, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.35443398356437683, + "rewards/margins": 0.1330409049987793, + "rewards/rejected": -0.48747485876083374, + "step": 1250 + }, + { + "epoch": 0.16, + "learning_rate": 4.936335624820313e-06, + "logits/chosen": -2.7169675827026367, + "logits/rejected": -2.6466755867004395, + "logps/chosen": -302.01953125, + "logps/rejected": -282.923828125, + "loss": 0.6326, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.24352756142616272, + "rewards/margins": 0.18360894918441772, + "rewards/rejected": -0.42713651061058044, + "step": 1260 + }, + { + "epoch": 0.17, + "learning_rate": 4.933748880042271e-06, + "logits/chosen": -2.7060999870300293, + "logits/rejected": -2.638232469558716, + "logps/chosen": -310.6798400878906, + "logps/rejected": -311.3609619140625, + "loss": 0.6077, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3422698378562927, + "rewards/margins": 0.24755015969276428, + "rewards/rejected": -0.5898200273513794, + "step": 1270 + }, + { + "epoch": 0.17, + "learning_rate": 4.931111330666317e-06, + "logits/chosen": -2.575082540512085, + "logits/rejected": -2.4545376300811768, + "logps/chosen": -317.52264404296875, + "logps/rejected": -297.6214294433594, + "loss": 0.6191, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.47772637009620667, + "rewards/margins": 0.21179227530956268, + "rewards/rejected": -0.6895186901092529, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 4.9284230317513906e-06, + "logits/chosen": -2.5893354415893555, + "logits/rejected": -2.5187082290649414, + "logps/chosen": -379.6683654785156, + "logps/rejected": -363.97015380859375, + "loss": 0.6043, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6415658593177795, + "rewards/margins": 0.3069884777069092, + "rewards/rejected": -0.9485543370246887, + "step": 1290 + }, + { + "epoch": 0.17, + "learning_rate": 4.9256840394158325e-06, + "logits/chosen": -2.4693050384521484, + "logits/rejected": -2.432520627975464, + "logps/chosen": -361.9473571777344, + "logps/rejected": -426.77685546875, + "loss": 0.5844, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6800118684768677, + "rewards/margins": 0.3689848780632019, + "rewards/rejected": -1.0489966869354248, + "step": 1300 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -2.0743157863616943, + "eval_logits/rejected": -2.0337142944335938, + "eval_logps/chosen": -370.3147277832031, + "eval_logps/rejected": -378.0120544433594, + "eval_loss": 0.6228457093238831, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": -0.8502199649810791, + "eval_rewards/margins": 0.29120388627052307, + "eval_rewards/rejected": -1.1414238214492798, + "eval_runtime": 1172.7082, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 4.922894410836207e-06, + "logits/chosen": -2.494260787963867, + "logits/rejected": -2.3333301544189453, + "logps/chosen": -394.7021789550781, + "logps/rejected": -361.4925842285156, + "loss": 0.635, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8935394287109375, + "rewards/margins": 0.2934940457344055, + "rewards/rejected": -1.1870336532592773, + "step": 1310 + }, + { + "epoch": 0.17, + "learning_rate": 4.920054204246116e-06, + "logits/chosen": -2.525690793991089, + "logits/rejected": -2.4263322353363037, + "logps/chosen": -370.66790771484375, + "logps/rejected": -344.0418395996094, + "loss": 0.634, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7464545965194702, + "rewards/margins": 0.2129075974225998, + "rewards/rejected": -0.9593623280525208, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 4.9171634789349744e-06, + "logits/chosen": -2.4831061363220215, + "logits/rejected": -2.4301085472106934, + "logps/chosen": -344.4041748046875, + "logps/rejected": -388.18572998046875, + "loss": 0.5445, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6461294293403625, + "rewards/margins": 0.44957056641578674, + "rewards/rejected": -1.0957000255584717, + "step": 1330 + }, + { + "epoch": 0.18, + "learning_rate": 4.914222295246782e-06, + "logits/chosen": -2.527456521987915, + "logits/rejected": -2.4752743244171143, + "logps/chosen": -345.05767822265625, + "logps/rejected": -359.32391357421875, + "loss": 0.6608, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6638450622558594, + "rewards/margins": 0.15090003609657288, + "rewards/rejected": -0.8147451281547546, + "step": 1340 + }, + { + "epoch": 0.18, + "learning_rate": 4.911230714578858e-06, + "logits/chosen": -2.441866636276245, + "logits/rejected": -2.462878704071045, + "logps/chosen": -284.39227294921875, + "logps/rejected": -344.8713073730469, + "loss": 0.5629, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.500754177570343, + "rewards/margins": 0.40834903717041016, + "rewards/rejected": -0.9091032147407532, + "step": 1350 + }, + { + "epoch": 0.18, + "learning_rate": 4.908188799380558e-06, + "logits/chosen": -2.4615304470062256, + "logits/rejected": -2.425814390182495, + "logps/chosen": -331.05267333984375, + "logps/rejected": -332.3231201171875, + "loss": 0.5829, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6858211755752563, + "rewards/margins": 0.3280574083328247, + "rewards/rejected": -1.013878583908081, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 4.905096613151975e-06, + "logits/chosen": -2.3768835067749023, + "logits/rejected": -2.288085460662842, + "logps/chosen": -415.037109375, + "logps/rejected": -410.47821044921875, + "loss": 0.6679, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.0296709537506104, + "rewards/margins": 0.2172650396823883, + "rewards/rejected": -1.2469360828399658, + "step": 1370 + }, + { + "epoch": 0.18, + "learning_rate": 4.90195422044261e-06, + "logits/chosen": -2.4731457233428955, + "logits/rejected": -2.4283947944641113, + "logps/chosen": -387.6423645019531, + "logps/rejected": -397.9414367675781, + "loss": 0.5595, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.753621518611908, + "rewards/margins": 0.45417460799217224, + "rewards/rejected": -1.2077960968017578, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 4.898761686850028e-06, + "logits/chosen": -2.353015422821045, + "logits/rejected": -2.2464370727539062, + "logps/chosen": -369.61572265625, + "logps/rejected": -382.53082275390625, + "loss": 0.6584, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8926342725753784, + "rewards/margins": 0.2712360918521881, + "rewards/rejected": -1.1638704538345337, + "step": 1390 + }, + { + "epoch": 0.18, + "learning_rate": 4.895519079018485e-06, + "logits/chosen": -2.401740312576294, + "logits/rejected": -2.185915470123291, + "logps/chosen": -335.4225158691406, + "logps/rejected": -341.16680908203125, + "loss": 0.6085, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6165248155593872, + "rewards/margins": 0.3225622773170471, + "rewards/rejected": -0.9390870332717896, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -1.9741628170013428, + "eval_logits/rejected": -1.927571415901184, + "eval_logps/chosen": -346.9267883300781, + "eval_logps/rejected": -353.4969787597656, + "eval_loss": 0.6157093644142151, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.6163406372070312, + "eval_rewards/margins": 0.27993249893188477, + "eval_rewards/rejected": -0.896273136138916, + "eval_runtime": 1173.3158, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 4.89222646463754e-06, + "logits/chosen": -2.4668169021606445, + "logits/rejected": -2.419532060623169, + "logps/chosen": -343.31878662109375, + "logps/rejected": -360.8763427734375, + "loss": 0.63, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6849584579467773, + "rewards/margins": 0.2665075957775116, + "rewards/rejected": -0.9514660835266113, + "step": 1410 + }, + { + "epoch": 0.19, + "learning_rate": 4.888883912440642e-06, + "logits/chosen": -2.4510014057159424, + "logits/rejected": -2.3843305110931396, + "logps/chosen": -395.0271911621094, + "logps/rejected": -407.986572265625, + "loss": 0.6226, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.68365079164505, + "rewards/margins": 0.29661715030670166, + "rewards/rejected": -0.9802680015563965, + "step": 1420 + }, + { + "epoch": 0.19, + "learning_rate": 4.885491492203688e-06, + "logits/chosen": -2.329678773880005, + "logits/rejected": -2.3171207904815674, + "logps/chosen": -349.471435546875, + "logps/rejected": -345.58251953125, + "loss": 0.6077, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6296406388282776, + "rewards/margins": 0.24890565872192383, + "rewards/rejected": -0.8785463571548462, + "step": 1430 + }, + { + "epoch": 0.19, + "learning_rate": 4.882049274743578e-06, + "logits/chosen": -2.446080446243286, + "logits/rejected": -2.4080348014831543, + "logps/chosen": -392.8956298828125, + "logps/rejected": -388.02960205078125, + "loss": 0.5971, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6403361558914185, + "rewards/margins": 0.33641594648361206, + "rewards/rejected": -0.9767521023750305, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 4.878557331916729e-06, + "logits/chosen": -2.3418636322021484, + "logits/rejected": -2.364252805709839, + "logps/chosen": -342.7167663574219, + "logps/rejected": -350.45538330078125, + "loss": 0.5703, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7152738571166992, + "rewards/margins": 0.36363348364830017, + "rewards/rejected": -1.0789072513580322, + "step": 1450 + }, + { + "epoch": 0.19, + "learning_rate": 4.875015736617576e-06, + "logits/chosen": -2.3957436084747314, + "logits/rejected": -2.3168387413024902, + "logps/chosen": -424.3904724121094, + "logps/rejected": -406.2117614746094, + "loss": 0.5892, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7364000082015991, + "rewards/margins": 0.3339517116546631, + "rewards/rejected": -1.0703517198562622, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 4.8714245627770515e-06, + "logits/chosen": -2.3729703426361084, + "logits/rejected": -2.184260129928589, + "logps/chosen": -353.2168273925781, + "logps/rejected": -339.24261474609375, + "loss": 0.6548, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8984875679016113, + "rewards/margins": 0.20593862235546112, + "rewards/rejected": -1.1044261455535889, + "step": 1470 + }, + { + "epoch": 0.19, + "learning_rate": 4.8677838853610445e-06, + "logits/chosen": -2.2903103828430176, + "logits/rejected": -2.1878256797790527, + "logps/chosen": -370.137451171875, + "logps/rejected": -356.63104248046875, + "loss": 0.6219, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9454523324966431, + "rewards/margins": 0.3196515440940857, + "rewards/rejected": -1.265103816986084, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 4.864093780368828e-06, + "logits/chosen": -2.435291290283203, + "logits/rejected": -2.2713615894317627, + "logps/chosen": -396.46807861328125, + "logps/rejected": -377.0063781738281, + "loss": 0.5697, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8792732357978821, + "rewards/margins": 0.3991313874721527, + "rewards/rejected": -1.2784045934677124, + "step": 1490 + }, + { + "epoch": 0.2, + "learning_rate": 4.860354324831482e-06, + "logits/chosen": -2.388760566711426, + "logits/rejected": -2.385594129562378, + "logps/chosen": -379.97344970703125, + "logps/rejected": -420.51934814453125, + "loss": 0.5887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9489212036132812, + "rewards/margins": 0.3839309811592102, + "rewards/rejected": -1.3328521251678467, + "step": 1500 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -1.54761803150177, + "eval_logits/rejected": -1.5129750967025757, + "eval_logps/chosen": -390.6337890625, + "eval_logps/rejected": -404.723388671875, + "eval_loss": 0.6093257665634155, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -1.0534104108810425, + "eval_rewards/margins": 0.35512715578079224, + "eval_rewards/rejected": -1.40853750705719, + "eval_runtime": 1173.7666, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 1500 + }, + { + "epoch": 0.2, + "learning_rate": 4.856565596810279e-06, + "logits/chosen": -2.269038438796997, + "logits/rejected": -2.240347385406494, + "logps/chosen": -350.50933837890625, + "logps/rejected": -381.20684814453125, + "loss": 0.6657, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.1637290716171265, + "rewards/margins": 0.2023475170135498, + "rewards/rejected": -1.3660767078399658, + "step": 1510 + }, + { + "epoch": 0.2, + "learning_rate": 4.852727675395056e-06, + "logits/chosen": -2.311117172241211, + "logits/rejected": -2.18780517578125, + "logps/chosen": -383.8539123535156, + "logps/rejected": -395.37872314453125, + "loss": 0.5408, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0786935091018677, + "rewards/margins": 0.4756636619567871, + "rewards/rejected": -1.5543569326400757, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 4.848840640702565e-06, + "logits/chosen": -2.2907707691192627, + "logits/rejected": -2.2663180828094482, + "logps/chosen": -388.94317626953125, + "logps/rejected": -380.520263671875, + "loss": 0.67, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2756744623184204, + "rewards/margins": 0.2061425894498825, + "rewards/rejected": -1.4818170070648193, + "step": 1530 + }, + { + "epoch": 0.2, + "learning_rate": 4.844904573874798e-06, + "logits/chosen": -2.2316396236419678, + "logits/rejected": -2.1963393688201904, + "logps/chosen": -392.61492919921875, + "logps/rejected": -379.75390625, + "loss": 0.6004, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9927800297737122, + "rewards/margins": 0.39371055364608765, + "rewards/rejected": -1.3864905834197998, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 4.840919557077297e-06, + "logits/chosen": -2.321735382080078, + "logits/rejected": -2.1763479709625244, + "logps/chosen": -388.51239013671875, + "logps/rejected": -379.34027099609375, + "loss": 0.6354, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0403954982757568, + "rewards/margins": 0.2822137475013733, + "rewards/rejected": -1.3226091861724854, + "step": 1550 + }, + { + "epoch": 0.2, + "learning_rate": 4.836885673497435e-06, + "logits/chosen": -2.3030014038085938, + "logits/rejected": -2.185213565826416, + "logps/chosen": -373.89276123046875, + "logps/rejected": -391.1092529296875, + "loss": 0.5617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8299756050109863, + "rewards/margins": 0.43793243169784546, + "rewards/rejected": -1.267907977104187, + "step": 1560 + }, + { + "epoch": 0.21, + "learning_rate": 4.832803007342679e-06, + "logits/chosen": -2.268263339996338, + "logits/rejected": -2.30707049369812, + "logps/chosen": -338.6516418457031, + "logps/rejected": -377.26324462890625, + "loss": 0.6173, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8366347551345825, + "rewards/margins": 0.321827232837677, + "rewards/rejected": -1.1584619283676147, + "step": 1570 + }, + { + "epoch": 0.21, + "learning_rate": 4.828671643838839e-06, + "logits/chosen": -2.2386622428894043, + "logits/rejected": -2.1988167762756348, + "logps/chosen": -355.00958251953125, + "logps/rejected": -341.5582580566406, + "loss": 0.6171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7135469913482666, + "rewards/margins": 0.30205848813056946, + "rewards/rejected": -1.0156054496765137, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 4.824491669228279e-06, + "logits/chosen": -2.232062816619873, + "logits/rejected": -2.1673014163970947, + "logps/chosen": -351.5841369628906, + "logps/rejected": -344.68939208984375, + "loss": 0.6685, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8179405331611633, + "rewards/margins": 0.19087883830070496, + "rewards/rejected": -1.008819341659546, + "step": 1590 + }, + { + "epoch": 0.21, + "learning_rate": 4.8202631707681245e-06, + "logits/chosen": -2.2642502784729004, + "logits/rejected": -2.138714551925659, + "logps/chosen": -342.4280700683594, + "logps/rejected": -366.55609130859375, + "loss": 0.5585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8875049352645874, + "rewards/margins": 0.4244559407234192, + "rewards/rejected": -1.3119609355926514, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -1.4652458429336548, + "eval_logits/rejected": -1.4216023683547974, + "eval_logps/chosen": -370.8767395019531, + "eval_logps/rejected": -387.5893249511719, + "eval_loss": 0.6019625663757324, + "eval_rewards/accuracies": 0.6644999980926514, + "eval_rewards/chosen": -0.8558406829833984, + "eval_rewards/margins": 0.3813556730747223, + "eval_rewards/rejected": -1.2371965646743774, + "eval_runtime": 1173.7745, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 4.815986236728437e-06, + "logits/chosen": -2.184940814971924, + "logits/rejected": -2.126659870147705, + "logps/chosen": -356.41668701171875, + "logps/rejected": -386.04779052734375, + "loss": 0.6132, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8487674593925476, + "rewards/margins": 0.3206823468208313, + "rewards/rejected": -1.1694496870040894, + "step": 1610 + }, + { + "epoch": 0.21, + "learning_rate": 4.811660956390372e-06, + "logits/chosen": -2.3122637271881104, + "logits/rejected": -2.2805025577545166, + "logps/chosen": -394.89727783203125, + "logps/rejected": -388.9953918457031, + "loss": 0.6314, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7255429029464722, + "rewards/margins": 0.28325191140174866, + "rewards/rejected": -1.0087947845458984, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 4.807287420044319e-06, + "logits/chosen": -2.3865654468536377, + "logits/rejected": -2.369770050048828, + "logps/chosen": -314.7702331542969, + "logps/rejected": -348.518310546875, + "loss": 0.5655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6648892164230347, + "rewards/margins": 0.48127803206443787, + "rewards/rejected": -1.1461671590805054, + "step": 1630 + }, + { + "epoch": 0.21, + "learning_rate": 4.802865718988008e-06, + "logits/chosen": -2.205326795578003, + "logits/rejected": -2.1454052925109863, + "logps/chosen": -348.91082763671875, + "logps/rejected": -410.8165588378906, + "loss": 0.6301, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0370396375656128, + "rewards/margins": 0.2929023504257202, + "rewards/rejected": -1.329941987991333, + "step": 1640 + }, + { + "epoch": 0.22, + "learning_rate": 4.798395945524615e-06, + "logits/chosen": -2.2590596675872803, + "logits/rejected": -2.1765456199645996, + "logps/chosen": -394.92132568359375, + "logps/rejected": -415.81707763671875, + "loss": 0.5635, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1871960163116455, + "rewards/margins": 0.4527507424354553, + "rewards/rejected": -1.6399469375610352, + "step": 1650 + }, + { + "epoch": 0.22, + "learning_rate": 4.793878192960823e-06, + "logits/chosen": -2.2939252853393555, + "logits/rejected": -2.1937241554260254, + "logps/chosen": -469.12811279296875, + "logps/rejected": -500.58837890625, + "loss": 0.6277, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3709437847137451, + "rewards/margins": 0.4458431303501129, + "rewards/rejected": -1.8167870044708252, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 4.789312555604887e-06, + "logits/chosen": -2.2132697105407715, + "logits/rejected": -2.1623330116271973, + "logps/chosen": -370.47039794921875, + "logps/rejected": -391.04534912109375, + "loss": 0.5848, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0416100025177002, + "rewards/margins": 0.44926396012306213, + "rewards/rejected": -1.49087393283844, + "step": 1670 + }, + { + "epoch": 0.22, + "learning_rate": 4.784699128764654e-06, + "logits/chosen": -2.3140299320220947, + "logits/rejected": -2.226590394973755, + "logps/chosen": -355.25970458984375, + "logps/rejected": -377.73211669921875, + "loss": 0.5963, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9214965105056763, + "rewards/margins": 0.4505564272403717, + "rewards/rejected": -1.3720529079437256, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 4.780038008745581e-06, + "logits/chosen": -2.3037924766540527, + "logits/rejected": -2.20536732673645, + "logps/chosen": -400.2337646484375, + "logps/rejected": -388.82733154296875, + "loss": 0.6463, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9450837969779968, + "rewards/margins": 0.2872735857963562, + "rewards/rejected": -1.2323572635650635, + "step": 1690 + }, + { + "epoch": 0.22, + "learning_rate": 4.775329292848721e-06, + "logits/chosen": -2.296189069747925, + "logits/rejected": -2.2350804805755615, + "logps/chosen": -370.3669738769531, + "logps/rejected": -406.692626953125, + "loss": 0.5417, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7108799815177917, + "rewards/margins": 0.5038576126098633, + "rewards/rejected": -1.2147375345230103, + "step": 1700 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -1.3614152669906616, + "eval_logits/rejected": -1.3190244436264038, + "eval_logps/chosen": -363.167236328125, + "eval_logps/rejected": -380.348876953125, + "eval_loss": 0.5937426686286926, + "eval_rewards/accuracies": 0.6639999747276306, + "eval_rewards/chosen": -0.7787453532218933, + "eval_rewards/margins": 0.38604700565338135, + "eval_rewards/rejected": -1.1647922992706299, + "eval_runtime": 1173.3172, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 4.770573079368691e-06, + "logits/chosen": -2.308875560760498, + "logits/rejected": -2.3057169914245605, + "logps/chosen": -360.26898193359375, + "logps/rejected": -362.5707702636719, + "loss": 0.6155, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.765152633190155, + "rewards/margins": 0.3400669991970062, + "rewards/rejected": -1.1052196025848389, + "step": 1710 + }, + { + "epoch": 0.23, + "learning_rate": 4.765769467591626e-06, + "logits/chosen": -2.417853832244873, + "logits/rejected": -2.3791589736938477, + "logps/chosen": -382.55157470703125, + "logps/rejected": -393.84710693359375, + "loss": 0.5753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7473410367965698, + "rewards/margins": 0.3952387273311615, + "rewards/rejected": -1.1425797939300537, + "step": 1720 + }, + { + "epoch": 0.23, + "learning_rate": 4.760918557793096e-06, + "logits/chosen": -2.3585808277130127, + "logits/rejected": -2.375034809112549, + "logps/chosen": -345.61383056640625, + "logps/rejected": -390.66583251953125, + "loss": 0.5851, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7573005557060242, + "rewards/margins": 0.37787926197052, + "rewards/rejected": -1.1351797580718994, + "step": 1730 + }, + { + "epoch": 0.23, + "learning_rate": 4.756020451236025e-06, + "logits/chosen": -2.3663887977600098, + "logits/rejected": -2.2452588081359863, + "logps/chosen": -395.936767578125, + "logps/rejected": -402.03155517578125, + "loss": 0.6093, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7393473386764526, + "rewards/margins": 0.37452763319015503, + "rewards/rejected": -1.113874912261963, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 4.751075250168569e-06, + "logits/chosen": -2.3883743286132812, + "logits/rejected": -2.1911792755126953, + "logps/chosen": -386.3553161621094, + "logps/rejected": -410.297119140625, + "loss": 0.5629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0602279901504517, + "rewards/margins": 0.5666171908378601, + "rewards/rejected": -1.626845121383667, + "step": 1750 + }, + { + "epoch": 0.23, + "learning_rate": 4.746083057821981e-06, + "logits/chosen": -2.1893460750579834, + "logits/rejected": -2.053621530532837, + "logps/chosen": -363.1799621582031, + "logps/rejected": -383.117431640625, + "loss": 0.5724, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9518973231315613, + "rewards/margins": 0.5612205266952515, + "rewards/rejected": -1.513117790222168, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 4.741043978408463e-06, + "logits/chosen": -2.1911866664886475, + "logits/rejected": -2.0612337589263916, + "logps/chosen": -371.76446533203125, + "logps/rejected": -427.1809997558594, + "loss": 0.5255, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9364175796508789, + "rewards/margins": 0.6334894895553589, + "rewards/rejected": -1.5699070692062378, + "step": 1770 + }, + { + "epoch": 0.23, + "learning_rate": 4.735958117118983e-06, + "logits/chosen": -2.347450017929077, + "logits/rejected": -2.147848129272461, + "logps/chosen": -394.55975341796875, + "logps/rejected": -418.2250061035156, + "loss": 0.5623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8392922282218933, + "rewards/margins": 0.5399720072746277, + "rewards/rejected": -1.3792643547058105, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 4.730825580121084e-06, + "logits/chosen": -2.3108162879943848, + "logits/rejected": -2.1813712120056152, + "logps/chosen": -350.3599853515625, + "logps/rejected": -411.48944091796875, + "loss": 0.545, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8638598322868347, + "rewards/margins": 0.5645357966423035, + "rewards/rejected": -1.4283956289291382, + "step": 1790 + }, + { + "epoch": 0.24, + "learning_rate": 4.725646474556666e-06, + "logits/chosen": -2.17897367477417, + "logits/rejected": -2.144803285598755, + "logps/chosen": -335.24713134765625, + "logps/rejected": -410.5194396972656, + "loss": 0.5691, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9080871343612671, + "rewards/margins": 0.6468304395675659, + "rewards/rejected": -1.554917573928833, + "step": 1800 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -0.7890856862068176, + "eval_logits/rejected": -0.7432680726051331, + "eval_logps/chosen": -392.1944885253906, + "eval_logps/rejected": -420.14715576171875, + "eval_loss": 0.5964349508285522, + "eval_rewards/accuracies": 0.6704999804496765, + "eval_rewards/chosen": -1.0690178871154785, + "eval_rewards/margins": 0.49375709891319275, + "eval_rewards/rejected": -1.5627750158309937, + "eval_runtime": 1172.7485, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 1800 + }, + { + "epoch": 0.24, + "learning_rate": 4.720420908539748e-06, + "logits/chosen": -2.155564785003662, + "logits/rejected": -2.0796492099761963, + "logps/chosen": -371.0989685058594, + "logps/rejected": -411.58270263671875, + "loss": 0.6435, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.069402813911438, + "rewards/margins": 0.37863826751708984, + "rewards/rejected": -1.4480410814285278, + "step": 1810 + }, + { + "epoch": 0.24, + "learning_rate": 4.715148991154216e-06, + "logits/chosen": -2.2921173572540283, + "logits/rejected": -2.2819952964782715, + "logps/chosen": -461.7276916503906, + "logps/rejected": -498.3104553222656, + "loss": 0.6099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0293859243392944, + "rewards/margins": 0.44142985343933105, + "rewards/rejected": -1.470815896987915, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 4.709830832451538e-06, + "logits/chosen": -2.1879446506500244, + "logits/rejected": -2.137216091156006, + "logps/chosen": -426.2359924316406, + "logps/rejected": -461.81439208984375, + "loss": 0.597, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1083770990371704, + "rewards/margins": 0.5203989148139954, + "rewards/rejected": -1.6287761926651, + "step": 1830 + }, + { + "epoch": 0.24, + "learning_rate": 4.704466543448477e-06, + "logits/chosen": -2.1278553009033203, + "logits/rejected": -1.9929275512695312, + "logps/chosen": -473.5394592285156, + "logps/rejected": -482.0958557128906, + "loss": 0.5253, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1090824604034424, + "rewards/margins": 0.6798272132873535, + "rewards/rejected": -1.788909673690796, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 4.699056236124762e-06, + "logits/chosen": -2.1284070014953613, + "logits/rejected": -2.0913150310516357, + "logps/chosen": -384.767333984375, + "logps/rejected": -431.7093811035156, + "loss": 0.592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0932021141052246, + "rewards/margins": 0.46525582671165466, + "rewards/rejected": -1.5584577322006226, + "step": 1850 + }, + { + "epoch": 0.24, + "learning_rate": 4.693600023420758e-06, + "logits/chosen": -2.176835536956787, + "logits/rejected": -2.0453624725341797, + "logps/chosen": -439.25469970703125, + "logps/rejected": -430.12017822265625, + "loss": 0.5293, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2489891052246094, + "rewards/margins": 0.7063229084014893, + "rewards/rejected": -1.9553120136260986, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 4.688098019235108e-06, + "logits/chosen": -2.1578125953674316, + "logits/rejected": -2.0287883281707764, + "logps/chosen": -433.19635009765625, + "logps/rejected": -473.2142028808594, + "loss": 0.5775, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2516090869903564, + "rewards/margins": 0.5367294549942017, + "rewards/rejected": -1.7883384227752686, + "step": 1870 + }, + { + "epoch": 0.25, + "learning_rate": 4.682550338422353e-06, + "logits/chosen": -2.096961498260498, + "logits/rejected": -1.9933475255966187, + "logps/chosen": -422.03643798828125, + "logps/rejected": -432.7709045410156, + "loss": 0.5831, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3931233882904053, + "rewards/margins": 0.4907165467739105, + "rewards/rejected": -1.8838398456573486, + "step": 1880 + }, + { + "epoch": 0.25, + "learning_rate": 4.676957096790536e-06, + "logits/chosen": -2.0234227180480957, + "logits/rejected": -1.9713634252548218, + "logps/chosen": -406.87890625, + "logps/rejected": -406.08074951171875, + "loss": 0.6289, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2355217933654785, + "rewards/margins": 0.37370064854621887, + "rewards/rejected": -1.609222650527954, + "step": 1890 + }, + { + "epoch": 0.25, + "learning_rate": 4.671318411098782e-06, + "logits/chosen": -2.08906626701355, + "logits/rejected": -2.1200289726257324, + "logps/chosen": -437.5709533691406, + "logps/rejected": -503.25091552734375, + "loss": 0.5869, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3465156555175781, + "rewards/margins": 0.6033599376678467, + "rewards/rejected": -1.9498755931854248, + "step": 1900 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -0.5963050127029419, + "eval_logits/rejected": -0.575740396976471, + "eval_logps/chosen": -427.6318054199219, + "eval_logps/rejected": -450.0478210449219, + "eval_loss": 0.5930874347686768, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -1.4233906269073486, + "eval_rewards/margins": 0.4383908212184906, + "eval_rewards/rejected": -1.8617814779281616, + "eval_runtime": 1173.2085, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 4.665634399054864e-06, + "logits/chosen": -2.063363552093506, + "logits/rejected": -2.0153086185455322, + "logps/chosen": -403.9264831542969, + "logps/rejected": -435.4012756347656, + "loss": 0.636, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5465749502182007, + "rewards/margins": 0.37581175565719604, + "rewards/rejected": -1.9223867654800415, + "step": 1910 + }, + { + "epoch": 0.25, + "learning_rate": 4.659905179312743e-06, + "logits/chosen": -2.292100429534912, + "logits/rejected": -2.212569236755371, + "logps/chosen": -456.89447021484375, + "logps/rejected": -429.1956481933594, + "loss": 0.5992, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3993576765060425, + "rewards/margins": 0.3659273087978363, + "rewards/rejected": -1.7652848958969116, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 4.654130871470093e-06, + "logits/chosen": -2.2021963596343994, + "logits/rejected": -2.076413869857788, + "logps/chosen": -391.838134765625, + "logps/rejected": -380.90032958984375, + "loss": 0.6364, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1989585161209106, + "rewards/margins": 0.24625608325004578, + "rewards/rejected": -1.4452146291732788, + "step": 1930 + }, + { + "epoch": 0.25, + "learning_rate": 4.6483115960658045e-06, + "logits/chosen": -2.333247423171997, + "logits/rejected": -2.182128429412842, + "logps/chosen": -402.03070068359375, + "logps/rejected": -364.1950988769531, + "loss": 0.5966, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1187512874603271, + "rewards/margins": 0.3161240220069885, + "rewards/rejected": -1.434875249862671, + "step": 1940 + }, + { + "epoch": 0.26, + "learning_rate": 4.642447474577466e-06, + "logits/chosen": -2.1036577224731445, + "logits/rejected": -2.1021203994750977, + "logps/chosen": -362.9808349609375, + "logps/rejected": -400.50506591796875, + "loss": 0.5646, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1177865266799927, + "rewards/margins": 0.47713446617126465, + "rewards/rejected": -1.5949208736419678, + "step": 1950 + }, + { + "epoch": 0.26, + "learning_rate": 4.636538629418832e-06, + "logits/chosen": -2.1588752269744873, + "logits/rejected": -2.1076531410217285, + "logps/chosen": -413.4895935058594, + "logps/rejected": -445.3883361816406, + "loss": 0.5649, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1384721994400024, + "rewards/margins": 0.5193332433700562, + "rewards/rejected": -1.6578052043914795, + "step": 1960 + }, + { + "epoch": 0.26, + "learning_rate": 4.630585183937263e-06, + "logits/chosen": -2.1655097007751465, + "logits/rejected": -2.0573620796203613, + "logps/chosen": -405.8871765136719, + "logps/rejected": -407.2527770996094, + "loss": 0.6204, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0278955698013306, + "rewards/margins": 0.3120744824409485, + "rewards/rejected": -1.3399698734283447, + "step": 1970 + }, + { + "epoch": 0.26, + "learning_rate": 4.6245872624111535e-06, + "logits/chosen": -2.13643217086792, + "logits/rejected": -2.1128616333007812, + "logps/chosen": -335.54150390625, + "logps/rejected": -354.4369812011719, + "loss": 0.6132, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9324430227279663, + "rewards/margins": 0.3190438747406006, + "rewards/rejected": -1.2514870166778564, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 4.618544990047336e-06, + "logits/chosen": -2.114036798477173, + "logits/rejected": -2.04970121383667, + "logps/chosen": -415.504638671875, + "logps/rejected": -439.928955078125, + "loss": 0.6247, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0088666677474976, + "rewards/margins": 0.38996371626853943, + "rewards/rejected": -1.3988304138183594, + "step": 1990 + }, + { + "epoch": 0.26, + "learning_rate": 4.612458492978473e-06, + "logits/chosen": -2.297130823135376, + "logits/rejected": -2.232883930206299, + "logps/chosen": -365.77850341796875, + "logps/rejected": -401.78826904296875, + "loss": 0.6732, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0754987001419067, + "rewards/margins": 0.30849093198776245, + "rewards/rejected": -1.383989691734314, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -0.959629237651825, + "eval_logits/rejected": -0.892835795879364, + "eval_logps/chosen": -358.4945068359375, + "eval_logps/rejected": -377.0960693359375, + "eval_loss": 0.5928361415863037, + "eval_rewards/accuracies": 0.6765000224113464, + "eval_rewards/chosen": -0.7320181131362915, + "eval_rewards/margins": 0.4002459943294525, + "eval_rewards/rejected": -1.1322641372680664, + "eval_runtime": 1173.4466, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 4.606327898260413e-06, + "logits/chosen": -2.1415932178497314, + "logits/rejected": -2.069800615310669, + "logps/chosen": -369.90728759765625, + "logps/rejected": -384.2721862792969, + "loss": 0.6042, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7192890048027039, + "rewards/margins": 0.40543490648269653, + "rewards/rejected": -1.1247237920761108, + "step": 2010 + }, + { + "epoch": 0.26, + "learning_rate": 4.600153333869549e-06, + "logits/chosen": -2.299431324005127, + "logits/rejected": -2.22072172164917, + "logps/chosen": -354.39117431640625, + "logps/rejected": -355.1102294921875, + "loss": 0.5805, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6067346930503845, + "rewards/margins": 0.40614938735961914, + "rewards/rejected": -1.0128841400146484, + "step": 2020 + }, + { + "epoch": 0.27, + "learning_rate": 4.593934928700141e-06, + "logits/chosen": -2.2791316509246826, + "logits/rejected": -2.1022727489471436, + "logps/chosen": -375.39031982421875, + "logps/rejected": -386.8539733886719, + "loss": 0.6022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8547514081001282, + "rewards/margins": 0.4911293089389801, + "rewards/rejected": -1.3458807468414307, + "step": 2030 + }, + { + "epoch": 0.27, + "learning_rate": 4.587672812561626e-06, + "logits/chosen": -2.088589668273926, + "logits/rejected": -2.056277275085449, + "logps/chosen": -356.75152587890625, + "logps/rejected": -447.05322265625, + "loss": 0.529, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.013441801071167, + "rewards/margins": 0.5791908502578735, + "rewards/rejected": -1.592632532119751, + "step": 2040 + }, + { + "epoch": 0.27, + "learning_rate": 4.581367116175911e-06, + "logits/chosen": -2.0325567722320557, + "logits/rejected": -1.9162721633911133, + "logps/chosen": -429.95745849609375, + "logps/rejected": -434.22979736328125, + "loss": 0.5999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1933940649032593, + "rewards/margins": 0.44283047318458557, + "rewards/rejected": -1.6362245082855225, + "step": 2050 + }, + { + "epoch": 0.27, + "learning_rate": 4.5750179711746416e-06, + "logits/chosen": -2.023871898651123, + "logits/rejected": -1.9521923065185547, + "logps/chosen": -420.52508544921875, + "logps/rejected": -465.43798828125, + "loss": 0.5524, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5283615589141846, + "rewards/margins": 0.5009020566940308, + "rewards/rejected": -2.029263496398926, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 4.5686255100964535e-06, + "logits/chosen": -2.0695993900299072, + "logits/rejected": -1.992722749710083, + "logps/chosen": -445.06683349609375, + "logps/rejected": -456.1000061035156, + "loss": 0.5776, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6490529775619507, + "rewards/margins": 0.45439139008522034, + "rewards/rejected": -2.1034445762634277, + "step": 2070 + }, + { + "epoch": 0.27, + "learning_rate": 4.562189866384209e-06, + "logits/chosen": -1.94949209690094, + "logits/rejected": -1.9335947036743164, + "logps/chosen": -411.20379638671875, + "logps/rejected": -485.4537048339844, + "loss": 0.5876, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5798475742340088, + "rewards/margins": 0.5429893136024475, + "rewards/rejected": -2.1228365898132324, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 4.555711174382209e-06, + "logits/chosen": -2.0153870582580566, + "logits/rejected": -1.9438400268554688, + "logps/chosen": -401.353271484375, + "logps/rejected": -415.864501953125, + "loss": 0.6126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.529844045639038, + "rewards/margins": 0.366629958152771, + "rewards/rejected": -1.8964741230010986, + "step": 2090 + }, + { + "epoch": 0.27, + "learning_rate": 4.549189569333387e-06, + "logits/chosen": -2.0757155418395996, + "logits/rejected": -1.9299837350845337, + "logps/chosen": -366.15838623046875, + "logps/rejected": -385.17474365234375, + "loss": 0.5453, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1265515089035034, + "rewards/margins": 0.49831241369247437, + "rewards/rejected": -1.6248639822006226, + "step": 2100 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -0.33254069089889526, + "eval_logits/rejected": -0.3057098388671875, + "eval_logps/chosen": -407.44610595703125, + "eval_logps/rejected": -431.1004943847656, + "eval_loss": 0.5811684131622314, + "eval_rewards/accuracies": 0.6769999861717224, + "eval_rewards/chosen": -1.2215332984924316, + "eval_rewards/margins": 0.450775146484375, + "eval_rewards/rejected": -1.6723084449768066, + "eval_runtime": 1173.1346, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 2100 + }, + { + "epoch": 0.28, + "learning_rate": 4.542625187376491e-06, + "logits/chosen": -2.260960578918457, + "logits/rejected": -2.1369497776031494, + "logps/chosen": -435.5047912597656, + "logps/rejected": -424.6029357910156, + "loss": 0.6178, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2315149307250977, + "rewards/margins": 0.343548059463501, + "rewards/rejected": -1.5750629901885986, + "step": 2110 + }, + { + "epoch": 0.28, + "learning_rate": 4.536018165543239e-06, + "logits/chosen": -2.270226240158081, + "logits/rejected": -2.1414380073547363, + "logps/chosen": -436.93499755859375, + "logps/rejected": -462.13232421875, + "loss": 0.6148, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2013145685195923, + "rewards/margins": 0.37624213099479675, + "rewards/rejected": -1.5775566101074219, + "step": 2120 + }, + { + "epoch": 0.28, + "learning_rate": 4.529368641755453e-06, + "logits/chosen": -2.187873125076294, + "logits/rejected": -2.1502745151519775, + "logps/chosen": -373.24639892578125, + "logps/rejected": -407.1181640625, + "loss": 0.6436, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.308226466178894, + "rewards/margins": 0.40959835052490234, + "rewards/rejected": -1.7178246974945068, + "step": 2130 + }, + { + "epoch": 0.28, + "learning_rate": 4.522676754822189e-06, + "logits/chosen": -2.077741861343384, + "logits/rejected": -1.9873619079589844, + "logps/chosen": -417.1329650878906, + "logps/rejected": -381.8856201171875, + "loss": 0.5885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2676500082015991, + "rewards/margins": 0.38986071944236755, + "rewards/rejected": -1.65751051902771, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 4.515942644436836e-06, + "logits/chosen": -2.181087017059326, + "logits/rejected": -2.0420756340026855, + "logps/chosen": -428.8570251464844, + "logps/rejected": -457.95330810546875, + "loss": 0.5893, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3216568231582642, + "rewards/margins": 0.5574394464492798, + "rewards/rejected": -1.879096269607544, + "step": 2150 + }, + { + "epoch": 0.28, + "learning_rate": 4.509166451174194e-06, + "logits/chosen": -2.1448304653167725, + "logits/rejected": -2.118425130844116, + "logps/chosen": -446.98272705078125, + "logps/rejected": -475.9813537597656, + "loss": 0.5674, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2321898937225342, + "rewards/margins": 0.5356115698814392, + "rewards/rejected": -1.7678015232086182, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 4.502348316487552e-06, + "logits/chosen": -2.1516335010528564, + "logits/rejected": -1.9779102802276611, + "logps/chosen": -437.4042053222656, + "logps/rejected": -434.9273376464844, + "loss": 0.6678, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4476085901260376, + "rewards/margins": 0.2950034439563751, + "rewards/rejected": -1.7426121234893799, + "step": 2170 + }, + { + "epoch": 0.29, + "learning_rate": 4.495488382705722e-06, + "logits/chosen": -2.2639286518096924, + "logits/rejected": -2.0572140216827393, + "logps/chosen": -442.319580078125, + "logps/rejected": -407.50311279296875, + "loss": 0.5469, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8552709817886353, + "rewards/margins": 0.5348329544067383, + "rewards/rejected": -1.390103816986084, + "step": 2180 + }, + { + "epoch": 0.29, + "learning_rate": 4.488586793030075e-06, + "logits/chosen": -2.1855459213256836, + "logits/rejected": -2.064626455307007, + "logps/chosen": -321.4360046386719, + "logps/rejected": -400.38995361328125, + "loss": 0.5179, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8480280041694641, + "rewards/margins": 0.6384379863739014, + "rewards/rejected": -1.4864661693572998, + "step": 2190 + }, + { + "epoch": 0.29, + "learning_rate": 4.481643691531551e-06, + "logits/chosen": -2.194056272506714, + "logits/rejected": -2.1267337799072266, + "logps/chosen": -382.1920166015625, + "logps/rejected": -397.70758056640625, + "loss": 0.5521, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9316880106925964, + "rewards/margins": 0.5789372324943542, + "rewards/rejected": -1.5106254816055298, + "step": 2200 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -0.10590755194425583, + "eval_logits/rejected": -0.08353219926357269, + "eval_logps/chosen": -383.8439025878906, + "eval_logps/rejected": -412.94171142578125, + "eval_loss": 0.5773423314094543, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -0.9855120778083801, + "eval_rewards/margins": 0.5052086114883423, + "eval_rewards/rejected": -1.4907207489013672, + "eval_runtime": 1173.8424, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 2200 + }, + { + "epoch": 0.29, + "learning_rate": 4.474659223147652e-06, + "logits/chosen": -2.1210954189300537, + "logits/rejected": -2.0790343284606934, + "logps/chosen": -388.6036682128906, + "logps/rejected": -412.49755859375, + "loss": 0.6111, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0008533000946045, + "rewards/margins": 0.5308166742324829, + "rewards/rejected": -1.5316702127456665, + "step": 2210 + }, + { + "epoch": 0.29, + "learning_rate": 4.4676335336794125e-06, + "logits/chosen": -2.1499674320220947, + "logits/rejected": -2.042468786239624, + "logps/chosen": -427.4583435058594, + "logps/rejected": -433.139404296875, + "loss": 0.6276, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9962937235832214, + "rewards/margins": 0.33339327573776245, + "rewards/rejected": -1.3296869993209839, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 4.46056676978836e-06, + "logits/chosen": -2.1695902347564697, + "logits/rejected": -2.1840529441833496, + "logps/chosen": -363.1017150878906, + "logps/rejected": -432.8349609375, + "loss": 0.6198, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8799940347671509, + "rewards/margins": 0.37018856406211853, + "rewards/rejected": -1.2501827478408813, + "step": 2230 + }, + { + "epoch": 0.29, + "learning_rate": 4.453459078993453e-06, + "logits/chosen": -2.105926036834717, + "logits/rejected": -2.1373894214630127, + "logps/chosen": -356.4091796875, + "logps/rejected": -404.40716552734375, + "loss": 0.516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8220658302307129, + "rewards/margins": 0.567711353302002, + "rewards/rejected": -1.3897771835327148, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 4.446310609668001e-06, + "logits/chosen": -2.105323553085327, + "logits/rejected": -2.037275791168213, + "logps/chosen": -361.72833251953125, + "logps/rejected": -442.4640197753906, + "loss": 0.5989, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1395691633224487, + "rewards/margins": 0.46317243576049805, + "rewards/rejected": -1.6027415990829468, + "step": 2250 + }, + { + "epoch": 0.3, + "learning_rate": 4.439121511036562e-06, + "logits/chosen": -2.156818389892578, + "logits/rejected": -2.057016611099243, + "logps/chosen": -418.86090087890625, + "logps/rejected": -436.59210205078125, + "loss": 0.5761, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1772918701171875, + "rewards/margins": 0.5623257160186768, + "rewards/rejected": -1.739617943763733, + "step": 2260 + }, + { + "epoch": 0.3, + "learning_rate": 4.431891933171839e-06, + "logits/chosen": -2.0695672035217285, + "logits/rejected": -1.941693663597107, + "logps/chosen": -397.0233154296875, + "logps/rejected": -432.2896423339844, + "loss": 0.6226, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2106586694717407, + "rewards/margins": 0.47111576795578003, + "rewards/rejected": -1.681774377822876, + "step": 2270 + }, + { + "epoch": 0.3, + "learning_rate": 4.424622026991536e-06, + "logits/chosen": -2.049593925476074, + "logits/rejected": -1.9610483646392822, + "logps/chosen": -415.3938903808594, + "logps/rejected": -431.11236572265625, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3287193775177002, + "rewards/margins": 0.3927326202392578, + "rewards/rejected": -1.721451759338379, + "step": 2280 + }, + { + "epoch": 0.3, + "learning_rate": 4.417311944255215e-06, + "logits/chosen": -2.1346962451934814, + "logits/rejected": -2.1600146293640137, + "logps/chosen": -380.8770446777344, + "logps/rejected": -421.3860778808594, + "loss": 0.7034, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2351090908050537, + "rewards/margins": 0.2262163609266281, + "rewards/rejected": -1.4613254070281982, + "step": 2290 + }, + { + "epoch": 0.3, + "learning_rate": 4.409961837561122e-06, + "logits/chosen": -2.099057912826538, + "logits/rejected": -2.1106457710266113, + "logps/chosen": -447.09356689453125, + "logps/rejected": -506.8944396972656, + "loss": 0.5352, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1697063446044922, + "rewards/margins": 0.6214213371276855, + "rewards/rejected": -1.7911275625228882, + "step": 2300 + }, + { + "epoch": 0.3, + "eval_logits/chosen": -0.24320349097251892, + "eval_logits/rejected": -0.21170783042907715, + "eval_logps/chosen": -393.0879821777344, + "eval_logps/rejected": -416.6599426269531, + "eval_loss": 0.5820898413658142, + "eval_rewards/accuracies": 0.6884999871253967, + "eval_rewards/chosen": -1.07795250415802, + "eval_rewards/margins": 0.449950248003006, + "eval_rewards/rejected": -1.5279029607772827, + "eval_runtime": 1172.9381, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 4.402571860343006e-06, + "logits/chosen": -2.2147767543792725, + "logits/rejected": -2.037105083465576, + "logps/chosen": -402.79998779296875, + "logps/rejected": -373.8688659667969, + "loss": 0.591, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0390770435333252, + "rewards/margins": 0.3865988552570343, + "rewards/rejected": -1.4256759881973267, + "step": 2310 + }, + { + "epoch": 0.3, + "learning_rate": 4.3951421668669165e-06, + "logits/chosen": -2.2005527019500732, + "logits/rejected": -2.104384183883667, + "logps/chosen": -391.08050537109375, + "logps/rejected": -424.8851623535156, + "loss": 0.5589, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9733511209487915, + "rewards/margins": 0.4935951828956604, + "rewards/rejected": -1.4669463634490967, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 4.3876729122279784e-06, + "logits/chosen": -2.1870036125183105, + "logits/rejected": -2.1664469242095947, + "logps/chosen": -309.8889465332031, + "logps/rejected": -373.26312255859375, + "loss": 0.5234, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9488767385482788, + "rewards/margins": 0.6144660115242004, + "rewards/rejected": -1.563342809677124, + "step": 2330 + }, + { + "epoch": 0.31, + "learning_rate": 4.3801642523471585e-06, + "logits/chosen": -2.2839043140411377, + "logits/rejected": -2.0562546253204346, + "logps/chosen": -403.57757568359375, + "logps/rejected": -419.24639892578125, + "loss": 0.526, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.102151870727539, + "rewards/margins": 0.5933542251586914, + "rewards/rejected": -1.6955060958862305, + "step": 2340 + }, + { + "epoch": 0.31, + "learning_rate": 4.37261634396801e-06, + "logits/chosen": -2.0778768062591553, + "logits/rejected": -1.971123456954956, + "logps/chosen": -422.4281311035156, + "logps/rejected": -454.2814025878906, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4297865629196167, + "rewards/margins": 0.5288795232772827, + "rewards/rejected": -1.958666205406189, + "step": 2350 + }, + { + "epoch": 0.31, + "learning_rate": 4.365029344653401e-06, + "logits/chosen": -2.1634223461151123, + "logits/rejected": -2.0877110958099365, + "logps/chosen": -486.320068359375, + "logps/rejected": -487.45159912109375, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3157932758331299, + "rewards/margins": 0.8328288197517395, + "rewards/rejected": -2.1486220359802246, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 4.35740341278222e-06, + "logits/chosen": -2.2238452434539795, + "logits/rejected": -2.188713550567627, + "logps/chosen": -474.1236267089844, + "logps/rejected": -508.4130859375, + "loss": 0.6364, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3115034103393555, + "rewards/margins": 0.4928676187992096, + "rewards/rejected": -1.8043708801269531, + "step": 2370 + }, + { + "epoch": 0.31, + "learning_rate": 4.349738707546079e-06, + "logits/chosen": -2.0244901180267334, + "logits/rejected": -1.984893798828125, + "logps/chosen": -433.35693359375, + "logps/rejected": -430.234130859375, + "loss": 0.6442, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4878586530685425, + "rewards/margins": 0.4103633761405945, + "rewards/rejected": -1.8982219696044922, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 4.3420353889459835e-06, + "logits/chosen": -2.2906548976898193, + "logits/rejected": -2.105910539627075, + "logps/chosen": -462.30682373046875, + "logps/rejected": -467.44293212890625, + "loss": 0.5388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3121368885040283, + "rewards/margins": 0.6499633193016052, + "rewards/rejected": -1.9621002674102783, + "step": 2390 + }, + { + "epoch": 0.31, + "learning_rate": 4.334293617788992e-06, + "logits/chosen": -2.157517433166504, + "logits/rejected": -1.953137755393982, + "logps/chosen": -398.0602111816406, + "logps/rejected": -423.69854736328125, + "loss": 0.4291, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.269965410232544, + "rewards/margins": 1.029221534729004, + "rewards/rejected": -2.2991867065429688, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.1741051822900772, + "eval_logits/rejected": 0.18015538156032562, + "eval_logps/chosen": -423.0900573730469, + "eval_logps/rejected": -462.5804748535156, + "eval_loss": 0.5799766778945923, + "eval_rewards/accuracies": 0.6784999966621399, + "eval_rewards/chosen": -1.3779734373092651, + "eval_rewards/margins": 0.6091340780258179, + "eval_rewards/rejected": -1.9871076345443726, + "eval_runtime": 1173.1683, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 2400 + }, + { + "epoch": 0.32, + "learning_rate": 4.326513555684867e-06, + "logits/chosen": -2.1554830074310303, + "logits/rejected": -1.9456875324249268, + "logps/chosen": -452.01861572265625, + "logps/rejected": -421.59033203125, + "loss": 0.644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4219211339950562, + "rewards/margins": 0.4033273160457611, + "rewards/rejected": -1.82524836063385, + "step": 2410 + }, + { + "epoch": 0.32, + "learning_rate": 4.31869536504269e-06, + "logits/chosen": -2.084606647491455, + "logits/rejected": -2.057734966278076, + "logps/chosen": -394.4380187988281, + "logps/rejected": -445.02484130859375, + "loss": 0.5474, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1819837093353271, + "rewards/margins": 0.6775622367858887, + "rewards/rejected": -1.8595459461212158, + "step": 2420 + }, + { + "epoch": 0.32, + "learning_rate": 4.310839209067482e-06, + "logits/chosen": -2.264970302581787, + "logits/rejected": -2.110339879989624, + "logps/chosen": -405.0823669433594, + "logps/rejected": -430.5166015625, + "loss": 0.5699, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1619272232055664, + "rewards/margins": 0.5353389978408813, + "rewards/rejected": -1.6972662210464478, + "step": 2430 + }, + { + "epoch": 0.32, + "learning_rate": 4.302945251756788e-06, + "logits/chosen": -2.028538227081299, + "logits/rejected": -2.029587507247925, + "logps/chosen": -395.78619384765625, + "logps/rejected": -437.18133544921875, + "loss": 0.5098, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.175262212753296, + "rewards/margins": 0.7358410954475403, + "rewards/rejected": -1.9111032485961914, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 4.29501365789726e-06, + "logits/chosen": -2.0059101581573486, + "logits/rejected": -1.9005285501480103, + "logps/chosen": -351.9297180175781, + "logps/rejected": -391.07647705078125, + "loss": 0.5735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0925757884979248, + "rewards/margins": 0.6294921040534973, + "rewards/rejected": -1.7220680713653564, + "step": 2450 + }, + { + "epoch": 0.32, + "learning_rate": 4.2870445930612135e-06, + "logits/chosen": -2.1147594451904297, + "logits/rejected": -2.009192705154419, + "logps/chosen": -425.51934814453125, + "logps/rejected": -462.2431640625, + "loss": 0.5077, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9345242381095886, + "rewards/margins": 0.8426654934883118, + "rewards/rejected": -1.77718985080719, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 4.279038223603171e-06, + "logits/chosen": -2.177354097366333, + "logits/rejected": -2.006502866744995, + "logps/chosen": -370.07952880859375, + "logps/rejected": -402.7530822753906, + "loss": 0.5482, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9241195917129517, + "rewards/margins": 0.6823036670684814, + "rewards/rejected": -1.6064231395721436, + "step": 2470 + }, + { + "epoch": 0.32, + "learning_rate": 4.2709947166563906e-06, + "logits/chosen": -1.9542591571807861, + "logits/rejected": -1.957780122756958, + "logps/chosen": -391.3414611816406, + "logps/rejected": -465.82244873046875, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1169626712799072, + "rewards/margins": 0.6581896543502808, + "rewards/rejected": -1.7751522064208984, + "step": 2480 + }, + { + "epoch": 0.33, + "learning_rate": 4.262914240129379e-06, + "logits/chosen": -2.082587242126465, + "logits/rejected": -1.9763103723526, + "logps/chosen": -411.59649658203125, + "logps/rejected": -450.15313720703125, + "loss": 0.5524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0453674793243408, + "rewards/margins": 0.8182929754257202, + "rewards/rejected": -1.8636605739593506, + "step": 2490 + }, + { + "epoch": 0.33, + "learning_rate": 4.254796962702382e-06, + "logits/chosen": -2.1096723079681396, + "logits/rejected": -2.033006191253662, + "logps/chosen": -407.79656982421875, + "logps/rejected": -441.3284606933594, + "loss": 0.5324, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9739041328430176, + "rewards/margins": 0.6023324728012085, + "rewards/rejected": -1.5762364864349365, + "step": 2500 + }, + { + "epoch": 0.33, + "eval_logits/chosen": 0.07505005598068237, + "eval_logits/rejected": 0.09037268161773682, + "eval_logps/chosen": -388.1980285644531, + "eval_logps/rejected": -422.61712646484375, + "eval_loss": 0.5709013938903809, + "eval_rewards/accuracies": 0.6765000224113464, + "eval_rewards/chosen": -1.0290535688400269, + "eval_rewards/margins": 0.5584208965301514, + "eval_rewards/rejected": -1.5874744653701782, + "eval_runtime": 1173.0849, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 2500 + }, + { + "epoch": 0.33, + "learning_rate": 4.246643053823864e-06, + "logits/chosen": -2.1814770698547363, + "logits/rejected": -2.039039134979248, + "logps/chosen": -309.3426513671875, + "logps/rejected": -398.8958740234375, + "loss": 0.5441, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8469230532646179, + "rewards/margins": 0.632138729095459, + "rewards/rejected": -1.4790617227554321, + "step": 2510 + }, + { + "epoch": 0.33, + "learning_rate": 4.238452683706979e-06, + "logits/chosen": -2.0768649578094482, + "logits/rejected": -2.069648027420044, + "logps/chosen": -340.1851501464844, + "logps/rejected": -352.81805419921875, + "loss": 0.5393, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.98432856798172, + "rewards/margins": 0.5381032228469849, + "rewards/rejected": -1.52243173122406, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 4.2302260233260025e-06, + "logits/chosen": -2.135117292404175, + "logits/rejected": -2.150930404663086, + "logps/chosen": -380.9312438964844, + "logps/rejected": -437.70672607421875, + "loss": 0.5252, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8901141285896301, + "rewards/margins": 0.7313874363899231, + "rewards/rejected": -1.6215015649795532, + "step": 2530 + }, + { + "epoch": 0.33, + "learning_rate": 4.2219632444127766e-06, + "logits/chosen": -1.9879133701324463, + "logits/rejected": -1.93001389503479, + "logps/chosen": -401.8074035644531, + "logps/rejected": -424.042724609375, + "loss": 0.6447, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.190601110458374, + "rewards/margins": 0.33566969633102417, + "rewards/rejected": -1.5262706279754639, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 4.213664519453115e-06, + "logits/chosen": -2.1339848041534424, + "logits/rejected": -2.0666513442993164, + "logps/chosen": -367.1988525390625, + "logps/rejected": -413.453369140625, + "loss": 0.5879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.175606608390808, + "rewards/margins": 0.463453471660614, + "rewards/rejected": -1.6390600204467773, + "step": 2550 + }, + { + "epoch": 0.33, + "learning_rate": 4.205330021683208e-06, + "logits/chosen": -1.9215284585952759, + "logits/rejected": -1.9162276983261108, + "logps/chosen": -332.72454833984375, + "logps/rejected": -360.2316589355469, + "loss": 0.627, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0344467163085938, + "rewards/margins": 0.313593327999115, + "rewards/rejected": -1.348039984703064, + "step": 2560 + }, + { + "epoch": 0.34, + "learning_rate": 4.196959925086008e-06, + "logits/chosen": -2.032893419265747, + "logits/rejected": -2.0280256271362305, + "logps/chosen": -393.7264709472656, + "logps/rejected": -445.7867126464844, + "loss": 0.6322, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.13155198097229, + "rewards/margins": 0.30951356887817383, + "rewards/rejected": -1.4410655498504639, + "step": 2570 + }, + { + "epoch": 0.34, + "learning_rate": 4.188554404387588e-06, + "logits/chosen": -2.1735775470733643, + "logits/rejected": -2.09834623336792, + "logps/chosen": -401.140625, + "logps/rejected": -413.3356018066406, + "loss": 0.5872, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.045619249343872, + "rewards/margins": 0.4095796048641205, + "rewards/rejected": -1.4551990032196045, + "step": 2580 + }, + { + "epoch": 0.34, + "learning_rate": 4.180113635053504e-06, + "logits/chosen": -2.1379377841949463, + "logits/rejected": -2.1078503131866455, + "logps/chosen": -378.58740234375, + "logps/rejected": -440.969482421875, + "loss": 0.5702, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0751326084136963, + "rewards/margins": 0.5110452175140381, + "rewards/rejected": -1.5861778259277344, + "step": 2590 + }, + { + "epoch": 0.34, + "learning_rate": 4.17163779328513e-06, + "logits/chosen": -2.0438780784606934, + "logits/rejected": -1.9390064477920532, + "logps/chosen": -394.52532958984375, + "logps/rejected": -429.409423828125, + "loss": 0.5659, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.087465763092041, + "rewards/margins": 0.590351939201355, + "rewards/rejected": -1.677817702293396, + "step": 2600 + }, + { + "epoch": 0.34, + "eval_logits/chosen": 0.3240658640861511, + "eval_logits/rejected": 0.3280923366546631, + "eval_logps/chosen": -410.6242980957031, + "eval_logps/rejected": -446.1897888183594, + "eval_loss": 0.5639599561691284, + "eval_rewards/accuracies": 0.6984999775886536, + "eval_rewards/chosen": -1.2533156871795654, + "eval_rewards/margins": 0.5698856115341187, + "eval_rewards/rejected": -1.823201298713684, + "eval_runtime": 1173.6285, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 4.163127056015975e-06, + "logits/chosen": -2.0112125873565674, + "logits/rejected": -1.8717600107192993, + "logps/chosen": -421.8485412597656, + "logps/rejected": -459.86590576171875, + "loss": 0.6169, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2108803987503052, + "rewards/margins": 0.524616539478302, + "rewards/rejected": -1.7354971170425415, + "step": 2610 + }, + { + "epoch": 0.34, + "learning_rate": 4.154581600907994e-06, + "logits/chosen": -2.0410170555114746, + "logits/rejected": -1.8681037425994873, + "logps/chosen": -370.8006286621094, + "logps/rejected": -413.893310546875, + "loss": 0.4689, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0415531396865845, + "rewards/margins": 0.7762254476547241, + "rewards/rejected": -1.8177785873413086, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 4.14600160634788e-06, + "logits/chosen": -1.9654664993286133, + "logits/rejected": -1.8224735260009766, + "logps/chosen": -377.9241943359375, + "logps/rejected": -453.4291076660156, + "loss": 0.5093, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.213224172592163, + "rewards/margins": 0.7400587797164917, + "rewards/rejected": -1.9532829523086548, + "step": 2630 + }, + { + "epoch": 0.35, + "learning_rate": 4.137387251443335e-06, + "logits/chosen": -2.0160083770751953, + "logits/rejected": -1.8280494213104248, + "logps/chosen": -392.6662902832031, + "logps/rejected": -413.31341552734375, + "loss": 0.5341, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2335208654403687, + "rewards/margins": 0.6282380223274231, + "rewards/rejected": -1.8617589473724365, + "step": 2640 + }, + { + "epoch": 0.35, + "learning_rate": 4.128738716019338e-06, + "logits/chosen": -1.9816757440567017, + "logits/rejected": -1.965356469154358, + "logps/chosen": -434.535400390625, + "logps/rejected": -474.57421875, + "loss": 0.5544, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2539236545562744, + "rewards/margins": 0.6901840567588806, + "rewards/rejected": -1.9441077709197998, + "step": 2650 + }, + { + "epoch": 0.35, + "learning_rate": 4.120056180614386e-06, + "logits/chosen": -1.8631175756454468, + "logits/rejected": -1.7722257375717163, + "logps/chosen": -396.16748046875, + "logps/rejected": -475.76214599609375, + "loss": 0.5704, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3676929473876953, + "rewards/margins": 0.6798584461212158, + "rewards/rejected": -2.047551393508911, + "step": 2660 + }, + { + "epoch": 0.35, + "learning_rate": 4.111339826476725e-06, + "logits/chosen": -1.8956613540649414, + "logits/rejected": -1.8707530498504639, + "logps/chosen": -393.42095947265625, + "logps/rejected": -448.44744873046875, + "loss": 0.6406, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4871103763580322, + "rewards/margins": 0.5370699167251587, + "rewards/rejected": -2.0241801738739014, + "step": 2670 + }, + { + "epoch": 0.35, + "learning_rate": 4.102589835560572e-06, + "logits/chosen": -1.9932838678359985, + "logits/rejected": -1.8022167682647705, + "logps/chosen": -469.27490234375, + "logps/rejected": -475.10028076171875, + "loss": 0.5753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4100916385650635, + "rewards/margins": 0.57780522108078, + "rewards/rejected": -1.9878966808319092, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 4.09380639052231e-06, + "logits/chosen": -1.9947681427001953, + "logits/rejected": -1.9494025707244873, + "logps/chosen": -446.71923828125, + "logps/rejected": -542.5546264648438, + "loss": 0.511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4790523052215576, + "rewards/margins": 0.7871916890144348, + "rewards/rejected": -2.2662441730499268, + "step": 2690 + }, + { + "epoch": 0.35, + "learning_rate": 4.084989674716679e-06, + "logits/chosen": -1.9378328323364258, + "logits/rejected": -1.7882620096206665, + "logps/chosen": -463.34368896484375, + "logps/rejected": -528.638427734375, + "loss": 0.5041, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6602880954742432, + "rewards/margins": 0.7532935738563538, + "rewards/rejected": -2.413581371307373, + "step": 2700 + }, + { + "epoch": 0.35, + "eval_logits/chosen": 0.5924010872840881, + "eval_logits/rejected": 0.5911453366279602, + "eval_logps/chosen": -459.9809875488281, + "eval_logps/rejected": -503.08282470703125, + "eval_loss": 0.5736746788024902, + "eval_rewards/accuracies": 0.6865000128746033, + "eval_rewards/chosen": -1.746882677078247, + "eval_rewards/margins": 0.6452487707138062, + "eval_rewards/rejected": -2.3921313285827637, + "eval_runtime": 1173.2673, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 4.076139872192949e-06, + "logits/chosen": -2.04536771774292, + "logits/rejected": -1.9077255725860596, + "logps/chosen": -490.4608459472656, + "logps/rejected": -500.9752502441406, + "loss": 0.5828, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7122223377227783, + "rewards/margins": 0.5663967132568359, + "rewards/rejected": -2.278618812561035, + "step": 2710 + }, + { + "epoch": 0.36, + "learning_rate": 4.067257167691074e-06, + "logits/chosen": -1.8697471618652344, + "logits/rejected": -1.8803532123565674, + "logps/chosen": -453.03570556640625, + "logps/rejected": -496.3179626464844, + "loss": 0.5782, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.366571307182312, + "rewards/margins": 0.6388453245162964, + "rewards/rejected": -2.0054163932800293, + "step": 2720 + }, + { + "epoch": 0.36, + "learning_rate": 4.05834174663784e-06, + "logits/chosen": -2.091736078262329, + "logits/rejected": -2.0846893787384033, + "logps/chosen": -400.5804138183594, + "logps/rejected": -394.76104736328125, + "loss": 0.6245, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1139838695526123, + "rewards/margins": 0.3597862124443054, + "rewards/rejected": -1.4737701416015625, + "step": 2730 + }, + { + "epoch": 0.36, + "learning_rate": 4.0493937951429895e-06, + "logits/chosen": -2.146397829055786, + "logits/rejected": -2.0857930183410645, + "logps/chosen": -382.061279296875, + "logps/rejected": -389.96307373046875, + "loss": 0.5693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.002920150756836, + "rewards/margins": 0.4762960374355316, + "rewards/rejected": -1.4792163372039795, + "step": 2740 + }, + { + "epoch": 0.36, + "learning_rate": 4.040413499995343e-06, + "logits/chosen": -2.082854747772217, + "logits/rejected": -1.9924688339233398, + "logps/chosen": -414.23095703125, + "logps/rejected": -456.17071533203125, + "loss": 0.5656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.032799243927002, + "rewards/margins": 0.5492347478866577, + "rewards/rejected": -1.5820338726043701, + "step": 2750 + }, + { + "epoch": 0.36, + "learning_rate": 4.031401048658892e-06, + "logits/chosen": -2.005931854248047, + "logits/rejected": -1.919759750366211, + "logps/chosen": -386.944580078125, + "logps/rejected": -435.21051025390625, + "loss": 0.5126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9739694595336914, + "rewards/margins": 0.7307363748550415, + "rewards/rejected": -1.704705834388733, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 4.022356629268894e-06, + "logits/chosen": -2.0577034950256348, + "logits/rejected": -1.8897212743759155, + "logps/chosen": -429.05938720703125, + "logps/rejected": -432.6758728027344, + "loss": 0.6329, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.333314299583435, + "rewards/margins": 0.4279370903968811, + "rewards/rejected": -1.761251449584961, + "step": 2770 + }, + { + "epoch": 0.36, + "learning_rate": 4.013280430627936e-06, + "logits/chosen": -1.9496095180511475, + "logits/rejected": -1.8696807622909546, + "logps/chosen": -382.43658447265625, + "logps/rejected": -411.63836669921875, + "loss": 0.5828, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.233246922492981, + "rewards/margins": 0.4972565174102783, + "rewards/rejected": -1.7305036783218384, + "step": 2780 + }, + { + "epoch": 0.37, + "learning_rate": 4.004172642202002e-06, + "logits/chosen": -1.9676673412322998, + "logits/rejected": -1.7953529357910156, + "logps/chosen": -404.1937561035156, + "logps/rejected": -451.77349853515625, + "loss": 0.5252, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4179786443710327, + "rewards/margins": 0.7819381952285767, + "rewards/rejected": -2.1999170780181885, + "step": 2790 + }, + { + "epoch": 0.37, + "learning_rate": 3.995033454116512e-06, + "logits/chosen": -2.0970406532287598, + "logits/rejected": -1.9235862493515015, + "logps/chosen": -460.91064453125, + "logps/rejected": -476.86199951171875, + "loss": 0.5754, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4746737480163574, + "rewards/margins": 0.4974168837070465, + "rewards/rejected": -1.972090721130371, + "step": 2800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 0.6612156629562378, + "eval_logits/rejected": 0.6424288153648376, + "eval_logps/chosen": -449.1170654296875, + "eval_logps/rejected": -486.8488464355469, + "eval_loss": 0.5716487169265747, + "eval_rewards/accuracies": 0.6884999871253967, + "eval_rewards/chosen": -1.638243317604065, + "eval_rewards/margins": 0.5915481448173523, + "eval_rewards/rejected": -2.2297914028167725, + "eval_runtime": 1173.0043, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 2800 + }, + { + "epoch": 0.37, + "learning_rate": 3.985863057152355e-06, + "logits/chosen": -1.9077107906341553, + "logits/rejected": -1.9370073080062866, + "logps/chosen": -474.05859375, + "logps/rejected": -511.4286193847656, + "loss": 0.5503, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5730762481689453, + "rewards/margins": 0.6731056571006775, + "rewards/rejected": -2.2461819648742676, + "step": 2810 + }, + { + "epoch": 0.37, + "learning_rate": 3.976661642741908e-06, + "logits/chosen": -1.743070363998413, + "logits/rejected": -1.790679931640625, + "logps/chosen": -475.90606689453125, + "logps/rejected": -556.7145385742188, + "loss": 0.5288, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9476301670074463, + "rewards/margins": 0.7543555498123169, + "rewards/rejected": -2.7019858360290527, + "step": 2820 + }, + { + "epoch": 0.37, + "learning_rate": 3.967429402965035e-06, + "logits/chosen": -1.6348196268081665, + "logits/rejected": -1.5773961544036865, + "logps/chosen": -544.2933349609375, + "logps/rejected": -595.9285888671875, + "loss": 0.5756, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4446425437927246, + "rewards/margins": 0.5722433924674988, + "rewards/rejected": -3.016885995864868, + "step": 2830 + }, + { + "epoch": 0.37, + "learning_rate": 3.958166530545085e-06, + "logits/chosen": -1.7844722270965576, + "logits/rejected": -1.717272162437439, + "logps/chosen": -540.5704956054688, + "logps/rejected": -611.501953125, + "loss": 0.5377, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6575241088867188, + "rewards/margins": 0.7096670866012573, + "rewards/rejected": -3.3671913146972656, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 3.948873218844863e-06, + "logits/chosen": -1.512880563735962, + "logits/rejected": -1.4856570959091187, + "logps/chosen": -487.31597900390625, + "logps/rejected": -574.8258666992188, + "loss": 0.6078, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.586638927459717, + "rewards/margins": 0.6180590391159058, + "rewards/rejected": -3.204697370529175, + "step": 2850 + }, + { + "epoch": 0.37, + "learning_rate": 3.939549661862592e-06, + "logits/chosen": -1.6444950103759766, + "logits/rejected": -1.5157592296600342, + "logps/chosen": -531.1907958984375, + "logps/rejected": -595.8784790039062, + "loss": 0.5354, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4481587409973145, + "rewards/margins": 0.8699649572372437, + "rewards/rejected": -3.3181240558624268, + "step": 2860 + }, + { + "epoch": 0.38, + "learning_rate": 3.930196054227871e-06, + "logits/chosen": -1.6284847259521484, + "logits/rejected": -1.427422285079956, + "logps/chosen": -496.8154296875, + "logps/rejected": -556.5086059570312, + "loss": 0.5569, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3700180053710938, + "rewards/margins": 0.7457307577133179, + "rewards/rejected": -3.115748643875122, + "step": 2870 + }, + { + "epoch": 0.38, + "learning_rate": 3.920812591197604e-06, + "logits/chosen": -1.7529376745224, + "logits/rejected": -1.6493419408798218, + "logps/chosen": -475.435791015625, + "logps/rejected": -499.7490234375, + "loss": 0.5515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9660146236419678, + "rewards/margins": 0.6012964248657227, + "rewards/rejected": -2.5673108100891113, + "step": 2880 + }, + { + "epoch": 0.38, + "learning_rate": 3.9113994686519305e-06, + "logits/chosen": -1.8608381748199463, + "logits/rejected": -1.6812989711761475, + "logps/chosen": -464.67657470703125, + "logps/rejected": -529.2399291992188, + "loss": 0.5276, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7906898260116577, + "rewards/margins": 0.6778753995895386, + "rewards/rejected": -2.4685654640197754, + "step": 2890 + }, + { + "epoch": 0.38, + "learning_rate": 3.90195688309013e-06, + "logits/chosen": -1.8268420696258545, + "logits/rejected": -1.641605019569397, + "logps/chosen": -436.00921630859375, + "logps/rejected": -460.01434326171875, + "loss": 0.6073, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6543318033218384, + "rewards/margins": 0.5443650484085083, + "rewards/rejected": -2.1986968517303467, + "step": 2900 + }, + { + "epoch": 0.38, + "eval_logits/chosen": 0.6979252099990845, + "eval_logits/rejected": 0.7017303705215454, + "eval_logps/chosen": -440.4114685058594, + "eval_logps/rejected": -485.1723937988281, + "eval_loss": 0.5730963349342346, + "eval_rewards/accuracies": 0.6815000176429749, + "eval_rewards/chosen": -1.5511873960494995, + "eval_rewards/margins": 0.6618397831916809, + "eval_rewards/rejected": -2.213027000427246, + "eval_runtime": 1173.0012, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 2900 + }, + { + "epoch": 0.38, + "learning_rate": 3.892485031626527e-06, + "logits/chosen": -1.8934433460235596, + "logits/rejected": -1.7927643060684204, + "logps/chosen": -420.6212463378906, + "logps/rejected": -468.4596252441406, + "loss": 0.5783, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.44437575340271, + "rewards/margins": 0.6150509119033813, + "rewards/rejected": -2.059426784515381, + "step": 2910 + }, + { + "epoch": 0.38, + "learning_rate": 3.882984111986371e-06, + "logits/chosen": -1.9982595443725586, + "logits/rejected": -1.9308195114135742, + "logps/chosen": -424.13134765625, + "logps/rejected": -436.39080810546875, + "loss": 0.5856, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.303920030593872, + "rewards/margins": 0.5523670315742493, + "rewards/rejected": -1.8562867641448975, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 3.873454322501711e-06, + "logits/chosen": -2.102571964263916, + "logits/rejected": -1.9521716833114624, + "logps/chosen": -388.68817138671875, + "logps/rejected": -427.4130859375, + "loss": 0.56, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9647110104560852, + "rewards/margins": 0.6733750104904175, + "rewards/rejected": -1.6380860805511475, + "step": 2930 + }, + { + "epoch": 0.38, + "learning_rate": 3.863895862107255e-06, + "logits/chosen": -2.160048484802246, + "logits/rejected": -1.9776769876480103, + "logps/chosen": -365.2091369628906, + "logps/rejected": -466.8072814941406, + "loss": 0.4624, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8954636454582214, + "rewards/margins": 0.8491466641426086, + "rewards/rejected": -1.7446101903915405, + "step": 2940 + }, + { + "epoch": 0.39, + "learning_rate": 3.854308930336216e-06, + "logits/chosen": -2.07346510887146, + "logits/rejected": -1.9108175039291382, + "logps/chosen": -432.8681640625, + "logps/rejected": -447.23291015625, + "loss": 0.5574, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.072650671005249, + "rewards/margins": 0.6242086291313171, + "rewards/rejected": -1.6968591213226318, + "step": 2950 + }, + { + "epoch": 0.39, + "learning_rate": 3.844693727316151e-06, + "logits/chosen": -2.0675299167633057, + "logits/rejected": -1.8454557657241821, + "logps/chosen": -419.49755859375, + "logps/rejected": -450.098876953125, + "loss": 0.5041, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2237988710403442, + "rewards/margins": 0.754100501537323, + "rewards/rejected": -1.9778993129730225, + "step": 2960 + }, + { + "epoch": 0.39, + "learning_rate": 3.835050453764779e-06, + "logits/chosen": -1.8740476369857788, + "logits/rejected": -1.7978649139404297, + "logps/chosen": -379.86651611328125, + "logps/rejected": -450.74761962890625, + "loss": 0.505, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1377508640289307, + "rewards/margins": 0.9418436288833618, + "rewards/rejected": -2.079594373703003, + "step": 2970 + }, + { + "epoch": 0.39, + "learning_rate": 3.825379310985792e-06, + "logits/chosen": -1.916658639907837, + "logits/rejected": -1.807265281677246, + "logps/chosen": -402.7890625, + "logps/rejected": -454.7411193847656, + "loss": 0.5985, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3055946826934814, + "rewards/margins": 0.5781377553939819, + "rewards/rejected": -1.8837321996688843, + "step": 2980 + }, + { + "epoch": 0.39, + "learning_rate": 3.815680500864651e-06, + "logits/chosen": -2.0287046432495117, + "logits/rejected": -1.9784412384033203, + "logps/chosen": -419.54022216796875, + "logps/rejected": -433.28021240234375, + "loss": 0.5507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0421308279037476, + "rewards/margins": 0.5592368245124817, + "rewards/rejected": -1.6013675928115845, + "step": 2990 + }, + { + "epoch": 0.39, + "learning_rate": 3.80595422586438e-06, + "logits/chosen": -1.9907076358795166, + "logits/rejected": -1.922621488571167, + "logps/chosen": -466.78118896484375, + "logps/rejected": -450.73126220703125, + "loss": 0.6283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2883647680282593, + "rewards/margins": 0.5715576410293579, + "rewards/rejected": -1.8599224090576172, + "step": 3000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 0.5950943827629089, + "eval_logits/rejected": 0.6221369504928589, + "eval_logps/chosen": -416.3377685546875, + "eval_logps/rejected": -463.23724365234375, + "eval_loss": 0.5645180940628052, + "eval_rewards/accuracies": 0.6859999895095825, + "eval_rewards/chosen": -1.3104503154754639, + "eval_rewards/margins": 0.6832253932952881, + "eval_rewards/rejected": -1.9936758279800415, + "eval_runtime": 1173.5431, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 3.7962006890213266e-06, + "logits/chosen": -1.7931913137435913, + "logits/rejected": -1.710323691368103, + "logps/chosen": -388.6459045410156, + "logps/rejected": -430.0355529785156, + "loss": 0.6373, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4123424291610718, + "rewards/margins": 0.43220600485801697, + "rewards/rejected": -1.8445484638214111, + "step": 3010 + }, + { + "epoch": 0.4, + "learning_rate": 3.7864200939409336e-06, + "logits/chosen": -2.0017385482788086, + "logits/rejected": -1.7524493932724, + "logps/chosen": -409.5406799316406, + "logps/rejected": -428.1637268066406, + "loss": 0.6217, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2341853380203247, + "rewards/margins": 0.4431460499763489, + "rewards/rejected": -1.6773313283920288, + "step": 3020 + }, + { + "epoch": 0.4, + "learning_rate": 3.7766126447934857e-06, + "logits/chosen": -2.037930965423584, + "logits/rejected": -1.996689796447754, + "logps/chosen": -369.6080627441406, + "logps/rejected": -401.9655456542969, + "loss": 0.5999, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0221846103668213, + "rewards/margins": 0.5410764813423157, + "rewards/rejected": -1.5632610321044922, + "step": 3030 + }, + { + "epoch": 0.4, + "learning_rate": 3.766778546309847e-06, + "logits/chosen": -2.0481367111206055, + "logits/rejected": -1.9539272785186768, + "logps/chosen": -416.8006896972656, + "logps/rejected": -385.1816711425781, + "loss": 0.5683, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9949377179145813, + "rewards/margins": 0.5998483896255493, + "rewards/rejected": -1.5947860479354858, + "step": 3040 + }, + { + "epoch": 0.4, + "learning_rate": 3.7569180037771868e-06, + "logits/chosen": -2.083566188812256, + "logits/rejected": -2.0596466064453125, + "logps/chosen": -388.3544006347656, + "logps/rejected": -429.990478515625, + "loss": 0.6271, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1148223876953125, + "rewards/margins": 0.4753757417201996, + "rewards/rejected": -1.5901981592178345, + "step": 3050 + }, + { + "epoch": 0.4, + "learning_rate": 3.7470312230346955e-06, + "logits/chosen": -1.9745086431503296, + "logits/rejected": -1.7619606256484985, + "logps/chosen": -418.34307861328125, + "logps/rejected": -419.50396728515625, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8757292628288269, + "rewards/margins": 0.6808592081069946, + "rewards/rejected": -1.5565884113311768, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 3.7371184104692857e-06, + "logits/chosen": -2.1536831855773926, + "logits/rejected": -2.0850067138671875, + "logps/chosen": -453.22320556640625, + "logps/rejected": -438.8262634277344, + "loss": 0.5683, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0526719093322754, + "rewards/margins": 0.5703314542770386, + "rewards/rejected": -1.623003363609314, + "step": 3070 + }, + { + "epoch": 0.4, + "learning_rate": 3.727179773011289e-06, + "logits/chosen": -1.9461936950683594, + "logits/rejected": -1.921303391456604, + "logps/chosen": -414.42388916015625, + "logps/rejected": -433.32769775390625, + "loss": 0.6566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2511370182037354, + "rewards/margins": 0.3483005166053772, + "rewards/rejected": -1.5994374752044678, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 3.717215518130127e-06, + "logits/chosen": -1.8317668437957764, + "logits/rejected": -1.7200689315795898, + "logps/chosen": -411.60223388671875, + "logps/rejected": -431.7054138183594, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4211969375610352, + "rewards/margins": 0.3373275399208069, + "rewards/rejected": -1.7585245370864868, + "step": 3090 + }, + { + "epoch": 0.41, + "learning_rate": 3.7072258538299923e-06, + "logits/chosen": -2.1221868991851807, + "logits/rejected": -1.958105444908142, + "logps/chosen": -490.62860107421875, + "logps/rejected": -451.4012756347656, + "loss": 0.5199, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2549514770507812, + "rewards/margins": 0.6226548552513123, + "rewards/rejected": -1.8776063919067383, + "step": 3100 + }, + { + "epoch": 0.41, + "eval_logits/chosen": 0.4091779887676239, + "eval_logits/rejected": 0.4404120445251465, + "eval_logps/chosen": -401.47412109375, + "eval_logps/rejected": -437.728271484375, + "eval_loss": 0.5584598183631897, + "eval_rewards/accuracies": 0.6940000057220459, + "eval_rewards/chosen": -1.1618139743804932, + "eval_rewards/margins": 0.5767720341682434, + "eval_rewards/rejected": -1.7385860681533813, + "eval_runtime": 1173.482, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3100 + }, + { + "epoch": 0.41, + "learning_rate": 3.6972109886454933e-06, + "logits/chosen": -1.927145004272461, + "logits/rejected": -1.981048345565796, + "logps/chosen": -397.15765380859375, + "logps/rejected": -427.95953369140625, + "loss": 0.5795, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2384923696517944, + "rewards/margins": 0.5860485434532166, + "rewards/rejected": -1.8245410919189453, + "step": 3110 + }, + { + "epoch": 0.41, + "learning_rate": 3.687171131637314e-06, + "logits/chosen": -1.850130319595337, + "logits/rejected": -1.8024015426635742, + "logps/chosen": -407.8834533691406, + "logps/rejected": -428.4339294433594, + "loss": 0.5739, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1514432430267334, + "rewards/margins": 0.4715694785118103, + "rewards/rejected": -1.6230125427246094, + "step": 3120 + }, + { + "epoch": 0.41, + "learning_rate": 3.677106492387839e-06, + "logits/chosen": -2.0786144733428955, + "logits/rejected": -1.9395787715911865, + "logps/chosen": -412.42626953125, + "logps/rejected": -394.81634521484375, + "loss": 0.6361, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1597317457199097, + "rewards/margins": 0.430908739566803, + "rewards/rejected": -1.5906405448913574, + "step": 3130 + }, + { + "epoch": 0.41, + "learning_rate": 3.6670172809967865e-06, + "logits/chosen": -1.8933837413787842, + "logits/rejected": -1.7607101202011108, + "logps/chosen": -350.9492492675781, + "logps/rejected": -383.0877380371094, + "loss": 0.5527, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2105662822723389, + "rewards/margins": 0.5227794051170349, + "rewards/rejected": -1.733345627784729, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 3.6569037080768153e-06, + "logits/chosen": -2.1721205711364746, + "logits/rejected": -2.0236735343933105, + "logps/chosen": -381.0616760253906, + "logps/rejected": -455.51458740234375, + "loss": 0.5521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2160230875015259, + "rewards/margins": 0.643632709980011, + "rewards/rejected": -1.859655737876892, + "step": 3150 + }, + { + "epoch": 0.41, + "learning_rate": 3.646765984749137e-06, + "logits/chosen": -2.021472692489624, + "logits/rejected": -2.0420773029327393, + "logps/chosen": -410.3907165527344, + "logps/rejected": -466.248779296875, + "loss": 0.5928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.247714638710022, + "rewards/margins": 0.5578851699829102, + "rewards/rejected": -1.8055998086929321, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 3.6366043226391e-06, + "logits/chosen": -1.936244010925293, + "logits/rejected": -1.8434072732925415, + "logps/chosen": -406.29278564453125, + "logps/rejected": -426.49432373046875, + "loss": 0.5065, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1206307411193848, + "rewards/margins": 0.6835092306137085, + "rewards/rejected": -1.8041400909423828, + "step": 3170 + }, + { + "epoch": 0.42, + "learning_rate": 3.6264189338717766e-06, + "logits/chosen": -2.2497780323028564, + "logits/rejected": -2.100496292114258, + "logps/chosen": -429.8587341308594, + "logps/rejected": -444.8746032714844, + "loss": 0.57, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3635942935943604, + "rewards/margins": 0.5040538907051086, + "rewards/rejected": -1.8676483631134033, + "step": 3180 + }, + { + "epoch": 0.42, + "learning_rate": 3.6162100310675334e-06, + "logits/chosen": -2.049527406692505, + "logits/rejected": -2.0114481449127197, + "logps/chosen": -408.66357421875, + "logps/rejected": -434.10565185546875, + "loss": 0.6632, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2964597940444946, + "rewards/margins": 0.35519662499427795, + "rewards/rejected": -1.6516563892364502, + "step": 3190 + }, + { + "epoch": 0.42, + "learning_rate": 3.605977827337596e-06, + "logits/chosen": -1.9362224340438843, + "logits/rejected": -1.8631280660629272, + "logps/chosen": -393.43902587890625, + "logps/rejected": -437.18450927734375, + "loss": 0.5658, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1848984956741333, + "rewards/margins": 0.6274099349975586, + "rewards/rejected": -1.8123083114624023, + "step": 3200 + }, + { + "epoch": 0.42, + "eval_logits/chosen": 0.25350603461265564, + "eval_logits/rejected": 0.30752307176589966, + "eval_logps/chosen": -404.454833984375, + "eval_logps/rejected": -440.9099426269531, + "eval_loss": 0.5603488087654114, + "eval_rewards/accuracies": 0.6959999799728394, + "eval_rewards/chosen": -1.19162118434906, + "eval_rewards/margins": 0.5787816047668457, + "eval_rewards/rejected": -1.7704027891159058, + "eval_runtime": 1173.3181, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 3200 + }, + { + "epoch": 0.42, + "learning_rate": 3.595722536279595e-06, + "logits/chosen": -2.178765296936035, + "logits/rejected": -1.8713560104370117, + "logps/chosen": -452.17724609375, + "logps/rejected": -454.56707763671875, + "loss": 0.4681, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.030299425125122, + "rewards/margins": 0.7594797611236572, + "rewards/rejected": -1.7897790670394897, + "step": 3210 + }, + { + "epoch": 0.42, + "learning_rate": 3.58544437197311e-06, + "logits/chosen": -1.9322439432144165, + "logits/rejected": -1.8333985805511475, + "logps/chosen": -403.7508544921875, + "logps/rejected": -441.99371337890625, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.078952670097351, + "rewards/margins": 0.6580331921577454, + "rewards/rejected": -1.7369858026504517, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 3.5751435489752025e-06, + "logits/chosen": -1.907280683517456, + "logits/rejected": -1.8821004629135132, + "logps/chosen": -380.5797119140625, + "logps/rejected": -421.6756286621094, + "loss": 0.5067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.110288381576538, + "rewards/margins": 0.7093003988265991, + "rewards/rejected": -1.8195888996124268, + "step": 3230 + }, + { + "epoch": 0.42, + "learning_rate": 3.5648202823159317e-06, + "logits/chosen": -1.8815323114395142, + "logits/rejected": -1.8551346063613892, + "logps/chosen": -372.4736022949219, + "logps/rejected": -472.7781677246094, + "loss": 0.5164, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1829547882080078, + "rewards/margins": 0.7592514753341675, + "rewards/rejected": -1.9422063827514648, + "step": 3240 + }, + { + "epoch": 0.43, + "learning_rate": 3.554474787493873e-06, + "logits/chosen": -1.8966144323349, + "logits/rejected": -1.6929298639297485, + "logps/chosen": -440.6084899902344, + "logps/rejected": -500.8756408691406, + "loss": 0.5411, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.260031819343567, + "rewards/margins": 0.7993737459182739, + "rewards/rejected": -2.059405565261841, + "step": 3250 + }, + { + "epoch": 0.43, + "learning_rate": 3.5441072804716125e-06, + "logits/chosen": -1.9628206491470337, + "logits/rejected": -1.8756574392318726, + "logps/chosen": -446.837158203125, + "logps/rejected": -532.57568359375, + "loss": 0.5969, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.32576584815979, + "rewards/margins": 0.7321029901504517, + "rewards/rejected": -2.0578689575195312, + "step": 3260 + }, + { + "epoch": 0.43, + "learning_rate": 3.5337179776712427e-06, + "logits/chosen": -1.8174540996551514, + "logits/rejected": -1.7179148197174072, + "logps/chosen": -422.1940002441406, + "logps/rejected": -487.786865234375, + "loss": 0.575, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4892836809158325, + "rewards/margins": 0.698641300201416, + "rewards/rejected": -2.187924861907959, + "step": 3270 + }, + { + "epoch": 0.43, + "learning_rate": 3.5233070959698445e-06, + "logits/chosen": -1.973497748374939, + "logits/rejected": -1.8461055755615234, + "logps/chosen": -470.6715393066406, + "logps/rejected": -477.07208251953125, + "loss": 0.6291, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5432910919189453, + "rewards/margins": 0.46076974272727966, + "rewards/rejected": -2.004060745239258, + "step": 3280 + }, + { + "epoch": 0.43, + "learning_rate": 3.512874852694959e-06, + "logits/chosen": -1.8944342136383057, + "logits/rejected": -1.7159817218780518, + "logps/chosen": -417.69781494140625, + "logps/rejected": -458.3694763183594, + "loss": 0.5366, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.293985366821289, + "rewards/margins": 0.6382049322128296, + "rewards/rejected": -1.9321902990341187, + "step": 3290 + }, + { + "epoch": 0.43, + "learning_rate": 3.5024214656200497e-06, + "logits/chosen": -1.996705412864685, + "logits/rejected": -1.7104390859603882, + "logps/chosen": -444.9064025878906, + "logps/rejected": -443.6212463378906, + "loss": 0.6214, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.434211015701294, + "rewards/margins": 0.5524640679359436, + "rewards/rejected": -1.9866750240325928, + "step": 3300 + }, + { + "epoch": 0.43, + "eval_logits/chosen": 0.656358540058136, + "eval_logits/rejected": 0.6742247343063354, + "eval_logps/chosen": -418.947998046875, + "eval_logps/rejected": -460.59857177734375, + "eval_loss": 0.5605348944664001, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -1.336552619934082, + "eval_rewards/margins": 0.6307366490364075, + "eval_rewards/rejected": -1.9672893285751343, + "eval_runtime": 1173.3928, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 3.491947152959958e-06, + "logits/chosen": -2.102064847946167, + "logits/rejected": -1.9289686679840088, + "logps/chosen": -437.4242248535156, + "logps/rejected": -465.6644592285156, + "loss": 0.5751, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2784984111785889, + "rewards/margins": 0.5344318747520447, + "rewards/rejected": -1.8129304647445679, + "step": 3310 + }, + { + "epoch": 0.43, + "learning_rate": 3.4814521333663497e-06, + "logits/chosen": -2.1381969451904297, + "logits/rejected": -2.014378547668457, + "logps/chosen": -461.36907958984375, + "logps/rejected": -440.06787109375, + "loss": 0.5694, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1574351787567139, + "rewards/margins": 0.610199511051178, + "rewards/rejected": -1.767634630203247, + "step": 3320 + }, + { + "epoch": 0.44, + "learning_rate": 3.4709366259231468e-06, + "logits/chosen": -1.9088739156723022, + "logits/rejected": -1.7347705364227295, + "logps/chosen": -422.6807556152344, + "logps/rejected": -433.4283142089844, + "loss": 0.5888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.172013282775879, + "rewards/margins": 0.5078199505805969, + "rewards/rejected": -1.6798330545425415, + "step": 3330 + }, + { + "epoch": 0.44, + "learning_rate": 3.460400850141956e-06, + "logits/chosen": -1.9768264293670654, + "logits/rejected": -1.7686551809310913, + "logps/chosen": -385.18499755859375, + "logps/rejected": -421.01708984375, + "loss": 0.5462, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3630168437957764, + "rewards/margins": 0.5688962936401367, + "rewards/rejected": -1.931913137435913, + "step": 3340 + }, + { + "epoch": 0.44, + "learning_rate": 3.4498450259574858e-06, + "logits/chosen": -1.8806072473526, + "logits/rejected": -1.845616340637207, + "logps/chosen": -414.7618713378906, + "logps/rejected": -438.54022216796875, + "loss": 0.6045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3012163639068604, + "rewards/margins": 0.42003631591796875, + "rewards/rejected": -1.7212527990341187, + "step": 3350 + }, + { + "epoch": 0.44, + "learning_rate": 3.439269373722957e-06, + "logits/chosen": -1.8833872079849243, + "logits/rejected": -1.8121143579483032, + "logps/chosen": -402.79156494140625, + "logps/rejected": -436.91632080078125, + "loss": 0.5561, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2642663717269897, + "rewards/margins": 0.6149861812591553, + "rewards/rejected": -1.8792526721954346, + "step": 3360 + }, + { + "epoch": 0.44, + "learning_rate": 3.4286741142055014e-06, + "logits/chosen": -1.9846508502960205, + "logits/rejected": -1.9692661762237549, + "logps/chosen": -426.28167724609375, + "logps/rejected": -463.14971923828125, + "loss": 0.5499, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2158000469207764, + "rewards/margins": 0.5520261526107788, + "rewards/rejected": -1.7678263187408447, + "step": 3370 + }, + { + "epoch": 0.44, + "learning_rate": 3.4180594685815536e-06, + "logits/chosen": -1.9412851333618164, + "logits/rejected": -1.8633683919906616, + "logps/chosen": -368.2373046875, + "logps/rejected": -419.0936584472656, + "loss": 0.5812, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.277923583984375, + "rewards/margins": 0.536681056022644, + "rewards/rejected": -1.8146045207977295, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 3.4074256584322336e-06, + "logits/chosen": -1.9246950149536133, + "logits/rejected": -1.8012104034423828, + "logps/chosen": -366.427978515625, + "logps/rejected": -421.9742736816406, + "loss": 0.486, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0420225858688354, + "rewards/margins": 0.7852731347084045, + "rewards/rejected": -1.8272956609725952, + "step": 3390 + }, + { + "epoch": 0.44, + "learning_rate": 3.3967729057387213e-06, + "logits/chosen": -1.937070608139038, + "logits/rejected": -1.8224836587905884, + "logps/chosen": -417.2428283691406, + "logps/rejected": -430.810791015625, + "loss": 0.581, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0852601528167725, + "rewards/margins": 0.539286732673645, + "rewards/rejected": -1.624547004699707, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_logits/chosen": 0.5448690056800842, + "eval_logits/rejected": 0.583937406539917, + "eval_logps/chosen": -398.8811950683594, + "eval_logps/rejected": -440.7017517089844, + "eval_loss": 0.5563305616378784, + "eval_rewards/accuracies": 0.6984999775886536, + "eval_rewards/chosen": -1.1358840465545654, + "eval_rewards/margins": 0.6324369311332703, + "eval_rewards/rejected": -1.7683210372924805, + "eval_runtime": 1173.8192, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3400 + }, + { + "epoch": 0.45, + "learning_rate": 3.386101432877624e-06, + "logits/chosen": -2.051443099975586, + "logits/rejected": -1.9577932357788086, + "logps/chosen": -393.5393981933594, + "logps/rejected": -406.2938537597656, + "loss": 0.5609, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0965825319290161, + "rewards/margins": 0.5381280779838562, + "rewards/rejected": -1.6347105503082275, + "step": 3410 + }, + { + "epoch": 0.45, + "learning_rate": 3.375411462616332e-06, + "logits/chosen": -2.104078769683838, + "logits/rejected": -1.994580864906311, + "logps/chosen": -422.4803771972656, + "logps/rejected": -492.143310546875, + "loss": 0.5181, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1625874042510986, + "rewards/margins": 0.6946650743484497, + "rewards/rejected": -1.8572524785995483, + "step": 3420 + }, + { + "epoch": 0.45, + "learning_rate": 3.3647032181083696e-06, + "logits/chosen": -2.0899133682250977, + "logits/rejected": -1.9766260385513306, + "logps/chosen": -455.85955810546875, + "logps/rejected": -512.9901123046875, + "loss": 0.4945, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3426158428192139, + "rewards/margins": 0.7741657495498657, + "rewards/rejected": -2.116781711578369, + "step": 3430 + }, + { + "epoch": 0.45, + "learning_rate": 3.3539769228887382e-06, + "logits/chosen": -2.0404748916625977, + "logits/rejected": -1.9174991846084595, + "logps/chosen": -448.553955078125, + "logps/rejected": -503.87762451171875, + "loss": 0.5073, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2138594388961792, + "rewards/margins": 0.6975424885749817, + "rewards/rejected": -1.9114019870758057, + "step": 3440 + }, + { + "epoch": 0.45, + "learning_rate": 3.343232800869247e-06, + "logits/chosen": -1.865696907043457, + "logits/rejected": -1.7030518054962158, + "logps/chosen": -385.63775634765625, + "logps/rejected": -397.9042663574219, + "loss": 0.5341, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3254315853118896, + "rewards/margins": 0.6739403009414673, + "rewards/rejected": -1.999372124671936, + "step": 3450 + }, + { + "epoch": 0.45, + "learning_rate": 3.33247107633384e-06, + "logits/chosen": -1.8744150400161743, + "logits/rejected": -1.8472354412078857, + "logps/chosen": -425.00030517578125, + "logps/rejected": -494.12347412109375, + "loss": 0.5114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3889572620391846, + "rewards/margins": 0.8310378789901733, + "rewards/rejected": -2.2199950218200684, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 3.3216919739339155e-06, + "logits/chosen": -1.9275439977645874, + "logits/rejected": -1.855700135231018, + "logps/chosen": -459.2312927246094, + "logps/rejected": -486.5613708496094, + "loss": 0.4922, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5085200071334839, + "rewards/margins": 0.8183439373970032, + "rewards/rejected": -2.3268637657165527, + "step": 3470 + }, + { + "epoch": 0.46, + "learning_rate": 3.310895718683635e-06, + "logits/chosen": -1.9474788904190063, + "logits/rejected": -1.896143913269043, + "logps/chosen": -452.40032958984375, + "logps/rejected": -480.8934020996094, + "loss": 0.616, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4318238496780396, + "rewards/margins": 0.533476710319519, + "rewards/rejected": -1.9653003215789795, + "step": 3480 + }, + { + "epoch": 0.46, + "learning_rate": 3.3000825359552256e-06, + "logits/chosen": -1.9331462383270264, + "logits/rejected": -1.8510589599609375, + "logps/chosen": -394.94024658203125, + "logps/rejected": -465.19512939453125, + "loss": 0.5357, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0899837017059326, + "rewards/margins": 0.7150499224662781, + "rewards/rejected": -1.8050334453582764, + "step": 3490 + }, + { + "epoch": 0.46, + "learning_rate": 3.2892526514742778e-06, + "logits/chosen": -1.9553263187408447, + "logits/rejected": -1.8354969024658203, + "logps/chosen": -401.61383056640625, + "logps/rejected": -421.52099609375, + "loss": 0.5422, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.112273931503296, + "rewards/margins": 0.6440818905830383, + "rewards/rejected": -1.7563560009002686, + "step": 3500 + }, + { + "epoch": 0.46, + "eval_logits/chosen": 0.5330484509468079, + "eval_logits/rejected": 0.5734737515449524, + "eval_logps/chosen": -388.93182373046875, + "eval_logps/rejected": -425.3734130859375, + "eval_loss": 0.5589743852615356, + "eval_rewards/accuracies": 0.6915000081062317, + "eval_rewards/chosen": -1.0363909006118774, + "eval_rewards/margins": 0.5786464810371399, + "eval_rewards/rejected": -1.615037441253662, + "eval_runtime": 1173.0533, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 3500 + }, + { + "epoch": 0.46, + "learning_rate": 3.27840629131503e-06, + "logits/chosen": -2.0102248191833496, + "logits/rejected": -1.8568379878997803, + "logps/chosen": -396.32257080078125, + "logps/rejected": -432.460693359375, + "loss": 0.5452, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.035860300064087, + "rewards/margins": 0.662264883518219, + "rewards/rejected": -1.6981252431869507, + "step": 3510 + }, + { + "epoch": 0.46, + "learning_rate": 3.2675436818956522e-06, + "logits/chosen": -2.016779661178589, + "logits/rejected": -1.9230639934539795, + "logps/chosen": -359.00775146484375, + "logps/rejected": -420.87335205078125, + "loss": 0.5526, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9780494570732117, + "rewards/margins": 0.529609203338623, + "rewards/rejected": -1.5076587200164795, + "step": 3520 + }, + { + "epoch": 0.46, + "learning_rate": 3.2566650499735185e-06, + "logits/chosen": -1.8304370641708374, + "logits/rejected": -1.7253713607788086, + "logps/chosen": -419.9305725097656, + "logps/rejected": -476.528076171875, + "loss": 0.4987, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.106669545173645, + "rewards/margins": 0.8515464067459106, + "rewards/rejected": -1.9582157135009766, + "step": 3530 + }, + { + "epoch": 0.46, + "learning_rate": 3.2457706226404715e-06, + "logits/chosen": -1.9432268142700195, + "logits/rejected": -1.873006820678711, + "logps/chosen": -408.003173828125, + "logps/rejected": -421.4966735839844, + "loss": 0.6014, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2254436016082764, + "rewards/margins": 0.5503524541854858, + "rewards/rejected": -1.7757961750030518, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 3.2348606273180847e-06, + "logits/chosen": -2.1072030067443848, + "logits/rejected": -1.8886438608169556, + "logps/chosen": -429.07977294921875, + "logps/rejected": -406.6357421875, + "loss": 0.5499, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0243723392486572, + "rewards/margins": 0.6358232498168945, + "rewards/rejected": -1.6601955890655518, + "step": 3550 + }, + { + "epoch": 0.47, + "learning_rate": 3.2239352917529165e-06, + "logits/chosen": -2.1119418144226074, + "logits/rejected": -1.9380228519439697, + "logps/chosen": -445.6163024902344, + "logps/rejected": -476.1900939941406, + "loss": 0.5612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2269493341445923, + "rewards/margins": 0.5179942846298218, + "rewards/rejected": -1.744943380355835, + "step": 3560 + }, + { + "epoch": 0.47, + "learning_rate": 3.2129948440117487e-06, + "logits/chosen": -1.987892508506775, + "logits/rejected": -1.941389799118042, + "logps/chosen": -425.90399169921875, + "logps/rejected": -453.47479248046875, + "loss": 0.6117, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4512683153152466, + "rewards/margins": 0.43263331055641174, + "rewards/rejected": -1.883901596069336, + "step": 3570 + }, + { + "epoch": 0.47, + "learning_rate": 3.202039512476833e-06, + "logits/chosen": -1.837915062904358, + "logits/rejected": -1.7621958255767822, + "logps/chosen": -383.6486511230469, + "logps/rejected": -465.4740295410156, + "loss": 0.4819, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3156425952911377, + "rewards/margins": 0.8208361864089966, + "rewards/rejected": -2.136478900909424, + "step": 3580 + }, + { + "epoch": 0.47, + "learning_rate": 3.1910695258411216e-06, + "logits/chosen": -2.027493715286255, + "logits/rejected": -1.7317771911621094, + "logps/chosen": -396.72076416015625, + "logps/rejected": -412.47100830078125, + "loss": 0.5382, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1293015480041504, + "rewards/margins": 0.7101532816886902, + "rewards/rejected": -1.8394548892974854, + "step": 3590 + }, + { + "epoch": 0.47, + "learning_rate": 3.1800851131034904e-06, + "logits/chosen": -1.9563058614730835, + "logits/rejected": -1.87738835811615, + "logps/chosen": -409.775146484375, + "logps/rejected": -439.15972900390625, + "loss": 0.5626, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2867438793182373, + "rewards/margins": 0.6279332041740417, + "rewards/rejected": -1.9146772623062134, + "step": 3600 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 0.7520022988319397, + "eval_logits/rejected": 0.786247968673706, + "eval_logps/chosen": -396.4902038574219, + "eval_logps/rejected": -438.8792419433594, + "eval_loss": 0.5602012276649475, + "eval_rewards/accuracies": 0.6909999847412109, + "eval_rewards/chosen": -1.1119749546051025, + "eval_rewards/margins": 0.6381211876869202, + "eval_rewards/rejected": -1.750096082687378, + "eval_runtime": 1173.4035, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3600 + }, + { + "epoch": 0.47, + "learning_rate": 3.169086503563962e-06, + "logits/chosen": -2.0438876152038574, + "logits/rejected": -1.9742358922958374, + "logps/chosen": -373.54742431640625, + "logps/rejected": -449.31475830078125, + "loss": 0.562, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9906983375549316, + "rewards/margins": 0.6011732220649719, + "rewards/rejected": -1.5918715000152588, + "step": 3610 + }, + { + "epoch": 0.47, + "learning_rate": 3.1580739268189165e-06, + "logits/chosen": -1.9275391101837158, + "logits/rejected": -1.7515623569488525, + "logps/chosen": -396.3177490234375, + "logps/rejected": -434.313232421875, + "loss": 0.518, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0211318731307983, + "rewards/margins": 0.7688616514205933, + "rewards/rejected": -1.7899936437606812, + "step": 3620 + }, + { + "epoch": 0.48, + "learning_rate": 3.147047612756302e-06, + "logits/chosen": -1.8347206115722656, + "logits/rejected": -1.8424797058105469, + "logps/chosen": -425.95281982421875, + "logps/rejected": -481.7438049316406, + "loss": 0.5059, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0812221765518188, + "rewards/margins": 0.7794182896614075, + "rewards/rejected": -1.860640287399292, + "step": 3630 + }, + { + "epoch": 0.48, + "learning_rate": 3.136007791550833e-06, + "logits/chosen": -1.7254505157470703, + "logits/rejected": -1.5392547845840454, + "logps/chosen": -370.35992431640625, + "logps/rejected": -395.29742431640625, + "loss": 0.5626, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.103926658630371, + "rewards/margins": 0.6251890063285828, + "rewards/rejected": -1.7291157245635986, + "step": 3640 + }, + { + "epoch": 0.48, + "learning_rate": 3.1249546936591848e-06, + "logits/chosen": -1.8757476806640625, + "logits/rejected": -1.7597439289093018, + "logps/chosen": -354.79046630859375, + "logps/rejected": -415.020751953125, + "loss": 0.5309, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9962058067321777, + "rewards/margins": 0.6632649898529053, + "rewards/rejected": -1.659470796585083, + "step": 3650 + }, + { + "epoch": 0.48, + "learning_rate": 3.1138885498151843e-06, + "logits/chosen": -1.6727317571640015, + "logits/rejected": -1.6530840396881104, + "logps/chosen": -423.81573486328125, + "logps/rejected": -481.62615966796875, + "loss": 0.443, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3244787454605103, + "rewards/margins": 1.0427327156066895, + "rewards/rejected": -2.3672115802764893, + "step": 3660 + }, + { + "epoch": 0.48, + "learning_rate": 3.1028095910249937e-06, + "logits/chosen": -1.9963428974151611, + "logits/rejected": -1.7119239568710327, + "logps/chosen": -432.7828063964844, + "logps/rejected": -448.8419494628906, + "loss": 0.5304, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3926994800567627, + "rewards/margins": 0.7119554281234741, + "rewards/rejected": -2.1046550273895264, + "step": 3670 + }, + { + "epoch": 0.48, + "learning_rate": 3.0917180485622895e-06, + "logits/chosen": -1.8475887775421143, + "logits/rejected": -1.564581274986267, + "logps/chosen": -446.0194396972656, + "logps/rejected": -470.3948669433594, + "loss": 0.576, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.546870231628418, + "rewards/margins": 0.7647116184234619, + "rewards/rejected": -2.311582088470459, + "step": 3680 + }, + { + "epoch": 0.48, + "learning_rate": 3.0806141539634294e-06, + "logits/chosen": -1.904123306274414, + "logits/rejected": -1.6115459203720093, + "logps/chosen": -418.052001953125, + "logps/rejected": -437.1920471191406, + "loss": 0.557, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.486206293106079, + "rewards/margins": 0.6802877187728882, + "rewards/rejected": -2.1664938926696777, + "step": 3690 + }, + { + "epoch": 0.48, + "learning_rate": 3.069498139022624e-06, + "logits/chosen": -2.0392978191375732, + "logits/rejected": -1.8265810012817383, + "logps/chosen": -451.705078125, + "logps/rejected": -444.59954833984375, + "loss": 0.627, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4540126323699951, + "rewards/margins": 0.47257882356643677, + "rewards/rejected": -1.9265915155410767, + "step": 3700 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 0.8575670123100281, + "eval_logits/rejected": 0.8809006810188293, + "eval_logps/chosen": -413.73907470703125, + "eval_logps/rejected": -458.7536926269531, + "eval_loss": 0.5579034686088562, + "eval_rewards/accuracies": 0.6934999823570251, + "eval_rewards/chosen": -1.2844632863998413, + "eval_rewards/margins": 0.6643770337104797, + "eval_rewards/rejected": -1.9488401412963867, + "eval_runtime": 1173.4041, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3700 + }, + { + "epoch": 0.49, + "learning_rate": 3.0583702357870964e-06, + "logits/chosen": -1.8973909616470337, + "logits/rejected": -1.836904525756836, + "logps/chosen": -453.96282958984375, + "logps/rejected": -506.8397521972656, + "loss": 0.5881, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2685407400131226, + "rewards/margins": 0.5457057952880859, + "rewards/rejected": -1.8142467737197876, + "step": 3710 + }, + { + "epoch": 0.49, + "learning_rate": 3.0472306765522393e-06, + "logits/chosen": -1.9780431985855103, + "logits/rejected": -1.767011284828186, + "logps/chosen": -373.0862731933594, + "logps/rejected": -420.5169982910156, + "loss": 0.5546, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0875823497772217, + "rewards/margins": 0.7041456699371338, + "rewards/rejected": -1.7917280197143555, + "step": 3720 + }, + { + "epoch": 0.49, + "learning_rate": 3.0360796938567628e-06, + "logits/chosen": -2.005666732788086, + "logits/rejected": -1.856957197189331, + "logps/chosen": -408.0935974121094, + "logps/rejected": -428.3479919433594, + "loss": 0.5623, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1865909099578857, + "rewards/margins": 0.6112765073776245, + "rewards/rejected": -1.7978674173355103, + "step": 3730 + }, + { + "epoch": 0.49, + "learning_rate": 3.0249175204778435e-06, + "logits/chosen": -1.895155668258667, + "logits/rejected": -1.882965326309204, + "logps/chosen": -400.49566650390625, + "logps/rejected": -446.05010986328125, + "loss": 0.5208, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2255418300628662, + "rewards/margins": 0.6657425165176392, + "rewards/rejected": -1.8912845849990845, + "step": 3740 + }, + { + "epoch": 0.49, + "learning_rate": 3.0137443894262634e-06, + "logits/chosen": -1.6782268285751343, + "logits/rejected": -1.5924017429351807, + "logps/chosen": -419.470703125, + "logps/rejected": -440.67437744140625, + "loss": 0.4925, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1807644367218018, + "rewards/margins": 0.8425852060317993, + "rewards/rejected": -2.0233497619628906, + "step": 3750 + }, + { + "epoch": 0.49, + "learning_rate": 3.0025605339415476e-06, + "logits/chosen": -1.8740978240966797, + "logits/rejected": -1.718133568763733, + "logps/chosen": -398.0804748535156, + "logps/rejected": -441.2355041503906, + "loss": 0.519, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1523767709732056, + "rewards/margins": 0.7449162602424622, + "rewards/rejected": -1.8972930908203125, + "step": 3760 + }, + { + "epoch": 0.49, + "learning_rate": 2.9913661874870923e-06, + "logits/chosen": -1.869728446006775, + "logits/rejected": -1.7910888195037842, + "logps/chosen": -395.14031982421875, + "logps/rejected": -419.30487060546875, + "loss": 0.5382, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1323497295379639, + "rewards/margins": 0.6088643074035645, + "rewards/rejected": -1.7412141561508179, + "step": 3770 + }, + { + "epoch": 0.49, + "learning_rate": 2.980161583745294e-06, + "logits/chosen": -1.9254146814346313, + "logits/rejected": -1.8167539834976196, + "logps/chosen": -436.7218322753906, + "logps/rejected": -462.28961181640625, + "loss": 0.4902, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1210156679153442, + "rewards/margins": 0.7795640230178833, + "rewards/rejected": -1.9005796909332275, + "step": 3780 + }, + { + "epoch": 0.5, + "learning_rate": 2.96894695661267e-06, + "logits/chosen": -1.9709405899047852, + "logits/rejected": -1.8259330987930298, + "logps/chosen": -451.60150146484375, + "logps/rejected": -450.0809631347656, + "loss": 0.6165, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3595227003097534, + "rewards/margins": 0.4553799033164978, + "rewards/rejected": -1.814902901649475, + "step": 3790 + }, + { + "epoch": 0.5, + "learning_rate": 2.9577225401949773e-06, + "logits/chosen": -1.6736812591552734, + "logits/rejected": -1.669345498085022, + "logps/chosen": -382.5348815917969, + "logps/rejected": -435.5172424316406, + "loss": 0.5522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.350743293762207, + "rewards/margins": 0.6797314882278442, + "rewards/rejected": -2.0304746627807617, + "step": 3800 + }, + { + "epoch": 0.5, + "eval_logits/chosen": 0.8745436072349548, + "eval_logits/rejected": 0.911791980266571, + "eval_logps/chosen": -423.3916320800781, + "eval_logps/rejected": -470.93115234375, + "eval_loss": 0.5561516284942627, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -1.380988597869873, + "eval_rewards/margins": 0.6896264553070068, + "eval_rewards/rejected": -2.07061505317688, + "eval_runtime": 1173.8815, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 3800 + }, + { + "epoch": 0.5, + "learning_rate": 2.946488568802324e-06, + "logits/chosen": -1.7115625143051147, + "logits/rejected": -1.5814566612243652, + "logps/chosen": -417.40313720703125, + "logps/rejected": -453.150634765625, + "loss": 0.5803, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3829896450042725, + "rewards/margins": 0.5222972631454468, + "rewards/rejected": -1.9052867889404297, + "step": 3810 + }, + { + "epoch": 0.5, + "learning_rate": 2.935245276944278e-06, + "logits/chosen": -1.7636444568634033, + "logits/rejected": -1.6741498708724976, + "logps/chosen": -433.516357421875, + "logps/rejected": -457.4190368652344, + "loss": 0.5762, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2248344421386719, + "rewards/margins": 0.5882797837257385, + "rewards/rejected": -1.8131141662597656, + "step": 3820 + }, + { + "epoch": 0.5, + "learning_rate": 2.9239928993249723e-06, + "logits/chosen": -1.8020870685577393, + "logits/rejected": -1.7303444147109985, + "logps/chosen": -424.1175842285156, + "logps/rejected": -479.3556213378906, + "loss": 0.5002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2616357803344727, + "rewards/margins": 0.9329161643981934, + "rewards/rejected": -2.194552183151245, + "step": 3830 + }, + { + "epoch": 0.5, + "learning_rate": 2.912731670838207e-06, + "logits/chosen": -1.7195581197738647, + "logits/rejected": -1.5701277256011963, + "logps/chosen": -412.962646484375, + "logps/rejected": -474.04156494140625, + "loss": 0.6148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4015324115753174, + "rewards/margins": 0.593268096446991, + "rewards/rejected": -1.994800329208374, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 2.901461826562543e-06, + "logits/chosen": -1.814399003982544, + "logits/rejected": -1.61077082157135, + "logps/chosen": -366.83282470703125, + "logps/rejected": -415.51483154296875, + "loss": 0.5527, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2060191631317139, + "rewards/margins": 0.7003362774848938, + "rewards/rejected": -1.9063555002212524, + "step": 3850 + }, + { + "epoch": 0.51, + "learning_rate": 2.8901836017563966e-06, + "logits/chosen": -1.8173844814300537, + "logits/rejected": -1.6594011783599854, + "logps/chosen": -419.95294189453125, + "logps/rejected": -427.8134765625, + "loss": 0.6101, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2266980409622192, + "rewards/margins": 0.46940097212791443, + "rewards/rejected": -1.696099042892456, + "step": 3860 + }, + { + "epoch": 0.51, + "learning_rate": 2.8788972318531272e-06, + "logits/chosen": -1.8642652034759521, + "logits/rejected": -1.7005784511566162, + "logps/chosen": -385.55279541015625, + "logps/rejected": -431.86590576171875, + "loss": 0.574, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1929553747177124, + "rewards/margins": 0.5182313919067383, + "rewards/rejected": -1.7111867666244507, + "step": 3870 + }, + { + "epoch": 0.51, + "learning_rate": 2.8676029524561255e-06, + "logits/chosen": -1.8119266033172607, + "logits/rejected": -1.7860915660858154, + "logps/chosen": -419.46258544921875, + "logps/rejected": -460.0663146972656, + "loss": 0.5882, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.134201169013977, + "rewards/margins": 0.5569092631340027, + "rewards/rejected": -1.691110372543335, + "step": 3880 + }, + { + "epoch": 0.51, + "learning_rate": 2.8563009993338906e-06, + "logits/chosen": -1.818861722946167, + "logits/rejected": -1.6572418212890625, + "logps/chosen": -381.38958740234375, + "logps/rejected": -442.7064514160156, + "loss": 0.5265, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1867233514785767, + "rewards/margins": 0.7000702619552612, + "rewards/rejected": -1.8867934942245483, + "step": 3890 + }, + { + "epoch": 0.51, + "learning_rate": 2.844991608415113e-06, + "logits/chosen": -1.8968263864517212, + "logits/rejected": -1.835386872291565, + "logps/chosen": -421.7579040527344, + "logps/rejected": -479.73699951171875, + "loss": 0.5734, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3715274333953857, + "rewards/margins": 0.689576268196106, + "rewards/rejected": -2.061103582382202, + "step": 3900 + }, + { + "epoch": 0.51, + "eval_logits/chosen": 0.7416585683822632, + "eval_logits/rejected": 0.7968641519546509, + "eval_logps/chosen": -424.93609619140625, + "eval_logps/rejected": -472.9461669921875, + "eval_loss": 0.5556566715240479, + "eval_rewards/accuracies": 0.6970000267028809, + "eval_rewards/chosen": -1.39643394947052, + "eval_rewards/margins": 0.6943311095237732, + "eval_rewards/rejected": -2.0907649993896484, + "eval_runtime": 1173.1781, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 3900 + }, + { + "epoch": 0.51, + "learning_rate": 2.833675015783746e-06, + "logits/chosen": -1.728763222694397, + "logits/rejected": -1.7468255758285522, + "logps/chosen": -406.5009460449219, + "logps/rejected": -470.13671875, + "loss": 0.5588, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5327627658843994, + "rewards/margins": 0.6243911385536194, + "rewards/rejected": -2.157153606414795, + "step": 3910 + }, + { + "epoch": 0.51, + "learning_rate": 2.8223514576740784e-06, + "logits/chosen": -1.710656762123108, + "logits/rejected": -1.6441787481307983, + "logps/chosen": -368.9499816894531, + "logps/rejected": -460.90606689453125, + "loss": 0.5291, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2226186990737915, + "rewards/margins": 0.6628161072731018, + "rewards/rejected": -1.8854347467422485, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 2.8110211704658073e-06, + "logits/chosen": -1.9349536895751953, + "logits/rejected": -1.797790288925171, + "logps/chosen": -454.807373046875, + "logps/rejected": -483.8372497558594, + "loss": 0.5261, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3279203176498413, + "rewards/margins": 0.6820384860038757, + "rewards/rejected": -2.0099589824676514, + "step": 3930 + }, + { + "epoch": 0.52, + "learning_rate": 2.7996843906790955e-06, + "logits/chosen": -1.7957370281219482, + "logits/rejected": -1.6038751602172852, + "logps/chosen": -396.48980712890625, + "logps/rejected": -440.9287109375, + "loss": 0.6408, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3963592052459717, + "rewards/margins": 0.4027465879917145, + "rewards/rejected": -1.7991058826446533, + "step": 3940 + }, + { + "epoch": 0.52, + "learning_rate": 2.7883413549696396e-06, + "logits/chosen": -1.888304352760315, + "logits/rejected": -1.7081407308578491, + "logps/chosen": -428.5066833496094, + "logps/rejected": -495.19647216796875, + "loss": 0.445, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.211716651916504, + "rewards/margins": 0.9643009305000305, + "rewards/rejected": -2.1760175228118896, + "step": 3950 + }, + { + "epoch": 0.52, + "learning_rate": 2.776992300123732e-06, + "logits/chosen": -1.6737916469573975, + "logits/rejected": -1.5721557140350342, + "logps/chosen": -391.4358825683594, + "logps/rejected": -468.9794006347656, + "loss": 0.6174, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.308905839920044, + "rewards/margins": 0.7785030603408813, + "rewards/rejected": -2.0874087810516357, + "step": 3960 + }, + { + "epoch": 0.52, + "learning_rate": 2.7656374630533113e-06, + "logits/chosen": -1.8585602045059204, + "logits/rejected": -1.8566067218780518, + "logps/chosen": -390.7518005371094, + "logps/rejected": -458.5517578125, + "loss": 0.5299, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4467813968658447, + "rewards/margins": 0.6529810428619385, + "rewards/rejected": -2.099762439727783, + "step": 3970 + }, + { + "epoch": 0.52, + "learning_rate": 2.754277080791021e-06, + "logits/chosen": -1.8200175762176514, + "logits/rejected": -1.7867431640625, + "logps/chosen": -438.35693359375, + "logps/rejected": -460.3201599121094, + "loss": 0.7543, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.56465744972229, + "rewards/margins": 0.29193997383117676, + "rewards/rejected": -1.856597661972046, + "step": 3980 + }, + { + "epoch": 0.52, + "learning_rate": 2.742911390485262e-06, + "logits/chosen": -1.6067097187042236, + "logits/rejected": -1.6185413599014282, + "logps/chosen": -369.546630859375, + "logps/rejected": -407.5931091308594, + "loss": 0.595, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4288946390151978, + "rewards/margins": 0.47618919610977173, + "rewards/rejected": -1.9050838947296143, + "step": 3990 + }, + { + "epoch": 0.52, + "learning_rate": 2.731540629395239e-06, + "logits/chosen": -1.738416075706482, + "logits/rejected": -1.681363821029663, + "logps/chosen": -443.32257080078125, + "logps/rejected": -460.7607421875, + "loss": 0.612, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5387576818466187, + "rewards/margins": 0.45962828397750854, + "rewards/rejected": -1.998386025428772, + "step": 4000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 0.8717538714408875, + "eval_logits/rejected": 0.8941403031349182, + "eval_logps/chosen": -447.7853698730469, + "eval_logps/rejected": -496.18499755859375, + "eval_loss": 0.5548127889633179, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -1.6249265670776367, + "eval_rewards/margins": 0.6982267498970032, + "eval_rewards/rejected": -2.323153257369995, + "eval_runtime": 1172.9673, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 2.7201650348860115e-06, + "logits/chosen": -1.8293654918670654, + "logits/rejected": -1.6924068927764893, + "logps/chosen": -425.6841735839844, + "logps/rejected": -457.46051025390625, + "loss": 0.5073, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6868245601654053, + "rewards/margins": 0.7679763436317444, + "rewards/rejected": -2.454801082611084, + "step": 4010 + }, + { + "epoch": 0.53, + "learning_rate": 2.7087848444235354e-06, + "logits/chosen": -1.9378684759140015, + "logits/rejected": -1.771817922592163, + "logps/chosen": -456.98504638671875, + "logps/rejected": -536.107421875, + "loss": 0.4734, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6349451541900635, + "rewards/margins": 0.9969595670700073, + "rewards/rejected": -2.6319048404693604, + "step": 4020 + }, + { + "epoch": 0.53, + "learning_rate": 2.697400295569707e-06, + "logits/chosen": -1.890873908996582, + "logits/rejected": -1.9099485874176025, + "logps/chosen": -416.513671875, + "logps/rejected": -483.06781005859375, + "loss": 0.6058, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5668065547943115, + "rewards/margins": 0.696664035320282, + "rewards/rejected": -2.263470411300659, + "step": 4030 + }, + { + "epoch": 0.53, + "learning_rate": 2.6860116259774065e-06, + "logits/chosen": -1.7504587173461914, + "logits/rejected": -1.6228545904159546, + "logps/chosen": -450.67535400390625, + "logps/rejected": -522.6287841796875, + "loss": 0.4873, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4655582904815674, + "rewards/margins": 0.89894038438797, + "rewards/rejected": -2.3644988536834717, + "step": 4040 + }, + { + "epoch": 0.53, + "learning_rate": 2.674619073385531e-06, + "logits/chosen": -1.7882649898529053, + "logits/rejected": -1.7534267902374268, + "logps/chosen": -383.7250061035156, + "logps/rejected": -463.1366271972656, + "loss": 0.579, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.245359182357788, + "rewards/margins": 0.7908748388290405, + "rewards/rejected": -2.036233901977539, + "step": 4050 + }, + { + "epoch": 0.53, + "learning_rate": 2.663222875614038e-06, + "logits/chosen": -1.9012863636016846, + "logits/rejected": -1.7262403964996338, + "logps/chosen": -394.57708740234375, + "logps/rejected": -458.18414306640625, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.335689902305603, + "rewards/margins": 0.5589646100997925, + "rewards/rejected": -1.8946545124053955, + "step": 4060 + }, + { + "epoch": 0.53, + "learning_rate": 2.6518232705589775e-06, + "logits/chosen": -1.9649426937103271, + "logits/rejected": -1.8220106363296509, + "logps/chosen": -378.5050354003906, + "logps/rejected": -460.794677734375, + "loss": 0.4866, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0446014404296875, + "rewards/margins": 0.8659340739250183, + "rewards/rejected": -1.9105355739593506, + "step": 4070 + }, + { + "epoch": 0.53, + "learning_rate": 2.640420496187528e-06, + "logits/chosen": -1.921217679977417, + "logits/rejected": -1.7409846782684326, + "logps/chosen": -418.1365661621094, + "logps/rejected": -441.38494873046875, + "loss": 0.5027, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.063091516494751, + "rewards/margins": 0.8426526784896851, + "rewards/rejected": -1.9057443141937256, + "step": 4080 + }, + { + "epoch": 0.54, + "learning_rate": 2.629014790533025e-06, + "logits/chosen": -1.8884108066558838, + "logits/rejected": -1.6967947483062744, + "logps/chosen": -426.567626953125, + "logps/rejected": -445.81134033203125, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1580777168273926, + "rewards/margins": 0.8561995625495911, + "rewards/rejected": -2.014277458190918, + "step": 4090 + }, + { + "epoch": 0.54, + "learning_rate": 2.617606391689996e-06, + "logits/chosen": -1.9376146793365479, + "logits/rejected": -1.746341347694397, + "logps/chosen": -382.73590087890625, + "logps/rejected": -441.44134521484375, + "loss": 0.5357, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0421863794326782, + "rewards/margins": 0.8114742040634155, + "rewards/rejected": -1.8536605834960938, + "step": 4100 + }, + { + "epoch": 0.54, + "eval_logits/chosen": 0.5101784467697144, + "eval_logits/rejected": 0.5836014747619629, + "eval_logps/chosen": -404.91351318359375, + "eval_logps/rejected": -452.53375244140625, + "eval_loss": 0.5587130188941956, + "eval_rewards/accuracies": 0.6995000243186951, + "eval_rewards/chosen": -1.196208119392395, + "eval_rewards/margins": 0.6904324889183044, + "eval_rewards/rejected": -1.8866406679153442, + "eval_runtime": 1172.8993, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 4100 + }, + { + "epoch": 0.54, + "learning_rate": 2.6061955378091896e-06, + "logits/chosen": -1.8436400890350342, + "logits/rejected": -1.7029361724853516, + "logps/chosen": -379.65777587890625, + "logps/rejected": -479.08734130859375, + "loss": 0.5021, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1766784191131592, + "rewards/margins": 0.9267901182174683, + "rewards/rejected": -2.103468656539917, + "step": 4110 + }, + { + "epoch": 0.54, + "learning_rate": 2.5947824670926025e-06, + "logits/chosen": -1.9190120697021484, + "logits/rejected": -1.902486801147461, + "logps/chosen": -372.6246337890625, + "logps/rejected": -463.3282775878906, + "loss": 0.4966, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1196424961090088, + "rewards/margins": 0.8446052670478821, + "rewards/rejected": -1.964247465133667, + "step": 4120 + }, + { + "epoch": 0.54, + "learning_rate": 2.583367417788508e-06, + "logits/chosen": -1.7358171939849854, + "logits/rejected": -1.6001354455947876, + "logps/chosen": -416.04217529296875, + "logps/rejected": -481.5262756347656, + "loss": 0.5564, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5063390731811523, + "rewards/margins": 0.796363353729248, + "rewards/rejected": -2.3027024269104004, + "step": 4130 + }, + { + "epoch": 0.54, + "learning_rate": 2.5719506281864838e-06, + "logits/chosen": -1.901760458946228, + "logits/rejected": -1.8143211603164673, + "logps/chosen": -435.39495849609375, + "logps/rejected": -438.5569763183594, + "loss": 0.6003, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3779494762420654, + "rewards/margins": 0.6639505624771118, + "rewards/rejected": -2.041900157928467, + "step": 4140 + }, + { + "epoch": 0.54, + "learning_rate": 2.5605323366124335e-06, + "logits/chosen": -1.8352622985839844, + "logits/rejected": -1.6869032382965088, + "logps/chosen": -411.93011474609375, + "logps/rejected": -476.12548828125, + "loss": 0.5492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.41312575340271, + "rewards/margins": 0.7376724481582642, + "rewards/rejected": -2.1507978439331055, + "step": 4150 + }, + { + "epoch": 0.54, + "learning_rate": 2.5491127814236172e-06, + "logits/chosen": -1.7956464290618896, + "logits/rejected": -1.8745015859603882, + "logps/chosen": -333.6759033203125, + "logps/rejected": -437.91644287109375, + "loss": 0.5616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0492000579833984, + "rewards/margins": 0.6359735727310181, + "rewards/rejected": -1.6851736307144165, + "step": 4160 + }, + { + "epoch": 0.55, + "learning_rate": 2.537692201003671e-06, + "logits/chosen": -1.8365240097045898, + "logits/rejected": -1.851731538772583, + "logps/chosen": -418.044677734375, + "logps/rejected": -476.5796813964844, + "loss": 0.5671, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3667340278625488, + "rewards/margins": 0.718292236328125, + "rewards/rejected": -2.085026264190674, + "step": 4170 + }, + { + "epoch": 0.55, + "learning_rate": 2.526270833757635e-06, + "logits/chosen": -1.9323654174804688, + "logits/rejected": -1.7108045816421509, + "logps/chosen": -397.91290283203125, + "logps/rejected": -448.17999267578125, + "loss": 0.5709, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2763726711273193, + "rewards/margins": 0.6827574968338013, + "rewards/rejected": -1.9591302871704102, + "step": 4180 + }, + { + "epoch": 0.55, + "learning_rate": 2.514848918106971e-06, + "logits/chosen": -1.7983453273773193, + "logits/rejected": -1.5887773036956787, + "logps/chosen": -421.1139221191406, + "logps/rejected": -466.9378356933594, + "loss": 0.5187, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.395037293434143, + "rewards/margins": 0.8693952560424805, + "rewards/rejected": -2.264432430267334, + "step": 4190 + }, + { + "epoch": 0.55, + "learning_rate": 2.503426692484594e-06, + "logits/chosen": -1.888593316078186, + "logits/rejected": -1.8359572887420654, + "logps/chosen": -388.5292053222656, + "logps/rejected": -456.5140075683594, + "loss": 0.5648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2017731666564941, + "rewards/margins": 0.5971595048904419, + "rewards/rejected": -1.798932671546936, + "step": 4200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": 0.6439515352249146, + "eval_logits/rejected": 0.7062841653823853, + "eval_logps/chosen": -416.76263427734375, + "eval_logps/rejected": -468.4803771972656, + "eval_loss": 0.5569632649421692, + "eval_rewards/accuracies": 0.6940000057220459, + "eval_rewards/chosen": -1.314698576927185, + "eval_rewards/margins": 0.7314084768295288, + "eval_rewards/rejected": -2.046107053756714, + "eval_runtime": 1173.6503, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 4200 + }, + { + "epoch": 0.55, + "learning_rate": 2.492004395329883e-06, + "logits/chosen": -1.7668720483779907, + "logits/rejected": -1.7909843921661377, + "logps/chosen": -388.5613708496094, + "logps/rejected": -459.2867736816406, + "loss": 0.4986, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2471996545791626, + "rewards/margins": 0.8756244778633118, + "rewards/rejected": -2.122824192047119, + "step": 4210 + }, + { + "epoch": 0.55, + "learning_rate": 2.4805822650837165e-06, + "logits/chosen": -1.641169786453247, + "logits/rejected": -1.5679924488067627, + "logps/chosen": -380.5749206542969, + "logps/rejected": -509.94647216796875, + "loss": 0.4449, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2884600162506104, + "rewards/margins": 1.1968393325805664, + "rewards/rejected": -2.485299587249756, + "step": 4220 + }, + { + "epoch": 0.55, + "learning_rate": 2.4691605401834843e-06, + "logits/chosen": -1.9538590908050537, + "logits/rejected": -1.8063548803329468, + "logps/chosen": -446.77532958984375, + "logps/rejected": -504.0521545410156, + "loss": 0.5588, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4774322509765625, + "rewards/margins": 0.6580866575241089, + "rewards/rejected": -2.135518789291382, + "step": 4230 + }, + { + "epoch": 0.55, + "learning_rate": 2.457739459058117e-06, + "logits/chosen": -1.9371347427368164, + "logits/rejected": -1.8718658685684204, + "logps/chosen": -493.62213134765625, + "logps/rejected": -517.4634399414062, + "loss": 0.5261, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.528563141822815, + "rewards/margins": 0.7139015793800354, + "rewards/rejected": -2.242464780807495, + "step": 4240 + }, + { + "epoch": 0.56, + "learning_rate": 2.4463192601231054e-06, + "logits/chosen": -1.7504394054412842, + "logits/rejected": -1.6634776592254639, + "logps/chosen": -490.6211853027344, + "logps/rejected": -508.7572326660156, + "loss": 0.5518, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7602230310440063, + "rewards/margins": 0.8414942026138306, + "rewards/rejected": -2.601717233657837, + "step": 4250 + }, + { + "epoch": 0.56, + "learning_rate": 2.434900181775524e-06, + "logits/chosen": -1.7957899570465088, + "logits/rejected": -1.7329041957855225, + "logps/chosen": -449.1871032714844, + "logps/rejected": -517.341552734375, + "loss": 0.5279, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.644603967666626, + "rewards/margins": 0.8088521957397461, + "rewards/rejected": -2.453456163406372, + "step": 4260 + }, + { + "epoch": 0.56, + "learning_rate": 2.4234824623890578e-06, + "logits/chosen": -1.8632961511611938, + "logits/rejected": -1.733759880065918, + "logps/chosen": -439.6903381347656, + "logps/rejected": -491.3687438964844, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.599808692932129, + "rewards/margins": 0.7229506969451904, + "rewards/rejected": -2.3227593898773193, + "step": 4270 + }, + { + "epoch": 0.56, + "learning_rate": 2.4120663403090193e-06, + "logits/chosen": -1.8758010864257812, + "logits/rejected": -1.781036138534546, + "logps/chosen": -454.6363220214844, + "logps/rejected": -528.6107177734375, + "loss": 0.618, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.645216941833496, + "rewards/margins": 0.6545249223709106, + "rewards/rejected": -2.299741744995117, + "step": 4280 + }, + { + "epoch": 0.56, + "learning_rate": 2.40065205384738e-06, + "logits/chosen": -1.780618667602539, + "logits/rejected": -1.578452467918396, + "logps/chosen": -435.1331481933594, + "logps/rejected": -444.5065002441406, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7645103931427002, + "rewards/margins": 0.4341967701911926, + "rewards/rejected": -2.198707103729248, + "step": 4290 + }, + { + "epoch": 0.56, + "learning_rate": 2.389239841277793e-06, + "logits/chosen": -1.6420913934707642, + "logits/rejected": -1.584995985031128, + "logps/chosen": -413.6902770996094, + "logps/rejected": -455.03314208984375, + "loss": 0.5237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5267785787582397, + "rewards/margins": 0.7052222490310669, + "rewards/rejected": -2.2320008277893066, + "step": 4300 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 0.828182578086853, + "eval_logits/rejected": 0.8568943738937378, + "eval_logps/chosen": -435.56292724609375, + "eval_logps/rejected": -484.738525390625, + "eval_loss": 0.5515031814575195, + "eval_rewards/accuracies": 0.703000009059906, + "eval_rewards/chosen": -1.502702236175537, + "eval_rewards/margins": 0.7059863805770874, + "eval_rewards/rejected": -2.208688735961914, + "eval_runtime": 1172.7955, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 4300 + }, + { + "epoch": 0.56, + "learning_rate": 2.3778299408306167e-06, + "logits/chosen": -1.7654708623886108, + "logits/rejected": -1.5771191120147705, + "logps/chosen": -421.884033203125, + "logps/rejected": -471.3894958496094, + "loss": 0.5476, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5702762603759766, + "rewards/margins": 0.6879106163978577, + "rewards/rejected": -2.2581868171691895, + "step": 4310 + }, + { + "epoch": 0.57, + "learning_rate": 2.3664225906879452e-06, + "logits/chosen": -1.813707709312439, + "logits/rejected": -1.6927099227905273, + "logps/chosen": -418.1673889160156, + "logps/rejected": -447.35015869140625, + "loss": 0.6011, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5919063091278076, + "rewards/margins": 0.5547482967376709, + "rewards/rejected": -2.1466546058654785, + "step": 4320 + }, + { + "epoch": 0.57, + "learning_rate": 2.3550180289786357e-06, + "logits/chosen": -1.7970657348632812, + "logits/rejected": -1.6429212093353271, + "logps/chosen": -430.11041259765625, + "logps/rejected": -457.9832458496094, + "loss": 0.5378, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5009050369262695, + "rewards/margins": 0.713435173034668, + "rewards/rejected": -2.2143399715423584, + "step": 4330 + }, + { + "epoch": 0.57, + "learning_rate": 2.343616493773335e-06, + "logits/chosen": -1.906446099281311, + "logits/rejected": -1.7467994689941406, + "logps/chosen": -435.53692626953125, + "logps/rejected": -507.5245056152344, + "loss": 0.5075, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5532820224761963, + "rewards/margins": 0.7375933527946472, + "rewards/rejected": -2.2908754348754883, + "step": 4340 + }, + { + "epoch": 0.57, + "learning_rate": 2.3322182230795127e-06, + "logits/chosen": -1.7325433492660522, + "logits/rejected": -1.7731993198394775, + "logps/chosen": -391.47540283203125, + "logps/rejected": -505.1651916503906, + "loss": 0.5042, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4061033725738525, + "rewards/margins": 0.892315685749054, + "rewards/rejected": -2.29841947555542, + "step": 4350 + }, + { + "epoch": 0.57, + "learning_rate": 2.320823454836491e-06, + "logits/chosen": -2.009273052215576, + "logits/rejected": -1.7471644878387451, + "logps/chosen": -407.6497497558594, + "logps/rejected": -466.71051025390625, + "loss": 0.459, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3678957223892212, + "rewards/margins": 0.7732141613960266, + "rewards/rejected": -2.1411099433898926, + "step": 4360 + }, + { + "epoch": 0.57, + "learning_rate": 2.309432426910478e-06, + "logits/chosen": -1.7197542190551758, + "logits/rejected": -1.5388801097869873, + "logps/chosen": -463.50555419921875, + "logps/rejected": -478.2345275878906, + "loss": 0.5387, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5038155317306519, + "rewards/margins": 0.7517666220664978, + "rewards/rejected": -2.255582094192505, + "step": 4370 + }, + { + "epoch": 0.57, + "learning_rate": 2.298045377089604e-06, + "logits/chosen": -1.7552915811538696, + "logits/rejected": -1.6244428157806396, + "logps/chosen": -422.2392578125, + "logps/rejected": -498.9564514160156, + "loss": 0.4528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5110352039337158, + "rewards/margins": 0.991446852684021, + "rewards/rejected": -2.5024819374084473, + "step": 4380 + }, + { + "epoch": 0.57, + "learning_rate": 2.286662543078955e-06, + "logits/chosen": -1.5256160497665405, + "logits/rejected": -1.3968003988265991, + "logps/chosen": -461.56817626953125, + "logps/rejected": -494.91461181640625, + "loss": 0.5027, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.660752534866333, + "rewards/margins": 0.7574098110198975, + "rewards/rejected": -2.4181621074676514, + "step": 4390 + }, + { + "epoch": 0.58, + "learning_rate": 2.2752841624956125e-06, + "logits/chosen": -1.7991011142730713, + "logits/rejected": -1.7218538522720337, + "logps/chosen": -502.18878173828125, + "logps/rejected": -554.5040283203125, + "loss": 0.5979, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9555965662002563, + "rewards/margins": 0.7310833930969238, + "rewards/rejected": -2.6866796016693115, + "step": 4400 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 0.9060326218605042, + "eval_logits/rejected": 0.9415406584739685, + "eval_logps/chosen": -455.10614013671875, + "eval_logps/rejected": -511.879638671875, + "eval_loss": 0.5594107508659363, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -1.698134183883667, + "eval_rewards/margins": 0.7819651365280151, + "eval_rewards/rejected": -2.4800994396209717, + "eval_runtime": 1172.9465, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 4400 + }, + { + "epoch": 0.58, + "learning_rate": 2.2639104728636915e-06, + "logits/chosen": -1.6456069946289062, + "logits/rejected": -1.6536614894866943, + "logps/chosen": -434.7286682128906, + "logps/rejected": -494.59918212890625, + "loss": 0.6201, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5495412349700928, + "rewards/margins": 0.6422635912895203, + "rewards/rejected": -2.191804885864258, + "step": 4410 + }, + { + "epoch": 0.58, + "learning_rate": 2.252541711609384e-06, + "logits/chosen": -1.7286525964736938, + "logits/rejected": -1.5054218769073486, + "logps/chosen": -430.8987731933594, + "logps/rejected": -480.05218505859375, + "loss": 0.5374, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5618770122528076, + "rewards/margins": 0.811420738697052, + "rewards/rejected": -2.373297929763794, + "step": 4420 + }, + { + "epoch": 0.58, + "learning_rate": 2.241178116056002e-06, + "logits/chosen": -1.7933905124664307, + "logits/rejected": -1.6752099990844727, + "logps/chosen": -420.69537353515625, + "logps/rejected": -474.57073974609375, + "loss": 0.5055, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4889934062957764, + "rewards/margins": 0.815077006816864, + "rewards/rejected": -2.304070234298706, + "step": 4430 + }, + { + "epoch": 0.58, + "learning_rate": 2.2298199234190236e-06, + "logits/chosen": -1.6978040933609009, + "logits/rejected": -1.7049938440322876, + "logps/chosen": -447.2154235839844, + "logps/rejected": -499.2445373535156, + "loss": 0.4904, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4197678565979004, + "rewards/margins": 0.8962188959121704, + "rewards/rejected": -2.3159868717193604, + "step": 4440 + }, + { + "epoch": 0.58, + "learning_rate": 2.218467370801138e-06, + "logits/chosen": -1.8251584768295288, + "logits/rejected": -1.7593870162963867, + "logps/chosen": -444.00738525390625, + "logps/rejected": -476.70355224609375, + "loss": 0.6049, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6047332286834717, + "rewards/margins": 0.632063090801239, + "rewards/rejected": -2.2367963790893555, + "step": 4450 + }, + { + "epoch": 0.58, + "learning_rate": 2.207120695187304e-06, + "logits/chosen": -1.642364501953125, + "logits/rejected": -1.4046622514724731, + "logps/chosen": -451.1121520996094, + "logps/rejected": -507.24920654296875, + "loss": 0.4812, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.538770079612732, + "rewards/margins": 0.9383605122566223, + "rewards/rejected": -2.47713041305542, + "step": 4460 + }, + { + "epoch": 0.58, + "learning_rate": 2.195780133439794e-06, + "logits/chosen": -1.7654697895050049, + "logits/rejected": -1.7827869653701782, + "logps/chosen": -442.58209228515625, + "logps/rejected": -508.88427734375, + "loss": 0.6248, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4976444244384766, + "rewards/margins": 0.6089947819709778, + "rewards/rejected": -2.1066391468048096, + "step": 4470 + }, + { + "epoch": 0.59, + "learning_rate": 2.1844459222932535e-06, + "logits/chosen": -1.780927300453186, + "logits/rejected": -1.665780782699585, + "logps/chosen": -436.9989318847656, + "logps/rejected": -478.2315979003906, + "loss": 0.5156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4333505630493164, + "rewards/margins": 0.753585934638977, + "rewards/rejected": -2.186936855316162, + "step": 4480 + }, + { + "epoch": 0.59, + "learning_rate": 2.17311829834976e-06, + "logits/chosen": -1.9392468929290771, + "logits/rejected": -1.8275117874145508, + "logps/chosen": -409.9462890625, + "logps/rejected": -481.535888671875, + "loss": 0.5013, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3325854539871216, + "rewards/margins": 0.7732506394386292, + "rewards/rejected": -2.1058361530303955, + "step": 4490 + }, + { + "epoch": 0.59, + "learning_rate": 2.1617974980738814e-06, + "logits/chosen": -1.8007287979125977, + "logits/rejected": -1.6861215829849243, + "logps/chosen": -410.5309143066406, + "logps/rejected": -460.1180114746094, + "loss": 0.4859, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4100977182388306, + "rewards/margins": 0.8361636400222778, + "rewards/rejected": -2.2462613582611084, + "step": 4500 + }, + { + "epoch": 0.59, + "eval_logits/chosen": 0.9057154655456543, + "eval_logits/rejected": 0.9398696422576904, + "eval_logps/chosen": -444.394775390625, + "eval_logps/rejected": -499.04150390625, + "eval_loss": 0.5529686808586121, + "eval_rewards/accuracies": 0.7080000042915344, + "eval_rewards/chosen": -1.5910205841064453, + "eval_rewards/margins": 0.7606974840164185, + "eval_rewards/rejected": -2.3517181873321533, + "eval_runtime": 1173.0292, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 4500 + }, + { + "epoch": 0.59, + "learning_rate": 2.150483757787744e-06, + "logits/chosen": -1.8965717554092407, + "logits/rejected": -1.667515754699707, + "logps/chosen": -432.5536193847656, + "logps/rejected": -446.9703063964844, + "loss": 0.5839, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6405025720596313, + "rewards/margins": 0.6483727693557739, + "rewards/rejected": -2.2888753414154053, + "step": 4510 + }, + { + "epoch": 0.59, + "learning_rate": 2.139177313666093e-06, + "logits/chosen": -1.6948086023330688, + "logits/rejected": -1.6933705806732178, + "logps/chosen": -480.1251525878906, + "logps/rejected": -491.8417053222656, + "loss": 0.599, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5966379642486572, + "rewards/margins": 0.6552848815917969, + "rewards/rejected": -2.251922845840454, + "step": 4520 + }, + { + "epoch": 0.59, + "learning_rate": 2.1278784017313688e-06, + "logits/chosen": -1.7972462177276611, + "logits/rejected": -1.895336389541626, + "logps/chosen": -454.59442138671875, + "logps/rejected": -512.6532592773438, + "loss": 0.5886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5067318677902222, + "rewards/margins": 0.5520287752151489, + "rewards/rejected": -2.05876088142395, + "step": 4530 + }, + { + "epoch": 0.59, + "learning_rate": 2.116587257848776e-06, + "logits/chosen": -1.7740548849105835, + "logits/rejected": -1.7964674234390259, + "logps/chosen": -412.32763671875, + "logps/rejected": -487.65216064453125, + "loss": 0.6279, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5109002590179443, + "rewards/margins": 0.5173921585083008, + "rewards/rejected": -2.028292179107666, + "step": 4540 + }, + { + "epoch": 0.6, + "learning_rate": 2.105304117721361e-06, + "logits/chosen": -1.6072734594345093, + "logits/rejected": -1.5378026962280273, + "logps/chosen": -397.68829345703125, + "logps/rejected": -420.8843688964844, + "loss": 0.6461, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6038001775741577, + "rewards/margins": 0.5361725091934204, + "rewards/rejected": -2.1399729251861572, + "step": 4550 + }, + { + "epoch": 0.6, + "learning_rate": 2.0940292168850913e-06, + "logits/chosen": -1.6167068481445312, + "logits/rejected": -1.6595230102539062, + "logps/chosen": -426.5391540527344, + "logps/rejected": -442.6863708496094, + "loss": 0.6535, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5523406267166138, + "rewards/margins": 0.45763474702835083, + "rewards/rejected": -2.0099751949310303, + "step": 4560 + }, + { + "epoch": 0.6, + "learning_rate": 2.082762790703939e-06, + "logits/chosen": -1.6738531589508057, + "logits/rejected": -1.5881974697113037, + "logps/chosen": -430.85076904296875, + "logps/rejected": -484.5582580566406, + "loss": 0.5929, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4763389825820923, + "rewards/margins": 0.6432314515113831, + "rewards/rejected": -2.11957049369812, + "step": 4570 + }, + { + "epoch": 0.6, + "learning_rate": 2.0715050743649674e-06, + "logits/chosen": -1.8061870336532593, + "logits/rejected": -1.7034177780151367, + "logps/chosen": -397.50482177734375, + "logps/rejected": -524.7658081054688, + "loss": 0.4877, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4619042873382568, + "rewards/margins": 0.9087886810302734, + "rewards/rejected": -2.370692729949951, + "step": 4580 + }, + { + "epoch": 0.6, + "learning_rate": 2.060256302873421e-06, + "logits/chosen": -1.80402410030365, + "logits/rejected": -1.782848596572876, + "logps/chosen": -414.2543029785156, + "logps/rejected": -496.57159423828125, + "loss": 0.5442, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4690145254135132, + "rewards/margins": 0.7510448694229126, + "rewards/rejected": -2.220059394836426, + "step": 4590 + }, + { + "epoch": 0.6, + "learning_rate": 2.049016711047822e-06, + "logits/chosen": -1.8822914361953735, + "logits/rejected": -1.7078053951263428, + "logps/chosen": -440.61187744140625, + "logps/rejected": -489.31005859375, + "loss": 0.5484, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6159025430679321, + "rewards/margins": 0.7375297546386719, + "rewards/rejected": -2.3534321784973145, + "step": 4600 + }, + { + "epoch": 0.6, + "eval_logits/chosen": 0.8267521858215332, + "eval_logits/rejected": 0.8710527420043945, + "eval_logps/chosen": -436.8822326660156, + "eval_logps/rejected": -488.2594909667969, + "eval_loss": 0.5524637699127197, + "eval_rewards/accuracies": 0.7055000066757202, + "eval_rewards/chosen": -1.515894889831543, + "eval_rewards/margins": 0.7280031442642212, + "eval_rewards/rejected": -2.2438981533050537, + "eval_runtime": 1173.1276, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 4600 + }, + { + "epoch": 0.6, + "learning_rate": 2.037786533515064e-06, + "logits/chosen": -1.8766225576400757, + "logits/rejected": -1.850306749343872, + "logps/chosen": -479.96905517578125, + "logps/rejected": -508.6878967285156, + "loss": 0.5693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6044374704360962, + "rewards/margins": 0.5724669694900513, + "rewards/rejected": -2.1769044399261475, + "step": 4610 + }, + { + "epoch": 0.6, + "learning_rate": 2.02656600470552e-06, + "logits/chosen": -1.845931053161621, + "logits/rejected": -1.7757551670074463, + "logps/chosen": -415.3828125, + "logps/rejected": -472.7496643066406, + "loss": 0.5342, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.394207239151001, + "rewards/margins": 0.7796455025672913, + "rewards/rejected": -2.1738526821136475, + "step": 4620 + }, + { + "epoch": 0.61, + "learning_rate": 2.015355358848144e-06, + "logits/chosen": -1.6293227672576904, + "logits/rejected": -1.7113008499145508, + "logps/chosen": -379.7857971191406, + "logps/rejected": -464.6165466308594, + "loss": 0.5687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4572124481201172, + "rewards/margins": 0.6002478003501892, + "rewards/rejected": -2.057460308074951, + "step": 4630 + }, + { + "epoch": 0.61, + "learning_rate": 2.004154829965582e-06, + "logits/chosen": -1.8674468994140625, + "logits/rejected": -1.8090667724609375, + "logps/chosen": -425.97772216796875, + "logps/rejected": -482.00677490234375, + "loss": 0.5024, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3103511333465576, + "rewards/margins": 0.7306745648384094, + "rewards/rejected": -2.0410256385803223, + "step": 4640 + }, + { + "epoch": 0.61, + "learning_rate": 1.99296465186929e-06, + "logits/chosen": -1.892324686050415, + "logits/rejected": -1.7118568420410156, + "logps/chosen": -428.433837890625, + "logps/rejected": -429.88726806640625, + "loss": 0.5307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3096859455108643, + "rewards/margins": 0.6370649337768555, + "rewards/rejected": -1.9467506408691406, + "step": 4650 + }, + { + "epoch": 0.61, + "learning_rate": 1.9817850581546488e-06, + "logits/chosen": -1.7857517004013062, + "logits/rejected": -1.7456003427505493, + "logps/chosen": -450.404296875, + "logps/rejected": -521.0674438476562, + "loss": 0.612, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5571680068969727, + "rewards/margins": 0.6643776893615723, + "rewards/rejected": -2.221545696258545, + "step": 4660 + }, + { + "epoch": 0.61, + "learning_rate": 1.970616282196091e-06, + "logits/chosen": -1.8562015295028687, + "logits/rejected": -1.7407516241073608, + "logps/chosen": -412.98291015625, + "logps/rejected": -477.928466796875, + "loss": 0.5602, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4430997371673584, + "rewards/margins": 0.7090359926223755, + "rewards/rejected": -2.1521358489990234, + "step": 4670 + }, + { + "epoch": 0.61, + "learning_rate": 1.959458557142228e-06, + "logits/chosen": -1.817831039428711, + "logits/rejected": -1.7628173828125, + "logps/chosen": -423.5113220214844, + "logps/rejected": -473.5160217285156, + "loss": 0.6716, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5431184768676758, + "rewards/margins": 0.3869319558143616, + "rewards/rejected": -1.9300504922866821, + "step": 4680 + }, + { + "epoch": 0.61, + "learning_rate": 1.948312115910982e-06, + "logits/chosen": -1.7673568725585938, + "logits/rejected": -1.725992202758789, + "logps/chosen": -449.0369567871094, + "logps/rejected": -479.7022399902344, + "loss": 0.5836, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4680416584014893, + "rewards/margins": 0.7295928001403809, + "rewards/rejected": -2.197634696960449, + "step": 4690 + }, + { + "epoch": 0.62, + "learning_rate": 1.937177191184729e-06, + "logits/chosen": -1.7792892456054688, + "logits/rejected": -1.7978578805923462, + "logps/chosen": -383.3244934082031, + "logps/rejected": -434.3316345214844, + "loss": 0.6135, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2648676633834839, + "rewards/margins": 0.47942668199539185, + "rewards/rejected": -1.74429452419281, + "step": 4700 + }, + { + "epoch": 0.62, + "eval_logits/chosen": 0.722199022769928, + "eval_logits/rejected": 0.773621141910553, + "eval_logps/chosen": -417.8462219238281, + "eval_logps/rejected": -466.32476806640625, + "eval_loss": 0.5504409670829773, + "eval_rewards/accuracies": 0.7064999938011169, + "eval_rewards/chosen": -1.3255350589752197, + "eval_rewards/margins": 0.6990163922309875, + "eval_rewards/rejected": -2.0245513916015625, + "eval_runtime": 1173.0581, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 4700 + }, + { + "epoch": 0.62, + "learning_rate": 1.9260540154054317e-06, + "logits/chosen": -1.8403739929199219, + "logits/rejected": -1.6461031436920166, + "logps/chosen": -374.638427734375, + "logps/rejected": -467.19354248046875, + "loss": 0.4626, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.187178611755371, + "rewards/margins": 0.9907558560371399, + "rewards/rejected": -2.177934408187866, + "step": 4710 + }, + { + "epoch": 0.62, + "learning_rate": 1.9149428207697983e-06, + "logits/chosen": -1.8509531021118164, + "logits/rejected": -1.8378045558929443, + "logps/chosen": -416.5984802246094, + "logps/rejected": -460.538818359375, + "loss": 0.6526, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.379197597503662, + "rewards/margins": 0.4792349338531494, + "rewards/rejected": -1.858432412147522, + "step": 4720 + }, + { + "epoch": 0.62, + "learning_rate": 1.9038438392244262e-06, + "logits/chosen": -1.901710867881775, + "logits/rejected": -1.9277139902114868, + "logps/chosen": -416.91595458984375, + "logps/rejected": -461.445556640625, + "loss": 0.4871, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1508758068084717, + "rewards/margins": 0.7535255551338196, + "rewards/rejected": -1.904401183128357, + "step": 4730 + }, + { + "epoch": 0.62, + "learning_rate": 1.8927573024609666e-06, + "logits/chosen": -1.69242262840271, + "logits/rejected": -1.5772535800933838, + "logps/chosen": -376.327392578125, + "logps/rejected": -435.93878173828125, + "loss": 0.5541, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3499438762664795, + "rewards/margins": 0.7300174832344055, + "rewards/rejected": -2.0799612998962402, + "step": 4740 + }, + { + "epoch": 0.62, + "learning_rate": 1.8816834419112845e-06, + "logits/chosen": -1.782232642173767, + "logits/rejected": -1.657865285873413, + "logps/chosen": -399.690673828125, + "logps/rejected": -431.1106872558594, + "loss": 0.5689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.357898473739624, + "rewards/margins": 0.7007596492767334, + "rewards/rejected": -2.0586581230163574, + "step": 4750 + }, + { + "epoch": 0.62, + "learning_rate": 1.8706224887426283e-06, + "logits/chosen": -1.742174744606018, + "logits/rejected": -1.7735731601715088, + "logps/chosen": -439.009765625, + "logps/rejected": -501.45965576171875, + "loss": 0.5835, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5864620208740234, + "rewards/margins": 0.5894501805305481, + "rewards/rejected": -2.175912380218506, + "step": 4760 + }, + { + "epoch": 0.62, + "learning_rate": 1.8595746738528045e-06, + "logits/chosen": -1.7558538913726807, + "logits/rejected": -1.8131908178329468, + "logps/chosen": -402.8466491699219, + "logps/rejected": -495.69940185546875, + "loss": 0.561, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.370308518409729, + "rewards/margins": 0.7301324605941772, + "rewards/rejected": -2.1004412174224854, + "step": 4770 + }, + { + "epoch": 0.63, + "learning_rate": 1.8485402278653584e-06, + "logits/chosen": -1.844264030456543, + "logits/rejected": -1.8011757135391235, + "logps/chosen": -424.69268798828125, + "logps/rejected": -461.2425231933594, + "loss": 0.5332, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6498619318008423, + "rewards/margins": 0.6133615970611572, + "rewards/rejected": -2.263223648071289, + "step": 4780 + }, + { + "epoch": 0.63, + "learning_rate": 1.8375193811247577e-06, + "logits/chosen": -1.711334228515625, + "logits/rejected": -1.6143522262573242, + "logps/chosen": -424.0199279785156, + "logps/rejected": -475.4574279785156, + "loss": 0.5249, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5820074081420898, + "rewards/margins": 0.7127015590667725, + "rewards/rejected": -2.2947089672088623, + "step": 4790 + }, + { + "epoch": 0.63, + "learning_rate": 1.826512363691586e-06, + "logits/chosen": -1.871834397315979, + "logits/rejected": -1.849853754043579, + "logps/chosen": -444.37725830078125, + "logps/rejected": -480.35107421875, + "loss": 0.5714, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5137640237808228, + "rewards/margins": 0.6592321991920471, + "rewards/rejected": -2.1729962825775146, + "step": 4800 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 0.8370016813278198, + "eval_logits/rejected": 0.8648673295974731, + "eval_logps/chosen": -432.65576171875, + "eval_logps/rejected": -480.5717468261719, + "eval_loss": 0.5500975251197815, + "eval_rewards/accuracies": 0.7070000171661377, + "eval_rewards/chosen": -1.4736299514770508, + "eval_rewards/margins": 0.6933912634849548, + "eval_rewards/rejected": -2.1670212745666504, + "eval_runtime": 1172.8752, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 4800 + }, + { + "epoch": 0.63, + "learning_rate": 1.8155194053377391e-06, + "logits/chosen": -1.8831536769866943, + "logits/rejected": -1.7245756387710571, + "logps/chosen": -405.37158203125, + "logps/rejected": -458.77398681640625, + "loss": 0.5054, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.287423849105835, + "rewards/margins": 0.9036749005317688, + "rewards/rejected": -2.191098690032959, + "step": 4810 + }, + { + "epoch": 0.63, + "learning_rate": 1.80454073554163e-06, + "logits/chosen": -1.6334726810455322, + "logits/rejected": -1.5998659133911133, + "logps/chosen": -380.0008239746094, + "logps/rejected": -431.1194763183594, + "loss": 0.6071, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3910070657730103, + "rewards/margins": 0.6737453937530518, + "rewards/rejected": -2.0647525787353516, + "step": 4820 + }, + { + "epoch": 0.63, + "learning_rate": 1.7935765834833966e-06, + "logits/chosen": -1.799440622329712, + "logits/rejected": -1.7077531814575195, + "logps/chosen": -425.4602966308594, + "logps/rejected": -520.7520141601562, + "loss": 0.4739, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4049465656280518, + "rewards/margins": 0.9730203747749329, + "rewards/rejected": -2.37796688079834, + "step": 4830 + }, + { + "epoch": 0.63, + "learning_rate": 1.7826271780401182e-06, + "logits/chosen": -1.6157394647598267, + "logits/rejected": -1.4483401775360107, + "logps/chosen": -411.940185546875, + "logps/rejected": -470.05889892578125, + "loss": 0.4797, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.516154170036316, + "rewards/margins": 0.7820614576339722, + "rewards/rejected": -2.298215866088867, + "step": 4840 + }, + { + "epoch": 0.63, + "learning_rate": 1.7716927477810389e-06, + "logits/chosen": -1.850337266921997, + "logits/rejected": -1.8355680704116821, + "logps/chosen": -418.2672424316406, + "logps/rejected": -512.0823974609375, + "loss": 0.5196, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4566137790679932, + "rewards/margins": 0.8951760530471802, + "rewards/rejected": -2.351789712905884, + "step": 4850 + }, + { + "epoch": 0.64, + "learning_rate": 1.7607735209627953e-06, + "logits/chosen": -1.7742077112197876, + "logits/rejected": -1.6215053796768188, + "logps/chosen": -431.87786865234375, + "logps/rejected": -473.6273498535156, + "loss": 0.5181, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.518726110458374, + "rewards/margins": 0.7906683683395386, + "rewards/rejected": -2.309394359588623, + "step": 4860 + }, + { + "epoch": 0.64, + "learning_rate": 1.749869725524651e-06, + "logits/chosen": -1.8782122135162354, + "logits/rejected": -1.7312371730804443, + "logps/chosen": -437.98809814453125, + "logps/rejected": -499.5009765625, + "loss": 0.4952, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4420686960220337, + "rewards/margins": 0.960234522819519, + "rewards/rejected": -2.4023032188415527, + "step": 4870 + }, + { + "epoch": 0.64, + "learning_rate": 1.7389815890837392e-06, + "logits/chosen": -1.685935378074646, + "logits/rejected": -1.7608509063720703, + "logps/chosen": -444.6941833496094, + "logps/rejected": -559.3511352539062, + "loss": 0.4362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.518878698348999, + "rewards/margins": 0.9780572056770325, + "rewards/rejected": -2.496936082839966, + "step": 4880 + }, + { + "epoch": 0.64, + "learning_rate": 1.7281093389303105e-06, + "logits/chosen": -1.766892433166504, + "logits/rejected": -1.663165807723999, + "logps/chosen": -409.701416015625, + "logps/rejected": -469.8858337402344, + "loss": 0.5157, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4582762718200684, + "rewards/margins": 0.8043732643127441, + "rewards/rejected": -2.2626495361328125, + "step": 4890 + }, + { + "epoch": 0.64, + "learning_rate": 1.7172532020229899e-06, + "logits/chosen": -1.9181607961654663, + "logits/rejected": -1.8041608333587646, + "logps/chosen": -453.35223388671875, + "logps/rejected": -496.159912109375, + "loss": 0.517, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5475901365280151, + "rewards/margins": 0.7943762540817261, + "rewards/rejected": -2.341966390609741, + "step": 4900 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 0.9524018168449402, + "eval_logits/rejected": 0.9735285639762878, + "eval_logps/chosen": -450.3797302246094, + "eval_logps/rejected": -504.55609130859375, + "eval_loss": 0.5530635714530945, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": -1.6508703231811523, + "eval_rewards/margins": 0.7559937834739685, + "eval_rewards/rejected": -2.4068639278411865, + "eval_runtime": 1173.3372, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 4900 + }, + { + "epoch": 0.64, + "learning_rate": 1.7064134049840359e-06, + "logits/chosen": -1.8048479557037354, + "logits/rejected": -1.780055046081543, + "logps/chosen": -418.9851989746094, + "logps/rejected": -504.3468322753906, + "loss": 0.4785, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4821782112121582, + "rewards/margins": 0.8749674558639526, + "rewards/rejected": -2.3571457862854004, + "step": 4910 + }, + { + "epoch": 0.64, + "learning_rate": 1.6955901740946136e-06, + "logits/chosen": -1.8494422435760498, + "logits/rejected": -1.7496616840362549, + "logps/chosen": -496.0858459472656, + "logps/rejected": -550.9440307617188, + "loss": 0.6453, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9027026891708374, + "rewards/margins": 0.5574442148208618, + "rewards/rejected": -2.460146903991699, + "step": 4920 + }, + { + "epoch": 0.65, + "learning_rate": 1.684783735290067e-06, + "logits/chosen": -1.8014285564422607, + "logits/rejected": -1.6779054403305054, + "logps/chosen": -424.20208740234375, + "logps/rejected": -518.1012573242188, + "loss": 0.4467, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5048842430114746, + "rewards/margins": 1.0522735118865967, + "rewards/rejected": -2.5571579933166504, + "step": 4930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6739943141552079e-06, + "logits/chosen": -1.820129632949829, + "logits/rejected": -1.690853476524353, + "logps/chosen": -472.244873046875, + "logps/rejected": -495.262451171875, + "loss": 0.5876, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6202694177627563, + "rewards/margins": 0.7392504811286926, + "rewards/rejected": -2.3595199584960938, + "step": 4940 + }, + { + "epoch": 0.65, + "learning_rate": 1.663222135919601e-06, + "logits/chosen": -1.8577134609222412, + "logits/rejected": -1.719242811203003, + "logps/chosen": -462.2020568847656, + "logps/rejected": -504.34454345703125, + "loss": 0.54, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4801579713821411, + "rewards/margins": 0.667270839214325, + "rewards/rejected": -2.1474289894104004, + "step": 4950 + }, + { + "epoch": 0.65, + "learning_rate": 1.652467425452865e-06, + "logits/chosen": -1.7774289846420288, + "logits/rejected": -1.7675281763076782, + "logps/chosen": -415.58038330078125, + "logps/rejected": -468.835205078125, + "loss": 0.5367, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4868974685668945, + "rewards/margins": 0.7201194167137146, + "rewards/rejected": -2.207017183303833, + "step": 4960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6417304072599787e-06, + "logits/chosen": -1.9124637842178345, + "logits/rejected": -1.720577597618103, + "logps/chosen": -459.5377502441406, + "logps/rejected": -508.71832275390625, + "loss": 0.599, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7493877410888672, + "rewards/margins": 0.5594874024391174, + "rewards/rejected": -2.30887508392334, + "step": 4970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6310113054765947e-06, + "logits/chosen": -1.8772382736206055, + "logits/rejected": -1.7379204034805298, + "logps/chosen": -455.2511291503906, + "logps/rejected": -504.4996643066406, + "loss": 0.5382, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.625997543334961, + "rewards/margins": 0.8635755777359009, + "rewards/rejected": -2.4895730018615723, + "step": 4980 + }, + { + "epoch": 0.65, + "learning_rate": 1.6203103438643591e-06, + "logits/chosen": -1.847890853881836, + "logits/rejected": -1.774196982383728, + "logps/chosen": -434.84954833984375, + "logps/rejected": -511.7041015625, + "loss": 0.5613, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6426740884780884, + "rewards/margins": 0.7089305520057678, + "rewards/rejected": -2.351604700088501, + "step": 4990 + }, + { + "epoch": 0.65, + "learning_rate": 1.6096277458062417e-06, + "logits/chosen": -1.681212067604065, + "logits/rejected": -1.6562614440917969, + "logps/chosen": -375.3092041015625, + "logps/rejected": -472.8038635253906, + "loss": 0.4862, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5255978107452393, + "rewards/margins": 0.8586394190788269, + "rewards/rejected": -2.38423752784729, + "step": 5000 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 0.8849155306816101, + "eval_logits/rejected": 0.9138307571411133, + "eval_logps/chosen": -439.3872985839844, + "eval_logps/rejected": -493.1929626464844, + "eval_loss": 0.5523954033851624, + "eval_rewards/accuracies": 0.7080000042915344, + "eval_rewards/chosen": -1.5409460067749023, + "eval_rewards/margins": 0.7522870302200317, + "eval_rewards/rejected": -2.2932329177856445, + "eval_runtime": 1173.3026, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 5000 + }, + { + "epoch": 0.66, + "learning_rate": 1.5989637343018705e-06, + "logits/chosen": -1.7913602590560913, + "logits/rejected": -1.6334741115570068, + "logps/chosen": -427.86187744140625, + "logps/rejected": -514.5504150390625, + "loss": 0.4979, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.422251582145691, + "rewards/margins": 0.8504781723022461, + "rewards/rejected": -2.2727296352386475, + "step": 5010 + }, + { + "epoch": 0.66, + "learning_rate": 1.5883185319628824e-06, + "logits/chosen": -1.7208000421524048, + "logits/rejected": -1.4415217638015747, + "logps/chosen": -453.88446044921875, + "logps/rejected": -490.77569580078125, + "loss": 0.5046, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5737110376358032, + "rewards/margins": 0.7625322341918945, + "rewards/rejected": -2.3362433910369873, + "step": 5020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5776923610082695e-06, + "logits/chosen": -1.8103011846542358, + "logits/rejected": -1.690213918685913, + "logps/chosen": -441.17919921875, + "logps/rejected": -509.001708984375, + "loss": 0.5442, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6508638858795166, + "rewards/margins": 0.9134138822555542, + "rewards/rejected": -2.5642776489257812, + "step": 5030 + }, + { + "epoch": 0.66, + "learning_rate": 1.5670854432597433e-06, + "logits/chosen": -1.824211835861206, + "logits/rejected": -1.8008737564086914, + "logps/chosen": -478.08709716796875, + "logps/rejected": -482.66168212890625, + "loss": 0.5657, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6046216487884521, + "rewards/margins": 0.6122376322746277, + "rewards/rejected": -2.2168593406677246, + "step": 5040 + }, + { + "epoch": 0.66, + "learning_rate": 1.556498000137104e-06, + "logits/chosen": -1.6245505809783936, + "logits/rejected": -1.5970425605773926, + "logps/chosen": -395.28411865234375, + "logps/rejected": -462.77069091796875, + "loss": 0.4882, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4695818424224854, + "rewards/margins": 0.8584505915641785, + "rewards/rejected": -2.3280324935913086, + "step": 5050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5459302526536188e-06, + "logits/chosen": -1.8356765508651733, + "logits/rejected": -1.7552534341812134, + "logps/chosen": -429.896728515625, + "logps/rejected": -458.87860107421875, + "loss": 0.6114, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4339876174926758, + "rewards/margins": 0.5756856799125671, + "rewards/rejected": -2.0096733570098877, + "step": 5060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5353824214114075e-06, + "logits/chosen": -1.9251823425292969, + "logits/rejected": -1.8641544580459595, + "logps/chosen": -418.57318115234375, + "logps/rejected": -481.7355041503906, + "loss": 0.5363, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.475541353225708, + "rewards/margins": 0.7100075483322144, + "rewards/rejected": -2.185549020767212, + "step": 5070 + }, + { + "epoch": 0.66, + "learning_rate": 1.5248547265968373e-06, + "logits/chosen": -1.9587377309799194, + "logits/rejected": -1.9029967784881592, + "logps/chosen": -388.7999572753906, + "logps/rejected": -448.08221435546875, + "loss": 0.5406, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3019168376922607, + "rewards/margins": 0.6987492442131042, + "rewards/rejected": -2.0006661415100098, + "step": 5080 + }, + { + "epoch": 0.67, + "learning_rate": 1.5143473879759265e-06, + "logits/chosen": -1.9306983947753906, + "logits/rejected": -1.7760818004608154, + "logps/chosen": -393.8701171875, + "logps/rejected": -435.2232971191406, + "loss": 0.5177, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.366986632347107, + "rewards/margins": 0.8715737462043762, + "rewards/rejected": -2.238560199737549, + "step": 5090 + }, + { + "epoch": 0.67, + "learning_rate": 1.5038606248897586e-06, + "logits/chosen": -1.8374249935150146, + "logits/rejected": -1.8686059713363647, + "logps/chosen": -469.5269470214844, + "logps/rejected": -503.837890625, + "loss": 0.6176, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7035410404205322, + "rewards/margins": 0.5446879863739014, + "rewards/rejected": -2.2482292652130127, + "step": 5100 + }, + { + "epoch": 0.67, + "eval_logits/chosen": 0.8442540764808655, + "eval_logits/rejected": 0.8784592747688293, + "eval_logps/chosen": -432.8858947753906, + "eval_logps/rejected": -486.6266174316406, + "eval_loss": 0.5519185662269592, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": -1.4759317636489868, + "eval_rewards/margins": 0.7516381144523621, + "eval_rewards/rejected": -2.227569818496704, + "eval_runtime": 1172.979, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 5100 + }, + { + "epoch": 0.67, + "learning_rate": 1.4933946562499008e-06, + "logits/chosen": -1.7316030263900757, + "logits/rejected": -1.6031084060668945, + "logps/chosen": -421.27056884765625, + "logps/rejected": -470.91259765625, + "loss": 0.5496, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4366884231567383, + "rewards/margins": 0.8011615872383118, + "rewards/rejected": -2.2378499507904053, + "step": 5110 + }, + { + "epoch": 0.67, + "learning_rate": 1.482949700533835e-06, + "logits/chosen": -1.6878440380096436, + "logits/rejected": -1.6608251333236694, + "logps/chosen": -375.3850402832031, + "logps/rejected": -426.45477294921875, + "loss": 0.5485, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4140172004699707, + "rewards/margins": 0.6440494656562805, + "rewards/rejected": -2.0580666065216064, + "step": 5120 + }, + { + "epoch": 0.67, + "learning_rate": 1.4725259757803983e-06, + "logits/chosen": -1.8848049640655518, + "logits/rejected": -1.8635708093643188, + "logps/chosen": -474.72637939453125, + "logps/rejected": -510.2239685058594, + "loss": 0.567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.468737006187439, + "rewards/margins": 0.7705806493759155, + "rewards/rejected": -2.2393178939819336, + "step": 5130 + }, + { + "epoch": 0.67, + "learning_rate": 1.4621236995852314e-06, + "logits/chosen": -2.061633586883545, + "logits/rejected": -1.883298635482788, + "logps/chosen": -415.67401123046875, + "logps/rejected": -478.1891174316406, + "loss": 0.5078, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.371880054473877, + "rewards/margins": 0.8724004030227661, + "rewards/rejected": -2.2442805767059326, + "step": 5140 + }, + { + "epoch": 0.67, + "learning_rate": 1.4517430890962337e-06, + "logits/chosen": -1.9740339517593384, + "logits/rejected": -1.664778709411621, + "logps/chosen": -439.51983642578125, + "logps/rejected": -422.0973205566406, + "loss": 0.4926, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3358771800994873, + "rewards/margins": 0.821506679058075, + "rewards/rejected": -2.157384157180786, + "step": 5150 + }, + { + "epoch": 0.68, + "learning_rate": 1.4413843610090342e-06, + "logits/chosen": -1.973301649093628, + "logits/rejected": -1.7762012481689453, + "logps/chosen": -462.387939453125, + "logps/rejected": -499.8759765625, + "loss": 0.5439, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.535919427871704, + "rewards/margins": 0.7471837997436523, + "rewards/rejected": -2.2831029891967773, + "step": 5160 + }, + { + "epoch": 0.68, + "learning_rate": 1.4310477315624637e-06, + "logits/chosen": -1.90434992313385, + "logits/rejected": -1.79486083984375, + "logps/chosen": -407.06341552734375, + "logps/rejected": -462.5298767089844, + "loss": 0.659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5114107131958008, + "rewards/margins": 0.5543745756149292, + "rewards/rejected": -2.0657854080200195, + "step": 5170 + }, + { + "epoch": 0.68, + "learning_rate": 1.420733416534045e-06, + "logits/chosen": -1.6625850200653076, + "logits/rejected": -1.5175690650939941, + "logps/chosen": -415.73321533203125, + "logps/rejected": -489.0868225097656, + "loss": 0.6199, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5352885723114014, + "rewards/margins": 0.6777051687240601, + "rewards/rejected": -2.212993860244751, + "step": 5180 + }, + { + "epoch": 0.68, + "learning_rate": 1.410441631235487e-06, + "logits/chosen": -1.9261877536773682, + "logits/rejected": -1.7891525030136108, + "logps/chosen": -431.77911376953125, + "logps/rejected": -491.50311279296875, + "loss": 0.5214, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3564847707748413, + "rewards/margins": 0.7622567415237427, + "rewards/rejected": -2.118741512298584, + "step": 5190 + }, + { + "epoch": 0.68, + "learning_rate": 1.4001725905081868e-06, + "logits/chosen": -1.853712797164917, + "logits/rejected": -1.645713210105896, + "logps/chosen": -394.0192565917969, + "logps/rejected": -418.23992919921875, + "loss": 0.5514, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.45644211769104, + "rewards/margins": 0.6980880498886108, + "rewards/rejected": -2.1545300483703613, + "step": 5200 + }, + { + "epoch": 0.68, + "eval_logits/chosen": 0.7893780469894409, + "eval_logits/rejected": 0.8299470543861389, + "eval_logps/chosen": -426.1199645996094, + "eval_logps/rejected": -477.4418029785156, + "eval_loss": 0.5500012040138245, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -1.4082725048065186, + "eval_rewards/margins": 0.7274492979049683, + "eval_rewards/rejected": -2.1357219219207764, + "eval_runtime": 1172.9082, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 5200 + }, + { + "epoch": 0.68, + "learning_rate": 1.3899265087187507e-06, + "logits/chosen": -1.8301036357879639, + "logits/rejected": -1.7930876016616821, + "logps/chosen": -383.07904052734375, + "logps/rejected": -428.0769958496094, + "loss": 0.5495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3355047702789307, + "rewards/margins": 0.6709381937980652, + "rewards/rejected": -2.0064430236816406, + "step": 5210 + }, + { + "epoch": 0.68, + "learning_rate": 1.3797035997545144e-06, + "logits/chosen": -1.9526773691177368, + "logits/rejected": -1.8379061222076416, + "logps/chosen": -439.8001403808594, + "logps/rejected": -469.07537841796875, + "loss": 0.5418, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3511309623718262, + "rewards/margins": 0.6671268939971924, + "rewards/rejected": -2.0182576179504395, + "step": 5220 + }, + { + "epoch": 0.68, + "learning_rate": 1.3695040770190816e-06, + "logits/chosen": -1.8893228769302368, + "logits/rejected": -1.8120670318603516, + "logps/chosen": -392.29559326171875, + "logps/rejected": -449.6136169433594, + "loss": 0.5796, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3194262981414795, + "rewards/margins": 0.6628329753875732, + "rewards/rejected": -1.9822590351104736, + "step": 5230 + }, + { + "epoch": 0.69, + "learning_rate": 1.3593281534278651e-06, + "logits/chosen": -1.8769222497940063, + "logits/rejected": -1.8663833141326904, + "logps/chosen": -374.20556640625, + "logps/rejected": -466.6195373535156, + "loss": 0.4697, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2972444295883179, + "rewards/margins": 0.8416715860366821, + "rewards/rejected": -2.138916015625, + "step": 5240 + }, + { + "epoch": 0.69, + "learning_rate": 1.3491760414036478e-06, + "logits/chosen": -1.8334850072860718, + "logits/rejected": -1.6556533575057983, + "logps/chosen": -448.02227783203125, + "logps/rejected": -452.9075622558594, + "loss": 0.6063, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4142099618911743, + "rewards/margins": 0.57100909948349, + "rewards/rejected": -1.9852192401885986, + "step": 5250 + }, + { + "epoch": 0.69, + "learning_rate": 1.3390479528721444e-06, + "logits/chosen": -1.7845255136489868, + "logits/rejected": -1.7285311222076416, + "logps/chosen": -428.2275390625, + "logps/rejected": -496.64190673828125, + "loss": 0.5979, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5454214811325073, + "rewards/margins": 0.6397279500961304, + "rewards/rejected": -2.1851494312286377, + "step": 5260 + }, + { + "epoch": 0.69, + "learning_rate": 1.3289440992575756e-06, + "logits/chosen": -1.957970380783081, + "logits/rejected": -1.8876816034317017, + "logps/chosen": -428.3063049316406, + "logps/rejected": -476.18719482421875, + "loss": 0.4958, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2013753652572632, + "rewards/margins": 0.7330563068389893, + "rewards/rejected": -1.9344314336776733, + "step": 5270 + }, + { + "epoch": 0.69, + "learning_rate": 1.3188646914782616e-06, + "logits/chosen": -2.035910129547119, + "logits/rejected": -1.855177640914917, + "logps/chosen": -481.1543884277344, + "logps/rejected": -454.63922119140625, + "loss": 0.5401, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.420890212059021, + "rewards/margins": 0.6797800064086914, + "rewards/rejected": -2.100670337677002, + "step": 5280 + }, + { + "epoch": 0.69, + "learning_rate": 1.3088099399422109e-06, + "logits/chosen": -1.9418426752090454, + "logits/rejected": -1.8563458919525146, + "logps/chosen": -439.60595703125, + "logps/rejected": -491.9336853027344, + "loss": 0.5342, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.406085729598999, + "rewards/margins": 0.7221137285232544, + "rewards/rejected": -2.128199338912964, + "step": 5290 + }, + { + "epoch": 0.69, + "learning_rate": 1.2987800545427353e-06, + "logits/chosen": -1.963463544845581, + "logits/rejected": -1.832273244857788, + "logps/chosen": -422.38409423828125, + "logps/rejected": -463.7245178222656, + "loss": 0.5166, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.283937692642212, + "rewards/margins": 0.7820326089859009, + "rewards/rejected": -2.0659701824188232, + "step": 5300 + }, + { + "epoch": 0.69, + "eval_logits/chosen": 0.8065236806869507, + "eval_logits/rejected": 0.844149649143219, + "eval_logps/chosen": -426.8324279785156, + "eval_logps/rejected": -478.9722595214844, + "eval_loss": 0.5508423447608948, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -1.4153975248336792, + "eval_rewards/margins": 0.7356284260749817, + "eval_rewards/rejected": -2.1510257720947266, + "eval_runtime": 1173.231, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 5300 + }, + { + "epoch": 0.69, + "learning_rate": 1.288775244654062e-06, + "logits/chosen": -1.929620385169983, + "logits/rejected": -1.8457822799682617, + "logps/chosen": -474.7225646972656, + "logps/rejected": -492.4024963378906, + "loss": 0.5901, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4866751432418823, + "rewards/margins": 0.6385191679000854, + "rewards/rejected": -2.1251943111419678, + "step": 5310 + }, + { + "epoch": 0.7, + "learning_rate": 1.2787957191269696e-06, + "logits/chosen": -1.7664387226104736, + "logits/rejected": -1.6823327541351318, + "logps/chosen": -432.7451171875, + "logps/rejected": -496.489013671875, + "loss": 0.6383, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5256898403167725, + "rewards/margins": 0.481570303440094, + "rewards/rejected": -2.0072600841522217, + "step": 5320 + }, + { + "epoch": 0.7, + "learning_rate": 1.2688416862844193e-06, + "logits/chosen": -1.6892430782318115, + "logits/rejected": -1.6865297555923462, + "logps/chosen": -378.4471130371094, + "logps/rejected": -495.246826171875, + "loss": 0.4466, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.293825387954712, + "rewards/margins": 1.037535309791565, + "rewards/rejected": -2.3313608169555664, + "step": 5330 + }, + { + "epoch": 0.7, + "learning_rate": 1.2589133539172193e-06, + "logits/chosen": -2.0279300212860107, + "logits/rejected": -1.9393310546875, + "logps/chosen": -454.46893310546875, + "logps/rejected": -485.9325256347656, + "loss": 0.4776, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2705265283584595, + "rewards/margins": 0.8261939883232117, + "rewards/rejected": -2.0967202186584473, + "step": 5340 + }, + { + "epoch": 0.7, + "learning_rate": 1.249010929279672e-06, + "logits/chosen": -2.0560059547424316, + "logits/rejected": -1.9522721767425537, + "logps/chosen": -452.753662109375, + "logps/rejected": -501.004638671875, + "loss": 0.5789, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.441618800163269, + "rewards/margins": 0.6563819646835327, + "rewards/rejected": -2.098001003265381, + "step": 5350 + }, + { + "epoch": 0.7, + "learning_rate": 1.2391346190852603e-06, + "logits/chosen": -2.059138536453247, + "logits/rejected": -1.8753607273101807, + "logps/chosen": -434.37322998046875, + "logps/rejected": -465.62017822265625, + "loss": 0.6552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5591915845870972, + "rewards/margins": 0.5033558011054993, + "rewards/rejected": -2.062547206878662, + "step": 5360 + }, + { + "epoch": 0.7, + "learning_rate": 1.2292846295023222e-06, + "logits/chosen": -1.9328066110610962, + "logits/rejected": -1.88934326171875, + "logps/chosen": -456.8828125, + "logps/rejected": -491.4498596191406, + "loss": 0.6035, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.49178147315979, + "rewards/margins": 0.5151655077934265, + "rewards/rejected": -2.0069470405578613, + "step": 5370 + }, + { + "epoch": 0.7, + "learning_rate": 1.2194611661497576e-06, + "logits/chosen": -1.8160051107406616, + "logits/rejected": -1.7239576578140259, + "logps/chosen": -415.52423095703125, + "logps/rejected": -475.9527893066406, + "loss": 0.5137, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3479664325714111, + "rewards/margins": 0.7646690011024475, + "rewards/rejected": -2.112635374069214, + "step": 5380 + }, + { + "epoch": 0.71, + "learning_rate": 1.2096644340927247e-06, + "logits/chosen": -1.9325023889541626, + "logits/rejected": -1.795840859413147, + "logps/chosen": -447.41448974609375, + "logps/rejected": -493.99786376953125, + "loss": 0.5656, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4784553050994873, + "rewards/margins": 0.6328384280204773, + "rewards/rejected": -2.1112937927246094, + "step": 5390 + }, + { + "epoch": 0.71, + "learning_rate": 1.19989463783837e-06, + "logits/chosen": -2.0235092639923096, + "logits/rejected": -1.8829200267791748, + "logps/chosen": -442.604736328125, + "logps/rejected": -510.56304931640625, + "loss": 0.4918, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.288726568222046, + "rewards/margins": 0.842505931854248, + "rewards/rejected": -2.131232500076294, + "step": 5400 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 0.7905195355415344, + "eval_logits/rejected": 0.8312855958938599, + "eval_logps/chosen": -426.2182922363281, + "eval_logps/rejected": -476.76666259765625, + "eval_loss": 0.54958176612854, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": -1.4092553853988647, + "eval_rewards/margins": 0.719714343547821, + "eval_rewards/rejected": -2.128969669342041, + "eval_runtime": 1172.6248, + "eval_samples_per_second": 1.706, + "eval_steps_per_second": 0.853, + "step": 5400 + }, + { + "epoch": 0.71, + "learning_rate": 1.1901519813315495e-06, + "logits/chosen": -1.752189040184021, + "logits/rejected": -1.7029765844345093, + "logps/chosen": -406.2033996582031, + "logps/rejected": -462.9178161621094, + "loss": 0.493, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.382287859916687, + "rewards/margins": 0.7878392338752747, + "rewards/rejected": -2.1701271533966064, + "step": 5410 + }, + { + "epoch": 0.71, + "learning_rate": 1.1804366679505798e-06, + "logits/chosen": -1.8241612911224365, + "logits/rejected": -1.653684377670288, + "logps/chosen": -465.00390625, + "logps/rejected": -486.6300354003906, + "loss": 0.5537, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5488064289093018, + "rewards/margins": 0.7547654509544373, + "rewards/rejected": -2.3035717010498047, + "step": 5420 + }, + { + "epoch": 0.71, + "learning_rate": 1.1707489005029877e-06, + "logits/chosen": -1.8216431140899658, + "logits/rejected": -1.764282464981079, + "logps/chosen": -425.51739501953125, + "logps/rejected": -489.0801696777344, + "loss": 0.5062, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4804413318634033, + "rewards/margins": 0.888811469078064, + "rewards/rejected": -2.3692526817321777, + "step": 5430 + }, + { + "epoch": 0.71, + "learning_rate": 1.1610888812212749e-06, + "logits/chosen": -1.7654953002929688, + "logits/rejected": -1.655609130859375, + "logps/chosen": -429.49456787109375, + "logps/rejected": -489.21337890625, + "loss": 0.51, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4289910793304443, + "rewards/margins": 0.8090342283248901, + "rewards/rejected": -2.238025188446045, + "step": 5440 + }, + { + "epoch": 0.71, + "learning_rate": 1.1514568117587035e-06, + "logits/chosen": -1.829450011253357, + "logits/rejected": -1.9704999923706055, + "logps/chosen": -452.014892578125, + "logps/rejected": -474.6261291503906, + "loss": 0.6124, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6344467401504517, + "rewards/margins": 0.4384728968143463, + "rewards/rejected": -2.0729196071624756, + "step": 5450 + }, + { + "epoch": 0.71, + "learning_rate": 1.1418528931850781e-06, + "logits/chosen": -1.9476792812347412, + "logits/rejected": -1.781757116317749, + "logps/chosen": -425.89154052734375, + "logps/rejected": -459.1473693847656, + "loss": 0.5068, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3089628219604492, + "rewards/margins": 0.851481556892395, + "rewards/rejected": -2.1604442596435547, + "step": 5460 + }, + { + "epoch": 0.72, + "learning_rate": 1.1322773259825563e-06, + "logits/chosen": -1.9182113409042358, + "logits/rejected": -1.6885206699371338, + "logps/chosen": -441.8060607910156, + "logps/rejected": -442.9778747558594, + "loss": 0.5254, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4591407775878906, + "rewards/margins": 0.6979411840438843, + "rewards/rejected": -2.1570820808410645, + "step": 5470 + }, + { + "epoch": 0.72, + "learning_rate": 1.1227303100414552e-06, + "logits/chosen": -1.796644926071167, + "logits/rejected": -1.7756593227386475, + "logps/chosen": -386.8682556152344, + "logps/rejected": -492.8446350097656, + "loss": 0.4757, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.478819489479065, + "rewards/margins": 0.8811575174331665, + "rewards/rejected": -2.3599770069122314, + "step": 5480 + }, + { + "epoch": 0.72, + "learning_rate": 1.113212044656087e-06, + "logits/chosen": -1.7989683151245117, + "logits/rejected": -1.7970603704452515, + "logps/chosen": -397.41998291015625, + "logps/rejected": -476.31024169921875, + "loss": 0.5683, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4304078817367554, + "rewards/margins": 0.6791922450065613, + "rewards/rejected": -2.109600067138672, + "step": 5490 + }, + { + "epoch": 0.72, + "learning_rate": 1.1037227285205951e-06, + "logits/chosen": -1.678154706954956, + "logits/rejected": -1.7668046951293945, + "logps/chosen": -427.35101318359375, + "logps/rejected": -486.7571716308594, + "loss": 0.596, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5133135318756104, + "rewards/margins": 0.5993794202804565, + "rewards/rejected": -2.1126928329467773, + "step": 5500 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 0.8238762617111206, + "eval_logits/rejected": 0.8632076978683472, + "eval_logps/chosen": -434.1884765625, + "eval_logps/rejected": -486.0820617675781, + "eval_loss": 0.548935055732727, + "eval_rewards/accuracies": 0.7074999809265137, + "eval_rewards/chosen": -1.4889572858810425, + "eval_rewards/margins": 0.7331663370132446, + "eval_rewards/rejected": -2.222123622894287, + "eval_runtime": 1173.5009, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 5500 + }, + { + "epoch": 0.72, + "learning_rate": 1.0942625597248028e-06, + "logits/chosen": -1.771903395652771, + "logits/rejected": -1.5377171039581299, + "logps/chosen": -414.85784912109375, + "logps/rejected": -461.81634521484375, + "loss": 0.4904, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4489622116088867, + "rewards/margins": 0.8897550702095032, + "rewards/rejected": -2.338717222213745, + "step": 5510 + }, + { + "epoch": 0.72, + "learning_rate": 1.0848317357500854e-06, + "logits/chosen": -1.8069961071014404, + "logits/rejected": -1.700169324874878, + "logps/chosen": -461.0796813964844, + "logps/rejected": -446.8509826660156, + "loss": 0.5895, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5638563632965088, + "rewards/margins": 0.5624241828918457, + "rewards/rejected": -2.1262803077697754, + "step": 5520 + }, + { + "epoch": 0.72, + "learning_rate": 1.0754304534652404e-06, + "logits/chosen": -1.7843172550201416, + "logits/rejected": -1.8855609893798828, + "logps/chosen": -430.64056396484375, + "logps/rejected": -515.0929565429688, + "loss": 0.5999, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5967886447906494, + "rewards/margins": 0.5139278173446655, + "rewards/rejected": -2.1107165813446045, + "step": 5530 + }, + { + "epoch": 0.72, + "learning_rate": 1.0660589091223854e-06, + "logits/chosen": -1.6621599197387695, + "logits/rejected": -1.7448629140853882, + "logps/chosen": -377.0594482421875, + "logps/rejected": -441.2286071777344, + "loss": 0.5291, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4500750303268433, + "rewards/margins": 0.7363721132278442, + "rewards/rejected": -2.1864471435546875, + "step": 5540 + }, + { + "epoch": 0.73, + "learning_rate": 1.0567172983528534e-06, + "logits/chosen": -1.7251408100128174, + "logits/rejected": -1.5952707529067993, + "logps/chosen": -363.9090576171875, + "logps/rejected": -439.10638427734375, + "loss": 0.474, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3747875690460205, + "rewards/margins": 0.7879632711410522, + "rewards/rejected": -2.162750720977783, + "step": 5550 + }, + { + "epoch": 0.73, + "learning_rate": 1.0474058161631168e-06, + "logits/chosen": -1.884746789932251, + "logits/rejected": -1.881100058555603, + "logps/chosen": -481.223876953125, + "logps/rejected": -511.8279724121094, + "loss": 0.6107, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.501129388809204, + "rewards/margins": 0.5625635981559753, + "rewards/rejected": -2.063692808151245, + "step": 5560 + }, + { + "epoch": 0.73, + "learning_rate": 1.0381246569307077e-06, + "logits/chosen": -1.9864051342010498, + "logits/rejected": -1.8679383993148804, + "logps/chosen": -462.429443359375, + "logps/rejected": -480.8426818847656, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4590893983840942, + "rewards/margins": 0.5718271732330322, + "rewards/rejected": -2.030916690826416, + "step": 5570 + }, + { + "epoch": 0.73, + "learning_rate": 1.0288740144001722e-06, + "logits/chosen": -2.014369249343872, + "logits/rejected": -1.766344428062439, + "logps/chosen": -424.77685546875, + "logps/rejected": -442.02764892578125, + "loss": 0.5989, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4355696439743042, + "rewards/margins": 0.6431950926780701, + "rewards/rejected": -2.0787649154663086, + "step": 5580 + }, + { + "epoch": 0.73, + "learning_rate": 1.0196540816790127e-06, + "logits/chosen": -1.748151183128357, + "logits/rejected": -1.6582529544830322, + "logps/chosen": -391.3197021484375, + "logps/rejected": -418.3585510253906, + "loss": 0.5349, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3962318897247314, + "rewards/margins": 0.6935681700706482, + "rewards/rejected": -2.0897998809814453, + "step": 5590 + }, + { + "epoch": 0.73, + "learning_rate": 1.0104650512336679e-06, + "logits/chosen": -2.043593168258667, + "logits/rejected": -1.8819385766983032, + "logps/chosen": -424.79779052734375, + "logps/rejected": -437.5965881347656, + "loss": 0.6034, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3628209829330444, + "rewards/margins": 0.634640634059906, + "rewards/rejected": -1.9974616765975952, + "step": 5600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": 0.7560797333717346, + "eval_logits/rejected": 0.8040981292724609, + "eval_logps/chosen": -425.7730407714844, + "eval_logps/rejected": -477.2521667480469, + "eval_loss": 0.5488966703414917, + "eval_rewards/accuracies": 0.7064999938011169, + "eval_rewards/chosen": -1.4048031568527222, + "eval_rewards/margins": 0.7290219664573669, + "eval_rewards/rejected": -2.1338253021240234, + "eval_runtime": 1173.6172, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 5600 + }, + { + "epoch": 0.73, + "learning_rate": 1.0013071148854861e-06, + "logits/chosen": -1.779545545578003, + "logits/rejected": -1.77289617061615, + "logps/chosen": -388.35260009765625, + "logps/rejected": -488.19500732421875, + "loss": 0.4682, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3467668294906616, + "rewards/margins": 0.9838630557060242, + "rewards/rejected": -2.330629825592041, + "step": 5610 + }, + { + "epoch": 0.74, + "learning_rate": 9.921804638067292e-07, + "logits/chosen": -1.8865807056427002, + "logits/rejected": -1.6680485010147095, + "logps/chosen": -441.1761169433594, + "logps/rejected": -476.1258850097656, + "loss": 0.5362, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.551322340965271, + "rewards/margins": 0.755243182182312, + "rewards/rejected": -2.306565761566162, + "step": 5620 + }, + { + "epoch": 0.74, + "learning_rate": 9.830852885165749e-07, + "logits/chosen": -1.747839331626892, + "logits/rejected": -1.9316389560699463, + "logps/chosen": -381.62451171875, + "logps/rejected": -481.34161376953125, + "loss": 0.5668, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4812041521072388, + "rewards/margins": 0.6032929420471191, + "rewards/rejected": -2.0844969749450684, + "step": 5630 + }, + { + "epoch": 0.74, + "learning_rate": 9.740217788771453e-07, + "logits/chosen": -1.8652489185333252, + "logits/rejected": -1.7647807598114014, + "logps/chosen": -425.3164978027344, + "logps/rejected": -458.489501953125, + "loss": 0.535, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3008569478988647, + "rewards/margins": 0.7232750654220581, + "rewards/rejected": -2.024132013320923, + "step": 5640 + }, + { + "epoch": 0.74, + "learning_rate": 9.649901240895374e-07, + "logits/chosen": -1.6762669086456299, + "logits/rejected": -1.7178618907928467, + "logps/chosen": -401.3484802246094, + "logps/rejected": -476.75885009765625, + "loss": 0.5098, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4109026193618774, + "rewards/margins": 0.8481072187423706, + "rewards/rejected": -2.259009599685669, + "step": 5650 + }, + { + "epoch": 0.74, + "learning_rate": 9.559905126898803e-07, + "logits/chosen": -1.8930237293243408, + "logits/rejected": -1.6374887228012085, + "logps/chosen": -427.38232421875, + "logps/rejected": -483.18170166015625, + "loss": 0.4217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3818823099136353, + "rewards/margins": 0.9239101409912109, + "rewards/rejected": -2.3057923316955566, + "step": 5660 + }, + { + "epoch": 0.74, + "learning_rate": 9.470231325453958e-07, + "logits/chosen": -1.7371124029159546, + "logits/rejected": -1.5872479677200317, + "logps/chosen": -423.66546630859375, + "logps/rejected": -459.2461853027344, + "loss": 0.5601, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4602539539337158, + "rewards/margins": 0.718927800655365, + "rewards/rejected": -2.1791815757751465, + "step": 5670 + }, + { + "epoch": 0.74, + "learning_rate": 9.380881708504741e-07, + "logits/chosen": -1.709235429763794, + "logits/rejected": -1.5888019800186157, + "logps/chosen": -376.9496154785156, + "logps/rejected": -422.35467529296875, + "loss": 0.531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.370224952697754, + "rewards/margins": 0.7989784479141235, + "rewards/rejected": -2.169203281402588, + "step": 5680 + }, + { + "epoch": 0.74, + "learning_rate": 9.291858141227733e-07, + "logits/chosen": -1.8022798299789429, + "logits/rejected": -1.8127330541610718, + "logps/chosen": -404.7644958496094, + "logps/rejected": -518.0156860351562, + "loss": 0.4482, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3747928142547607, + "rewards/margins": 0.9949323534965515, + "rewards/rejected": -2.369725227355957, + "step": 5690 + }, + { + "epoch": 0.75, + "learning_rate": 9.203162481993175e-07, + "logits/chosen": -1.9120906591415405, + "logits/rejected": -1.8126256465911865, + "logps/chosen": -458.3221130371094, + "logps/rejected": -549.8818969726562, + "loss": 0.4793, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4057399034500122, + "rewards/margins": 1.0536389350891113, + "rewards/rejected": -2.459378719329834, + "step": 5700 + }, + { + "epoch": 0.75, + "eval_logits/chosen": 0.8544980883598328, + "eval_logits/rejected": 0.8918463587760925, + "eval_logps/chosen": -435.4676208496094, + "eval_logps/rejected": -489.2809143066406, + "eval_loss": 0.5494768023490906, + "eval_rewards/accuracies": 0.7080000042915344, + "eval_rewards/chosen": -1.5017492771148682, + "eval_rewards/margins": 0.7523629069328308, + "eval_rewards/rejected": -2.2541120052337646, + "eval_runtime": 1173.0208, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 5700 + }, + { + "epoch": 0.75, + "learning_rate": 9.114796582326255e-07, + "logits/chosen": -2.0369105339050293, + "logits/rejected": -1.7664175033569336, + "logps/chosen": -435.7451171875, + "logps/rejected": -467.1946716308594, + "loss": 0.5717, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6470935344696045, + "rewards/margins": 0.6342312693595886, + "rewards/rejected": -2.281324863433838, + "step": 5710 + }, + { + "epoch": 0.75, + "learning_rate": 9.026762286868373e-07, + "logits/chosen": -1.9730746746063232, + "logits/rejected": -1.9558706283569336, + "logps/chosen": -419.619384765625, + "logps/rejected": -524.6027221679688, + "loss": 0.5154, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4232149124145508, + "rewards/margins": 0.9718655347824097, + "rewards/rejected": -2.39508056640625, + "step": 5720 + }, + { + "epoch": 0.75, + "learning_rate": 8.939061433338722e-07, + "logits/chosen": -1.7903960943222046, + "logits/rejected": -1.7915462255477905, + "logps/chosen": -440.1333923339844, + "logps/rejected": -487.67620849609375, + "loss": 0.6097, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5487083196640015, + "rewards/margins": 0.5043095350265503, + "rewards/rejected": -2.0530178546905518, + "step": 5730 + }, + { + "epoch": 0.75, + "learning_rate": 8.851695852495867e-07, + "logits/chosen": -1.7867329120635986, + "logits/rejected": -1.8247419595718384, + "logps/chosen": -390.4758605957031, + "logps/rejected": -478.8408203125, + "loss": 0.554, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4541441202163696, + "rewards/margins": 0.8417972326278687, + "rewards/rejected": -2.2959413528442383, + "step": 5740 + }, + { + "epoch": 0.75, + "learning_rate": 8.764667368099525e-07, + "logits/chosen": -1.7342529296875, + "logits/rejected": -1.6524286270141602, + "logps/chosen": -415.9396057128906, + "logps/rejected": -469.9048767089844, + "loss": 0.5183, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5002275705337524, + "rewards/margins": 0.7854744791984558, + "rewards/rejected": -2.2857022285461426, + "step": 5750 + }, + { + "epoch": 0.75, + "learning_rate": 8.677977796872541e-07, + "logits/chosen": -1.7585633993148804, + "logits/rejected": -1.5713212490081787, + "logps/chosen": -468.2236328125, + "logps/rejected": -477.36895751953125, + "loss": 0.5981, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6817764043807983, + "rewards/margins": 0.7200533151626587, + "rewards/rejected": -2.401829719543457, + "step": 5760 + }, + { + "epoch": 0.76, + "learning_rate": 8.591628948462913e-07, + "logits/chosen": -1.5852441787719727, + "logits/rejected": -1.4527372121810913, + "logps/chosen": -446.5345764160156, + "logps/rejected": -523.7599487304688, + "loss": 0.5773, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5635210275650024, + "rewards/margins": 0.7766464948654175, + "rewards/rejected": -2.34016752243042, + "step": 5770 + }, + { + "epoch": 0.76, + "learning_rate": 8.505622625406054e-07, + "logits/chosen": -1.7319908142089844, + "logits/rejected": -1.716835379600525, + "logps/chosen": -430.82904052734375, + "logps/rejected": -505.0128479003906, + "loss": 0.5574, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5698482990264893, + "rewards/margins": 0.7734547257423401, + "rewards/rejected": -2.3433032035827637, + "step": 5780 + }, + { + "epoch": 0.76, + "learning_rate": 8.419960623087129e-07, + "logits/chosen": -1.5350911617279053, + "logits/rejected": -1.5504592657089233, + "logps/chosen": -367.9596862792969, + "logps/rejected": -462.3741149902344, + "loss": 0.5646, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4081454277038574, + "rewards/margins": 0.6708158254623413, + "rewards/rejected": -2.0789613723754883, + "step": 5790 + }, + { + "epoch": 0.76, + "learning_rate": 8.334644729703617e-07, + "logits/chosen": -1.7765905857086182, + "logits/rejected": -1.7603641748428345, + "logps/chosen": -389.497314453125, + "logps/rejected": -474.2781677246094, + "loss": 0.5164, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4255878925323486, + "rewards/margins": 0.8293389081954956, + "rewards/rejected": -2.2549266815185547, + "step": 5800 + }, + { + "epoch": 0.76, + "eval_logits/chosen": 0.8885383009910583, + "eval_logits/rejected": 0.9220978617668152, + "eval_logps/chosen": -440.76849365234375, + "eval_logps/rejected": -496.01495361328125, + "eval_loss": 0.5497148633003235, + "eval_rewards/accuracies": 0.7085000276565552, + "eval_rewards/chosen": -1.5547575950622559, + "eval_rewards/margins": 0.7666952013969421, + "eval_rewards/rejected": -2.321453094482422, + "eval_runtime": 1172.8418, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 5800 + }, + { + "epoch": 0.76, + "learning_rate": 8.249676726227931e-07, + "logits/chosen": -1.6978380680084229, + "logits/rejected": -1.7133516073226929, + "logps/chosen": -496.34356689453125, + "logps/rejected": -496.5018615722656, + "loss": 0.6626, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8182287216186523, + "rewards/margins": 0.40940380096435547, + "rewards/rejected": -2.2276322841644287, + "step": 5810 + }, + { + "epoch": 0.76, + "learning_rate": 8.165058386370314e-07, + "logits/chosen": -1.6891844272613525, + "logits/rejected": -1.6935393810272217, + "logps/chosen": -435.996337890625, + "logps/rejected": -526.1800537109375, + "loss": 0.5456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5769296884536743, + "rewards/margins": 0.7456473112106323, + "rewards/rejected": -2.3225769996643066, + "step": 5820 + }, + { + "epoch": 0.76, + "learning_rate": 8.080791476541721e-07, + "logits/chosen": -1.7240186929702759, + "logits/rejected": -1.7112128734588623, + "logps/chosen": -397.134521484375, + "logps/rejected": -465.83990478515625, + "loss": 0.4912, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4226911067962646, + "rewards/margins": 0.9041608572006226, + "rewards/rejected": -2.3268518447875977, + "step": 5830 + }, + { + "epoch": 0.76, + "learning_rate": 7.996877755817026e-07, + "logits/chosen": -1.8242250680923462, + "logits/rejected": -1.7658584117889404, + "logps/chosen": -432.068115234375, + "logps/rejected": -438.3821716308594, + "loss": 0.6201, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6332203149795532, + "rewards/margins": 0.39267483353614807, + "rewards/rejected": -2.025895118713379, + "step": 5840 + }, + { + "epoch": 0.77, + "learning_rate": 7.913318975898238e-07, + "logits/chosen": -1.9215571880340576, + "logits/rejected": -1.667769193649292, + "logps/chosen": -510.7637634277344, + "logps/rejected": -530.4691772460938, + "loss": 0.5864, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6410611867904663, + "rewards/margins": 0.7761032581329346, + "rewards/rejected": -2.4171645641326904, + "step": 5850 + }, + { + "epoch": 0.77, + "learning_rate": 7.830116881077992e-07, + "logits/chosen": -1.7714402675628662, + "logits/rejected": -1.5833399295806885, + "logps/chosen": -444.66864013671875, + "logps/rejected": -507.297607421875, + "loss": 0.4912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.451790452003479, + "rewards/margins": 0.8779904246330261, + "rewards/rejected": -2.3297810554504395, + "step": 5860 + }, + { + "epoch": 0.77, + "learning_rate": 7.747273208203096e-07, + "logits/chosen": -1.7608497142791748, + "logits/rejected": -1.6446444988250732, + "logps/chosen": -463.907958984375, + "logps/rejected": -553.3729248046875, + "loss": 0.5506, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7586644887924194, + "rewards/margins": 0.8479653596878052, + "rewards/rejected": -2.6066298484802246, + "step": 5870 + }, + { + "epoch": 0.77, + "learning_rate": 7.664789686638272e-07, + "logits/chosen": -1.7554763555526733, + "logits/rejected": -1.5306179523468018, + "logps/chosen": -419.94183349609375, + "logps/rejected": -517.2022094726562, + "loss": 0.5401, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.49278724193573, + "rewards/margins": 0.9025591015815735, + "rewards/rejected": -2.395346164703369, + "step": 5880 + }, + { + "epoch": 0.77, + "learning_rate": 7.582668038230089e-07, + "logits/chosen": -2.0004661083221436, + "logits/rejected": -1.8577114343643188, + "logps/chosen": -443.9126892089844, + "logps/rejected": -507.6675720214844, + "loss": 0.5518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4959663152694702, + "rewards/margins": 0.8871985673904419, + "rewards/rejected": -2.383164882659912, + "step": 5890 + }, + { + "epoch": 0.77, + "learning_rate": 7.500909977271007e-07, + "logits/chosen": -1.8355262279510498, + "logits/rejected": -1.774918556213379, + "logps/chosen": -470.26348876953125, + "logps/rejected": -520.8164672851562, + "loss": 0.6164, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6989784240722656, + "rewards/margins": 0.7304049730300903, + "rewards/rejected": -2.4293832778930664, + "step": 5900 + }, + { + "epoch": 0.77, + "eval_logits/chosen": 0.8645352721214294, + "eval_logits/rejected": 0.8986992835998535, + "eval_logps/chosen": -438.6431884765625, + "eval_logps/rejected": -492.7100830078125, + "eval_loss": 0.549113392829895, + "eval_rewards/accuracies": 0.7080000042915344, + "eval_rewards/chosen": -1.5335044860839844, + "eval_rewards/margins": 0.754899799823761, + "eval_rewards/rejected": -2.2884044647216797, + "eval_runtime": 1172.6923, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 5900 + }, + { + "epoch": 0.77, + "learning_rate": 7.41951721046357e-07, + "logits/chosen": -1.773479700088501, + "logits/rejected": -1.5404237508773804, + "logps/chosen": -413.2481384277344, + "logps/rejected": -473.30511474609375, + "loss": 0.5612, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4232587814331055, + "rewards/margins": 0.6826823353767395, + "rewards/rejected": -2.1059410572052, + "step": 5910 + }, + { + "epoch": 0.77, + "learning_rate": 7.338491436884787e-07, + "logits/chosen": -1.734042763710022, + "logits/rejected": -1.6976509094238281, + "logps/chosen": -414.5589294433594, + "logps/rejected": -492.0824279785156, + "loss": 0.5474, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6448618173599243, + "rewards/margins": 0.7069370150566101, + "rewards/rejected": -2.3517985343933105, + "step": 5920 + }, + { + "epoch": 0.78, + "learning_rate": 7.257834347950693e-07, + "logits/chosen": -1.7987234592437744, + "logits/rejected": -1.587146520614624, + "logps/chosen": -432.47119140625, + "logps/rejected": -452.37841796875, + "loss": 0.62, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.552919626235962, + "rewards/margins": 0.5302165746688843, + "rewards/rejected": -2.0831360816955566, + "step": 5930 + }, + { + "epoch": 0.78, + "learning_rate": 7.177547627380987e-07, + "logits/chosen": -1.802987813949585, + "logits/rejected": -1.7997875213623047, + "logps/chosen": -448.490234375, + "logps/rejected": -509.35107421875, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.473778486251831, + "rewards/margins": 0.7411264181137085, + "rewards/rejected": -2.21490478515625, + "step": 5940 + }, + { + "epoch": 0.78, + "learning_rate": 7.097632951163949e-07, + "logits/chosen": -1.8216888904571533, + "logits/rejected": -1.6682456731796265, + "logps/chosen": -466.199462890625, + "logps/rejected": -504.44537353515625, + "loss": 0.5822, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5226764678955078, + "rewards/margins": 0.7720099687576294, + "rewards/rejected": -2.2946863174438477, + "step": 5950 + }, + { + "epoch": 0.78, + "learning_rate": 7.018091987521386e-07, + "logits/chosen": -1.9580612182617188, + "logits/rejected": -1.79523503780365, + "logps/chosen": -450.51348876953125, + "logps/rejected": -491.3302307128906, + "loss": 0.5887, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6182911396026611, + "rewards/margins": 0.6931548714637756, + "rewards/rejected": -2.311445713043213, + "step": 5960 + }, + { + "epoch": 0.78, + "learning_rate": 6.93892639687386e-07, + "logits/chosen": -1.9790513515472412, + "logits/rejected": -1.8413312435150146, + "logps/chosen": -460.31689453125, + "logps/rejected": -458.7654724121094, + "loss": 0.6095, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4465421438217163, + "rewards/margins": 0.5680855512619019, + "rewards/rejected": -2.014627695083618, + "step": 5970 + }, + { + "epoch": 0.78, + "learning_rate": 6.860137831806018e-07, + "logits/chosen": -1.7386270761489868, + "logits/rejected": -1.6823354959487915, + "logps/chosen": -449.7940368652344, + "logps/rejected": -483.63800048828125, + "loss": 0.5787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4916565418243408, + "rewards/margins": 0.7418898344039917, + "rewards/rejected": -2.233546257019043, + "step": 5980 + }, + { + "epoch": 0.78, + "learning_rate": 6.781727937032054e-07, + "logits/chosen": -1.7121458053588867, + "logits/rejected": -1.5884407758712769, + "logps/chosen": -407.84271240234375, + "logps/rejected": -500.04986572265625, + "loss": 0.4405, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3468246459960938, + "rewards/margins": 0.9939699172973633, + "rewards/rejected": -2.340794563293457, + "step": 5990 + }, + { + "epoch": 0.79, + "learning_rate": 6.703698349361437e-07, + "logits/chosen": -1.857116460800171, + "logits/rejected": -1.6669254302978516, + "logps/chosen": -414.12078857421875, + "logps/rejected": -444.47564697265625, + "loss": 0.5347, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4424694776535034, + "rewards/margins": 0.7128639817237854, + "rewards/rejected": -2.1553330421447754, + "step": 6000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 0.8396689295768738, + "eval_logits/rejected": 0.8765884637832642, + "eval_logps/chosen": -435.57208251953125, + "eval_logps/rejected": -488.74273681640625, + "eval_loss": 0.5487044453620911, + "eval_rewards/accuracies": 0.7105000019073486, + "eval_rewards/chosen": -1.5027936697006226, + "eval_rewards/margins": 0.7459368109703064, + "eval_rewards/rejected": -2.2487306594848633, + "eval_runtime": 1172.9793, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 6000 + }, + { + "epoch": 0.79, + "learning_rate": 6.626050697664682e-07, + "logits/chosen": -1.7867813110351562, + "logits/rejected": -1.6846868991851807, + "logps/chosen": -441.70672607421875, + "logps/rejected": -468.6539001464844, + "loss": 0.526, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5382730960845947, + "rewards/margins": 0.7291477918624878, + "rewards/rejected": -2.267420530319214, + "step": 6010 + }, + { + "epoch": 0.79, + "learning_rate": 6.548786602839404e-07, + "logits/chosen": -1.7776895761489868, + "logits/rejected": -1.759871244430542, + "logps/chosen": -385.42010498046875, + "logps/rejected": -454.4854431152344, + "loss": 0.458, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3485528230667114, + "rewards/margins": 0.991489589214325, + "rewards/rejected": -2.3400423526763916, + "step": 6020 + }, + { + "epoch": 0.79, + "learning_rate": 6.471907677776426e-07, + "logits/chosen": -1.9682811498641968, + "logits/rejected": -1.8500810861587524, + "logps/chosen": -446.57818603515625, + "logps/rejected": -465.53302001953125, + "loss": 0.5939, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4418151378631592, + "rewards/margins": 0.6426098942756653, + "rewards/rejected": -2.0844249725341797, + "step": 6030 + }, + { + "epoch": 0.79, + "learning_rate": 6.39541552732617e-07, + "logits/chosen": -1.8172651529312134, + "logits/rejected": -1.7830369472503662, + "logps/chosen": -446.30657958984375, + "logps/rejected": -528.9716186523438, + "loss": 0.5815, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6189403533935547, + "rewards/margins": 0.6232365965843201, + "rewards/rejected": -2.2421767711639404, + "step": 6040 + }, + { + "epoch": 0.79, + "learning_rate": 6.319311748265086e-07, + "logits/chosen": -1.8408966064453125, + "logits/rejected": -1.5872822999954224, + "logps/chosen": -526.5015869140625, + "logps/rejected": -542.4791259765625, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5981743335723877, + "rewards/margins": 0.7914873361587524, + "rewards/rejected": -2.3896615505218506, + "step": 6050 + }, + { + "epoch": 0.79, + "learning_rate": 6.243597929262404e-07, + "logits/chosen": -1.792720079421997, + "logits/rejected": -1.552986979484558, + "logps/chosen": -375.17608642578125, + "logps/rejected": -522.9835205078125, + "loss": 0.5579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.538088321685791, + "rewards/margins": 1.001598596572876, + "rewards/rejected": -2.539686918258667, + "step": 6060 + }, + { + "epoch": 0.79, + "learning_rate": 6.168275650846875e-07, + "logits/chosen": -1.867114782333374, + "logits/rejected": -1.8291406631469727, + "logps/chosen": -454.90289306640625, + "logps/rejected": -469.98162841796875, + "loss": 0.5895, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4467754364013672, + "rewards/margins": 0.6458438634872437, + "rewards/rejected": -2.0926194190979004, + "step": 6070 + }, + { + "epoch": 0.8, + "learning_rate": 6.093346485373863e-07, + "logits/chosen": -1.8101370334625244, + "logits/rejected": -1.6821973323822021, + "logps/chosen": -459.3401794433594, + "logps/rejected": -492.3536071777344, + "loss": 0.538, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5773446559906006, + "rewards/margins": 0.6525165438652039, + "rewards/rejected": -2.229861259460449, + "step": 6080 + }, + { + "epoch": 0.8, + "learning_rate": 6.018811996992455e-07, + "logits/chosen": -1.8199021816253662, + "logits/rejected": -1.6964166164398193, + "logps/chosen": -426.5926208496094, + "logps/rejected": -489.61199951171875, + "loss": 0.4093, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2519571781158447, + "rewards/margins": 1.1711586713790894, + "rewards/rejected": -2.4231159687042236, + "step": 6090 + }, + { + "epoch": 0.8, + "learning_rate": 5.944673741612866e-07, + "logits/chosen": -1.8592021465301514, + "logits/rejected": -1.814186453819275, + "logps/chosen": -465.5372009277344, + "logps/rejected": -520.5704345703125, + "loss": 0.56, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6353585720062256, + "rewards/margins": 0.6319504380226135, + "rewards/rejected": -2.2673091888427734, + "step": 6100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 0.8248075842857361, + "eval_logits/rejected": 0.8642656803131104, + "eval_logps/chosen": -433.8428955078125, + "eval_logps/rejected": -487.2425842285156, + "eval_loss": 0.5491208434104919, + "eval_rewards/accuracies": 0.7105000019073486, + "eval_rewards/chosen": -1.4855022430419922, + "eval_rewards/margins": 0.7482272386550903, + "eval_rewards/rejected": -2.233729600906372, + "eval_runtime": 1173.0295, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 6100 + }, + { + "epoch": 0.8, + "learning_rate": 5.870933266873916e-07, + "logits/chosen": -1.8650023937225342, + "logits/rejected": -1.7973130941390991, + "logps/chosen": -394.88909912109375, + "logps/rejected": -456.81024169921875, + "loss": 0.625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4461156129837036, + "rewards/margins": 0.5903973579406738, + "rewards/rejected": -2.036512851715088, + "step": 6110 + }, + { + "epoch": 0.8, + "learning_rate": 5.797592112110734e-07, + "logits/chosen": -1.6772758960723877, + "logits/rejected": -1.6647506952285767, + "logps/chosen": -378.04296875, + "logps/rejected": -415.96343994140625, + "loss": 0.653, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.527093529701233, + "rewards/margins": 0.5279368162155151, + "rewards/rejected": -2.055030345916748, + "step": 6120 + }, + { + "epoch": 0.8, + "learning_rate": 5.724651808322645e-07, + "logits/chosen": -1.7247283458709717, + "logits/rejected": -1.7005923986434937, + "logps/chosen": -404.6063232421875, + "logps/rejected": -508.9358825683594, + "loss": 0.4811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3675364255905151, + "rewards/margins": 0.8878251910209656, + "rewards/rejected": -2.255361557006836, + "step": 6130 + }, + { + "epoch": 0.8, + "learning_rate": 5.652113878141194e-07, + "logits/chosen": -1.7346999645233154, + "logits/rejected": -1.5381842851638794, + "logps/chosen": -362.3267517089844, + "logps/rejected": -425.4991760253906, + "loss": 0.5472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4318227767944336, + "rewards/margins": 0.6716090440750122, + "rewards/rejected": -2.103431463241577, + "step": 6140 + }, + { + "epoch": 0.8, + "learning_rate": 5.579979835798361e-07, + "logits/chosen": -1.8034861087799072, + "logits/rejected": -1.7127468585968018, + "logps/chosen": -408.5749816894531, + "logps/rejected": -490.4581604003906, + "loss": 0.4916, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4602086544036865, + "rewards/margins": 0.8907852172851562, + "rewards/rejected": -2.3509938716888428, + "step": 6150 + }, + { + "epoch": 0.81, + "learning_rate": 5.508251187094932e-07, + "logits/chosen": -1.9209445714950562, + "logits/rejected": -1.8475583791732788, + "logps/chosen": -458.3445739746094, + "logps/rejected": -470.88262939453125, + "loss": 0.6092, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5180823802947998, + "rewards/margins": 0.6414216756820679, + "rewards/rejected": -2.1595041751861572, + "step": 6160 + }, + { + "epoch": 0.81, + "learning_rate": 5.436929429369122e-07, + "logits/chosen": -1.7689403295516968, + "logits/rejected": -1.6730201244354248, + "logps/chosen": -408.4197692871094, + "logps/rejected": -456.3441467285156, + "loss": 0.5689, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4721378087997437, + "rewards/margins": 0.6763318777084351, + "rewards/rejected": -2.1484696865081787, + "step": 6170 + }, + { + "epoch": 0.81, + "learning_rate": 5.366016051465245e-07, + "logits/chosen": -1.8204491138458252, + "logits/rejected": -1.7095301151275635, + "logps/chosen": -408.523193359375, + "logps/rejected": -497.50860595703125, + "loss": 0.4561, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.390578269958496, + "rewards/margins": 0.9562222361564636, + "rewards/rejected": -2.3468003273010254, + "step": 6180 + }, + { + "epoch": 0.81, + "learning_rate": 5.295512533702701e-07, + "logits/chosen": -1.8240352869033813, + "logits/rejected": -1.6943086385726929, + "logps/chosen": -389.2616271972656, + "logps/rejected": -451.5743103027344, + "loss": 0.5642, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4481045007705688, + "rewards/margins": 0.6837536096572876, + "rewards/rejected": -2.1318581104278564, + "step": 6190 + }, + { + "epoch": 0.81, + "learning_rate": 5.225420347845023e-07, + "logits/chosen": -1.8606281280517578, + "logits/rejected": -1.7882606983184814, + "logps/chosen": -444.04937744140625, + "logps/rejected": -502.1737365722656, + "loss": 0.587, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4267102479934692, + "rewards/margins": 0.74085932970047, + "rewards/rejected": -2.167569637298584, + "step": 6200 + }, + { + "epoch": 0.81, + "eval_logits/chosen": 0.8071790933609009, + "eval_logits/rejected": 0.8489038944244385, + "eval_logps/chosen": -431.671142578125, + "eval_logps/rejected": -484.97882080078125, + "eval_loss": 0.549113392829895, + "eval_rewards/accuracies": 0.7095000147819519, + "eval_rewards/chosen": -1.4637844562530518, + "eval_rewards/margins": 0.7473068237304688, + "eval_rewards/rejected": -2.2110912799835205, + "eval_runtime": 1172.7765, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 6200 + }, + { + "epoch": 0.81, + "learning_rate": 5.155740957069186e-07, + "logits/chosen": -1.9389774799346924, + "logits/rejected": -1.954559564590454, + "logps/chosen": -434.64599609375, + "logps/rejected": -488.9850158691406, + "loss": 0.5066, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4530550241470337, + "rewards/margins": 0.8636603355407715, + "rewards/rejected": -2.3167154788970947, + "step": 6210 + }, + { + "epoch": 0.81, + "learning_rate": 5.08647581593506e-07, + "logits/chosen": -1.7193613052368164, + "logits/rejected": -1.6669191122055054, + "logps/chosen": -404.49835205078125, + "logps/rejected": -478.39080810546875, + "loss": 0.4748, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2635455131530762, + "rewards/margins": 0.8986026048660278, + "rewards/rejected": -2.1621482372283936, + "step": 6220 + }, + { + "epoch": 0.82, + "learning_rate": 5.017626370355014e-07, + "logits/chosen": -1.7605371475219727, + "logits/rejected": -1.5838454961776733, + "logps/chosen": -415.41375732421875, + "logps/rejected": -465.14178466796875, + "loss": 0.4511, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3500653505325317, + "rewards/margins": 0.9630001783370972, + "rewards/rejected": -2.31306529045105, + "step": 6230 + }, + { + "epoch": 0.82, + "learning_rate": 4.949194057563783e-07, + "logits/chosen": -1.8982305526733398, + "logits/rejected": -1.763641357421875, + "logps/chosen": -435.21533203125, + "logps/rejected": -451.747314453125, + "loss": 0.614, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5065052509307861, + "rewards/margins": 0.5958704352378845, + "rewards/rejected": -2.1023757457733154, + "step": 6240 + }, + { + "epoch": 0.82, + "learning_rate": 4.881180306088418e-07, + "logits/chosen": -1.8761934041976929, + "logits/rejected": -1.6820194721221924, + "logps/chosen": -433.70941162109375, + "logps/rejected": -476.25091552734375, + "loss": 0.4914, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3821966648101807, + "rewards/margins": 0.9251546859741211, + "rewards/rejected": -2.3073513507843018, + "step": 6250 + }, + { + "epoch": 0.82, + "learning_rate": 4.813586535718512e-07, + "logits/chosen": -1.9150760173797607, + "logits/rejected": -1.6301143169403076, + "logps/chosen": -467.3033142089844, + "logps/rejected": -483.08721923828125, + "loss": 0.5139, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.407266616821289, + "rewards/margins": 0.9681908488273621, + "rewards/rejected": -2.375457286834717, + "step": 6260 + }, + { + "epoch": 0.82, + "learning_rate": 4.746414157476506e-07, + "logits/chosen": -1.9365136623382568, + "logits/rejected": -1.7622143030166626, + "logps/chosen": -382.59930419921875, + "logps/rejected": -452.78363037109375, + "loss": 0.4941, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3162574768066406, + "rewards/margins": 0.9539827108383179, + "rewards/rejected": -2.270240306854248, + "step": 6270 + }, + { + "epoch": 0.82, + "learning_rate": 4.679664573588294e-07, + "logits/chosen": -1.7709249258041382, + "logits/rejected": -1.65009343624115, + "logps/chosen": -375.88079833984375, + "logps/rejected": -438.1414489746094, + "loss": 0.508, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3196238279342651, + "rewards/margins": 0.7960411906242371, + "rewards/rejected": -2.1156649589538574, + "step": 6280 + }, + { + "epoch": 0.82, + "learning_rate": 4.6133391774538903e-07, + "logits/chosen": -1.9863331317901611, + "logits/rejected": -1.8483604192733765, + "logps/chosen": -450.23858642578125, + "logps/rejected": -484.78631591796875, + "loss": 0.5767, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3949511051177979, + "rewards/margins": 0.8456497192382812, + "rewards/rejected": -2.2406005859375, + "step": 6290 + }, + { + "epoch": 0.82, + "learning_rate": 4.5474393536184214e-07, + "logits/chosen": -1.9218631982803345, + "logits/rejected": -1.7662346363067627, + "logps/chosen": -417.332763671875, + "logps/rejected": -461.92791748046875, + "loss": 0.4927, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3764417171478271, + "rewards/margins": 0.7701510190963745, + "rewards/rejected": -2.146592617034912, + "step": 6300 + }, + { + "epoch": 0.82, + "eval_logits/chosen": 0.8118359446525574, + "eval_logits/rejected": 0.8531017899513245, + "eval_logps/chosen": -431.203857421875, + "eval_logps/rejected": -484.6880798339844, + "eval_loss": 0.5489959716796875, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": -1.4591114521026611, + "eval_rewards/margins": 0.7490728497505188, + "eval_rewards/rejected": -2.208184242248535, + "eval_runtime": 1172.584, + "eval_samples_per_second": 1.706, + "eval_steps_per_second": 0.853, + "step": 6300 + }, + { + "epoch": 0.83, + "learning_rate": 4.4819664777431243e-07, + "logits/chosen": -1.7982280254364014, + "logits/rejected": -1.6588172912597656, + "logps/chosen": -379.1011657714844, + "logps/rejected": -411.88262939453125, + "loss": 0.6031, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.398041009902954, + "rewards/margins": 0.5266937017440796, + "rewards/rejected": -1.9247347116470337, + "step": 6310 + }, + { + "epoch": 0.83, + "learning_rate": 4.416921916576722e-07, + "logits/chosen": -1.7167733907699585, + "logits/rejected": -1.534623622894287, + "logps/chosen": -463.98712158203125, + "logps/rejected": -516.0758056640625, + "loss": 0.5965, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.516566514968872, + "rewards/margins": 0.6256294250488281, + "rewards/rejected": -2.142195701599121, + "step": 6320 + }, + { + "epoch": 0.83, + "learning_rate": 4.352307027926828e-07, + "logits/chosen": -1.8432748317718506, + "logits/rejected": -1.7553224563598633, + "logps/chosen": -410.40191650390625, + "logps/rejected": -486.0369567871094, + "loss": 0.4201, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2879002094268799, + "rewards/margins": 1.0923058986663818, + "rewards/rejected": -2.3802061080932617, + "step": 6330 + }, + { + "epoch": 0.83, + "learning_rate": 4.288123160631624e-07, + "logits/chosen": -1.5096865892410278, + "logits/rejected": -1.5543712377548218, + "logps/chosen": -422.20526123046875, + "logps/rejected": -469.3055725097656, + "loss": 0.6692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6089191436767578, + "rewards/margins": 0.574799656867981, + "rewards/rejected": -2.1837189197540283, + "step": 6340 + }, + { + "epoch": 0.83, + "learning_rate": 4.224371654531731e-07, + "logits/chosen": -1.836951494216919, + "logits/rejected": -1.7463347911834717, + "logps/chosen": -422.5487365722656, + "logps/rejected": -446.9239196777344, + "loss": 0.591, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5224004983901978, + "rewards/margins": 0.5998014211654663, + "rewards/rejected": -2.122201919555664, + "step": 6350 + }, + { + "epoch": 0.83, + "learning_rate": 4.1610538404421837e-07, + "logits/chosen": -1.851381540298462, + "logits/rejected": -1.8891586065292358, + "logps/chosen": -388.76495361328125, + "logps/rejected": -489.7081604003906, + "loss": 0.4899, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.263028860092163, + "rewards/margins": 0.7727184891700745, + "rewards/rejected": -2.0357470512390137, + "step": 6360 + }, + { + "epoch": 0.83, + "learning_rate": 4.098171040124699e-07, + "logits/chosen": -1.901296615600586, + "logits/rejected": -1.7560195922851562, + "logps/chosen": -491.11767578125, + "logps/rejected": -490.3089904785156, + "loss": 0.621, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6198790073394775, + "rewards/margins": 0.612278938293457, + "rewards/rejected": -2.2321579456329346, + "step": 6370 + }, + { + "epoch": 0.83, + "learning_rate": 4.03572456626006e-07, + "logits/chosen": -1.7471396923065186, + "logits/rejected": -1.8034473657608032, + "logps/chosen": -431.95391845703125, + "logps/rejected": -472.6758728027344, + "loss": 0.5683, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4471083879470825, + "rewards/margins": 0.592138409614563, + "rewards/rejected": -2.0392465591430664, + "step": 6380 + }, + { + "epoch": 0.84, + "learning_rate": 3.9737157224207265e-07, + "logits/chosen": -1.8360217809677124, + "logits/rejected": -1.772001028060913, + "logps/chosen": -389.5067443847656, + "logps/rejected": -454.16253662109375, + "loss": 0.5769, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3939635753631592, + "rewards/margins": 0.6599873304367065, + "rewards/rejected": -2.0539510250091553, + "step": 6390 + }, + { + "epoch": 0.84, + "learning_rate": 3.912145803043596e-07, + "logits/chosen": -1.8079197406768799, + "logits/rejected": -1.750848412513733, + "logps/chosen": -432.05364990234375, + "logps/rejected": -449.82830810546875, + "loss": 0.6102, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4526618719100952, + "rewards/margins": 0.4696109890937805, + "rewards/rejected": -1.9222729206085205, + "step": 6400 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 0.8054977655410767, + "eval_logits/rejected": 0.8473966717720032, + "eval_logps/chosen": -429.9117126464844, + "eval_logps/rejected": -483.1517639160156, + "eval_loss": 0.5485800504684448, + "eval_rewards/accuracies": 0.7105000019073486, + "eval_rewards/chosen": -1.4461897611618042, + "eval_rewards/margins": 0.7466309666633606, + "eval_rewards/rejected": -2.1928207874298096, + "eval_runtime": 1172.5265, + "eval_samples_per_second": 1.706, + "eval_steps_per_second": 0.853, + "step": 6400 + }, + { + "epoch": 0.84, + "learning_rate": 3.851016093403023e-07, + "logits/chosen": -1.6788511276245117, + "logits/rejected": -1.6036514043807983, + "logps/chosen": -402.7706604003906, + "logps/rejected": -466.3506774902344, + "loss": 0.5659, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5908563137054443, + "rewards/margins": 0.6856015920639038, + "rewards/rejected": -2.2764577865600586, + "step": 6410 + }, + { + "epoch": 0.84, + "learning_rate": 3.7903278695839456e-07, + "logits/chosen": -1.7197093963623047, + "logits/rejected": -1.774407982826233, + "logps/chosen": -426.36505126953125, + "logps/rejected": -464.59356689453125, + "loss": 0.5771, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.430627465248108, + "rewards/margins": 0.6341504454612732, + "rewards/rejected": -2.0647778511047363, + "step": 6420 + }, + { + "epoch": 0.84, + "learning_rate": 3.7300823984552983e-07, + "logits/chosen": -1.7846879959106445, + "logits/rejected": -1.7678248882293701, + "logps/chosen": -379.194580078125, + "logps/rejected": -457.87493896484375, + "loss": 0.5666, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3751968145370483, + "rewards/margins": 0.6400051712989807, + "rewards/rejected": -2.015202045440674, + "step": 6430 + }, + { + "epoch": 0.84, + "learning_rate": 3.670280937643503e-07, + "logits/chosen": -1.780124306678772, + "logits/rejected": -1.6688493490219116, + "logps/chosen": -417.65606689453125, + "logps/rejected": -460.10565185546875, + "loss": 0.5392, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4370096921920776, + "rewards/margins": 0.7721136212348938, + "rewards/rejected": -2.209123134613037, + "step": 6440 + }, + { + "epoch": 0.84, + "learning_rate": 3.610924735506274e-07, + "logits/chosen": -1.850022315979004, + "logits/rejected": -1.601930856704712, + "logps/chosen": -468.7164001464844, + "logps/rejected": -456.21746826171875, + "loss": 0.6132, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5035723447799683, + "rewards/margins": 0.572163462638855, + "rewards/rejected": -2.0757358074188232, + "step": 6450 + }, + { + "epoch": 0.85, + "learning_rate": 3.5520150311065316e-07, + "logits/chosen": -1.6352055072784424, + "logits/rejected": -1.6226692199707031, + "logps/chosen": -429.9473571777344, + "logps/rejected": -486.21343994140625, + "loss": 0.4952, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3845300674438477, + "rewards/margins": 0.7877673506736755, + "rewards/rejected": -2.1722970008850098, + "step": 6460 + }, + { + "epoch": 0.85, + "learning_rate": 3.493553054186527e-07, + "logits/chosen": -1.8799070119857788, + "logits/rejected": -1.7547706365585327, + "logps/chosen": -437.9310607910156, + "logps/rejected": -493.5552673339844, + "loss": 0.577, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.586670160293579, + "rewards/margins": 0.640863299369812, + "rewards/rejected": -2.2275335788726807, + "step": 6470 + }, + { + "epoch": 0.85, + "learning_rate": 3.4355400251421977e-07, + "logits/chosen": -1.700897216796875, + "logits/rejected": -1.6764428615570068, + "logps/chosen": -419.20635986328125, + "logps/rejected": -443.47802734375, + "loss": 0.6838, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5222584009170532, + "rewards/margins": 0.4975927472114563, + "rewards/rejected": -2.0198514461517334, + "step": 6480 + }, + { + "epoch": 0.85, + "learning_rate": 3.3779771549976637e-07, + "logits/chosen": -1.8302929401397705, + "logits/rejected": -1.645246148109436, + "logps/chosen": -411.7997131347656, + "logps/rejected": -459.4706115722656, + "loss": 0.547, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4969736337661743, + "rewards/margins": 0.7363954186439514, + "rewards/rejected": -2.2333691120147705, + "step": 6490 + }, + { + "epoch": 0.85, + "learning_rate": 3.3208656453799783e-07, + "logits/chosen": -1.8124793767929077, + "logits/rejected": -1.742392897605896, + "logps/chosen": -402.3931884765625, + "logps/rejected": -453.42083740234375, + "loss": 0.4988, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3512194156646729, + "rewards/margins": 0.8209896087646484, + "rewards/rejected": -2.172208786010742, + "step": 6500 + }, + { + "epoch": 0.85, + "eval_logits/chosen": 0.8046004772186279, + "eval_logits/rejected": 0.8463611006736755, + "eval_logps/chosen": -430.1142272949219, + "eval_logps/rejected": -483.24664306640625, + "eval_loss": 0.5485355257987976, + "eval_rewards/accuracies": 0.7095000147819519, + "eval_rewards/chosen": -1.448215126991272, + "eval_rewards/margins": 0.74555504322052, + "eval_rewards/rejected": -2.193770170211792, + "eval_runtime": 1172.8617, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 6500 + }, + { + "epoch": 0.85, + "learning_rate": 3.2642066884940064e-07, + "logits/chosen": -1.7718359231948853, + "logits/rejected": -1.6276681423187256, + "logps/chosen": -421.28515625, + "logps/rejected": -518.3148193359375, + "loss": 0.534, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3754109144210815, + "rewards/margins": 0.9865762591362, + "rewards/rejected": -2.361987352371216, + "step": 6510 + }, + { + "epoch": 0.85, + "learning_rate": 3.2080014670975825e-07, + "logits/chosen": -1.913540244102478, + "logits/rejected": -1.8367464542388916, + "logps/chosen": -403.6368103027344, + "logps/rejected": -440.57232666015625, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3662607669830322, + "rewards/margins": 0.6077486872673035, + "rewards/rejected": -1.9740097522735596, + "step": 6520 + }, + { + "epoch": 0.85, + "learning_rate": 3.152251154476765e-07, + "logits/chosen": -1.7818803787231445, + "logits/rejected": -1.7090286016464233, + "logps/chosen": -394.8916320800781, + "logps/rejected": -472.5263671875, + "loss": 0.5199, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4043118953704834, + "rewards/margins": 0.8041407465934753, + "rewards/rejected": -2.2084529399871826, + "step": 6530 + }, + { + "epoch": 0.86, + "learning_rate": 3.0969569144214147e-07, + "logits/chosen": -1.9432601928710938, + "logits/rejected": -1.8312276601791382, + "logps/chosen": -423.01953125, + "logps/rejected": -476.34075927734375, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3503367900848389, + "rewards/margins": 0.8274005055427551, + "rewards/rejected": -2.177737236022949, + "step": 6540 + }, + { + "epoch": 0.86, + "learning_rate": 3.042119901200824e-07, + "logits/chosen": -1.7422155141830444, + "logits/rejected": -1.7008956670761108, + "logps/chosen": -401.38153076171875, + "logps/rejected": -487.8135681152344, + "loss": 0.6273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5218549966812134, + "rewards/margins": 0.5686341524124146, + "rewards/rejected": -2.090488910675049, + "step": 6550 + }, + { + "epoch": 0.86, + "learning_rate": 2.9877412595396726e-07, + "logits/chosen": -1.9248844385147095, + "logits/rejected": -1.8276618719100952, + "logps/chosen": -468.0619201660156, + "logps/rejected": -514.0059204101562, + "loss": 0.5368, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4151637554168701, + "rewards/margins": 0.8898487091064453, + "rewards/rejected": -2.3050124645233154, + "step": 6560 + }, + { + "epoch": 0.86, + "learning_rate": 2.933822124594124e-07, + "logits/chosen": -1.873353362083435, + "logits/rejected": -1.7209460735321045, + "logps/chosen": -425.2776794433594, + "logps/rejected": -449.8960876464844, + "loss": 0.5923, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4606962203979492, + "rewards/margins": 0.6035781502723694, + "rewards/rejected": -2.064274311065674, + "step": 6570 + }, + { + "epoch": 0.86, + "learning_rate": 2.880363621928106e-07, + "logits/chosen": -1.8270835876464844, + "logits/rejected": -1.6930897235870361, + "logps/chosen": -447.11419677734375, + "logps/rejected": -467.6141662597656, + "loss": 0.5549, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.502572774887085, + "rewards/margins": 0.617160975933075, + "rewards/rejected": -2.1197338104248047, + "step": 6580 + }, + { + "epoch": 0.86, + "learning_rate": 2.82736686748985e-07, + "logits/chosen": -1.8135850429534912, + "logits/rejected": -1.7034486532211304, + "logps/chosen": -442.55743408203125, + "logps/rejected": -456.33111572265625, + "loss": 0.5417, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.453835368156433, + "rewards/margins": 0.7465580701828003, + "rewards/rejected": -2.2003934383392334, + "step": 6590 + }, + { + "epoch": 0.86, + "learning_rate": 2.774832967588556e-07, + "logits/chosen": -1.9186111688613892, + "logits/rejected": -1.779409646987915, + "logps/chosen": -448.62322998046875, + "logps/rejected": -503.616943359375, + "loss": 0.5544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4722912311553955, + "rewards/margins": 0.8435390591621399, + "rewards/rejected": -2.3158302307128906, + "step": 6600 + }, + { + "epoch": 0.86, + "eval_logits/chosen": 0.8067517876625061, + "eval_logits/rejected": 0.8487147688865662, + "eval_logps/chosen": -430.1987609863281, + "eval_logps/rejected": -483.3599853515625, + "eval_loss": 0.5486475229263306, + "eval_rewards/accuracies": 0.7114999890327454, + "eval_rewards/chosen": -1.449060320854187, + "eval_rewards/margins": 0.7458434700965881, + "eval_rewards/rejected": -2.19490385055542, + "eval_runtime": 1172.9906, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 6600 + }, + { + "epoch": 0.86, + "learning_rate": 2.7227630188713326e-07, + "logits/chosen": -1.9206463098526, + "logits/rejected": -1.6951115131378174, + "logps/chosen": -467.5791015625, + "logps/rejected": -486.3162536621094, + "loss": 0.5427, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.501364827156067, + "rewards/margins": 0.8067043423652649, + "rewards/rejected": -2.3080692291259766, + "step": 6610 + }, + { + "epoch": 0.87, + "learning_rate": 2.671158108300284e-07, + "logits/chosen": -1.9582704305648804, + "logits/rejected": -1.8491710424423218, + "logps/chosen": -426.76953125, + "logps/rejected": -487.92584228515625, + "loss": 0.554, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4947972297668457, + "rewards/margins": 0.6211631298065186, + "rewards/rejected": -2.1159605979919434, + "step": 6620 + }, + { + "epoch": 0.87, + "learning_rate": 2.6200193131298376e-07, + "logits/chosen": -1.9450123310089111, + "logits/rejected": -1.8916527032852173, + "logps/chosen": -439.2418518066406, + "logps/rejected": -511.76898193359375, + "loss": 0.4518, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3632786273956299, + "rewards/margins": 1.0412644147872925, + "rewards/rejected": -2.404543161392212, + "step": 6630 + }, + { + "epoch": 0.87, + "learning_rate": 2.569347700884217e-07, + "logits/chosen": -1.978216528892517, + "logits/rejected": -1.759415626525879, + "logps/chosen": -418.84686279296875, + "logps/rejected": -473.9796447753906, + "loss": 0.4749, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3123157024383545, + "rewards/margins": 1.0165531635284424, + "rewards/rejected": -2.328868865966797, + "step": 6640 + }, + { + "epoch": 0.87, + "learning_rate": 2.5191443293352186e-07, + "logits/chosen": -1.8891208171844482, + "logits/rejected": -1.8109643459320068, + "logps/chosen": -438.7274475097656, + "logps/rejected": -514.1735229492188, + "loss": 0.5407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4189406633377075, + "rewards/margins": 0.7500912547111511, + "rewards/rejected": -2.169032335281372, + "step": 6650 + }, + { + "epoch": 0.87, + "learning_rate": 2.469410246480067e-07, + "logits/chosen": -1.6742660999298096, + "logits/rejected": -1.5309489965438843, + "logps/chosen": -404.78057861328125, + "logps/rejected": -482.4720153808594, + "loss": 0.4872, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.563522458076477, + "rewards/margins": 0.971258819103241, + "rewards/rejected": -2.5347812175750732, + "step": 6660 + }, + { + "epoch": 0.87, + "learning_rate": 2.4201464905195955e-07, + "logits/chosen": -1.8839927911758423, + "logits/rejected": -1.8668091297149658, + "logps/chosen": -414.96142578125, + "logps/rejected": -461.70416259765625, + "loss": 0.6223, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4317705631256104, + "rewards/margins": 0.45908093452453613, + "rewards/rejected": -1.890851616859436, + "step": 6670 + }, + { + "epoch": 0.87, + "learning_rate": 2.3713540898365196e-07, + "logits/chosen": -1.726810097694397, + "logits/rejected": -1.7777540683746338, + "logps/chosen": -420.916748046875, + "logps/rejected": -478.23419189453125, + "loss": 0.5019, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.336426854133606, + "rewards/margins": 0.8878466486930847, + "rewards/rejected": -2.224273443222046, + "step": 6680 + }, + { + "epoch": 0.88, + "learning_rate": 2.3230340629740166e-07, + "logits/chosen": -1.9040178060531616, + "logits/rejected": -1.8105716705322266, + "logps/chosen": -430.43817138671875, + "logps/rejected": -438.405029296875, + "loss": 0.6784, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4686359167099, + "rewards/margins": 0.3403673768043518, + "rewards/rejected": -1.8090031147003174, + "step": 6690 + }, + { + "epoch": 0.88, + "learning_rate": 2.2751874186144357e-07, + "logits/chosen": -1.8653980493545532, + "logits/rejected": -1.7417211532592773, + "logps/chosen": -430.21527099609375, + "logps/rejected": -449.42987060546875, + "loss": 0.5828, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2628891468048096, + "rewards/margins": 0.6491962671279907, + "rewards/rejected": -1.9120851755142212, + "step": 6700 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 0.8096733689308167, + "eval_logits/rejected": 0.8511734008789062, + "eval_logps/chosen": -430.47711181640625, + "eval_logps/rejected": -483.68023681640625, + "eval_loss": 0.5486401319503784, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -1.4518442153930664, + "eval_rewards/margins": 0.7462618947029114, + "eval_rewards/rejected": -2.198106050491333, + "eval_runtime": 1172.8106, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 6700 + }, + { + "epoch": 0.88, + "learning_rate": 2.227815155558241e-07, + "logits/chosen": -1.9332526922225952, + "logits/rejected": -1.8840205669403076, + "logps/chosen": -431.55413818359375, + "logps/rejected": -513.9845581054688, + "loss": 0.4791, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.412088394165039, + "rewards/margins": 0.9422322511672974, + "rewards/rejected": -2.354320526123047, + "step": 6710 + }, + { + "epoch": 0.88, + "learning_rate": 2.1809182627031883e-07, + "logits/chosen": -2.016242265701294, + "logits/rejected": -1.8350414037704468, + "logps/chosen": -445.91046142578125, + "logps/rejected": -487.2635803222656, + "loss": 0.5423, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4208695888519287, + "rewards/margins": 0.719671905040741, + "rewards/rejected": -2.1405415534973145, + "step": 6720 + }, + { + "epoch": 0.88, + "learning_rate": 2.1344977190236372e-07, + "logits/chosen": -1.6209218502044678, + "logits/rejected": -1.6282440423965454, + "logps/chosen": -411.72576904296875, + "logps/rejected": -483.763916015625, + "loss": 0.579, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5281398296356201, + "rewards/margins": 0.7512315511703491, + "rewards/rejected": -2.279371500015259, + "step": 6730 + }, + { + "epoch": 0.88, + "learning_rate": 2.0885544935501656e-07, + "logits/chosen": -1.7832863330841064, + "logits/rejected": -1.773505449295044, + "logps/chosen": -409.8037109375, + "logps/rejected": -504.25994873046875, + "loss": 0.4929, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.349562168121338, + "rewards/margins": 0.9078701138496399, + "rewards/rejected": -2.257432222366333, + "step": 6740 + }, + { + "epoch": 0.88, + "learning_rate": 2.0430895453492944e-07, + "logits/chosen": -1.8785454034805298, + "logits/rejected": -1.8223320245742798, + "logps/chosen": -455.398193359375, + "logps/rejected": -466.92901611328125, + "loss": 0.613, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4360158443450928, + "rewards/margins": 0.49601641297340393, + "rewards/rejected": -1.9320322275161743, + "step": 6750 + }, + { + "epoch": 0.88, + "learning_rate": 1.9981038235035111e-07, + "logits/chosen": -1.7741162776947021, + "logits/rejected": -1.7254183292388916, + "logps/chosen": -409.7948303222656, + "logps/rejected": -478.5233459472656, + "loss": 0.4285, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2761456966400146, + "rewards/margins": 0.9531618356704712, + "rewards/rejected": -2.2293076515197754, + "step": 6760 + }, + { + "epoch": 0.89, + "learning_rate": 1.9535982670914112e-07, + "logits/chosen": -1.7457072734832764, + "logits/rejected": -1.5970163345336914, + "logps/chosen": -461.16253662109375, + "logps/rejected": -513.734130859375, + "loss": 0.532, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.446793794631958, + "rewards/margins": 0.8001499176025391, + "rewards/rejected": -2.246943712234497, + "step": 6770 + }, + { + "epoch": 0.89, + "learning_rate": 1.9095738051681412e-07, + "logits/chosen": -1.7886247634887695, + "logits/rejected": -1.7298681735992432, + "logps/chosen": -417.9925231933594, + "logps/rejected": -471.6416015625, + "loss": 0.59, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6166937351226807, + "rewards/margins": 0.5833561420440674, + "rewards/rejected": -2.200049877166748, + "step": 6780 + }, + { + "epoch": 0.89, + "learning_rate": 1.8660313567459703e-07, + "logits/chosen": -1.7199945449829102, + "logits/rejected": -1.8157663345336914, + "logps/chosen": -388.76190185546875, + "logps/rejected": -467.54638671875, + "loss": 0.5576, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4115843772888184, + "rewards/margins": 0.8241313695907593, + "rewards/rejected": -2.235715866088867, + "step": 6790 + }, + { + "epoch": 0.89, + "learning_rate": 1.8229718307751165e-07, + "logits/chosen": -1.9006595611572266, + "logits/rejected": -1.7318729162216187, + "logps/chosen": -449.3634338378906, + "logps/rejected": -472.896728515625, + "loss": 0.5711, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5036065578460693, + "rewards/margins": 0.819083034992218, + "rewards/rejected": -2.3226895332336426, + "step": 6800 + }, + { + "epoch": 0.89, + "eval_logits/chosen": 0.8124059438705444, + "eval_logits/rejected": 0.8538053631782532, + "eval_logps/chosen": -430.8609924316406, + "eval_logps/rejected": -484.1659851074219, + "eval_loss": 0.5484992861747742, + "eval_rewards/accuracies": 0.7095000147819519, + "eval_rewards/chosen": -1.4556825160980225, + "eval_rewards/margins": 0.747280478477478, + "eval_rewards/rejected": -2.20296311378479, + "eval_runtime": 1173.0318, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 6800 + }, + { + "epoch": 0.89, + "learning_rate": 1.7803961261247864e-07, + "logits/chosen": -1.7197239398956299, + "logits/rejected": -1.6941922903060913, + "logps/chosen": -430.0616149902344, + "logps/rejected": -510.97027587890625, + "loss": 0.4903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3526915311813354, + "rewards/margins": 0.8942115902900696, + "rewards/rejected": -2.24690318107605, + "step": 6810 + }, + { + "epoch": 0.89, + "learning_rate": 1.7383051315643772e-07, + "logits/chosen": -1.8691461086273193, + "logits/rejected": -1.7417608499526978, + "logps/chosen": -444.39208984375, + "logps/rejected": -472.8169860839844, + "loss": 0.5579, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4681408405303955, + "rewards/margins": 0.6582044363021851, + "rewards/rejected": -2.126345157623291, + "step": 6820 + }, + { + "epoch": 0.89, + "learning_rate": 1.6966997257449685e-07, + "logits/chosen": -1.8133577108383179, + "logits/rejected": -1.7999480962753296, + "logps/chosen": -434.6233825683594, + "logps/rejected": -474.06884765625, + "loss": 0.562, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4581729173660278, + "rewards/margins": 0.6189436912536621, + "rewards/rejected": -2.0771164894104004, + "step": 6830 + }, + { + "epoch": 0.9, + "learning_rate": 1.6555807771809375e-07, + "logits/chosen": -1.7792173624038696, + "logits/rejected": -1.694387674331665, + "logps/chosen": -416.30206298828125, + "logps/rejected": -446.8271484375, + "loss": 0.5098, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4068571329116821, + "rewards/margins": 0.8837915658950806, + "rewards/rejected": -2.2906486988067627, + "step": 6840 + }, + { + "epoch": 0.9, + "learning_rate": 1.6149491442318617e-07, + "logits/chosen": -1.906616449356079, + "logits/rejected": -1.8216674327850342, + "logps/chosen": -415.72918701171875, + "logps/rejected": -471.26568603515625, + "loss": 0.5208, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.342486023902893, + "rewards/margins": 0.7155783176422119, + "rewards/rejected": -2.0580644607543945, + "step": 6850 + }, + { + "epoch": 0.9, + "learning_rate": 1.5748056750845786e-07, + "logits/chosen": -1.8992087841033936, + "logits/rejected": -1.8292995691299438, + "logps/chosen": -452.1145935058594, + "logps/rejected": -446.95452880859375, + "loss": 0.5498, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5781564712524414, + "rewards/margins": 0.6276174783706665, + "rewards/rejected": -2.2057740688323975, + "step": 6860 + }, + { + "epoch": 0.9, + "learning_rate": 1.5351512077355024e-07, + "logits/chosen": -1.8367058038711548, + "logits/rejected": -1.720963478088379, + "logps/chosen": -443.9596252441406, + "logps/rejected": -561.5131225585938, + "loss": 0.4586, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2995741367340088, + "rewards/margins": 0.9421840906143188, + "rewards/rejected": -2.241758108139038, + "step": 6870 + }, + { + "epoch": 0.9, + "learning_rate": 1.4959865699730902e-07, + "logits/chosen": -1.7452888488769531, + "logits/rejected": -1.623615026473999, + "logps/chosen": -401.3358459472656, + "logps/rejected": -451.8515625, + "loss": 0.5025, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5068118572235107, + "rewards/margins": 0.8342909812927246, + "rewards/rejected": -2.3411028385162354, + "step": 6880 + }, + { + "epoch": 0.9, + "learning_rate": 1.4573125793606202e-07, + "logits/chosen": -1.7507559061050415, + "logits/rejected": -1.617040991783142, + "logps/chosen": -389.5786437988281, + "logps/rejected": -453.15240478515625, + "loss": 0.5231, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5261350870132446, + "rewards/margins": 0.7890563607215881, + "rewards/rejected": -2.3151915073394775, + "step": 6890 + }, + { + "epoch": 0.9, + "learning_rate": 1.4191300432190634e-07, + "logits/chosen": -1.7426488399505615, + "logits/rejected": -1.4778661727905273, + "logps/chosen": -446.41387939453125, + "logps/rejected": -484.15740966796875, + "loss": 0.5621, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5824512243270874, + "rewards/margins": 0.6358043551445007, + "rewards/rejected": -2.2182555198669434, + "step": 6900 + }, + { + "epoch": 0.9, + "eval_logits/chosen": 0.8118740916252136, + "eval_logits/rejected": 0.8534757494926453, + "eval_logps/chosen": -430.86248779296875, + "eval_logps/rejected": -484.2228698730469, + "eval_loss": 0.5483530759811401, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -1.4556974172592163, + "eval_rewards/margins": 0.7478345632553101, + "eval_rewards/rejected": -2.2035317420959473, + "eval_runtime": 1173.3393, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 6900 + }, + { + "epoch": 0.9, + "learning_rate": 1.381439758610284e-07, + "logits/chosen": -1.8737863302230835, + "logits/rejected": -1.677040696144104, + "logps/chosen": -414.41937255859375, + "logps/rejected": -454.90057373046875, + "loss": 0.5616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3846615552902222, + "rewards/margins": 0.5714194774627686, + "rewards/rejected": -1.9560811519622803, + "step": 6910 + }, + { + "epoch": 0.91, + "learning_rate": 1.3442425123203596e-07, + "logits/chosen": -1.9634335041046143, + "logits/rejected": -1.827630639076233, + "logps/chosen": -404.2421875, + "logps/rejected": -489.4110412597656, + "loss": 0.4855, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3388338088989258, + "rewards/margins": 0.8733822107315063, + "rewards/rejected": -2.2122159004211426, + "step": 6920 + }, + { + "epoch": 0.91, + "learning_rate": 1.3075390808431897e-07, + "logits/chosen": -1.6054050922393799, + "logits/rejected": -1.5290915966033936, + "logps/chosen": -398.23614501953125, + "logps/rejected": -439.45953369140625, + "loss": 0.5452, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4326770305633545, + "rewards/margins": 0.7014239430427551, + "rewards/rejected": -2.134101152420044, + "step": 6930 + }, + { + "epoch": 0.91, + "learning_rate": 1.271330230364262e-07, + "logits/chosen": -1.8218040466308594, + "logits/rejected": -1.748741865158081, + "logps/chosen": -415.97271728515625, + "logps/rejected": -539.3084716796875, + "loss": 0.5837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.513819932937622, + "rewards/margins": 0.7359345555305481, + "rewards/rejected": -2.2497544288635254, + "step": 6940 + }, + { + "epoch": 0.91, + "learning_rate": 1.2356167167446698e-07, + "logits/chosen": -1.772926926612854, + "logits/rejected": -1.7543052434921265, + "logps/chosen": -412.350341796875, + "logps/rejected": -494.44891357421875, + "loss": 0.5541, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5925939083099365, + "rewards/margins": 0.7502508759498596, + "rewards/rejected": -2.3428447246551514, + "step": 6950 + }, + { + "epoch": 0.91, + "learning_rate": 1.2003992855053326e-07, + "logits/chosen": -1.7478806972503662, + "logits/rejected": -1.59023916721344, + "logps/chosen": -392.5653991699219, + "logps/rejected": -542.6173706054688, + "loss": 0.4976, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3845903873443604, + "rewards/margins": 1.542232871055603, + "rewards/rejected": -2.926823616027832, + "step": 6960 + }, + { + "epoch": 0.91, + "learning_rate": 1.1656786718114239e-07, + "logits/chosen": -1.7298316955566406, + "logits/rejected": -1.7008460760116577, + "logps/chosen": -412.833984375, + "logps/rejected": -471.994384765625, + "loss": 0.5261, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4343531131744385, + "rewards/margins": 0.7191973328590393, + "rewards/rejected": -2.153550386428833, + "step": 6970 + }, + { + "epoch": 0.91, + "learning_rate": 1.1314556004570487e-07, + "logits/chosen": -1.722328782081604, + "logits/rejected": -1.6999261379241943, + "logps/chosen": -368.1650085449219, + "logps/rejected": -455.1087951660156, + "loss": 0.593, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3800055980682373, + "rewards/margins": 0.6139020323753357, + "rewards/rejected": -1.9939076900482178, + "step": 6980 + }, + { + "epoch": 0.91, + "learning_rate": 1.0977307858500818e-07, + "logits/chosen": -1.714484453201294, + "logits/rejected": -1.553884506225586, + "logps/chosen": -396.49676513671875, + "logps/rejected": -444.13494873046875, + "loss": 0.4963, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.282323956489563, + "rewards/margins": 0.7836967706680298, + "rewards/rejected": -2.0660204887390137, + "step": 6990 + }, + { + "epoch": 0.92, + "learning_rate": 1.0645049319972789e-07, + "logits/chosen": -1.7314668893814087, + "logits/rejected": -1.6142488718032837, + "logps/chosen": -424.98333740234375, + "logps/rejected": -467.7752990722656, + "loss": 0.5093, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4310033321380615, + "rewards/margins": 0.9052546620368958, + "rewards/rejected": -2.3362579345703125, + "step": 7000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": 0.812759518623352, + "eval_logits/rejected": 0.8539248704910278, + "eval_logps/chosen": -430.8410949707031, + "eval_logps/rejected": -484.165771484375, + "eval_loss": 0.5484933257102966, + "eval_rewards/accuracies": 0.7095000147819519, + "eval_rewards/chosen": -1.4554840326309204, + "eval_rewards/margins": 0.7474771738052368, + "eval_rewards/rejected": -2.2029612064361572, + "eval_runtime": 1172.8269, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 7000 + }, + { + "epoch": 0.92, + "learning_rate": 1.0317787324895634e-07, + "logits/chosen": -1.8907572031021118, + "logits/rejected": -1.7053619623184204, + "logps/chosen": -454.3196716308594, + "logps/rejected": -511.54364013671875, + "loss": 0.4711, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.471142053604126, + "rewards/margins": 0.9644231796264648, + "rewards/rejected": -2.4355649948120117, + "step": 7010 + }, + { + "epoch": 0.92, + "learning_rate": 9.995528704875635e-08, + "logits/chosen": -1.7494027614593506, + "logits/rejected": -1.843583345413208, + "logps/chosen": -399.296142578125, + "logps/rejected": -481.3427734375, + "loss": 0.5744, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.471015214920044, + "rewards/margins": 0.6564980745315552, + "rewards/rejected": -2.1275134086608887, + "step": 7020 + }, + { + "epoch": 0.92, + "learning_rate": 9.678280187073452e-08, + "logits/chosen": -1.6823720932006836, + "logits/rejected": -1.6337709426879883, + "logps/chosen": -429.0879821777344, + "logps/rejected": -483.7962951660156, + "loss": 0.4381, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2382405996322632, + "rewards/margins": 1.0622050762176514, + "rewards/rejected": -2.300445795059204, + "step": 7030 + }, + { + "epoch": 0.92, + "learning_rate": 9.366048394063549e-08, + "logits/chosen": -1.9416471719741821, + "logits/rejected": -1.8561681509017944, + "logps/chosen": -417.84527587890625, + "logps/rejected": -503.8096618652344, + "loss": 0.5097, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3203117847442627, + "rewards/margins": 0.8082769513130188, + "rewards/rejected": -2.1285886764526367, + "step": 7040 + }, + { + "epoch": 0.92, + "learning_rate": 9.058839843696237e-08, + "logits/chosen": -1.8911237716674805, + "logits/rejected": -1.7695062160491943, + "logps/chosen": -437.6932067871094, + "logps/rejected": -481.0242614746094, + "loss": 0.5439, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4225181341171265, + "rewards/margins": 0.7539501190185547, + "rewards/rejected": -2.1764683723449707, + "step": 7050 + }, + { + "epoch": 0.92, + "learning_rate": 8.756660948961299e-08, + "logits/chosen": -1.851973295211792, + "logits/rejected": -1.8292022943496704, + "logps/chosen": -406.18634033203125, + "logps/rejected": -482.53515625, + "loss": 0.5743, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4888880252838135, + "rewards/margins": 0.6048755049705505, + "rewards/rejected": -2.093763589859009, + "step": 7060 + }, + { + "epoch": 0.93, + "learning_rate": 8.459518017854412e-08, + "logits/chosen": -1.8769149780273438, + "logits/rejected": -1.762813925743103, + "logps/chosen": -420.82550048828125, + "logps/rejected": -439.24658203125, + "loss": 0.6129, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.437957525253296, + "rewards/margins": 0.41024312376976013, + "rewards/rejected": -1.8482005596160889, + "step": 7070 + }, + { + "epoch": 0.93, + "learning_rate": 8.167417253245213e-08, + "logits/chosen": -1.799330711364746, + "logits/rejected": -1.6416656970977783, + "logps/chosen": -401.96075439453125, + "logps/rejected": -454.5691833496094, + "loss": 0.494, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3122749328613281, + "rewards/margins": 0.7804873585700989, + "rewards/rejected": -2.0927624702453613, + "step": 7080 + }, + { + "epoch": 0.93, + "learning_rate": 7.880364752747948e-08, + "logits/chosen": -1.7926000356674194, + "logits/rejected": -1.7163807153701782, + "logps/chosen": -416.36773681640625, + "logps/rejected": -473.047607421875, + "loss": 0.5961, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6541506052017212, + "rewards/margins": 0.5921451449394226, + "rewards/rejected": -2.246295690536499, + "step": 7090 + }, + { + "epoch": 0.93, + "learning_rate": 7.598366508594245e-08, + "logits/chosen": -1.8010107278823853, + "logits/rejected": -1.767491102218628, + "logps/chosen": -462.71319580078125, + "logps/rejected": -527.4778442382812, + "loss": 0.4665, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4491039514541626, + "rewards/margins": 0.9631915092468262, + "rewards/rejected": -2.4122955799102783, + "step": 7100 + }, + { + "epoch": 0.93, + "eval_logits/chosen": 0.8128459453582764, + "eval_logits/rejected": 0.8539407253265381, + "eval_logps/chosen": -430.9034729003906, + "eval_logps/rejected": -484.2508850097656, + "eval_loss": 0.5485362410545349, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -1.4561071395874023, + "eval_rewards/margins": 0.7477050423622131, + "eval_rewards/rejected": -2.203812599182129, + "eval_runtime": 1172.6821, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 7100 + }, + { + "epoch": 0.93, + "learning_rate": 7.32142840750788e-08, + "logits/chosen": -1.8629350662231445, + "logits/rejected": -1.6973631381988525, + "logps/chosen": -448.58514404296875, + "logps/rejected": -496.39508056640625, + "loss": 0.4596, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2828123569488525, + "rewards/margins": 0.9491091966629028, + "rewards/rejected": -2.231921672821045, + "step": 7110 + }, + { + "epoch": 0.93, + "learning_rate": 7.049556230581872e-08, + "logits/chosen": -1.7679073810577393, + "logits/rejected": -1.6131082773208618, + "logps/chosen": -414.78521728515625, + "logps/rejected": -452.10125732421875, + "loss": 0.6113, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5814316272735596, + "rewards/margins": 0.5837645530700684, + "rewards/rejected": -2.165196180343628, + "step": 7120 + }, + { + "epoch": 0.93, + "learning_rate": 6.782755653158085e-08, + "logits/chosen": -1.8873008489608765, + "logits/rejected": -1.805215835571289, + "logps/chosen": -429.1568298339844, + "logps/rejected": -474.21484375, + "loss": 0.5254, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3898502588272095, + "rewards/margins": 0.7227694988250732, + "rewards/rejected": -2.1126198768615723, + "step": 7130 + }, + { + "epoch": 0.93, + "learning_rate": 6.521032244708375e-08, + "logits/chosen": -1.6917879581451416, + "logits/rejected": -1.6695703268051147, + "logps/chosen": -420.2552795410156, + "logps/rejected": -478.6356506347656, + "loss": 0.6021, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4334557056427002, + "rewards/margins": 0.6401674151420593, + "rewards/rejected": -2.0736231803894043, + "step": 7140 + }, + { + "epoch": 0.94, + "learning_rate": 6.264391468718628e-08, + "logits/chosen": -1.921420693397522, + "logits/rejected": -1.8295361995697021, + "logps/chosen": -416.1781311035156, + "logps/rejected": -477.92254638671875, + "loss": 0.5117, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2795865535736084, + "rewards/margins": 0.805153489112854, + "rewards/rejected": -2.084740161895752, + "step": 7150 + }, + { + "epoch": 0.94, + "learning_rate": 6.012838682574462e-08, + "logits/chosen": -1.92929208278656, + "logits/rejected": -1.8011394739151, + "logps/chosen": -431.6207580566406, + "logps/rejected": -437.07916259765625, + "loss": 0.5315, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4483994245529175, + "rewards/margins": 0.6417319178581238, + "rewards/rejected": -2.0901312828063965, + "step": 7160 + }, + { + "epoch": 0.94, + "learning_rate": 5.766379137449624e-08, + "logits/chosen": -1.821616768836975, + "logits/rejected": -1.80642831325531, + "logps/chosen": -387.9848327636719, + "logps/rejected": -483.4175720214844, + "loss": 0.5058, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.400189757347107, + "rewards/margins": 0.7653945684432983, + "rewards/rejected": -2.1655843257904053, + "step": 7170 + }, + { + "epoch": 0.94, + "learning_rate": 5.525017978196295e-08, + "logits/chosen": -2.0324506759643555, + "logits/rejected": -1.8615728616714478, + "logps/chosen": -441.63037109375, + "logps/rejected": -474.91741943359375, + "loss": 0.5405, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4084820747375488, + "rewards/margins": 0.7742080688476562, + "rewards/rejected": -2.182690143585205, + "step": 7180 + }, + { + "epoch": 0.94, + "learning_rate": 5.288760243237545e-08, + "logits/chosen": -1.8969142436981201, + "logits/rejected": -1.8226633071899414, + "logps/chosen": -476.3313903808594, + "logps/rejected": -500.29693603515625, + "loss": 0.5595, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5552139282226562, + "rewards/margins": 0.7276593446731567, + "rewards/rejected": -2.2828731536865234, + "step": 7190 + }, + { + "epoch": 0.94, + "learning_rate": 5.0576108644623536e-08, + "logits/chosen": -1.7604423761367798, + "logits/rejected": -1.576672911643982, + "logps/chosen": -485.7357482910156, + "logps/rejected": -487.3819274902344, + "loss": 0.6276, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6866333484649658, + "rewards/margins": 0.6542980074882507, + "rewards/rejected": -2.3409314155578613, + "step": 7200 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 0.8129886388778687, + "eval_logits/rejected": 0.8539232611656189, + "eval_logps/chosen": -430.8554382324219, + "eval_logps/rejected": -484.1955261230469, + "eval_loss": 0.548583447933197, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": -1.4556269645690918, + "eval_rewards/margins": 0.7476316690444946, + "eval_rewards/rejected": -2.203258752822876, + "eval_runtime": 1173.429, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 7200 + }, + { + "epoch": 0.94, + "learning_rate": 4.8315746671225296e-08, + "logits/chosen": -1.8429571390151978, + "logits/rejected": -1.6994577646255493, + "logps/chosen": -457.65594482421875, + "logps/rejected": -516.1876220703125, + "loss": 0.4917, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.343750238418579, + "rewards/margins": 0.8745664358139038, + "rewards/rejected": -2.2183165550231934, + "step": 7210 + }, + { + "epoch": 0.94, + "learning_rate": 4.6106563697320695e-08, + "logits/chosen": -1.7050548791885376, + "logits/rejected": -1.5491039752960205, + "logps/chosen": -397.58441162109375, + "logps/rejected": -462.5196228027344, + "loss": 0.5121, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4449985027313232, + "rewards/margins": 0.9326278567314148, + "rewards/rejected": -2.377626419067383, + "step": 7220 + }, + { + "epoch": 0.95, + "learning_rate": 4.394860583968624e-08, + "logits/chosen": -1.7307459115982056, + "logits/rejected": -1.8069360256195068, + "logps/chosen": -362.82421875, + "logps/rejected": -458.86895751953125, + "loss": 0.5635, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4238734245300293, + "rewards/margins": 0.7079671621322632, + "rewards/rejected": -2.131840467453003, + "step": 7230 + }, + { + "epoch": 0.95, + "learning_rate": 4.1841918145771874e-08, + "logits/chosen": -1.8039257526397705, + "logits/rejected": -1.7260109186172485, + "logps/chosen": -419.90777587890625, + "logps/rejected": -495.89288330078125, + "loss": 0.4669, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2997856140136719, + "rewards/margins": 0.9247959852218628, + "rewards/rejected": -2.2245819568634033, + "step": 7240 + }, + { + "epoch": 0.95, + "learning_rate": 3.978654459276088e-08, + "logits/chosen": -1.9709882736206055, + "logits/rejected": -1.841698408126831, + "logps/chosen": -469.30804443359375, + "logps/rejected": -484.8894958496094, + "loss": 0.5384, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3955186605453491, + "rewards/margins": 0.803984522819519, + "rewards/rejected": -2.1995034217834473, + "step": 7250 + }, + { + "epoch": 0.95, + "learning_rate": 3.778252808665284e-08, + "logits/chosen": -2.043179512023926, + "logits/rejected": -1.958134651184082, + "logps/chosen": -479.24005126953125, + "logps/rejected": -458.7781677246094, + "loss": 0.5527, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.469952940940857, + "rewards/margins": 0.5939545631408691, + "rewards/rejected": -2.0639073848724365, + "step": 7260 + }, + { + "epoch": 0.95, + "learning_rate": 3.5829910461366023e-08, + "logits/chosen": -1.721975326538086, + "logits/rejected": -1.708946943283081, + "logps/chosen": -403.50286865234375, + "logps/rejected": -470.90777587890625, + "loss": 0.6299, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4790034294128418, + "rewards/margins": 0.7231262922286987, + "rewards/rejected": -2.20212984085083, + "step": 7270 + }, + { + "epoch": 0.95, + "learning_rate": 3.39287324778656e-08, + "logits/chosen": -1.9802652597427368, + "logits/rejected": -1.9110679626464844, + "logps/chosen": -495.3759765625, + "logps/rejected": -515.5206909179688, + "loss": 0.6205, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5836551189422607, + "rewards/margins": 0.6330682039260864, + "rewards/rejected": -2.216723680496216, + "step": 7280 + }, + { + "epoch": 0.95, + "learning_rate": 3.207903382331262e-08, + "logits/chosen": -1.7459537982940674, + "logits/rejected": -1.6916313171386719, + "logps/chosen": -444.4571228027344, + "logps/rejected": -485.93829345703125, + "loss": 0.5289, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3292511701583862, + "rewards/margins": 0.8171280026435852, + "rewards/rejected": -2.146378993988037, + "step": 7290 + }, + { + "epoch": 0.96, + "learning_rate": 3.028085311023443e-08, + "logits/chosen": -1.7082124948501587, + "logits/rejected": -1.6282784938812256, + "logps/chosen": -425.0762634277344, + "logps/rejected": -481.3499450683594, + "loss": 0.457, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2966469526290894, + "rewards/margins": 0.932367205619812, + "rewards/rejected": -2.2290141582489014, + "step": 7300 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 0.8128509521484375, + "eval_logits/rejected": 0.8539592027664185, + "eval_logps/chosen": -430.7640075683594, + "eval_logps/rejected": -484.09423828125, + "eval_loss": 0.5485822558403015, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": -1.4547128677368164, + "eval_rewards/margins": 0.7475329637527466, + "eval_rewards/rejected": -2.2022459506988525, + "eval_runtime": 1172.8041, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 7300 + }, + { + "epoch": 0.96, + "learning_rate": 2.8534227875720576e-08, + "logits/chosen": -1.932652473449707, + "logits/rejected": -1.9709556102752686, + "logps/chosen": -431.27130126953125, + "logps/rejected": -491.1851501464844, + "loss": 0.5513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4966964721679688, + "rewards/margins": 0.7334243655204773, + "rewards/rejected": -2.2301206588745117, + "step": 7310 + }, + { + "epoch": 0.96, + "learning_rate": 2.683919458063705e-08, + "logits/chosen": -1.788451910018921, + "logits/rejected": -1.520784854888916, + "logps/chosen": -363.04632568359375, + "logps/rejected": -387.00390625, + "loss": 0.559, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3995718955993652, + "rewards/margins": 0.6893042325973511, + "rewards/rejected": -2.088876247406006, + "step": 7320 + }, + { + "epoch": 0.96, + "learning_rate": 2.5195788608866345e-08, + "logits/chosen": -1.8044917583465576, + "logits/rejected": -1.6800788640975952, + "logps/chosen": -483.80987548828125, + "logps/rejected": -498.57293701171875, + "loss": 0.5478, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3945772647857666, + "rewards/margins": 0.8723125457763672, + "rewards/rejected": -2.2668895721435547, + "step": 7330 + }, + { + "epoch": 0.96, + "learning_rate": 2.3604044266569426e-08, + "logits/chosen": -1.8498961925506592, + "logits/rejected": -1.608460783958435, + "logps/chosen": -443.71759033203125, + "logps/rejected": -481.8394470214844, + "loss": 0.5601, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5067075490951538, + "rewards/margins": 0.72029709815979, + "rewards/rejected": -2.2270047664642334, + "step": 7340 + }, + { + "epoch": 0.96, + "learning_rate": 2.2063994781468256e-08, + "logits/chosen": -1.7632825374603271, + "logits/rejected": -1.7024528980255127, + "logps/chosen": -417.3409118652344, + "logps/rejected": -460.17144775390625, + "loss": 0.5216, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.375964641571045, + "rewards/margins": 0.7951169610023499, + "rewards/rejected": -2.171082019805908, + "step": 7350 + }, + { + "epoch": 0.96, + "learning_rate": 2.057567230215246e-08, + "logits/chosen": -1.9352567195892334, + "logits/rejected": -1.86801016330719, + "logps/chosen": -435.8123474121094, + "logps/rejected": -501.49761962890625, + "loss": 0.5404, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5306956768035889, + "rewards/margins": 0.6156077980995178, + "rewards/rejected": -2.146303415298462, + "step": 7360 + }, + { + "epoch": 0.96, + "learning_rate": 1.9139107897409303e-08, + "logits/chosen": -1.764081597328186, + "logits/rejected": -1.732365369796753, + "logps/chosen": -438.1732482910156, + "logps/rejected": -472.97088623046875, + "loss": 0.4651, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3602381944656372, + "rewards/margins": 0.9602410197257996, + "rewards/rejected": -2.320479154586792, + "step": 7370 + }, + { + "epoch": 0.97, + "learning_rate": 1.7754331555573656e-08, + "logits/chosen": -1.9736744165420532, + "logits/rejected": -1.8503955602645874, + "logps/chosen": -452.91314697265625, + "logps/rejected": -552.435791015625, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4927455186843872, + "rewards/margins": 0.7449467182159424, + "rewards/rejected": -2.237692356109619, + "step": 7380 + }, + { + "epoch": 0.97, + "learning_rate": 1.642137218390294e-08, + "logits/chosen": -1.9133977890014648, + "logits/rejected": -1.7161099910736084, + "logps/chosen": -445.2350158691406, + "logps/rejected": -474.0205078125, + "loss": 0.5617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.501450538635254, + "rewards/margins": 0.7882502675056458, + "rewards/rejected": -2.289700746536255, + "step": 7390 + }, + { + "epoch": 0.97, + "learning_rate": 1.514025760797344e-08, + "logits/chosen": -2.056525707244873, + "logits/rejected": -1.8239011764526367, + "logps/chosen": -483.4375, + "logps/rejected": -487.8236389160156, + "loss": 0.5436, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.394179105758667, + "rewards/margins": 0.7199534773826599, + "rewards/rejected": -2.1141324043273926, + "step": 7400 + }, + { + "epoch": 0.97, + "eval_logits/chosen": 0.8129631280899048, + "eval_logits/rejected": 0.8540742993354797, + "eval_logps/chosen": -430.8633728027344, + "eval_logps/rejected": -484.2209167480469, + "eval_loss": 0.5485607981681824, + "eval_rewards/accuracies": 0.7129999995231628, + "eval_rewards/chosen": -1.4557064771652222, + "eval_rewards/margins": 0.7478062510490417, + "eval_rewards/rejected": -2.203512668609619, + "eval_runtime": 1173.1991, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.852, + "step": 7400 + }, + { + "epoch": 0.97, + "learning_rate": 1.3911014571098835e-08, + "logits/chosen": -1.8888267278671265, + "logits/rejected": -1.8209940195083618, + "logps/chosen": -410.78131103515625, + "logps/rejected": -490.9281311035156, + "loss": 0.5512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4821470975875854, + "rewards/margins": 0.6978796124458313, + "rewards/rejected": -2.1800267696380615, + "step": 7410 + }, + { + "epoch": 0.97, + "learning_rate": 1.2733668733773685e-08, + "logits/chosen": -1.8667023181915283, + "logits/rejected": -1.763035774230957, + "logps/chosen": -422.2196350097656, + "logps/rejected": -472.96563720703125, + "loss": 0.4785, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3575246334075928, + "rewards/margins": 0.8752638697624207, + "rewards/rejected": -2.232788562774658, + "step": 7420 + }, + { + "epoch": 0.97, + "learning_rate": 1.160824467313526e-08, + "logits/chosen": -1.9163471460342407, + "logits/rejected": -1.8418922424316406, + "logps/chosen": -469.48895263671875, + "logps/rejected": -527.531005859375, + "loss": 0.5388, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4518873691558838, + "rewards/margins": 0.7550562024116516, + "rewards/rejected": -2.2069435119628906, + "step": 7430 + }, + { + "epoch": 0.97, + "learning_rate": 1.0534765882453113e-08, + "logits/chosen": -1.9798256158828735, + "logits/rejected": -1.8704360723495483, + "logps/chosen": -416.0478515625, + "logps/rejected": -476.11407470703125, + "loss": 0.5321, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3923460245132446, + "rewards/margins": 0.7569142580032349, + "rewards/rejected": -2.1492602825164795, + "step": 7440 + }, + { + "epoch": 0.97, + "learning_rate": 9.513254770636138e-09, + "logits/chosen": -1.9356752634048462, + "logits/rejected": -1.822356939315796, + "logps/chosen": -485.7112731933594, + "logps/rejected": -521.7303466796875, + "loss": 0.6574, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7006072998046875, + "rewards/margins": 0.49051332473754883, + "rewards/rejected": -2.1911206245422363, + "step": 7450 + }, + { + "epoch": 0.98, + "learning_rate": 8.543732661767113e-09, + "logits/chosen": -1.7195825576782227, + "logits/rejected": -1.857712984085083, + "logps/chosen": -450.140380859375, + "logps/rejected": -496.9141540527344, + "loss": 0.6259, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4755804538726807, + "rewards/margins": 0.47876566648483276, + "rewards/rejected": -1.9543460607528687, + "step": 7460 + }, + { + "epoch": 0.98, + "learning_rate": 7.626219794655553e-09, + "logits/chosen": -1.7965953350067139, + "logits/rejected": -1.7672102451324463, + "logps/chosen": -396.3360290527344, + "logps/rejected": -501.77081298828125, + "loss": 0.4944, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3167970180511475, + "rewards/margins": 0.9213516116142273, + "rewards/rejected": -2.2381484508514404, + "step": 7470 + }, + { + "epoch": 0.98, + "learning_rate": 6.7607353224163896e-09, + "logits/chosen": -1.8362220525741577, + "logits/rejected": -1.7684568166732788, + "logps/chosen": -438.6917419433594, + "logps/rejected": -450.4482421875, + "loss": 0.5987, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4603219032287598, + "rewards/margins": 0.5805224776268005, + "rewards/rejected": -2.040844440460205, + "step": 7480 + }, + { + "epoch": 0.98, + "learning_rate": 5.947297312070554e-09, + "logits/chosen": -1.8304193019866943, + "logits/rejected": -1.589911699295044, + "logps/chosen": -460.3739318847656, + "logps/rejected": -479.26934814453125, + "loss": 0.5077, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4384151697158813, + "rewards/margins": 0.8745136260986328, + "rewards/rejected": -2.3129289150238037, + "step": 7490 + }, + { + "epoch": 0.98, + "learning_rate": 5.185922744166128e-09, + "logits/chosen": -1.8723042011260986, + "logits/rejected": -1.8588206768035889, + "logps/chosen": -432.6783142089844, + "logps/rejected": -502.30999755859375, + "loss": 0.4801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3651726245880127, + "rewards/margins": 0.8920621871948242, + "rewards/rejected": -2.257235050201416, + "step": 7500 + }, + { + "epoch": 0.98, + "eval_logits/chosen": 0.8125240206718445, + "eval_logits/rejected": 0.8537938594818115, + "eval_logps/chosen": -430.8403625488281, + "eval_logps/rejected": -484.1994323730469, + "eval_loss": 0.5486313700675964, + "eval_rewards/accuracies": 0.7124999761581421, + "eval_rewards/chosen": -1.4554764032363892, + "eval_rewards/margins": 0.747821569442749, + "eval_rewards/rejected": -2.2032980918884277, + "eval_runtime": 1172.9523, + "eval_samples_per_second": 1.705, + "eval_steps_per_second": 0.853, + "step": 7500 + }, + { + "epoch": 0.98, + "learning_rate": 4.476627512425558e-09, + "logits/chosen": -1.7662780284881592, + "logits/rejected": -1.7995166778564453, + "logps/chosen": -437.27130126953125, + "logps/rejected": -486.76220703125, + "loss": 0.6005, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5092612504959106, + "rewards/margins": 0.5584593415260315, + "rewards/rejected": -2.067720651626587, + "step": 7510 + }, + { + "epoch": 0.98, + "learning_rate": 3.819426423412875e-09, + "logits/chosen": -1.9472389221191406, + "logits/rejected": -1.870298147201538, + "logps/chosen": -446.26641845703125, + "logps/rejected": -475.03253173828125, + "loss": 0.6559, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4711787700653076, + "rewards/margins": 0.6127547025680542, + "rewards/rejected": -2.0839333534240723, + "step": 7520 + }, + { + "epoch": 0.99, + "learning_rate": 3.2143331962256053e-09, + "logits/chosen": -1.8179538249969482, + "logits/rejected": -1.8011947870254517, + "logps/chosen": -432.440185546875, + "logps/rejected": -500.1532287597656, + "loss": 0.5916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3934599161148071, + "rewards/margins": 0.6785745024681091, + "rewards/rejected": -2.0720343589782715, + "step": 7530 + }, + { + "epoch": 0.99, + "learning_rate": 2.6613604622066635e-09, + "logits/chosen": -1.9298431873321533, + "logits/rejected": -1.9188578128814697, + "logps/chosen": -392.70599365234375, + "logps/rejected": -483.16143798828125, + "loss": 0.5142, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1843836307525635, + "rewards/margins": 0.7742894887924194, + "rewards/rejected": -1.9586732387542725, + "step": 7540 + }, + { + "epoch": 0.99, + "learning_rate": 2.1605197646826228e-09, + "logits/chosen": -1.72048020362854, + "logits/rejected": -1.58535635471344, + "logps/chosen": -393.91241455078125, + "logps/rejected": -453.22698974609375, + "loss": 0.5001, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3305981159210205, + "rewards/margins": 0.8685491681098938, + "rewards/rejected": -2.1991469860076904, + "step": 7550 + }, + { + "epoch": 0.99, + "learning_rate": 1.711821558721405e-09, + "logits/chosen": -1.9033464193344116, + "logits/rejected": -1.7436285018920898, + "logps/chosen": -467.17437744140625, + "logps/rejected": -472.579833984375, + "loss": 0.5372, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4799602031707764, + "rewards/margins": 0.6517667770385742, + "rewards/rejected": -2.1317269802093506, + "step": 7560 + }, + { + "epoch": 0.99, + "learning_rate": 1.3152752109149569e-09, + "logits/chosen": -1.8958499431610107, + "logits/rejected": -1.8162498474121094, + "logps/chosen": -447.59515380859375, + "logps/rejected": -489.57025146484375, + "loss": 0.5856, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.484851360321045, + "rewards/margins": 0.6364052891731262, + "rewards/rejected": -2.1212563514709473, + "step": 7570 + }, + { + "epoch": 0.99, + "learning_rate": 9.708889991830173e-10, + "logits/chosen": -1.8766504526138306, + "logits/rejected": -1.6922956705093384, + "logps/chosen": -444.44268798828125, + "logps/rejected": -451.81640625, + "loss": 0.5367, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5096843242645264, + "rewards/margins": 0.752662181854248, + "rewards/rejected": -2.2623465061187744, + "step": 7580 + }, + { + "epoch": 0.99, + "learning_rate": 6.786701125999218e-10, + "logits/chosen": -1.6351743936538696, + "logits/rejected": -1.620650053024292, + "logps/chosen": -442.44403076171875, + "logps/rejected": -487.6390686035156, + "loss": 0.6604, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.7403841018676758, + "rewards/margins": 0.5352242588996887, + "rewards/rejected": -2.275608539581299, + "step": 7590 + }, + { + "epoch": 0.99, + "learning_rate": 4.3862465124638873e-10, + "logits/chosen": -1.736732840538025, + "logits/rejected": -1.741463303565979, + "logps/chosen": -422.96258544921875, + "logps/rejected": -466.79266357421875, + "loss": 0.5922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4744383096694946, + "rewards/margins": 0.6053553819656372, + "rewards/rejected": -2.0797934532165527, + "step": 7600 + }, + { + "epoch": 0.99, + "eval_logits/chosen": 0.8123713135719299, + "eval_logits/rejected": 0.853736162185669, + "eval_logps/chosen": -430.84136962890625, + "eval_logps/rejected": -484.1860046386719, + "eval_loss": 0.5485937595367432, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -1.4554866552352905, + "eval_rewards/margins": 0.7476763129234314, + "eval_rewards/rejected": -2.2031631469726562, + "eval_runtime": 1173.62, + "eval_samples_per_second": 1.704, + "eval_steps_per_second": 0.852, + "step": 7600 + }, + { + "epoch": 1.0, + "learning_rate": 2.507576260799005e-10, + "logits/chosen": -2.041153907775879, + "logits/rejected": -1.9447238445281982, + "logps/chosen": -438.7113342285156, + "logps/rejected": -516.343505859375, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3306516408920288, + "rewards/margins": 0.8777866363525391, + "rewards/rejected": -2.2084383964538574, + "step": 7610 + }, + { + "epoch": 1.0, + "learning_rate": 1.1507295883145253e-10, + "logits/chosen": -1.8992458581924438, + "logits/rejected": -1.8089666366577148, + "logps/chosen": -424.86865234375, + "logps/rejected": -513.1679077148438, + "loss": 0.4914, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2947027683258057, + "rewards/margins": 0.8707477450370789, + "rewards/rejected": -2.16545033454895, + "step": 7620 + }, + { + "epoch": 1.0, + "learning_rate": 3.1573481923952156e-11, + "logits/chosen": -1.8514108657836914, + "logits/rejected": -1.7576173543930054, + "logps/chosen": -459.4273986816406, + "logps/rejected": -517.269287109375, + "loss": 0.5021, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.298668622970581, + "rewards/margins": 0.9450668096542358, + "rewards/rejected": -2.2437355518341064, + "step": 7630 + }, + { + "epoch": 1.0, + "learning_rate": 2.609384119889313e-13, + "logits/chosen": -1.6424258947372437, + "logits/rejected": -1.6611589193344116, + "logps/chosen": -414.85186767578125, + "logps/rejected": -503.81170654296875, + "loss": 0.5105, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3835017681121826, + "rewards/margins": 0.8242697715759277, + "rewards/rejected": -2.2077715396881104, + "step": 7640 + }, + { + "epoch": 1.0, + "step": 7641, + "total_flos": 0.0, + "train_loss": 0.5756704107174588, + "train_runtime": 163576.2451, + "train_samples_per_second": 0.374, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 10, + "max_steps": 7641, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}