diff --git "a/harvard_lora-qrefVqa/trainer_state.json" "b/harvard_lora-qrefVqa/trainer_state.json" new file mode 100644--- /dev/null +++ "b/harvard_lora-qrefVqa/trainer_state.json" @@ -0,0 +1,11914 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 849, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.846153846153846e-09, + "logits/chosen": -2.097510576248169, + "logits/rejected": -2.119924306869507, + "logps/chosen": -11.372314453125, + "logps/rejected": -33.306610107421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 7.692307692307693e-09, + "logits/chosen": -2.1055893898010254, + "logits/rejected": -2.1095895767211914, + "logps/chosen": -10.376533508300781, + "logps/rejected": -19.702884674072266, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 1.1538461538461538e-08, + "logits/chosen": -2.0200953483581543, + "logits/rejected": -2.0211269855499268, + "logps/chosen": -9.433221817016602, + "logps/rejected": -9.342824935913086, + "loss": 0.6918, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0019603255204856396, + "rewards/margins": -0.004571294877678156, + "rewards/rejected": 0.00261096959002316, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.5384615384615385e-08, + "logits/chosen": -2.040437698364258, + "logits/rejected": -2.046257734298706, + "logps/chosen": -11.848332405090332, + "logps/rejected": -14.236579895019531, + "loss": 0.6921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0038475512992590666, + "rewards/margins": 0.008034134283661842, + "rewards/rejected": -0.011881684884428978, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 1.923076923076923e-08, + "logits/chosen": -2.099327802658081, + "logits/rejected": -2.1057448387145996, + "logps/chosen": -10.55780029296875, + "logps/rejected": -20.279972076416016, + "loss": 0.6932, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003068304155021906, + "rewards/margins": 0.0006995678413659334, + "rewards/rejected": -0.0037678717635571957, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 2.3076923076923076e-08, + "logits/chosen": -2.0648746490478516, + "logits/rejected": -2.0698909759521484, + "logps/chosen": -10.364034652709961, + "logps/rejected": -14.481033325195312, + "loss": 0.6968, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0005358697380870581, + "rewards/margins": 0.0007348540239036083, + "rewards/rejected": -0.00019898428581655025, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 2.692307692307692e-08, + "logits/chosen": -2.116549253463745, + "logits/rejected": -2.120208978652954, + "logps/chosen": -12.404097557067871, + "logps/rejected": -15.303730010986328, + "loss": 0.6944, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.002380562014877796, + "rewards/margins": -0.0020289896056056023, + "rewards/rejected": -0.00035157217644155025, + "step": 7 + }, + { + "epoch": 0.03, + "learning_rate": 3.076923076923077e-08, + "logits/chosen": -2.129392623901367, + "logits/rejected": -2.13861083984375, + "logps/chosen": -12.869179725646973, + "logps/rejected": -20.393476486206055, + "loss": 0.6907, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015720844268798828, + "rewards/margins": 0.019379710778594017, + "rewards/rejected": -0.003658866975456476, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 3.4615384615384616e-08, + "logits/chosen": -2.082509994506836, + "logits/rejected": -2.092583656311035, + "logps/chosen": -11.582996368408203, + "logps/rejected": -25.843406677246094, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002738523529842496, + "rewards/margins": 0.0006840226706117392, + "rewards/rejected": 0.0020545008592307568, + "step": 9 + }, + { + "epoch": 0.04, + "learning_rate": 3.846153846153846e-08, + "logits/chosen": -2.0870673656463623, + "logits/rejected": -2.0959627628326416, + "logps/chosen": -8.461482048034668, + "logps/rejected": -12.77489948272705, + "loss": 0.6978, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0003260135417804122, + "rewards/margins": -0.0015049456851556897, + "rewards/rejected": 0.0018309593433514237, + "step": 10 + }, + { + "epoch": 0.04, + "learning_rate": 4.230769230769231e-08, + "logits/chosen": -2.036034107208252, + "logits/rejected": -2.041839838027954, + "logps/chosen": -9.941839218139648, + "logps/rejected": -13.342981338500977, + "loss": 0.6952, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0012675285106524825, + "rewards/margins": -0.005756044760346413, + "rewards/rejected": 0.00702357292175293, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 4.615384615384615e-08, + "logits/chosen": -2.09169602394104, + "logits/rejected": -2.0904858112335205, + "logps/chosen": -9.701820373535156, + "logps/rejected": -9.197052001953125, + "loss": 0.6927, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010931158438324928, + "rewards/margins": -0.01592230796813965, + "rewards/rejected": 0.0049911499954760075, + "step": 12 + }, + { + "epoch": 0.05, + "learning_rate": 5e-08, + "logits/chosen": -2.04725980758667, + "logits/rejected": -2.05619740486145, + "logps/chosen": -14.081567764282227, + "logps/rejected": -13.698946952819824, + "loss": 0.6924, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004795551300048828, + "rewards/margins": 0.005252575967460871, + "rewards/rejected": -0.0004570246674120426, + "step": 13 + }, + { + "epoch": 0.05, + "learning_rate": 5.384615384615384e-08, + "logits/chosen": -2.109088182449341, + "logits/rejected": -2.1060073375701904, + "logps/chosen": -9.33390998840332, + "logps/rejected": -20.0125675201416, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0005416393396444619, + "rewards/margins": 0.004609442315995693, + "rewards/rejected": -0.004067802336066961, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 5.769230769230768e-08, + "logits/chosen": -2.0645275115966797, + "logits/rejected": -2.064751625061035, + "logps/chosen": -23.257171630859375, + "logps/rejected": -17.14521026611328, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0009992599952965975, + "rewards/margins": -0.011056138202548027, + "rewards/rejected": 0.012055397033691406, + "step": 15 + }, + { + "epoch": 0.06, + "learning_rate": 6.153846153846154e-08, + "logits/chosen": -2.0690627098083496, + "logits/rejected": -2.0702261924743652, + "logps/chosen": -9.12561321258545, + "logps/rejected": -9.311273574829102, + "loss": 0.6936, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0021490096114575863, + "rewards/margins": 0.0010279176058247685, + "rewards/rejected": 0.0011210921220481396, + "step": 16 + }, + { + "epoch": 0.06, + "learning_rate": 6.538461538461538e-08, + "logits/chosen": -2.100080966949463, + "logits/rejected": -2.103487253189087, + "logps/chosen": -9.528514862060547, + "logps/rejected": -23.89231300354004, + "loss": 0.693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004167318344116211, + "rewards/margins": 0.006782484240829945, + "rewards/rejected": -0.0026151658967137337, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 6.923076923076923e-08, + "logits/chosen": -2.0232815742492676, + "logits/rejected": -2.0248076915740967, + "logps/chosen": -7.857854843139648, + "logps/rejected": -8.982934951782227, + "loss": 0.6969, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0023724795319139957, + "rewards/margins": -0.002675557043403387, + "rewards/rejected": 0.005048036575317383, + "step": 18 + }, + { + "epoch": 0.07, + "learning_rate": 7.307692307692307e-08, + "logits/chosen": -2.0792500972747803, + "logits/rejected": -2.0829238891601562, + "logps/chosen": -10.368392944335938, + "logps/rejected": -13.925779342651367, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0008166789775714278, + "rewards/margins": -3.139977343380451e-05, + "rewards/rejected": 0.0008480787510052323, + "step": 19 + }, + { + "epoch": 0.07, + "learning_rate": 7.692307692307692e-08, + "logits/chosen": -2.0435268878936768, + "logits/rejected": -2.039405345916748, + "logps/chosen": -14.17733383178711, + "logps/rejected": -18.850263595581055, + "loss": 0.6933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0006655216566286981, + "rewards/margins": 0.0034564496017992496, + "rewards/rejected": -0.004121971316635609, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 8.076923076923076e-08, + "logits/chosen": -2.1756489276885986, + "logits/rejected": -2.151092052459717, + "logps/chosen": -9.293466567993164, + "logps/rejected": -12.510316848754883, + "loss": 0.6918, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008032942190766335, + "rewards/margins": 0.0070709227584302425, + "rewards/rejected": 0.0009620189666748047, + "step": 21 + }, + { + "epoch": 0.08, + "learning_rate": 8.461538461538461e-08, + "logits/chosen": -2.060006856918335, + "logits/rejected": -2.0592617988586426, + "logps/chosen": -10.06259822845459, + "logps/rejected": -19.066829681396484, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002218628069385886, + "rewards/margins": 0.0008467198349535465, + "rewards/rejected": -0.003065347671508789, + "step": 22 + }, + { + "epoch": 0.08, + "learning_rate": 8.846153846153845e-08, + "logits/chosen": -2.1863443851470947, + "logits/rejected": -2.186579704284668, + "logps/chosen": -10.898603439331055, + "logps/rejected": -9.618656158447266, + "loss": 0.6919, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013610411435365677, + "rewards/margins": 0.008812570944428444, + "rewards/rejected": 0.004797840025275946, + "step": 23 + }, + { + "epoch": 0.08, + "learning_rate": 9.23076923076923e-08, + "logits/chosen": -2.1300365924835205, + "logits/rejected": -2.141152858734131, + "logps/chosen": -11.082178115844727, + "logps/rejected": -27.433414459228516, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005371618550270796, + "rewards/margins": -0.00020413403399288654, + "rewards/rejected": 0.005575752351433039, + "step": 24 + }, + { + "epoch": 0.09, + "learning_rate": 9.615384615384616e-08, + "logits/chosen": -2.1097612380981445, + "logits/rejected": -2.112030029296875, + "logps/chosen": -9.428064346313477, + "logps/rejected": -20.1447696685791, + "loss": 0.6916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014590883627533913, + "rewards/margins": 0.01878209225833416, + "rewards/rejected": -0.00419120816513896, + "step": 25 + }, + { + "epoch": 0.09, + "learning_rate": 1e-07, + "logits/chosen": -2.0811123847961426, + "logits/rejected": -2.092834711074829, + "logps/chosen": -13.305941581726074, + "logps/rejected": -18.802751541137695, + "loss": 0.6914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01761798933148384, + "rewards/margins": 0.024689197540283203, + "rewards/rejected": -0.007071209140121937, + "step": 26 + }, + { + "epoch": 0.1, + "learning_rate": 9.999963571645328e-08, + "logits/chosen": -2.107715129852295, + "logits/rejected": -2.1136362552642822, + "logps/chosen": -21.053009033203125, + "logps/rejected": -15.247662544250488, + "loss": 0.6914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007356214802712202, + "rewards/margins": 0.008664179593324661, + "rewards/rejected": -0.0013079643249511719, + "step": 27 + }, + { + "epoch": 0.1, + "learning_rate": 9.999854287112121e-08, + "logits/chosen": -2.106203317642212, + "logits/rejected": -2.101030111312866, + "logps/chosen": -9.724105834960938, + "logps/rejected": -12.390012741088867, + "loss": 0.691, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016744565218687057, + "rewards/margins": 0.0039815898053348064, + "rewards/rejected": 0.012762976810336113, + "step": 28 + }, + { + "epoch": 0.1, + "learning_rate": 9.999672147992805e-08, + "logits/chosen": -2.147495746612549, + "logits/rejected": -2.1486103534698486, + "logps/chosen": -10.287918090820312, + "logps/rejected": -11.839522361755371, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017780780792236328, + "rewards/margins": 0.0028064725920557976, + "rewards/rejected": 0.014974309131503105, + "step": 29 + }, + { + "epoch": 0.11, + "learning_rate": 9.999417156941388e-08, + "logits/chosen": -2.092768430709839, + "logits/rejected": -2.0952470302581787, + "logps/chosen": -8.360401153564453, + "logps/rejected": -14.62700080871582, + "loss": 0.6921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014501571655273438, + "rewards/margins": 0.009264994412660599, + "rewards/rejected": 0.005236578173935413, + "step": 30 + }, + { + "epoch": 0.11, + "learning_rate": 9.999089317673432e-08, + "logits/chosen": -2.1355152130126953, + "logits/rejected": -2.1367437839508057, + "logps/chosen": -9.422966003417969, + "logps/rejected": -7.761822700500488, + "loss": 0.6926, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018777895718812943, + "rewards/margins": 0.008191894739866257, + "rewards/rejected": 0.010585999116301537, + "step": 31 + }, + { + "epoch": 0.11, + "learning_rate": 9.998688634965994e-08, + "logits/chosen": -2.117422103881836, + "logits/rejected": -2.1283791065216064, + "logps/chosen": -9.015121459960938, + "logps/rejected": -17.3316707611084, + "loss": 0.6909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02167201042175293, + "rewards/margins": 0.004514789208769798, + "rewards/rejected": 0.01715722121298313, + "step": 32 + }, + { + "epoch": 0.12, + "learning_rate": 9.998215114657563e-08, + "logits/chosen": -2.0608222484588623, + "logits/rejected": -2.072838068008423, + "logps/chosen": -10.97737979888916, + "logps/rejected": -20.068649291992188, + "loss": 0.6905, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0284452922642231, + "rewards/margins": 0.023051120340824127, + "rewards/rejected": 0.005394172854721546, + "step": 33 + }, + { + "epoch": 0.12, + "learning_rate": 9.997668763647961e-08, + "logits/chosen": -2.0181727409362793, + "logits/rejected": -2.022264003753662, + "logps/chosen": -8.996482849121094, + "logps/rejected": -20.64644432067871, + "loss": 0.6902, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.023577261716127396, + "rewards/margins": -0.008658742532134056, + "rewards/rejected": 0.0322360023856163, + "step": 34 + }, + { + "epoch": 0.12, + "learning_rate": 9.997049589898259e-08, + "logits/chosen": -2.041630506515503, + "logits/rejected": -2.070546865463257, + "logps/chosen": -7.790151596069336, + "logps/rejected": -36.7705192565918, + "loss": 0.6888, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.015954280272126198, + "rewards/margins": -0.010411953553557396, + "rewards/rejected": 0.026366233825683594, + "step": 35 + }, + { + "epoch": 0.13, + "learning_rate": 9.996357602430646e-08, + "logits/chosen": -2.020752429962158, + "logits/rejected": -2.0208966732025146, + "logps/chosen": -8.187994956970215, + "logps/rejected": -9.604312896728516, + "loss": 0.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025165606290102005, + "rewards/margins": 3.0185095965862274e-05, + "rewards/rejected": 0.025135422125458717, + "step": 36 + }, + { + "epoch": 0.13, + "learning_rate": 9.995592811328309e-08, + "logits/chosen": -2.110893487930298, + "logits/rejected": -2.1664650440216064, + "logps/chosen": -20.323095321655273, + "logps/rejected": -21.13479232788086, + "loss": 0.6877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03332533687353134, + "rewards/margins": 0.03722415119409561, + "rewards/rejected": -0.003898811526596546, + "step": 37 + }, + { + "epoch": 0.13, + "learning_rate": 9.994755227735282e-08, + "logits/chosen": -2.087979316711426, + "logits/rejected": -2.0860090255737305, + "logps/chosen": -21.18497657775879, + "logps/rejected": -18.570045471191406, + "loss": 0.6873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04009499400854111, + "rewards/margins": 0.02089395560324192, + "rewards/rejected": 0.019201040267944336, + "step": 38 + }, + { + "epoch": 0.14, + "learning_rate": 9.99384486385628e-08, + "logits/chosen": -2.0037944316864014, + "logits/rejected": -2.0017518997192383, + "logps/chosen": -12.245137214660645, + "logps/rejected": -10.076192855834961, + "loss": 0.6952, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.020441342145204544, + "rewards/margins": -0.013841009698808193, + "rewards/rejected": 0.03428234905004501, + "step": 39 + }, + { + "epoch": 0.14, + "learning_rate": 9.992861732956528e-08, + "logits/chosen": -2.1184048652648926, + "logits/rejected": -2.1239118576049805, + "logps/chosen": -9.899801254272461, + "logps/rejected": -9.981226921081543, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031125212088227272, + "rewards/margins": 0.004374503158032894, + "rewards/rejected": 0.026750709861516953, + "step": 40 + }, + { + "epoch": 0.14, + "learning_rate": 9.99180584936156e-08, + "logits/chosen": -2.1456151008605957, + "logits/rejected": -2.1501035690307617, + "logps/chosen": -12.040979385375977, + "logps/rejected": -9.064493179321289, + "loss": 0.6887, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03932995721697807, + "rewards/margins": 0.01530761644244194, + "rewards/rejected": 0.024022340774536133, + "step": 41 + }, + { + "epoch": 0.15, + "learning_rate": 9.990677228457021e-08, + "logits/chosen": -2.073322057723999, + "logits/rejected": -2.080109119415283, + "logps/chosen": -10.221904754638672, + "logps/rejected": -13.421606063842773, + "loss": 0.6901, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.058562517166137695, + "rewards/margins": 0.021431325003504753, + "rewards/rejected": 0.03713119029998779, + "step": 42 + }, + { + "epoch": 0.15, + "learning_rate": 9.989475886688428e-08, + "logits/chosen": -2.0823450088500977, + "logits/rejected": -2.080927848815918, + "logps/chosen": -11.3592529296875, + "logps/rejected": -7.6746416091918945, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04347651079297066, + "rewards/margins": 0.015666866675019264, + "rewards/rejected": 0.027809644117951393, + "step": 43 + }, + { + "epoch": 0.16, + "learning_rate": 9.988201841560944e-08, + "logits/chosen": -2.0874502658843994, + "logits/rejected": -2.125110626220703, + "logps/chosen": -10.047807693481445, + "logps/rejected": -13.724536895751953, + "loss": 0.6871, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046552374958992004, + "rewards/margins": 0.024516511708498, + "rewards/rejected": 0.022035861387848854, + "step": 44 + }, + { + "epoch": 0.16, + "learning_rate": 9.986855111639116e-08, + "logits/chosen": -2.0733938217163086, + "logits/rejected": -2.071836471557617, + "logps/chosen": -12.546595573425293, + "logps/rejected": -8.60620403289795, + "loss": 0.6901, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.045719340443611145, + "rewards/margins": 0.010957004502415657, + "rewards/rejected": 0.03476233407855034, + "step": 45 + }, + { + "epoch": 0.16, + "learning_rate": 9.985435716546606e-08, + "logits/chosen": -2.102259397506714, + "logits/rejected": -2.1540231704711914, + "logps/chosen": -9.814385414123535, + "logps/rejected": -21.571456909179688, + "loss": 0.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04839963838458061, + "rewards/margins": 0.006337450817227364, + "rewards/rejected": 0.0420621857047081, + "step": 46 + }, + { + "epoch": 0.17, + "learning_rate": 9.983943676965907e-08, + "logits/chosen": -2.0776894092559814, + "logits/rejected": -2.0837106704711914, + "logps/chosen": -11.596502304077148, + "logps/rejected": -6.630319595336914, + "loss": 0.6883, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05289759486913681, + "rewards/margins": 0.01256708987057209, + "rewards/rejected": 0.04033050686120987, + "step": 47 + }, + { + "epoch": 0.17, + "learning_rate": 9.982379014638034e-08, + "logits/chosen": -2.0488228797912598, + "logits/rejected": -2.058349132537842, + "logps/chosen": -11.765326499938965, + "logps/rejected": -7.866977691650391, + "loss": 0.6851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05442643165588379, + "rewards/margins": 0.01524507999420166, + "rewards/rejected": 0.03918135166168213, + "step": 48 + }, + { + "epoch": 0.17, + "learning_rate": 9.980741752362221e-08, + "logits/chosen": -2.060767412185669, + "logits/rejected": -2.0779690742492676, + "logps/chosen": -12.060062408447266, + "logps/rejected": -9.311071395874023, + "loss": 0.6961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0519011989235878, + "rewards/margins": 0.021114613860845566, + "rewards/rejected": 0.030786585062742233, + "step": 49 + }, + { + "epoch": 0.18, + "learning_rate": 9.979031913995573e-08, + "logits/chosen": -2.035358428955078, + "logits/rejected": -2.053785562515259, + "logps/chosen": -11.271171569824219, + "logps/rejected": -18.149864196777344, + "loss": 0.6924, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06728596985340118, + "rewards/margins": 0.01150042936205864, + "rewards/rejected": 0.055785536766052246, + "step": 50 + }, + { + "epoch": 0.18, + "learning_rate": 9.97724952445273e-08, + "logits/chosen": -2.169144630432129, + "logits/rejected": -2.180375337600708, + "logps/chosen": -10.308534622192383, + "logps/rejected": -18.4023494720459, + "loss": 0.692, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06545813381671906, + "rewards/margins": 0.009809613227844238, + "rewards/rejected": 0.05564852058887482, + "step": 51 + }, + { + "epoch": 0.18, + "learning_rate": 9.975394609705503e-08, + "logits/chosen": -2.128641366958618, + "logits/rejected": -2.1256725788116455, + "logps/chosen": -10.16600227355957, + "logps/rejected": -10.622486114501953, + "loss": 0.6881, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06465206295251846, + "rewards/margins": 0.01646161451935768, + "rewards/rejected": 0.04819045215845108, + "step": 52 + }, + { + "epoch": 0.19, + "learning_rate": 9.973467196782483e-08, + "logits/chosen": -2.1218433380126953, + "logits/rejected": -2.1238653659820557, + "logps/chosen": -8.758686065673828, + "logps/rejected": -17.917177200317383, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06241035461425781, + "rewards/margins": -0.0009089931845664978, + "rewards/rejected": 0.06331934779882431, + "step": 53 + }, + { + "epoch": 0.19, + "learning_rate": 9.971467313768667e-08, + "logits/chosen": -2.1037755012512207, + "logits/rejected": -2.103863477706909, + "logps/chosen": -9.44898796081543, + "logps/rejected": -14.776172637939453, + "loss": 0.6861, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07580962777137756, + "rewards/margins": -0.0004600510001182556, + "rewards/rejected": 0.07626967132091522, + "step": 54 + }, + { + "epoch": 0.19, + "learning_rate": 9.969394989805033e-08, + "logits/chosen": -2.0558416843414307, + "logits/rejected": -2.0587964057922363, + "logps/chosen": -6.988409042358398, + "logps/rejected": -7.644698143005371, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0686829537153244, + "rewards/margins": 0.018300294876098633, + "rewards/rejected": 0.05038266256451607, + "step": 55 + }, + { + "epoch": 0.2, + "learning_rate": 9.96725025508812e-08, + "logits/chosen": -2.0616981983184814, + "logits/rejected": -2.0679736137390137, + "logps/chosen": -10.079291343688965, + "logps/rejected": -8.145094871520996, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08156780898571014, + "rewards/margins": 0.031128644943237305, + "rewards/rejected": 0.05043916776776314, + "step": 56 + }, + { + "epoch": 0.2, + "learning_rate": 9.965033140869594e-08, + "logits/chosen": -2.050377368927002, + "logits/rejected": -2.0517618656158447, + "logps/chosen": -9.657028198242188, + "logps/rejected": -15.797605514526367, + "loss": 0.6841, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08444122970104218, + "rewards/margins": 0.03363938629627228, + "rewards/rejected": 0.050801850855350494, + "step": 57 + }, + { + "epoch": 0.2, + "learning_rate": 9.962743679455782e-08, + "logits/chosen": -2.102853775024414, + "logits/rejected": -2.108802080154419, + "logps/chosen": -9.64322280883789, + "logps/rejected": -13.328766822814941, + "loss": 0.6943, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09340968728065491, + "rewards/margins": 0.03464534506201744, + "rewards/rejected": 0.05876433849334717, + "step": 58 + }, + { + "epoch": 0.21, + "learning_rate": 9.960381904207209e-08, + "logits/chosen": -2.1222074031829834, + "logits/rejected": -2.122331380844116, + "logps/chosen": -18.60080909729004, + "logps/rejected": -10.812004089355469, + "loss": 0.6976, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08437414467334747, + "rewards/margins": -0.010586503893136978, + "rewards/rejected": 0.09496064484119415, + "step": 59 + }, + { + "epoch": 0.21, + "learning_rate": 9.957947849538111e-08, + "logits/chosen": -2.0448548793792725, + "logits/rejected": -2.0557830333709717, + "logps/chosen": -10.124374389648438, + "logps/rejected": -13.674853324890137, + "loss": 0.6814, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08518097549676895, + "rewards/margins": 0.012413430958986282, + "rewards/rejected": 0.07276754081249237, + "step": 60 + }, + { + "epoch": 0.22, + "learning_rate": 9.955441550915929e-08, + "logits/chosen": -2.0689234733581543, + "logits/rejected": -2.073192834854126, + "logps/chosen": -8.8046236038208, + "logps/rejected": -27.179088592529297, + "loss": 0.682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09696832299232483, + "rewards/margins": 0.018537092953920364, + "rewards/rejected": 0.07843122631311417, + "step": 61 + }, + { + "epoch": 0.22, + "learning_rate": 9.952863044860797e-08, + "logits/chosen": -2.049260139465332, + "logits/rejected": -2.059544563293457, + "logps/chosen": -11.074729919433594, + "logps/rejected": -17.73105239868164, + "loss": 0.6892, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0740656852722168, + "rewards/margins": -0.01779666170477867, + "rewards/rejected": 0.09186235070228577, + "step": 62 + }, + { + "epoch": 0.22, + "learning_rate": 9.950212368945013e-08, + "logits/chosen": -1.9984493255615234, + "logits/rejected": -1.997687578201294, + "logps/chosen": -7.344542503356934, + "logps/rejected": -10.283039093017578, + "loss": 0.683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08235251903533936, + "rewards/margins": 0.012254027649760246, + "rewards/rejected": 0.07009849697351456, + "step": 63 + }, + { + "epoch": 0.23, + "learning_rate": 9.947489561792475e-08, + "logits/chosen": -2.1543097496032715, + "logits/rejected": -2.1560754776000977, + "logps/chosen": -9.04195499420166, + "logps/rejected": -17.853330612182617, + "loss": 0.6854, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10793473571538925, + "rewards/margins": 0.03549639880657196, + "rewards/rejected": 0.07243833690881729, + "step": 64 + }, + { + "epoch": 0.23, + "learning_rate": 9.944694663078139e-08, + "logits/chosen": -2.1258819103240967, + "logits/rejected": -2.124260663986206, + "logps/chosen": -17.276477813720703, + "logps/rejected": -11.566054344177246, + "loss": 0.6781, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10849504917860031, + "rewards/margins": 0.05518022179603577, + "rewards/rejected": 0.05331483110785484, + "step": 65 + }, + { + "epoch": 0.23, + "learning_rate": 9.941827713527433e-08, + "logits/chosen": -2.013655662536621, + "logits/rejected": -2.0460946559906006, + "logps/chosen": -17.086772918701172, + "logps/rejected": -20.625690460205078, + "loss": 0.6906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10770726203918457, + "rewards/margins": 0.029364541172981262, + "rewards/rejected": 0.07834272086620331, + "step": 66 + }, + { + "epoch": 0.24, + "learning_rate": 9.938888754915656e-08, + "logits/chosen": -2.1242778301239014, + "logits/rejected": -2.1253678798675537, + "logps/chosen": -12.099685668945312, + "logps/rejected": -10.097085952758789, + "loss": 0.6762, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.07341685891151428, + "rewards/margins": -0.016077373176813126, + "rewards/rejected": 0.08949422836303711, + "step": 67 + }, + { + "epoch": 0.24, + "learning_rate": 9.935877830067379e-08, + "logits/chosen": -2.0230395793914795, + "logits/rejected": -2.0240256786346436, + "logps/chosen": -12.051183700561523, + "logps/rejected": -8.571578979492188, + "loss": 0.6725, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12944316864013672, + "rewards/margins": 0.021613217890262604, + "rewards/rejected": 0.10782995820045471, + "step": 68 + }, + { + "epoch": 0.24, + "learning_rate": 9.932794982855817e-08, + "logits/chosen": -2.050057888031006, + "logits/rejected": -2.08294415473938, + "logps/chosen": -8.920730590820312, + "logps/rejected": -15.853729248046875, + "loss": 0.6917, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10334315896034241, + "rewards/margins": 0.027882816269993782, + "rewards/rejected": 0.07546033710241318, + "step": 69 + }, + { + "epoch": 0.25, + "learning_rate": 9.929640258202191e-08, + "logits/chosen": -2.034510850906372, + "logits/rejected": -2.0355947017669678, + "logps/chosen": -9.406510353088379, + "logps/rejected": -8.679980278015137, + "loss": 0.676, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1371043175458908, + "rewards/margins": 0.030440162867307663, + "rewards/rejected": 0.10666415840387344, + "step": 70 + }, + { + "epoch": 0.25, + "learning_rate": 9.926413702075073e-08, + "logits/chosen": -2.050097703933716, + "logits/rejected": -2.052020311355591, + "logps/chosen": -7.396944046020508, + "logps/rejected": -15.598074913024902, + "loss": 0.6925, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08399474620819092, + "rewards/margins": -0.02992434799671173, + "rewards/rejected": 0.11391909420490265, + "step": 71 + }, + { + "epoch": 0.25, + "learning_rate": 9.923115361489718e-08, + "logits/chosen": -2.041449785232544, + "logits/rejected": -2.052797317504883, + "logps/chosen": -9.681797981262207, + "logps/rejected": -12.710771560668945, + "loss": 0.6899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13242563605308533, + "rewards/margins": 0.0304136723279953, + "rewards/rejected": 0.10201197117567062, + "step": 72 + }, + { + "epoch": 0.26, + "learning_rate": 9.919745284507368e-08, + "logits/chosen": -2.138528347015381, + "logits/rejected": -2.1719486713409424, + "logps/chosen": -11.22022819519043, + "logps/rejected": -15.938741683959961, + "loss": 0.683, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10667119175195694, + "rewards/margins": 0.008025597780942917, + "rewards/rejected": 0.09864559769630432, + "step": 73 + }, + { + "epoch": 0.26, + "learning_rate": 9.916303520234571e-08, + "logits/chosen": -2.068274974822998, + "logits/rejected": -2.074456214904785, + "logps/chosen": -7.726677894592285, + "logps/rejected": -17.548513412475586, + "loss": 0.6773, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1414797008037567, + "rewards/margins": 0.018967553973197937, + "rewards/rejected": 0.12251215428113937, + "step": 74 + }, + { + "epoch": 0.27, + "learning_rate": 9.912790118822451e-08, + "logits/chosen": -2.1544888019561768, + "logits/rejected": -2.233659029006958, + "logps/chosen": -8.123453140258789, + "logps/rejected": -33.27145004272461, + "loss": 0.7006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14580073952674866, + "rewards/margins": 0.05218074098229408, + "rewards/rejected": 0.09361998736858368, + "step": 75 + }, + { + "epoch": 0.27, + "learning_rate": 9.909205131465978e-08, + "logits/chosen": -2.1424713134765625, + "logits/rejected": -2.1581528186798096, + "logps/chosen": -11.182374000549316, + "logps/rejected": -8.008686065673828, + "loss": 0.673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20076484978199005, + "rewards/margins": 0.0919303447008133, + "rewards/rejected": 0.10883450508117676, + "step": 76 + }, + { + "epoch": 0.27, + "learning_rate": 9.905548610403232e-08, + "logits/chosen": -2.1171116828918457, + "logits/rejected": -2.1237363815307617, + "logps/chosen": -8.29359245300293, + "logps/rejected": -16.20252799987793, + "loss": 0.6811, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20946750044822693, + "rewards/margins": 0.08721654117107391, + "rewards/rejected": 0.12225095927715302, + "step": 77 + }, + { + "epoch": 0.28, + "learning_rate": 9.90182060891463e-08, + "logits/chosen": -2.0021650791168213, + "logits/rejected": -2.005268096923828, + "logps/chosen": -10.561080932617188, + "logps/rejected": -9.047115325927734, + "loss": 0.6783, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13553543388843536, + "rewards/margins": 0.026949431747198105, + "rewards/rejected": 0.10858599841594696, + "step": 78 + }, + { + "epoch": 0.28, + "learning_rate": 9.898021181322156e-08, + "logits/chosen": -2.0893473625183105, + "logits/rejected": -2.095689296722412, + "logps/chosen": -6.52653694152832, + "logps/rejected": -13.82789421081543, + "loss": 0.6872, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14994114637374878, + "rewards/margins": 0.020168043673038483, + "rewards/rejected": 0.1297730952501297, + "step": 79 + }, + { + "epoch": 0.28, + "learning_rate": 9.894150382988569e-08, + "logits/chosen": -2.1102893352508545, + "logits/rejected": -2.170872211456299, + "logps/chosen": -13.700004577636719, + "logps/rejected": -11.067142486572266, + "loss": 0.6533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20735913515090942, + "rewards/margins": 0.13651591539382935, + "rewards/rejected": 0.07084321975708008, + "step": 80 + }, + { + "epoch": 0.29, + "learning_rate": 9.890208270316594e-08, + "logits/chosen": -2.0818259716033936, + "logits/rejected": -2.0917532444000244, + "logps/chosen": -8.942182540893555, + "logps/rejected": -18.319435119628906, + "loss": 0.6975, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19034011662006378, + "rewards/margins": 0.009798318147659302, + "rewards/rejected": 0.18054179847240448, + "step": 81 + }, + { + "epoch": 0.29, + "learning_rate": 9.886194900748101e-08, + "logits/chosen": -2.105217456817627, + "logits/rejected": -2.1030564308166504, + "logps/chosen": -16.090484619140625, + "logps/rejected": -9.614008903503418, + "loss": 0.6781, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1324107050895691, + "rewards/margins": -0.008041314780712128, + "rewards/rejected": 0.14045201241970062, + "step": 82 + }, + { + "epoch": 0.29, + "learning_rate": 9.882110332763274e-08, + "logits/chosen": -2.1609046459198, + "logits/rejected": -2.166107177734375, + "logps/chosen": -11.106157302856445, + "logps/rejected": -6.214755535125732, + "loss": 0.6647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23929864168167114, + "rewards/margins": 0.13347239792346954, + "rewards/rejected": 0.105826236307621, + "step": 83 + }, + { + "epoch": 0.3, + "learning_rate": 9.877954625879745e-08, + "logits/chosen": -2.1204068660736084, + "logits/rejected": -2.1244850158691406, + "logps/chosen": -11.31182861328125, + "logps/rejected": -7.365429878234863, + "loss": 0.6848, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16703271865844727, + "rewards/margins": 0.022074386477470398, + "rewards/rejected": 0.14495833218097687, + "step": 84 + }, + { + "epoch": 0.3, + "learning_rate": 9.873727840651744e-08, + "logits/chosen": -2.07621693611145, + "logits/rejected": -2.070263147354126, + "logps/chosen": -10.696220397949219, + "logps/rejected": -7.109923362731934, + "loss": 0.6748, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16924360394477844, + "rewards/margins": 0.050597697496414185, + "rewards/rejected": 0.11864590644836426, + "step": 85 + }, + { + "epoch": 0.3, + "learning_rate": 9.869430038669201e-08, + "logits/chosen": -2.0692458152770996, + "logits/rejected": -2.0642073154449463, + "logps/chosen": -9.274160385131836, + "logps/rejected": -18.43902587890625, + "loss": 0.6826, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14094658195972443, + "rewards/margins": -0.05492105334997177, + "rewards/rejected": 0.1958676278591156, + "step": 86 + }, + { + "epoch": 0.31, + "learning_rate": 9.865061282556859e-08, + "logits/chosen": -2.080880880355835, + "logits/rejected": -2.0824923515319824, + "logps/chosen": -9.72069263458252, + "logps/rejected": -6.209484100341797, + "loss": 0.6826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2217741310596466, + "rewards/margins": 0.09113580733537674, + "rewards/rejected": 0.13063831627368927, + "step": 87 + }, + { + "epoch": 0.31, + "learning_rate": 9.860621635973354e-08, + "logits/chosen": -2.131855010986328, + "logits/rejected": -2.1392621994018555, + "logps/chosen": -8.715415000915527, + "logps/rejected": -14.866212844848633, + "loss": 0.6676, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20188544690608978, + "rewards/margins": 0.06384273618459702, + "rewards/rejected": 0.13804271817207336, + "step": 88 + }, + { + "epoch": 0.31, + "learning_rate": 9.856111163610299e-08, + "logits/chosen": -2.081540584564209, + "logits/rejected": -2.072953224182129, + "logps/chosen": -10.758820533752441, + "logps/rejected": -16.539304733276367, + "loss": 0.6974, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15370512008666992, + "rewards/margins": -0.04202170670032501, + "rewards/rejected": 0.19572682678699493, + "step": 89 + }, + { + "epoch": 0.32, + "learning_rate": 9.851529931191324e-08, + "logits/chosen": -2.155614137649536, + "logits/rejected": -2.166372299194336, + "logps/chosen": -7.4568257331848145, + "logps/rejected": -26.76165771484375, + "loss": 0.6716, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25703856348991394, + "rewards/margins": 0.0407138392329216, + "rewards/rejected": 0.21632471680641174, + "step": 90 + }, + { + "epoch": 0.32, + "learning_rate": 9.846878005471137e-08, + "logits/chosen": -2.0905909538269043, + "logits/rejected": -2.0886566638946533, + "logps/chosen": -10.362454414367676, + "logps/rejected": -8.164715766906738, + "loss": 0.6756, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12827114760875702, + "rewards/margins": -0.039729926735162735, + "rewards/rejected": 0.16800108551979065, + "step": 91 + }, + { + "epoch": 0.33, + "learning_rate": 9.842155454234537e-08, + "logits/chosen": -2.1349334716796875, + "logits/rejected": -2.136669397354126, + "logps/chosen": -9.82733154296875, + "logps/rejected": -9.594762802124023, + "loss": 0.6804, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21031531691551208, + "rewards/margins": 0.07720570266246796, + "rewards/rejected": 0.13310962915420532, + "step": 92 + }, + { + "epoch": 0.33, + "learning_rate": 9.837362346295429e-08, + "logits/chosen": -2.1058225631713867, + "logits/rejected": -2.1150882244110107, + "logps/chosen": -9.686410903930664, + "logps/rejected": -6.37019157409668, + "loss": 0.6583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2418338656425476, + "rewards/margins": 0.12519530951976776, + "rewards/rejected": 0.11663857102394104, + "step": 93 + }, + { + "epoch": 0.33, + "learning_rate": 9.832498751495831e-08, + "logits/chosen": -2.0473146438598633, + "logits/rejected": -2.0501928329467773, + "logps/chosen": -9.824817657470703, + "logps/rejected": -18.43427276611328, + "loss": 0.6663, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21768590807914734, + "rewards/margins": 0.07504191249608994, + "rewards/rejected": 0.142644003033638, + "step": 94 + }, + { + "epoch": 0.34, + "learning_rate": 9.827564740704846e-08, + "logits/chosen": -2.0649924278259277, + "logits/rejected": -2.0654473304748535, + "logps/chosen": -6.286008834838867, + "logps/rejected": -8.296812057495117, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2209237515926361, + "rewards/margins": 0.05174841359257698, + "rewards/rejected": 0.16917534172534943, + "step": 95 + }, + { + "epoch": 0.34, + "learning_rate": 9.822560385817629e-08, + "logits/chosen": -2.072606325149536, + "logits/rejected": -2.070835828781128, + "logps/chosen": -7.147233009338379, + "logps/rejected": -15.896069526672363, + "loss": 0.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2533239424228668, + "rewards/margins": 0.00022558867931365967, + "rewards/rejected": 0.25309833884239197, + "step": 96 + }, + { + "epoch": 0.34, + "learning_rate": 9.817485759754347e-08, + "logits/chosen": -2.0586330890655518, + "logits/rejected": -2.059922218322754, + "logps/chosen": -8.090757369995117, + "logps/rejected": -15.444175720214844, + "loss": 0.6769, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2008999139070511, + "rewards/margins": 0.016382336616516113, + "rewards/rejected": 0.18451757729053497, + "step": 97 + }, + { + "epoch": 0.35, + "learning_rate": 9.812340936459113e-08, + "logits/chosen": -2.0044758319854736, + "logits/rejected": -2.006527900695801, + "logps/chosen": -4.96382999420166, + "logps/rejected": -9.902922630310059, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17210343480110168, + "rewards/margins": -0.04763215780258179, + "rewards/rejected": 0.21973559260368347, + "step": 98 + }, + { + "epoch": 0.35, + "learning_rate": 9.807125990898903e-08, + "logits/chosen": -1.9936691522598267, + "logits/rejected": -2.0013482570648193, + "logps/chosen": -6.440214157104492, + "logps/rejected": -7.474986553192139, + "loss": 0.6491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2744210660457611, + "rewards/margins": 0.12064732611179352, + "rewards/rejected": 0.1537737399339676, + "step": 99 + }, + { + "epoch": 0.35, + "learning_rate": 9.801840999062475e-08, + "logits/chosen": -2.039762496948242, + "logits/rejected": -2.0372893810272217, + "logps/chosen": -6.915624141693115, + "logps/rejected": -9.040582656860352, + "loss": 0.6642, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.223637193441391, + "rewards/margins": 0.00830569863319397, + "rewards/rejected": 0.21533149480819702, + "step": 100 + }, + { + "epoch": 0.36, + "learning_rate": 9.796486037959251e-08, + "logits/chosen": -2.0876975059509277, + "logits/rejected": -2.0925588607788086, + "logps/chosen": -8.059728622436523, + "logps/rejected": -7.5098981857299805, + "loss": 0.6469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2728281617164612, + "rewards/margins": 0.10026481002569199, + "rewards/rejected": 0.1725633442401886, + "step": 101 + }, + { + "epoch": 0.36, + "learning_rate": 9.791061185618196e-08, + "logits/chosen": -2.0858378410339355, + "logits/rejected": -2.0836315155029297, + "logps/chosen": -5.805757522583008, + "logps/rejected": -11.781646728515625, + "loss": 0.6738, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23338955640792847, + "rewards/margins": -0.02429869771003723, + "rewards/rejected": 0.2576882541179657, + "step": 102 + }, + { + "epoch": 0.36, + "learning_rate": 9.785566521086695e-08, + "logits/chosen": -2.02596378326416, + "logits/rejected": -2.0266895294189453, + "logps/chosen": -13.891460418701172, + "logps/rejected": -9.332324981689453, + "loss": 0.7046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2910730838775635, + "rewards/margins": 0.09379493445158005, + "rewards/rejected": 0.19727817177772522, + "step": 103 + }, + { + "epoch": 0.37, + "learning_rate": 9.780002124429376e-08, + "logits/chosen": -1.9952216148376465, + "logits/rejected": -2.0248308181762695, + "logps/chosen": -7.935598373413086, + "logps/rejected": -16.02955436706543, + "loss": 0.6497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33796823024749756, + "rewards/margins": 0.21762225031852722, + "rewards/rejected": 0.12034597992897034, + "step": 104 + }, + { + "epoch": 0.37, + "learning_rate": 9.77436807672697e-08, + "logits/chosen": -2.017425298690796, + "logits/rejected": -2.017908811569214, + "logps/chosen": -6.93657922744751, + "logps/rejected": -7.069596290588379, + "loss": 0.6809, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19175004959106445, + "rewards/margins": -0.018778249621391296, + "rewards/rejected": 0.21052829921245575, + "step": 105 + }, + { + "epoch": 0.37, + "learning_rate": 9.768664460075112e-08, + "logits/chosen": -2.041365385055542, + "logits/rejected": -2.047417640686035, + "logps/chosen": -7.031099319458008, + "logps/rejected": -11.466200828552246, + "loss": 0.6476, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3030357360839844, + "rewards/margins": 0.06942635774612427, + "rewards/rejected": 0.2336093634366989, + "step": 106 + }, + { + "epoch": 0.38, + "learning_rate": 9.762891357583147e-08, + "logits/chosen": -2.0753724575042725, + "logits/rejected": -2.0770294666290283, + "logps/chosen": -6.268417835235596, + "logps/rejected": -13.116997718811035, + "loss": 0.6915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2917788028717041, + "rewards/margins": 0.03700032830238342, + "rewards/rejected": 0.25477850437164307, + "step": 107 + }, + { + "epoch": 0.38, + "learning_rate": 9.757048853372927e-08, + "logits/chosen": -2.1022837162017822, + "logits/rejected": -2.107455015182495, + "logps/chosen": -10.66141128540039, + "logps/rejected": -5.718726634979248, + "loss": 0.6477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3974106013774872, + "rewards/margins": 0.19205310940742493, + "rewards/rejected": 0.20535749197006226, + "step": 108 + }, + { + "epoch": 0.39, + "learning_rate": 9.751137032577579e-08, + "logits/chosen": -2.038050651550293, + "logits/rejected": -2.045031785964966, + "logps/chosen": -6.796262264251709, + "logps/rejected": -8.053932189941406, + "loss": 0.6361, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32452598214149475, + "rewards/margins": 0.11952994018793106, + "rewards/rejected": 0.2049960494041443, + "step": 109 + }, + { + "epoch": 0.39, + "learning_rate": 9.745155981340262e-08, + "logits/chosen": -2.102818012237549, + "logits/rejected": -2.105161666870117, + "logps/chosen": -5.016627311706543, + "logps/rejected": -14.66352367401123, + "loss": 0.685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2693566083908081, + "rewards/margins": 0.024307221174240112, + "rewards/rejected": 0.245049387216568, + "step": 110 + }, + { + "epoch": 0.39, + "learning_rate": 9.739105786812923e-08, + "logits/chosen": -2.1339924335479736, + "logits/rejected": -2.148538827896118, + "logps/chosen": -7.39410400390625, + "logps/rejected": -23.348529815673828, + "loss": 0.6876, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2899773418903351, + "rewards/margins": 0.03281037509441376, + "rewards/rejected": 0.2571669816970825, + "step": 111 + }, + { + "epoch": 0.4, + "learning_rate": 9.73298653715501e-08, + "logits/chosen": -2.1342453956604004, + "logits/rejected": -2.1448278427124023, + "logps/chosen": -6.072168350219727, + "logps/rejected": -8.947652816772461, + "loss": 0.6499, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31404024362564087, + "rewards/margins": 0.0964769795536995, + "rewards/rejected": 0.21756324172019958, + "step": 112 + }, + { + "epoch": 0.4, + "learning_rate": 9.726798321532203e-08, + "logits/chosen": -2.134019374847412, + "logits/rejected": -2.1309566497802734, + "logps/chosen": -7.6180291175842285, + "logps/rejected": -15.05532455444336, + "loss": 0.6714, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23129568994045258, + "rewards/margins": 0.0009798109531402588, + "rewards/rejected": 0.23031587898731232, + "step": 113 + }, + { + "epoch": 0.4, + "learning_rate": 9.720541230115112e-08, + "logits/chosen": -2.101468324661255, + "logits/rejected": -2.100236177444458, + "logps/chosen": -5.7438740730285645, + "logps/rejected": -10.041770935058594, + "loss": 0.6951, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2661612927913666, + "rewards/margins": -0.016392461955547333, + "rewards/rejected": 0.2825537621974945, + "step": 114 + }, + { + "epoch": 0.41, + "learning_rate": 9.714215354077949e-08, + "logits/chosen": -2.0260679721832275, + "logits/rejected": -2.0820138454437256, + "logps/chosen": -5.1623640060424805, + "logps/rejected": -25.682668685913086, + "loss": 0.668, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3329288959503174, + "rewards/margins": 0.08901158720254898, + "rewards/rejected": 0.243917316198349, + "step": 115 + }, + { + "epoch": 0.41, + "learning_rate": 9.707820785597218e-08, + "logits/chosen": -2.0410354137420654, + "logits/rejected": -2.053378105163574, + "logps/chosen": -10.27570629119873, + "logps/rejected": -6.435758113861084, + "loss": 0.6546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32476818561553955, + "rewards/margins": 0.07557767629623413, + "rewards/rejected": 0.2491905242204666, + "step": 116 + }, + { + "epoch": 0.41, + "learning_rate": 9.701357617850363e-08, + "logits/chosen": -2.0508875846862793, + "logits/rejected": -2.0604872703552246, + "logps/chosen": -9.831592559814453, + "logps/rejected": -11.271739959716797, + "loss": 0.6517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38692569732666016, + "rewards/margins": 0.1330094337463379, + "rewards/rejected": 0.25391626358032227, + "step": 117 + }, + { + "epoch": 0.42, + "learning_rate": 9.694825945014413e-08, + "logits/chosen": -1.9894123077392578, + "logits/rejected": -1.9946706295013428, + "logps/chosen": -16.549367904663086, + "logps/rejected": -6.677373886108398, + "loss": 0.6497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3764038681983948, + "rewards/margins": 0.17335736751556396, + "rewards/rejected": 0.2030465006828308, + "step": 118 + }, + { + "epoch": 0.42, + "learning_rate": 9.688225862264603e-08, + "logits/chosen": -2.061980962753296, + "logits/rejected": -2.0704870223999023, + "logps/chosen": -16.514728546142578, + "logps/rejected": -16.752994537353516, + "loss": 0.6592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3266030550003052, + "rewards/margins": 0.07418781518936157, + "rewards/rejected": 0.2524152398109436, + "step": 119 + }, + { + "epoch": 0.42, + "learning_rate": 9.681557465772995e-08, + "logits/chosen": -2.117605209350586, + "logits/rejected": -2.127908229827881, + "logps/chosen": -10.8696870803833, + "logps/rejected": -5.878535270690918, + "loss": 0.6683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31065842509269714, + "rewards/margins": 0.12004665285348892, + "rewards/rejected": 0.19061177968978882, + "step": 120 + }, + { + "epoch": 0.43, + "learning_rate": 9.674820852707075e-08, + "logits/chosen": -2.0668861865997314, + "logits/rejected": -2.0581531524658203, + "logps/chosen": -7.816673755645752, + "logps/rejected": -17.748355865478516, + "loss": 0.66, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35392099618911743, + "rewards/margins": 0.038230374455451965, + "rewards/rejected": 0.31569063663482666, + "step": 121 + }, + { + "epoch": 0.43, + "learning_rate": 9.668016121228336e-08, + "logits/chosen": -1.9980697631835938, + "logits/rejected": -1.9957993030548096, + "logps/chosen": -6.275465965270996, + "logps/rejected": -8.977691650390625, + "loss": 0.7177, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32538849115371704, + "rewards/margins": 0.017597824335098267, + "rewards/rejected": 0.3077906370162964, + "step": 122 + }, + { + "epoch": 0.43, + "learning_rate": 9.661143370490845e-08, + "logits/chosen": -2.0757241249084473, + "logits/rejected": -2.0759809017181396, + "logps/chosen": -4.807711601257324, + "logps/rejected": -16.099239349365234, + "loss": 0.7113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3960738480091095, + "rewards/margins": 0.05741699039936066, + "rewards/rejected": 0.33865684270858765, + "step": 123 + }, + { + "epoch": 0.44, + "learning_rate": 9.654202700639805e-08, + "logits/chosen": -2.072523355484009, + "logits/rejected": -2.0882208347320557, + "logps/chosen": -5.547380447387695, + "logps/rejected": -16.632116317749023, + "loss": 0.642, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33295485377311707, + "rewards/margins": 0.07279514521360397, + "rewards/rejected": 0.2601597011089325, + "step": 124 + }, + { + "epoch": 0.44, + "learning_rate": 9.647194212810085e-08, + "logits/chosen": -2.046043634414673, + "logits/rejected": -2.047804355621338, + "logps/chosen": -6.768145561218262, + "logps/rejected": -6.009703636169434, + "loss": 0.6621, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3930690288543701, + "rewards/margins": 0.043035656213760376, + "rewards/rejected": 0.35003334283828735, + "step": 125 + }, + { + "epoch": 0.45, + "learning_rate": 9.64011800912476e-08, + "logits/chosen": -2.101191759109497, + "logits/rejected": -2.1078927516937256, + "logps/chosen": -6.333587646484375, + "logps/rejected": -5.583258628845215, + "loss": 0.6682, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36752966046333313, + "rewards/margins": 0.05936174839735031, + "rewards/rejected": 0.3081679046154022, + "step": 126 + }, + { + "epoch": 0.45, + "learning_rate": 9.632974192693612e-08, + "logits/chosen": -2.0088415145874023, + "logits/rejected": -2.0087015628814697, + "logps/chosen": -15.140894889831543, + "logps/rejected": -6.892490863800049, + "loss": 0.6788, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38981926441192627, + "rewards/margins": -0.0042158812284469604, + "rewards/rejected": 0.3940351605415344, + "step": 127 + }, + { + "epoch": 0.45, + "learning_rate": 9.625762867611635e-08, + "logits/chosen": -2.064101219177246, + "logits/rejected": -2.069530963897705, + "logps/chosen": -6.735413551330566, + "logps/rejected": -13.289546966552734, + "loss": 0.6756, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38232937455177307, + "rewards/margins": -0.0031363070011138916, + "rewards/rejected": 0.38546568155288696, + "step": 128 + }, + { + "epoch": 0.46, + "learning_rate": 9.61848413895751e-08, + "logits/chosen": -2.0021445751190186, + "logits/rejected": -2.0113677978515625, + "logps/chosen": -6.49336576461792, + "logps/rejected": -13.440930366516113, + "loss": 0.6674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4244805872440338, + "rewards/margins": 0.100160613656044, + "rewards/rejected": 0.3243199586868286, + "step": 129 + }, + { + "epoch": 0.46, + "learning_rate": 9.61113811279208e-08, + "logits/chosen": -2.060149908065796, + "logits/rejected": -2.061184883117676, + "logps/chosen": -4.910290241241455, + "logps/rejected": -7.441489219665527, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3278893828392029, + "rewards/margins": -0.12167346477508545, + "rewards/rejected": 0.44956284761428833, + "step": 130 + }, + { + "epoch": 0.46, + "learning_rate": 9.603724896156804e-08, + "logits/chosen": -2.011277675628662, + "logits/rejected": -2.016819953918457, + "logps/chosen": -5.506241798400879, + "logps/rejected": -17.78220558166504, + "loss": 0.6382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4682408571243286, + "rewards/margins": 0.08237965404987335, + "rewards/rejected": 0.38586121797561646, + "step": 131 + }, + { + "epoch": 0.47, + "learning_rate": 9.596244597072196e-08, + "logits/chosen": -2.0329580307006836, + "logits/rejected": -2.0430383682250977, + "logps/chosen": -5.942636013031006, + "logps/rejected": -25.331932067871094, + "loss": 0.6456, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4781746566295624, + "rewards/margins": 0.049496233463287354, + "rewards/rejected": 0.428678423166275, + "step": 132 + }, + { + "epoch": 0.47, + "learning_rate": 9.588697324536252e-08, + "logits/chosen": -2.091088056564331, + "logits/rejected": -2.093322277069092, + "logps/chosen": -9.067215919494629, + "logps/rejected": -6.298609733581543, + "loss": 0.6267, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4100271463394165, + "rewards/margins": 0.06899859011173248, + "rewards/rejected": 0.3410285413265228, + "step": 133 + }, + { + "epoch": 0.47, + "learning_rate": 9.581083188522861e-08, + "logits/chosen": -2.1164286136627197, + "logits/rejected": -2.1220200061798096, + "logps/chosen": -5.468056678771973, + "logps/rejected": -4.47393274307251, + "loss": 0.6587, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4522343873977661, + "rewards/margins": 0.18559186160564423, + "rewards/rejected": 0.2666425108909607, + "step": 134 + }, + { + "epoch": 0.48, + "learning_rate": 9.5734022999802e-08, + "logits/chosen": -1.991593837738037, + "logits/rejected": -2.0018534660339355, + "logps/chosen": -8.244100570678711, + "logps/rejected": -10.307732582092285, + "loss": 0.6088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4423764944076538, + "rewards/margins": 0.08367283642292023, + "rewards/rejected": 0.3587036728858948, + "step": 135 + }, + { + "epoch": 0.48, + "learning_rate": 9.565654770829122e-08, + "logits/chosen": -1.9938260316848755, + "logits/rejected": -2.0008606910705566, + "logps/chosen": -4.6375837326049805, + "logps/rejected": -18.531368255615234, + "loss": 0.6909, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.39690840244293213, + "rewards/margins": -0.10321375727653503, + "rewards/rejected": 0.5001221895217896, + "step": 136 + }, + { + "epoch": 0.48, + "learning_rate": 9.557840713961524e-08, + "logits/chosen": -2.0475218296051025, + "logits/rejected": -2.0484542846679688, + "logps/chosen": -3.86293888092041, + "logps/rejected": -8.57626724243164, + "loss": 0.661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36302345991134644, + "rewards/margins": 0.07512373477220535, + "rewards/rejected": 0.2878997027873993, + "step": 137 + }, + { + "epoch": 0.49, + "learning_rate": 9.5499602432387e-08, + "logits/chosen": -1.9594106674194336, + "logits/rejected": -1.9610954523086548, + "logps/chosen": -4.423036098480225, + "logps/rejected": -10.982858657836914, + "loss": 0.6859, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49251383543014526, + "rewards/margins": 0.0602782666683197, + "rewards/rejected": 0.43223559856414795, + "step": 138 + }, + { + "epoch": 0.49, + "learning_rate": 9.542013473489682e-08, + "logits/chosen": -2.1190593242645264, + "logits/rejected": -2.1184122562408447, + "logps/chosen": -3.897106409072876, + "logps/rejected": -6.540000915527344, + "loss": 0.6953, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44496697187423706, + "rewards/margins": 0.006479114294052124, + "rewards/rejected": 0.43848782777786255, + "step": 139 + }, + { + "epoch": 0.49, + "learning_rate": 9.534000520509568e-08, + "logits/chosen": -1.9902158975601196, + "logits/rejected": -1.9847174882888794, + "logps/chosen": -12.275193214416504, + "logps/rejected": -7.072300910949707, + "loss": 0.6773, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4759800434112549, + "rewards/margins": 0.013349711894989014, + "rewards/rejected": 0.46263033151626587, + "step": 140 + }, + { + "epoch": 0.5, + "learning_rate": 9.525921501057839e-08, + "logits/chosen": -2.0582785606384277, + "logits/rejected": -2.064080238342285, + "logps/chosen": -4.842461585998535, + "logps/rejected": -7.109132766723633, + "loss": 0.6488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5059958696365356, + "rewards/margins": 0.19790901243686676, + "rewards/rejected": 0.3080868124961853, + "step": 141 + }, + { + "epoch": 0.5, + "learning_rate": 9.517776532856645e-08, + "logits/chosen": -2.0587408542633057, + "logits/rejected": -2.055891275405884, + "logps/chosen": -4.141119003295898, + "logps/rejected": -5.117815971374512, + "loss": 0.6832, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.39498692750930786, + "rewards/margins": -0.18272733688354492, + "rewards/rejected": 0.5777142643928528, + "step": 142 + }, + { + "epoch": 0.51, + "learning_rate": 9.509565734589105e-08, + "logits/chosen": -2.135819435119629, + "logits/rejected": -2.133920431137085, + "logps/chosen": -5.776573181152344, + "logps/rejected": -6.607836723327637, + "loss": 0.6283, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.537788987159729, + "rewards/margins": 0.1487315595149994, + "rewards/rejected": 0.3890573978424072, + "step": 143 + }, + { + "epoch": 0.51, + "learning_rate": 9.501289225897565e-08, + "logits/chosen": -2.093254804611206, + "logits/rejected": -2.0936217308044434, + "logps/chosen": -4.246147155761719, + "logps/rejected": -6.032503128051758, + "loss": 0.6467, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43081557750701904, + "rewards/margins": -0.024669155478477478, + "rewards/rejected": 0.4554847478866577, + "step": 144 + }, + { + "epoch": 0.51, + "learning_rate": 9.492947127381865e-08, + "logits/chosen": -2.130110025405884, + "logits/rejected": -2.1359012126922607, + "logps/chosen": -4.633965969085693, + "logps/rejected": -14.135638236999512, + "loss": 0.6858, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4832161068916321, + "rewards/margins": 0.005404025316238403, + "rewards/rejected": 0.4778120517730713, + "step": 145 + }, + { + "epoch": 0.52, + "learning_rate": 9.484539560597575e-08, + "logits/chosen": -2.1190459728240967, + "logits/rejected": -2.1017165184020996, + "logps/chosen": -7.074264049530029, + "logps/rejected": -24.996551513671875, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4535711407661438, + "rewards/margins": -0.06576403975486755, + "rewards/rejected": 0.519335150718689, + "step": 146 + }, + { + "epoch": 0.52, + "learning_rate": 9.476066648054222e-08, + "logits/chosen": -2.0275509357452393, + "logits/rejected": -2.033064603805542, + "logps/chosen": -4.5024027824401855, + "logps/rejected": -14.351448059082031, + "loss": 0.6222, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46561646461486816, + "rewards/margins": 0.01169826090335846, + "rewards/rejected": 0.4539182186126709, + "step": 147 + }, + { + "epoch": 0.52, + "learning_rate": 9.467528513213514e-08, + "logits/chosen": -2.0159475803375244, + "logits/rejected": -2.017779588699341, + "logps/chosen": -14.601338386535645, + "logps/rejected": -12.364233016967773, + "loss": 0.6393, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48921793699264526, + "rewards/margins": 0.06680598855018616, + "rewards/rejected": 0.4224119186401367, + "step": 148 + }, + { + "epoch": 0.53, + "learning_rate": 9.458925280487531e-08, + "logits/chosen": -2.095726728439331, + "logits/rejected": -2.0944690704345703, + "logps/chosen": -5.0145344734191895, + "logps/rejected": -5.931546211242676, + "loss": 0.7295, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45960116386413574, + "rewards/margins": -0.1628980189561844, + "rewards/rejected": 0.6224991679191589, + "step": 149 + }, + { + "epoch": 0.53, + "learning_rate": 9.450257075236918e-08, + "logits/chosen": -2.0839316844940186, + "logits/rejected": -2.1145522594451904, + "logps/chosen": -5.021351337432861, + "logps/rejected": -11.703817367553711, + "loss": 0.6491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6066639423370361, + "rewards/margins": 0.3898150324821472, + "rewards/rejected": 0.2168489396572113, + "step": 150 + }, + { + "epoch": 0.53, + "learning_rate": 9.441524023769057e-08, + "logits/chosen": -2.046121120452881, + "logits/rejected": -2.068605661392212, + "logps/chosen": -6.367807865142822, + "logps/rejected": -18.37773895263672, + "loss": 0.6513, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5056875944137573, + "rewards/margins": -0.010173097252845764, + "rewards/rejected": 0.5158607363700867, + "step": 151 + }, + { + "epoch": 0.54, + "learning_rate": 9.432726253336229e-08, + "logits/chosen": -2.060546636581421, + "logits/rejected": -2.052715539932251, + "logps/chosen": -15.888566970825195, + "logps/rejected": -7.130773544311523, + "loss": 0.6694, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5894063115119934, + "rewards/margins": 0.08662134408950806, + "rewards/rejected": 0.5027849674224854, + "step": 152 + }, + { + "epoch": 0.54, + "learning_rate": 9.423863892133752e-08, + "logits/chosen": -2.063019037246704, + "logits/rejected": -2.065579414367676, + "logps/chosen": -4.350585460662842, + "logps/rejected": -5.576711177825928, + "loss": 0.5761, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.568697452545166, + "rewards/margins": 0.20862814784049988, + "rewards/rejected": 0.3600693345069885, + "step": 153 + }, + { + "epoch": 0.54, + "learning_rate": 9.414937069298124e-08, + "logits/chosen": -2.0648794174194336, + "logits/rejected": -2.0923047065734863, + "logps/chosen": -9.422317504882812, + "logps/rejected": -16.456809997558594, + "loss": 0.6821, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5301134586334229, + "rewards/margins": 0.12394113838672638, + "rewards/rejected": 0.4061722755432129, + "step": 154 + }, + { + "epoch": 0.55, + "learning_rate": 9.405945914905128e-08, + "logits/chosen": -2.062502384185791, + "logits/rejected": -2.06160044670105, + "logps/chosen": -3.9789462089538574, + "logps/rejected": -6.0644941329956055, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5587772130966187, + "rewards/margins": 0.20727208256721497, + "rewards/rejected": 0.3515051007270813, + "step": 155 + }, + { + "epoch": 0.55, + "learning_rate": 9.39689055996795e-08, + "logits/chosen": -2.0654869079589844, + "logits/rejected": -2.063753128051758, + "logps/chosen": -4.9499311447143555, + "logps/rejected": -12.195274353027344, + "loss": 0.6599, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.481566458940506, + "rewards/margins": -0.06760197877883911, + "rewards/rejected": 0.5491684675216675, + "step": 156 + }, + { + "epoch": 0.55, + "learning_rate": 9.387771136435265e-08, + "logits/chosen": -2.0787086486816406, + "logits/rejected": -2.077901840209961, + "logps/chosen": -14.5159912109375, + "logps/rejected": -4.115872383117676, + "loss": 0.6856, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6690038442611694, + "rewards/margins": 0.21736958622932434, + "rewards/rejected": 0.4516342580318451, + "step": 157 + }, + { + "epoch": 0.56, + "learning_rate": 9.378587777189309e-08, + "logits/chosen": -2.0814199447631836, + "logits/rejected": -2.0884950160980225, + "logps/chosen": -7.0681915283203125, + "logps/rejected": -16.57811164855957, + "loss": 0.6354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5327993631362915, + "rewards/margins": 0.07332910597324371, + "rewards/rejected": 0.4594702124595642, + "step": 158 + }, + { + "epoch": 0.56, + "learning_rate": 9.369340616043948e-08, + "logits/chosen": -2.1129956245422363, + "logits/rejected": -2.117924690246582, + "logps/chosen": -3.5016050338745117, + "logps/rejected": -5.242643356323242, + "loss": 0.6467, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5125755071640015, + "rewards/margins": 0.08737437427043915, + "rewards/rejected": 0.4252011179924011, + "step": 159 + }, + { + "epoch": 0.57, + "learning_rate": 9.360029787742729e-08, + "logits/chosen": -2.0746278762817383, + "logits/rejected": -2.0780797004699707, + "logps/chosen": -5.707599639892578, + "logps/rejected": -17.19940757751465, + "loss": 0.6623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5587804317474365, + "rewards/margins": 0.052925318479537964, + "rewards/rejected": 0.5058550834655762, + "step": 160 + }, + { + "epoch": 0.57, + "learning_rate": 9.350655427956917e-08, + "logits/chosen": -2.094630241394043, + "logits/rejected": -2.0955843925476074, + "logps/chosen": -4.127342224121094, + "logps/rejected": -5.702712535858154, + "loss": 0.6811, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4667899012565613, + "rewards/margins": -0.16266174614429474, + "rewards/rejected": 0.6294516324996948, + "step": 161 + }, + { + "epoch": 0.57, + "learning_rate": 9.341217673283508e-08, + "logits/chosen": -2.0470192432403564, + "logits/rejected": -2.077691078186035, + "logps/chosen": -14.73582935333252, + "logps/rejected": -8.44935417175293, + "loss": 0.6284, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6201330423355103, + "rewards/margins": 0.14134365320205688, + "rewards/rejected": 0.47878938913345337, + "step": 162 + }, + { + "epoch": 0.58, + "learning_rate": 9.331716661243258e-08, + "logits/chosen": -2.111847162246704, + "logits/rejected": -2.1120057106018066, + "logps/chosen": -8.240798950195312, + "logps/rejected": -12.539896011352539, + "loss": 0.7394, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.475114643573761, + "rewards/margins": -0.0062034279108047485, + "rewards/rejected": 0.48131808638572693, + "step": 163 + }, + { + "epoch": 0.58, + "learning_rate": 9.322152530278657e-08, + "logits/chosen": -2.105618953704834, + "logits/rejected": -2.108598470687866, + "logps/chosen": -2.7514755725860596, + "logps/rejected": -9.900801658630371, + "loss": 0.7028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5883119106292725, + "rewards/margins": 0.16890989243984222, + "rewards/rejected": 0.41940200328826904, + "step": 164 + }, + { + "epoch": 0.58, + "learning_rate": 9.312525419751929e-08, + "logits/chosen": -2.0988266468048096, + "logits/rejected": -2.100895404815674, + "logps/chosen": -5.939178466796875, + "logps/rejected": -13.63568115234375, + "loss": 0.6134, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4905054569244385, + "rewards/margins": -0.06554649770259857, + "rewards/rejected": 0.5560519695281982, + "step": 165 + }, + { + "epoch": 0.59, + "learning_rate": 9.302835469942992e-08, + "logits/chosen": -2.1125619411468506, + "logits/rejected": -2.1296169757843018, + "logps/chosen": -3.1618587970733643, + "logps/rejected": -14.233179092407227, + "loss": 0.7123, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5630980134010315, + "rewards/margins": 0.05800086259841919, + "rewards/rejected": 0.5050971508026123, + "step": 166 + }, + { + "epoch": 0.59, + "learning_rate": 9.293082822047415e-08, + "logits/chosen": -2.0887343883514404, + "logits/rejected": -2.094165802001953, + "logps/chosen": -2.863309383392334, + "logps/rejected": -10.392391204833984, + "loss": 0.6371, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6048001050949097, + "rewards/margins": 0.05825185775756836, + "rewards/rejected": 0.5465482473373413, + "step": 167 + }, + { + "epoch": 0.59, + "learning_rate": 9.283267618174369e-08, + "logits/chosen": -2.0154199600219727, + "logits/rejected": -2.0148346424102783, + "logps/chosen": -3.6156227588653564, + "logps/rejected": -6.762144088745117, + "loss": 0.677, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5965454578399658, + "rewards/margins": -0.003221571445465088, + "rewards/rejected": 0.5997669696807861, + "step": 168 + }, + { + "epoch": 0.6, + "learning_rate": 9.273390001344543e-08, + "logits/chosen": -2.0068471431732178, + "logits/rejected": -2.0067780017852783, + "logps/chosen": -4.414371490478516, + "logps/rejected": -5.784242630004883, + "loss": 0.6956, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.734883189201355, + "rewards/margins": 0.32199397683143616, + "rewards/rejected": 0.4128892421722412, + "step": 169 + }, + { + "epoch": 0.6, + "learning_rate": 9.263450115488069e-08, + "logits/chosen": -2.0692784786224365, + "logits/rejected": -2.0673110485076904, + "logps/chosen": -5.603399276733398, + "logps/rejected": -3.578502893447876, + "loss": 0.5565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6898326277732849, + "rewards/margins": 0.29047924280166626, + "rewards/rejected": 0.39935338497161865, + "step": 170 + }, + { + "epoch": 0.6, + "learning_rate": 9.253448105442421e-08, + "logits/chosen": -2.097252368927002, + "logits/rejected": -2.1022706031799316, + "logps/chosen": -5.009082794189453, + "logps/rejected": -4.5946149826049805, + "loss": 0.6635, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4651423692703247, + "rewards/margins": -0.006123840808868408, + "rewards/rejected": 0.4712662100791931, + "step": 171 + }, + { + "epoch": 0.61, + "learning_rate": 9.243384116950308e-08, + "logits/chosen": -2.0312366485595703, + "logits/rejected": -2.0314536094665527, + "logps/chosen": -2.4519896507263184, + "logps/rejected": -4.407073497772217, + "loss": 0.632, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5699340105056763, + "rewards/margins": 0.14262722432613373, + "rewards/rejected": 0.42730677127838135, + "step": 172 + }, + { + "epoch": 0.61, + "learning_rate": 9.233258296657546e-08, + "logits/chosen": -2.0920002460479736, + "logits/rejected": -2.0924630165100098, + "logps/chosen": -4.890830993652344, + "logps/rejected": -4.775411128997803, + "loss": 0.6319, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.617672324180603, + "rewards/margins": 0.01834636926651001, + "rewards/rejected": 0.5993258953094482, + "step": 173 + }, + { + "epoch": 0.61, + "learning_rate": 9.223070792110926e-08, + "logits/chosen": -2.0651326179504395, + "logits/rejected": -2.0709660053253174, + "logps/chosen": -3.9293549060821533, + "logps/rejected": -12.448321342468262, + "loss": 0.6625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6937732696533203, + "rewards/margins": 0.1754501610994339, + "rewards/rejected": 0.5183231234550476, + "step": 174 + }, + { + "epoch": 0.62, + "learning_rate": 9.212821751756057e-08, + "logits/chosen": -2.077624797821045, + "logits/rejected": -2.0813300609588623, + "logps/chosen": -14.808090209960938, + "logps/rejected": -3.7651543617248535, + "loss": 0.6373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6706396341323853, + "rewards/margins": 0.22953644394874573, + "rewards/rejected": 0.4411032199859619, + "step": 175 + }, + { + "epoch": 0.62, + "learning_rate": 9.202511324935212e-08, + "logits/chosen": -2.0753955841064453, + "logits/rejected": -2.0748088359832764, + "logps/chosen": -4.110065937042236, + "logps/rejected": -6.414317607879639, + "loss": 0.6548, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6402162909507751, + "rewards/margins": 0.13253115117549896, + "rewards/rejected": 0.507685124874115, + "step": 176 + }, + { + "epoch": 0.63, + "learning_rate": 9.192139661885142e-08, + "logits/chosen": -2.115790367126465, + "logits/rejected": -2.1244959831237793, + "logps/chosen": -2.52486252784729, + "logps/rejected": -12.555469512939453, + "loss": 0.723, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5536451935768127, + "rewards/margins": -0.02674223482608795, + "rewards/rejected": 0.5803874135017395, + "step": 177 + }, + { + "epoch": 0.63, + "learning_rate": 9.181706913734899e-08, + "logits/chosen": -2.1360909938812256, + "logits/rejected": -2.143453359603882, + "logps/chosen": -5.429347991943359, + "logps/rejected": -12.278985977172852, + "loss": 0.6171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7489770650863647, + "rewards/margins": 0.28541600704193115, + "rewards/rejected": 0.4635610580444336, + "step": 178 + }, + { + "epoch": 0.63, + "learning_rate": 9.17121323250362e-08, + "logits/chosen": -2.0240859985351562, + "logits/rejected": -2.023867130279541, + "logps/chosen": -2.747467517852783, + "logps/rejected": -8.849620819091797, + "loss": 0.6334, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.690142810344696, + "rewards/margins": 0.09131383895874023, + "rewards/rejected": 0.5988289713859558, + "step": 179 + }, + { + "epoch": 0.64, + "learning_rate": 9.160658771098322e-08, + "logits/chosen": -2.119912624359131, + "logits/rejected": -2.116853952407837, + "logps/chosen": -4.533078193664551, + "logps/rejected": -5.935761451721191, + "loss": 0.5943, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6330578923225403, + "rewards/margins": 0.19861215353012085, + "rewards/rejected": 0.43444573879241943, + "step": 180 + }, + { + "epoch": 0.64, + "learning_rate": 9.150043683311672e-08, + "logits/chosen": -2.055363416671753, + "logits/rejected": -2.0570054054260254, + "logps/chosen": -4.008270263671875, + "logps/rejected": -2.8472585678100586, + "loss": 0.6546, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6444904804229736, + "rewards/margins": -0.016354292631149292, + "rewards/rejected": 0.6608448028564453, + "step": 181 + }, + { + "epoch": 0.64, + "learning_rate": 9.139368123819742e-08, + "logits/chosen": -2.067139148712158, + "logits/rejected": -2.06680965423584, + "logps/chosen": -13.248064041137695, + "logps/rejected": -3.7882323265075684, + "loss": 0.6927, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6786010265350342, + "rewards/margins": 0.16107425093650818, + "rewards/rejected": 0.5175267457962036, + "step": 182 + }, + { + "epoch": 0.65, + "learning_rate": 9.12863224817976e-08, + "logits/chosen": -2.0549652576446533, + "logits/rejected": -2.0545883178710938, + "logps/chosen": -2.9973526000976562, + "logps/rejected": -12.229803085327148, + "loss": 0.6766, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5882805585861206, + "rewards/margins": -0.11080878973007202, + "rewards/rejected": 0.6990894079208374, + "step": 183 + }, + { + "epoch": 0.65, + "learning_rate": 9.117836212827838e-08, + "logits/chosen": -2.1314480304718018, + "logits/rejected": -2.150956869125366, + "logps/chosen": -7.311335563659668, + "logps/rejected": -9.395841598510742, + "loss": 0.5831, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6973772048950195, + "rewards/margins": 0.3633418083190918, + "rewards/rejected": 0.33403539657592773, + "step": 184 + }, + { + "epoch": 0.65, + "learning_rate": 9.106980175076699e-08, + "logits/chosen": -1.9907197952270508, + "logits/rejected": -1.991147518157959, + "logps/chosen": -2.696820020675659, + "logps/rejected": -9.949247360229492, + "loss": 0.6298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6357452273368835, + "rewards/margins": 0.04279276728630066, + "rewards/rejected": 0.5929524898529053, + "step": 185 + }, + { + "epoch": 0.66, + "learning_rate": 9.096064293113382e-08, + "logits/chosen": -2.086991548538208, + "logits/rejected": -2.0838191509246826, + "logps/chosen": -4.728148460388184, + "logps/rejected": -4.056379795074463, + "loss": 0.6516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7372314929962158, + "rewards/margins": 0.19815018773078918, + "rewards/rejected": 0.539081335067749, + "step": 186 + }, + { + "epoch": 0.66, + "learning_rate": 9.085088725996933e-08, + "logits/chosen": -2.041287422180176, + "logits/rejected": -2.048053741455078, + "logps/chosen": -2.6218621730804443, + "logps/rejected": -10.249082565307617, + "loss": 0.6193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6218602061271667, + "rewards/margins": 0.049615710973739624, + "rewards/rejected": 0.5722445249557495, + "step": 187 + }, + { + "epoch": 0.66, + "learning_rate": 9.074053633656093e-08, + "logits/chosen": -2.1563210487365723, + "logits/rejected": -2.1545510292053223, + "logps/chosen": -4.057003021240234, + "logps/rejected": -14.974495887756348, + "loss": 0.5888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7303050756454468, + "rewards/margins": 0.11132848262786865, + "rewards/rejected": 0.6189765930175781, + "step": 188 + }, + { + "epoch": 0.67, + "learning_rate": 9.062959176886966e-08, + "logits/chosen": -2.0393247604370117, + "logits/rejected": -2.0389161109924316, + "logps/chosen": -3.1201744079589844, + "logps/rejected": -4.352142333984375, + "loss": 0.6038, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6428436040878296, + "rewards/margins": -0.05892184376716614, + "rewards/rejected": 0.7017654180526733, + "step": 189 + }, + { + "epoch": 0.67, + "learning_rate": 9.051805517350672e-08, + "logits/chosen": -1.9690868854522705, + "logits/rejected": -1.9691470861434937, + "logps/chosen": -2.584702730178833, + "logps/rejected": -2.542117118835449, + "loss": 0.5717, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6137998104095459, + "rewards/margins": 0.03487074375152588, + "rewards/rejected": 0.57892906665802, + "step": 190 + }, + { + "epoch": 0.67, + "learning_rate": 9.040592817571e-08, + "logits/chosen": -2.057260036468506, + "logits/rejected": -2.0584399700164795, + "logps/chosen": -2.3116233348846436, + "logps/rejected": -12.652660369873047, + "loss": 0.6145, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.670344352722168, + "rewards/margins": -0.002231806516647339, + "rewards/rejected": 0.6725761890411377, + "step": 191 + }, + { + "epoch": 0.68, + "learning_rate": 9.029321240932032e-08, + "logits/chosen": -2.067406415939331, + "logits/rejected": -2.106401205062866, + "logps/chosen": -8.282800674438477, + "logps/rejected": -13.618911743164062, + "loss": 0.584, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8032389879226685, + "rewards/margins": 0.1789337396621704, + "rewards/rejected": 0.624305248260498, + "step": 192 + }, + { + "epoch": 0.68, + "learning_rate": 9.017990951675763e-08, + "logits/chosen": -2.0273633003234863, + "logits/rejected": -2.0197174549102783, + "logps/chosen": -6.44704008102417, + "logps/rejected": -11.527935981750488, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6325668096542358, + "rewards/margins": 0.02114233374595642, + "rewards/rejected": 0.611424446105957, + "step": 193 + }, + { + "epoch": 0.69, + "learning_rate": 9.00660211489971e-08, + "logits/chosen": -2.0192556381225586, + "logits/rejected": -2.017244338989258, + "logps/chosen": -4.468182563781738, + "logps/rejected": -3.9755568504333496, + "loss": 0.5969, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7222710847854614, + "rewards/margins": 0.16795912384986877, + "rewards/rejected": 0.554311990737915, + "step": 194 + }, + { + "epoch": 0.69, + "learning_rate": 8.995154896554508e-08, + "logits/chosen": -2.0944886207580566, + "logits/rejected": -2.1011250019073486, + "logps/chosen": -5.823890686035156, + "logps/rejected": -11.668116569519043, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7537532448768616, + "rewards/margins": 0.12172892689704895, + "rewards/rejected": 0.6320242881774902, + "step": 195 + }, + { + "epoch": 0.69, + "learning_rate": 8.983649463441492e-08, + "logits/chosen": -2.0381689071655273, + "logits/rejected": -2.0378618240356445, + "logps/chosen": -1.666111707687378, + "logps/rejected": -3.7744810581207275, + "loss": 0.6503, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6427743434906006, + "rewards/margins": 0.220810204744339, + "rewards/rejected": 0.4219641089439392, + "step": 196 + }, + { + "epoch": 0.7, + "learning_rate": 8.972085983210258e-08, + "logits/chosen": -2.1860809326171875, + "logits/rejected": -2.19046950340271, + "logps/chosen": -5.079977035522461, + "logps/rejected": -12.119379043579102, + "loss": 0.6544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8241627216339111, + "rewards/margins": 0.18175917863845825, + "rewards/rejected": 0.6424034833908081, + "step": 197 + }, + { + "epoch": 0.7, + "learning_rate": 8.96046462435623e-08, + "logits/chosen": -1.9737952947616577, + "logits/rejected": -1.9762822389602661, + "logps/chosen": -3.3300976753234863, + "logps/rejected": -4.579537391662598, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6828453540802002, + "rewards/margins": 0.26630938053131104, + "rewards/rejected": 0.41653597354888916, + "step": 198 + }, + { + "epoch": 0.7, + "learning_rate": 8.948785556218202e-08, + "logits/chosen": -2.0832927227020264, + "logits/rejected": -2.085637331008911, + "logps/chosen": -3.1877994537353516, + "logps/rejected": -4.580703258514404, + "loss": 0.6539, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6220249533653259, + "rewards/margins": -0.014381974935531616, + "rewards/rejected": 0.6364068984985352, + "step": 199 + }, + { + "epoch": 0.71, + "learning_rate": 8.937048948975867e-08, + "logits/chosen": -2.039717435836792, + "logits/rejected": -2.0528578758239746, + "logps/chosen": -6.570793151855469, + "logps/rejected": -21.593809127807617, + "loss": 0.6393, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6131695508956909, + "rewards/margins": 0.06834425032138824, + "rewards/rejected": 0.5448253154754639, + "step": 200 + }, + { + "epoch": 0.71, + "learning_rate": 8.925254973647342e-08, + "logits/chosen": -2.0471529960632324, + "logits/rejected": -2.0464978218078613, + "logps/chosen": -3.0693516731262207, + "logps/rejected": -5.4899420738220215, + "loss": 0.5759, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6330344676971436, + "rewards/margins": 0.03111615777015686, + "rewards/rejected": 0.6019182801246643, + "step": 201 + }, + { + "epoch": 0.71, + "learning_rate": 8.913403802086675e-08, + "logits/chosen": -2.0923702716827393, + "logits/rejected": -2.0932884216308594, + "logps/chosen": -3.418344020843506, + "logps/rejected": -3.2735166549682617, + "loss": 0.621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.788568377494812, + "rewards/margins": 0.31601065397262573, + "rewards/rejected": 0.4725576639175415, + "step": 202 + }, + { + "epoch": 0.72, + "learning_rate": 8.901495606981337e-08, + "logits/chosen": -2.0796663761138916, + "logits/rejected": -2.0885398387908936, + "logps/chosen": -1.6042132377624512, + "logps/rejected": -11.54625415802002, + "loss": 0.6451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7348337769508362, + "rewards/margins": 0.16441766917705536, + "rewards/rejected": 0.5704160928726196, + "step": 203 + }, + { + "epoch": 0.72, + "learning_rate": 8.889530561849709e-08, + "logits/chosen": -1.9317024946212769, + "logits/rejected": -1.9373377561569214, + "logps/chosen": -1.6279592514038086, + "logps/rejected": -15.435929298400879, + "loss": 0.6, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7393727898597717, + "rewards/margins": 0.13780497014522552, + "rewards/rejected": 0.601567804813385, + "step": 204 + }, + { + "epoch": 0.72, + "learning_rate": 8.877508841038558e-08, + "logits/chosen": -2.0370469093322754, + "logits/rejected": -2.0402650833129883, + "logps/chosen": -13.44301986694336, + "logps/rejected": -10.306529998779297, + "loss": 0.5855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9444580078125, + "rewards/margins": 0.19026458263397217, + "rewards/rejected": 0.7541934251785278, + "step": 205 + }, + { + "epoch": 0.73, + "learning_rate": 8.865430619720483e-08, + "logits/chosen": -2.081827163696289, + "logits/rejected": -2.0823941230773926, + "logps/chosen": -1.766142725944519, + "logps/rejected": -3.137019157409668, + "loss": 0.6291, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5954875946044922, + "rewards/margins": -0.04022699594497681, + "rewards/rejected": 0.6357145309448242, + "step": 206 + }, + { + "epoch": 0.73, + "learning_rate": 8.853296073891379e-08, + "logits/chosen": -2.0832951068878174, + "logits/rejected": -2.085935115814209, + "logps/chosen": -2.642087697982788, + "logps/rejected": -3.6862545013427734, + "loss": 0.6608, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6902578473091125, + "rewards/margins": 0.12843824923038483, + "rewards/rejected": 0.5618196129798889, + "step": 207 + }, + { + "epoch": 0.73, + "learning_rate": 8.841105380367859e-08, + "logits/chosen": -1.9725271463394165, + "logits/rejected": -1.9726706743240356, + "logps/chosen": -1.7386714220046997, + "logps/rejected": -3.1689469814300537, + "loss": 0.5746, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.672714114189148, + "rewards/margins": 0.030841439962387085, + "rewards/rejected": 0.6418727040290833, + "step": 208 + }, + { + "epoch": 0.74, + "learning_rate": 8.828858716784691e-08, + "logits/chosen": -2.109929084777832, + "logits/rejected": -2.125743865966797, + "logps/chosen": -3.1284098625183105, + "logps/rejected": -14.20623779296875, + "loss": 0.6148, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8200628161430359, + "rewards/margins": 0.22363552451133728, + "rewards/rejected": 0.596427321434021, + "step": 209 + }, + { + "epoch": 0.74, + "learning_rate": 8.81655626159219e-08, + "logits/chosen": -2.051114082336426, + "logits/rejected": -2.0486671924591064, + "logps/chosen": -1.5414700508117676, + "logps/rejected": -7.371219158172607, + "loss": 0.6677, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6169458627700806, + "rewards/margins": 0.019547730684280396, + "rewards/rejected": 0.5973981022834778, + "step": 210 + }, + { + "epoch": 0.75, + "learning_rate": 8.804198194053641e-08, + "logits/chosen": -1.9764589071273804, + "logits/rejected": -1.9839946031570435, + "logps/chosen": -3.6948184967041016, + "logps/rejected": -3.4167933464050293, + "loss": 0.6437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9235994815826416, + "rewards/margins": 0.5047410726547241, + "rewards/rejected": 0.4188583791255951, + "step": 211 + }, + { + "epoch": 0.75, + "learning_rate": 8.791784694242672e-08, + "logits/chosen": -2.0637776851654053, + "logits/rejected": -2.070524215698242, + "logps/chosen": -2.5157644748687744, + "logps/rejected": -8.260083198547363, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7720234394073486, + "rewards/margins": 0.23209916055202484, + "rewards/rejected": 0.5399242639541626, + "step": 212 + }, + { + "epoch": 0.75, + "learning_rate": 8.779315943040628e-08, + "logits/chosen": -2.0280044078826904, + "logits/rejected": -2.0180797576904297, + "logps/chosen": -9.310430526733398, + "logps/rejected": -10.841652870178223, + "loss": 0.5458, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8640991449356079, + "rewards/margins": 0.23064352571964264, + "rewards/rejected": 0.6334555745124817, + "step": 213 + }, + { + "epoch": 0.76, + "learning_rate": 8.766792122133948e-08, + "logits/chosen": -2.04017972946167, + "logits/rejected": -2.061889410018921, + "logps/chosen": -3.5606770515441895, + "logps/rejected": -24.927764892578125, + "loss": 0.6183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8345874547958374, + "rewards/margins": 0.12567400932312012, + "rewards/rejected": 0.7089134454727173, + "step": 214 + }, + { + "epoch": 0.76, + "learning_rate": 8.754213414011509e-08, + "logits/chosen": -2.1725656986236572, + "logits/rejected": -2.174450397491455, + "logps/chosen": -1.7367199659347534, + "logps/rejected": -4.247137069702148, + "loss": 0.6565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7439385056495667, + "rewards/margins": 0.3828103542327881, + "rewards/rejected": 0.36112815141677856, + "step": 215 + }, + { + "epoch": 0.76, + "learning_rate": 8.741580001961966e-08, + "logits/chosen": -2.1346116065979004, + "logits/rejected": -2.1331264972686768, + "logps/chosen": -0.9554059505462646, + "logps/rejected": -4.624788761138916, + "loss": 0.7052, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6176758408546448, + "rewards/margins": -0.21874213218688965, + "rewards/rejected": 0.8364179730415344, + "step": 216 + }, + { + "epoch": 0.77, + "learning_rate": 8.728892070071083e-08, + "logits/chosen": -2.0177576541900635, + "logits/rejected": -2.0307674407958984, + "logps/chosen": -4.274886131286621, + "logps/rejected": -13.033632278442383, + "loss": 0.7001, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7792242765426636, + "rewards/margins": 0.025530636310577393, + "rewards/rejected": 0.753693699836731, + "step": 217 + }, + { + "epoch": 0.77, + "learning_rate": 8.716149803219058e-08, + "logits/chosen": -1.993787407875061, + "logits/rejected": -2.022050142288208, + "logps/chosen": -2.8853518962860107, + "logps/rejected": -21.197372436523438, + "loss": 0.7415, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6690850257873535, + "rewards/margins": -0.05528172850608826, + "rewards/rejected": 0.7243667840957642, + "step": 218 + }, + { + "epoch": 0.77, + "learning_rate": 8.703353387077812e-08, + "logits/chosen": -2.157074213027954, + "logits/rejected": -2.161729335784912, + "logps/chosen": -3.135571002960205, + "logps/rejected": -10.40681266784668, + "loss": 0.659, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7775501012802124, + "rewards/margins": 0.12950173020362854, + "rewards/rejected": 0.6480484008789062, + "step": 219 + }, + { + "epoch": 0.78, + "learning_rate": 8.690503008108304e-08, + "logits/chosen": -2.0811712741851807, + "logits/rejected": -2.0765295028686523, + "logps/chosen": -3.3981523513793945, + "logps/rejected": -12.939985275268555, + "loss": 0.6511, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7917190790176392, + "rewards/margins": 0.04310595989227295, + "rewards/rejected": 0.7486131191253662, + "step": 220 + }, + { + "epoch": 0.78, + "learning_rate": 8.677598853557797e-08, + "logits/chosen": -2.0711352825164795, + "logits/rejected": -2.073270320892334, + "logps/chosen": -2.0387537479400635, + "logps/rejected": -11.267715454101562, + "loss": 0.6812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7775349617004395, + "rewards/margins": -0.048075854778289795, + "rewards/rejected": 0.825610876083374, + "step": 221 + }, + { + "epoch": 0.78, + "learning_rate": 8.664641111457139e-08, + "logits/chosen": -2.013550281524658, + "logits/rejected": -2.0204403400421143, + "logps/chosen": -1.8753901720046997, + "logps/rejected": -9.507131576538086, + "loss": 0.7427, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6771697998046875, + "rewards/margins": -0.12623068690299988, + "rewards/rejected": 0.8034005165100098, + "step": 222 + }, + { + "epoch": 0.79, + "learning_rate": 8.651629970618019e-08, + "logits/chosen": -2.011268138885498, + "logits/rejected": -2.0077648162841797, + "logps/chosen": -5.356961250305176, + "logps/rejected": -4.024666786193848, + "loss": 0.6187, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6697742938995361, + "rewards/margins": 0.17373602092266083, + "rewards/rejected": 0.4960383176803589, + "step": 223 + }, + { + "epoch": 0.79, + "learning_rate": 8.638565620630218e-08, + "logits/chosen": -2.025496006011963, + "logits/rejected": -2.0333962440490723, + "logps/chosen": -1.7342565059661865, + "logps/rejected": -13.24448299407959, + "loss": 0.6113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7557774782180786, + "rewards/margins": 0.049749940633773804, + "rewards/rejected": 0.7060275077819824, + "step": 224 + }, + { + "epoch": 0.8, + "learning_rate": 8.625448251858847e-08, + "logits/chosen": -2.0371882915496826, + "logits/rejected": -2.042539358139038, + "logps/chosen": -2.499297857284546, + "logps/rejected": -3.075155735015869, + "loss": 0.6373, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9230514168739319, + "rewards/margins": 0.2615776062011719, + "rewards/rejected": 0.66147381067276, + "step": 225 + }, + { + "epoch": 0.8, + "learning_rate": 8.612278055441572e-08, + "logits/chosen": -1.9454425573349, + "logits/rejected": -1.9522525072097778, + "logps/chosen": -3.3749144077301025, + "logps/rejected": -14.046710014343262, + "loss": 0.5837, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.767713725566864, + "rewards/margins": 0.09144982695579529, + "rewards/rejected": 0.6762639284133911, + "step": 226 + }, + { + "epoch": 0.8, + "learning_rate": 8.599055223285825e-08, + "logits/chosen": -2.035226345062256, + "logits/rejected": -2.0347163677215576, + "logps/chosen": -2.0107157230377197, + "logps/rejected": -10.683090209960938, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8197892308235168, + "rewards/margins": 0.0231112539768219, + "rewards/rejected": 0.7966779470443726, + "step": 227 + }, + { + "epoch": 0.81, + "learning_rate": 8.585779948066015e-08, + "logits/chosen": -2.028615713119507, + "logits/rejected": -2.032618284225464, + "logps/chosen": -1.680711269378662, + "logps/rejected": -19.68160057067871, + "loss": 0.686, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7408775687217712, + "rewards/margins": -0.18792429566383362, + "rewards/rejected": 0.9288018941879272, + "step": 228 + }, + { + "epoch": 0.81, + "learning_rate": 8.572452423220716e-08, + "logits/chosen": -2.0169694423675537, + "logits/rejected": -2.018419027328491, + "logps/chosen": -1.4606056213378906, + "logps/rejected": -3.1519687175750732, + "loss": 0.6936, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7215170860290527, + "rewards/margins": 0.14802680909633636, + "rewards/rejected": 0.5734902620315552, + "step": 229 + }, + { + "epoch": 0.81, + "learning_rate": 8.559072842949848e-08, + "logits/chosen": -2.026834726333618, + "logits/rejected": -2.127309799194336, + "logps/chosen": -1.635563850402832, + "logps/rejected": -30.02583885192871, + "loss": 0.7057, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6516119837760925, + "rewards/margins": -0.049664318561553955, + "rewards/rejected": 0.7012763023376465, + "step": 230 + }, + { + "epoch": 0.82, + "learning_rate": 8.545641402211849e-08, + "logits/chosen": -2.0188446044921875, + "logits/rejected": -2.0271875858306885, + "logps/chosen": -1.6317921876907349, + "logps/rejected": -19.68891716003418, + "loss": 0.7422, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8000383973121643, + "rewards/margins": -0.08347535133361816, + "rewards/rejected": 0.8835137486457825, + "step": 231 + }, + { + "epoch": 0.82, + "learning_rate": 8.532158296720835e-08, + "logits/chosen": -2.004681348800659, + "logits/rejected": -2.0353147983551025, + "logps/chosen": -3.4402105808258057, + "logps/rejected": -16.195838928222656, + "loss": 0.5567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9600374698638916, + "rewards/margins": 0.5027660131454468, + "rewards/rejected": 0.4572714567184448, + "step": 232 + }, + { + "epoch": 0.82, + "learning_rate": 8.518623722943745e-08, + "logits/chosen": -2.0358710289001465, + "logits/rejected": -2.03664493560791, + "logps/chosen": -3.167174816131592, + "logps/rejected": -2.1868083477020264, + "loss": 0.7353, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7859362363815308, + "rewards/margins": 0.03467944264411926, + "rewards/rejected": 0.7512567639350891, + "step": 233 + }, + { + "epoch": 0.83, + "learning_rate": 8.505037878097481e-08, + "logits/chosen": -2.087397813796997, + "logits/rejected": -2.08896541595459, + "logps/chosen": -1.5274966955184937, + "logps/rejected": -10.974538803100586, + "loss": 0.5894, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8069373369216919, + "rewards/margins": 0.15573567152023315, + "rewards/rejected": 0.6512017250061035, + "step": 234 + }, + { + "epoch": 0.83, + "learning_rate": 8.491400960146032e-08, + "logits/chosen": -2.0725460052490234, + "logits/rejected": -2.074294090270996, + "logps/chosen": -1.447859525680542, + "logps/rejected": -2.7293529510498047, + "loss": 0.6632, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7505640983581543, + "rewards/margins": 0.14788126945495605, + "rewards/rejected": 0.6026828289031982, + "step": 235 + }, + { + "epoch": 0.83, + "learning_rate": 8.477713167797591e-08, + "logits/chosen": -1.9355508089065552, + "logits/rejected": -1.9335052967071533, + "logps/chosen": -1.7083971500396729, + "logps/rejected": -4.3652143478393555, + "loss": 0.7177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7956393957138062, + "rewards/margins": 0.09667810797691345, + "rewards/rejected": 0.6989613175392151, + "step": 236 + }, + { + "epoch": 0.84, + "learning_rate": 8.46397470050166e-08, + "logits/chosen": -2.051347255706787, + "logits/rejected": -2.0523290634155273, + "logps/chosen": -1.623436450958252, + "logps/rejected": -10.840812683105469, + "loss": 0.5879, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7999486923217773, + "rewards/margins": -0.017120718955993652, + "rewards/rejected": 0.817069411277771, + "step": 237 + }, + { + "epoch": 0.84, + "learning_rate": 8.450185758446145e-08, + "logits/chosen": -2.0779285430908203, + "logits/rejected": -2.077561616897583, + "logps/chosen": -3.2457361221313477, + "logps/rejected": -2.3329057693481445, + "loss": 0.6858, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7842065095901489, + "rewards/margins": 0.022778451442718506, + "rewards/rejected": 0.7614281177520752, + "step": 238 + }, + { + "epoch": 0.84, + "learning_rate": 8.436346542554432e-08, + "logits/chosen": -2.072805881500244, + "logits/rejected": -2.072026252746582, + "logps/chosen": -1.2256221771240234, + "logps/rejected": -3.223036289215088, + "loss": 0.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.732630729675293, + "rewards/margins": -0.014105260372161865, + "rewards/rejected": 0.7467359900474548, + "step": 239 + }, + { + "epoch": 0.85, + "learning_rate": 8.422457254482467e-08, + "logits/chosen": -1.9786070585250854, + "logits/rejected": -1.982964038848877, + "logps/chosen": -3.402554750442505, + "logps/rejected": -4.056947231292725, + "loss": 0.6543, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6315486431121826, + "rewards/margins": -0.12552985548973083, + "rewards/rejected": 0.7570784687995911, + "step": 240 + }, + { + "epoch": 0.85, + "learning_rate": 8.408518096615816e-08, + "logits/chosen": -2.101975917816162, + "logits/rejected": -2.105990171432495, + "logps/chosen": -2.414355993270874, + "logps/rejected": -2.500633955001831, + "loss": 0.6635, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7711955308914185, + "rewards/margins": 0.15856435894966125, + "rewards/rejected": 0.6126311421394348, + "step": 241 + }, + { + "epoch": 0.86, + "learning_rate": 8.394529272066713e-08, + "logits/chosen": -2.0268030166625977, + "logits/rejected": -2.021085023880005, + "logps/chosen": -1.5044916868209839, + "logps/rejected": -3.3226661682128906, + "loss": 0.6562, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5640605092048645, + "rewards/margins": -0.3547849953174591, + "rewards/rejected": 0.918845534324646, + "step": 242 + }, + { + "epoch": 0.86, + "learning_rate": 8.380490984671105e-08, + "logits/chosen": -2.0771985054016113, + "logits/rejected": -2.081435441970825, + "logps/chosen": -2.0622353553771973, + "logps/rejected": -13.775341987609863, + "loss": 0.5857, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7388877868652344, + "rewards/margins": -0.006993889808654785, + "rewards/rejected": 0.7458816766738892, + "step": 243 + }, + { + "epoch": 0.86, + "learning_rate": 8.366403438985674e-08, + "logits/chosen": -2.025540590286255, + "logits/rejected": -2.0453453063964844, + "logps/chosen": -1.3454320430755615, + "logps/rejected": -5.043482303619385, + "loss": 0.6912, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7872332334518433, + "rewards/margins": 0.11605805158615112, + "rewards/rejected": 0.6711751818656921, + "step": 244 + }, + { + "epoch": 0.87, + "learning_rate": 8.352266840284864e-08, + "logits/chosen": -2.040412425994873, + "logits/rejected": -2.0458834171295166, + "logps/chosen": -2.321018934249878, + "logps/rejected": -7.628360271453857, + "loss": 0.6527, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8915587663650513, + "rewards/margins": 0.2677105665206909, + "rewards/rejected": 0.6238482594490051, + "step": 245 + }, + { + "epoch": 0.87, + "learning_rate": 8.338081394557891e-08, + "logits/chosen": -2.052135705947876, + "logits/rejected": -2.125215530395508, + "logps/chosen": -2.4266481399536133, + "logps/rejected": -23.800628662109375, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7930386066436768, + "rewards/margins": 0.29625195264816284, + "rewards/rejected": 0.4967867136001587, + "step": 246 + }, + { + "epoch": 0.87, + "learning_rate": 8.323847308505732e-08, + "logits/chosen": -1.9952982664108276, + "logits/rejected": -2.014610528945923, + "logps/chosen": -2.199622392654419, + "logps/rejected": -14.449346542358398, + "loss": 0.5776, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8040448427200317, + "rewards/margins": -0.006867557764053345, + "rewards/rejected": 0.8109123706817627, + "step": 247 + }, + { + "epoch": 0.88, + "learning_rate": 8.30956478953812e-08, + "logits/chosen": -2.0036799907684326, + "logits/rejected": -2.004887580871582, + "logps/chosen": -0.9592111110687256, + "logps/rejected": -2.692075490951538, + "loss": 0.71, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7733142971992493, + "rewards/margins": 0.16241902112960815, + "rewards/rejected": 0.6108952760696411, + "step": 248 + }, + { + "epoch": 0.88, + "learning_rate": 8.295234045770523e-08, + "logits/chosen": -1.9926023483276367, + "logits/rejected": -2.0012764930725098, + "logps/chosen": -2.1513078212738037, + "logps/rejected": -8.409695625305176, + "loss": 0.5962, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7657176852226257, + "rewards/margins": 0.03309363126754761, + "rewards/rejected": 0.7326240539550781, + "step": 249 + }, + { + "epoch": 0.88, + "learning_rate": 8.280855286021109e-08, + "logits/chosen": -2.0206947326660156, + "logits/rejected": -2.0321922302246094, + "logps/chosen": -0.8595100045204163, + "logps/rejected": -8.573554039001465, + "loss": 0.6634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7347872257232666, + "rewards/margins": 0.18226905167102814, + "rewards/rejected": 0.5525181889533997, + "step": 250 + }, + { + "epoch": 0.89, + "learning_rate": 8.266428719807699e-08, + "logits/chosen": -2.066925525665283, + "logits/rejected": -2.0671117305755615, + "logps/chosen": -1.7105846405029297, + "logps/rejected": -2.192788600921631, + "loss": 0.7116, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6887451410293579, + "rewards/margins": 0.025359541177749634, + "rewards/rejected": 0.6633856296539307, + "step": 251 + }, + { + "epoch": 0.89, + "learning_rate": 8.251954557344723e-08, + "logits/chosen": -1.9701435565948486, + "logits/rejected": -1.9664626121520996, + "logps/chosen": -6.696351528167725, + "logps/rejected": -1.6353956460952759, + "loss": 0.7331, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7137150764465332, + "rewards/margins": -0.26114216446876526, + "rewards/rejected": 0.9748572111129761, + "step": 252 + }, + { + "epoch": 0.89, + "learning_rate": 8.237433009540149e-08, + "logits/chosen": -1.9939554929733276, + "logits/rejected": -2.00881290435791, + "logps/chosen": -4.237756252288818, + "logps/rejected": -11.694608688354492, + "loss": 0.6814, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8004196882247925, + "rewards/margins": 0.07762017846107483, + "rewards/rejected": 0.72279953956604, + "step": 253 + }, + { + "epoch": 0.9, + "learning_rate": 8.222864287992418e-08, + "logits/chosen": -2.1223180294036865, + "logits/rejected": -2.131164789199829, + "logps/chosen": -1.254389762878418, + "logps/rejected": -11.415414810180664, + "loss": 0.6265, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7115257978439331, + "rewards/margins": -0.1219845712184906, + "rewards/rejected": 0.8335103988647461, + "step": 254 + }, + { + "epoch": 0.9, + "learning_rate": 8.208248604987348e-08, + "logits/chosen": -2.0219316482543945, + "logits/rejected": -2.0165982246398926, + "logps/chosen": -2.914048910140991, + "logps/rejected": -12.565868377685547, + "loss": 0.6615, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6997090578079224, + "rewards/margins": 0.009253442287445068, + "rewards/rejected": 0.6904555559158325, + "step": 255 + }, + { + "epoch": 0.9, + "learning_rate": 8.193586173495056e-08, + "logits/chosen": -2.0333807468414307, + "logits/rejected": -2.0341591835021973, + "logps/chosen": -3.2079033851623535, + "logps/rejected": -10.881457328796387, + "loss": 0.7607, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9320360422134399, + "rewards/margins": 0.1821029782295227, + "rewards/rejected": 0.7499330639839172, + "step": 256 + }, + { + "epoch": 0.91, + "learning_rate": 8.178877207166841e-08, + "logits/chosen": -1.9662516117095947, + "logits/rejected": -1.9634521007537842, + "logps/chosen": -6.616245269775391, + "logps/rejected": -2.920659303665161, + "loss": 0.6291, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9544394016265869, + "rewards/margins": 0.34666335582733154, + "rewards/rejected": 0.6077760457992554, + "step": 257 + }, + { + "epoch": 0.91, + "learning_rate": 8.164121920332083e-08, + "logits/chosen": -1.976347804069519, + "logits/rejected": -1.9804260730743408, + "logps/chosen": -0.9897197484970093, + "logps/rejected": -3.9219164848327637, + "loss": 0.6999, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6954189538955688, + "rewards/margins": 0.09803502261638641, + "rewards/rejected": 0.5973839163780212, + "step": 258 + }, + { + "epoch": 0.92, + "learning_rate": 8.149320527995109e-08, + "logits/chosen": -1.9843168258666992, + "logits/rejected": -1.9992660284042358, + "logps/chosen": -3.1392717361450195, + "logps/rejected": -9.588376998901367, + "loss": 0.6064, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9176877737045288, + "rewards/margins": 0.17481625080108643, + "rewards/rejected": 0.7428715229034424, + "step": 259 + }, + { + "epoch": 0.92, + "learning_rate": 8.134473245832069e-08, + "logits/chosen": -2.0085484981536865, + "logits/rejected": -2.0139050483703613, + "logps/chosen": -1.0605796575546265, + "logps/rejected": -15.942989349365234, + "loss": 0.6049, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7912999391555786, + "rewards/margins": -0.14433300495147705, + "rewards/rejected": 0.9356329441070557, + "step": 260 + }, + { + "epoch": 0.92, + "learning_rate": 8.119580290187783e-08, + "logits/chosen": -2.0000667572021484, + "logits/rejected": -1.9996501207351685, + "logps/chosen": -2.2994437217712402, + "logps/rejected": -3.149878978729248, + "loss": 0.6774, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.701819658279419, + "rewards/margins": -0.04600280523300171, + "rewards/rejected": 0.7478225231170654, + "step": 261 + }, + { + "epoch": 0.93, + "learning_rate": 8.104641878072602e-08, + "logits/chosen": -1.9547855854034424, + "logits/rejected": -1.962069034576416, + "logps/chosen": -2.0996174812316895, + "logps/rejected": -9.01628589630127, + "loss": 0.707, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7135105133056641, + "rewards/margins": -0.0973767340183258, + "rewards/rejected": 0.8108872175216675, + "step": 262 + }, + { + "epoch": 0.93, + "learning_rate": 8.089658227159237e-08, + "logits/chosen": -2.0127103328704834, + "logits/rejected": -2.064171552658081, + "logps/chosen": -0.8541543483734131, + "logps/rejected": -27.97882843017578, + "loss": 0.6958, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8438607454299927, + "rewards/margins": 0.23042979836463928, + "rewards/rejected": 0.6134309768676758, + "step": 263 + }, + { + "epoch": 0.93, + "learning_rate": 8.074629555779584e-08, + "logits/chosen": -2.0469419956207275, + "logits/rejected": -2.0592868328094482, + "logps/chosen": -0.9459546804428101, + "logps/rejected": -15.687143325805664, + "loss": 0.6871, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.743391752243042, + "rewards/margins": -0.1177804172039032, + "rewards/rejected": 0.8611721992492676, + "step": 264 + }, + { + "epoch": 0.94, + "learning_rate": 8.059556082921556e-08, + "logits/chosen": -2.023913860321045, + "logits/rejected": -2.033966541290283, + "logps/chosen": -3.157127857208252, + "logps/rejected": -11.410080909729004, + "loss": 0.6555, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7509087920188904, + "rewards/margins": 0.003617703914642334, + "rewards/rejected": 0.747291088104248, + "step": 265 + }, + { + "epoch": 0.94, + "learning_rate": 8.044438028225878e-08, + "logits/chosen": -2.0005507469177246, + "logits/rejected": -2.011519432067871, + "logps/chosen": -5.190197944641113, + "logps/rejected": -4.1701741218566895, + "loss": 0.5981, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.996796190738678, + "rewards/margins": 0.3431049585342407, + "rewards/rejected": 0.6536912322044373, + "step": 266 + }, + { + "epoch": 0.94, + "learning_rate": 8.029275611982888e-08, + "logits/chosen": -2.0648748874664307, + "logits/rejected": -2.060323715209961, + "logps/chosen": -2.114431142807007, + "logps/rejected": -9.077127456665039, + "loss": 0.7168, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9330071210861206, + "rewards/margins": 0.18787460029125214, + "rewards/rejected": 0.745132565498352, + "step": 267 + }, + { + "epoch": 0.95, + "learning_rate": 8.014069055129345e-08, + "logits/chosen": -2.0536751747131348, + "logits/rejected": -2.0601651668548584, + "logps/chosen": -2.770223617553711, + "logps/rejected": -10.173649787902832, + "loss": 0.6261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8561105132102966, + "rewards/margins": 0.126519113779068, + "rewards/rejected": 0.7295913696289062, + "step": 268 + }, + { + "epoch": 0.95, + "learning_rate": 7.998818579245182e-08, + "logits/chosen": -2.012517213821411, + "logits/rejected": -2.0222887992858887, + "logps/chosen": -2.683431625366211, + "logps/rejected": -5.2774505615234375, + "loss": 0.6518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9051965475082397, + "rewards/margins": 0.17432913184165955, + "rewards/rejected": 0.7308673858642578, + "step": 269 + }, + { + "epoch": 0.95, + "learning_rate": 7.983524406550299e-08, + "logits/chosen": -2.067622184753418, + "logits/rejected": -2.0707478523254395, + "logps/chosen": -1.7032065391540527, + "logps/rejected": -2.633174419403076, + "loss": 0.6512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7628061771392822, + "rewards/margins": 0.2551374137401581, + "rewards/rejected": 0.5076687335968018, + "step": 270 + }, + { + "epoch": 0.96, + "learning_rate": 7.968186759901314e-08, + "logits/chosen": -2.1290574073791504, + "logits/rejected": -2.130589485168457, + "logps/chosen": -1.9920811653137207, + "logps/rejected": -10.725828170776367, + "loss": 0.5281, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7926328182220459, + "rewards/margins": 0.09855842590332031, + "rewards/rejected": 0.6940743923187256, + "step": 271 + }, + { + "epoch": 0.96, + "learning_rate": 7.95280586278832e-08, + "logits/chosen": -1.9788604974746704, + "logits/rejected": -1.9758750200271606, + "logps/chosen": -10.761131286621094, + "logps/rejected": -6.1180195808410645, + "loss": 0.6884, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8441171646118164, + "rewards/margins": 0.2913113832473755, + "rewards/rejected": 0.5528057217597961, + "step": 272 + }, + { + "epoch": 0.96, + "learning_rate": 7.937381939331628e-08, + "logits/chosen": -2.01127290725708, + "logits/rejected": -2.03305983543396, + "logps/chosen": -2.347101926803589, + "logps/rejected": -11.549613952636719, + "loss": 0.6458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7844542264938354, + "rewards/margins": 0.49644985795021057, + "rewards/rejected": 0.2880043685436249, + "step": 273 + }, + { + "epoch": 0.97, + "learning_rate": 7.921915214278498e-08, + "logits/chosen": -2.0273547172546387, + "logits/rejected": -2.0568928718566895, + "logps/chosen": -2.918302059173584, + "logps/rejected": -10.682862281799316, + "loss": 0.5683, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8606523275375366, + "rewards/margins": 0.1806512475013733, + "rewards/rejected": 0.6800010800361633, + "step": 274 + }, + { + "epoch": 0.97, + "learning_rate": 7.90640591299987e-08, + "logits/chosen": -2.0346760749816895, + "logits/rejected": -2.033754348754883, + "logps/chosen": -1.4718921184539795, + "logps/rejected": -3.0727133750915527, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7706084251403809, + "rewards/margins": 0.17165224254131317, + "rewards/rejected": 0.5989562273025513, + "step": 275 + }, + { + "epoch": 0.98, + "learning_rate": 7.890854261487073e-08, + "logits/chosen": -2.0572245121002197, + "logits/rejected": -2.062751531600952, + "logps/chosen": -4.085022926330566, + "logps/rejected": -2.854490280151367, + "loss": 0.7191, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8339813947677612, + "rewards/margins": 0.3691835403442383, + "rewards/rejected": 0.46479785442352295, + "step": 276 + }, + { + "epoch": 0.98, + "learning_rate": 7.875260486348541e-08, + "logits/chosen": -2.0174288749694824, + "logits/rejected": -2.0146255493164062, + "logps/chosen": -1.1995835304260254, + "logps/rejected": -8.963883399963379, + "loss": 0.6725, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6726590394973755, + "rewards/margins": -0.17038249969482422, + "rewards/rejected": 0.8430415391921997, + "step": 277 + }, + { + "epoch": 0.98, + "learning_rate": 7.8596248148065e-08, + "logits/chosen": -2.018212080001831, + "logits/rejected": -2.0166549682617188, + "logps/chosen": -0.7451326251029968, + "logps/rejected": -7.534337043762207, + "loss": 0.7196, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7188433408737183, + "rewards/margins": -0.1244419515132904, + "rewards/rejected": 0.8432852625846863, + "step": 278 + }, + { + "epoch": 0.99, + "learning_rate": 7.843947474693665e-08, + "logits/chosen": -1.9857819080352783, + "logits/rejected": -1.98786199092865, + "logps/chosen": -1.7179996967315674, + "logps/rejected": -9.278009414672852, + "loss": 0.7478, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6938050985336304, + "rewards/margins": -0.21930626034736633, + "rewards/rejected": 0.9131113290786743, + "step": 279 + }, + { + "epoch": 0.99, + "learning_rate": 7.828228694449919e-08, + "logits/chosen": -1.9977566003799438, + "logits/rejected": -2.000732183456421, + "logps/chosen": -1.713930368423462, + "logps/rejected": -3.284440517425537, + "loss": 0.5314, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8067037463188171, + "rewards/margins": 0.38464850187301636, + "rewards/rejected": 0.4220552444458008, + "step": 280 + }, + { + "epoch": 0.99, + "learning_rate": 7.812468703118984e-08, + "logits/chosen": -2.068929433822632, + "logits/rejected": -2.073986530303955, + "logps/chosen": -2.070213556289673, + "logps/rejected": -3.0810914039611816, + "loss": 0.7429, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8320354223251343, + "rewards/margins": 0.18829843401908875, + "rewards/rejected": 0.6437369585037231, + "step": 281 + }, + { + "epoch": 1.0, + "learning_rate": 7.796667730345082e-08, + "logits/chosen": -1.984861969947815, + "logits/rejected": -2.000248432159424, + "logps/chosen": -1.4982824325561523, + "logps/rejected": -12.7310791015625, + "loss": 0.5995, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8827951550483704, + "rewards/margins": 0.07727503776550293, + "rewards/rejected": 0.8055201172828674, + "step": 282 + }, + { + "epoch": 1.0, + "learning_rate": 7.780826006369585e-08, + "logits/chosen": -1.9896522760391235, + "logits/rejected": -1.9915542602539062, + "logps/chosen": -3.7270267009735107, + "logps/rejected": -5.191914081573486, + "loss": 0.6166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8569517135620117, + "rewards/margins": 0.30243414640426636, + "rewards/rejected": 0.5545175671577454, + "step": 283 + }, + { + "epoch": 1.0, + "learning_rate": 7.764943762027674e-08, + "logits/chosen": -2.054262638092041, + "logits/rejected": -2.055432081222534, + "logps/chosen": -0.6730564832687378, + "logps/rejected": -8.582691192626953, + "loss": 0.7044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.870444655418396, + "rewards/margins": 0.23393258452415466, + "rewards/rejected": 0.636512041091919, + "step": 284 + }, + { + "epoch": 1.01, + "learning_rate": 7.749021228744958e-08, + "logits/chosen": -1.998663067817688, + "logits/rejected": -1.9903998374938965, + "logps/chosen": -9.386507034301758, + "logps/rejected": -2.5542211532592773, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0357084274291992, + "rewards/margins": 0.40424999594688416, + "rewards/rejected": 0.6314584612846375, + "step": 285 + }, + { + "epoch": 1.01, + "learning_rate": 7.733058638534112e-08, + "logits/chosen": -2.084587574005127, + "logits/rejected": -2.0872364044189453, + "logps/chosen": -7.870680809020996, + "logps/rejected": -3.500558853149414, + "loss": 0.5876, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.077088713645935, + "rewards/margins": 0.5478217601776123, + "rewards/rejected": 0.5292670130729675, + "step": 286 + }, + { + "epoch": 1.01, + "learning_rate": 7.717056223991498e-08, + "logits/chosen": -1.9853637218475342, + "logits/rejected": -1.9889427423477173, + "logps/chosen": -3.088313579559326, + "logps/rejected": -2.9851245880126953, + "loss": 0.6628, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.903223991394043, + "rewards/margins": 0.2362876832485199, + "rewards/rejected": 0.6669362783432007, + "step": 287 + }, + { + "epoch": 1.02, + "learning_rate": 7.701014218293767e-08, + "logits/chosen": -1.942586064338684, + "logits/rejected": -1.933422565460205, + "logps/chosen": -2.249345064163208, + "logps/rejected": -2.4885051250457764, + "loss": 0.7481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6005504131317139, + "rewards/margins": -0.37595248222351074, + "rewards/rejected": 0.9765028953552246, + "step": 288 + }, + { + "epoch": 1.02, + "learning_rate": 7.68493285519447e-08, + "logits/chosen": -2.0965819358825684, + "logits/rejected": -2.1080195903778076, + "logps/chosen": -2.6352999210357666, + "logps/rejected": -5.330697059631348, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9015946388244629, + "rewards/margins": 0.4865310788154602, + "rewards/rejected": 0.41506361961364746, + "step": 289 + }, + { + "epoch": 1.02, + "learning_rate": 7.668812369020644e-08, + "logits/chosen": -2.0958337783813477, + "logits/rejected": -2.0768799781799316, + "logps/chosen": -2.8491268157958984, + "logps/rejected": -10.017556190490723, + "loss": 0.6958, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7864257097244263, + "rewards/margins": -0.09470182657241821, + "rewards/rejected": 0.8811274766921997, + "step": 290 + }, + { + "epoch": 1.03, + "learning_rate": 7.652652994669407e-08, + "logits/chosen": -2.0162949562072754, + "logits/rejected": -2.0210070610046387, + "logps/chosen": -4.678500652313232, + "logps/rejected": -12.152202606201172, + "loss": 0.692, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7361276149749756, + "rewards/margins": -0.08063369989395142, + "rewards/rejected": 0.816761314868927, + "step": 291 + }, + { + "epoch": 1.03, + "learning_rate": 7.636454967604523e-08, + "logits/chosen": -1.9910411834716797, + "logits/rejected": -1.9923514127731323, + "logps/chosen": -1.4496328830718994, + "logps/rejected": -8.933928489685059, + "loss": 0.5734, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7971479296684265, + "rewards/margins": -0.058545202016830444, + "rewards/rejected": 0.8556931018829346, + "step": 292 + }, + { + "epoch": 1.04, + "learning_rate": 7.620218523852986e-08, + "logits/chosen": -1.9570904970169067, + "logits/rejected": -1.9704766273498535, + "logps/chosen": -3.1132149696350098, + "logps/rejected": -15.074378967285156, + "loss": 0.6463, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8294650316238403, + "rewards/margins": -0.15822634100914001, + "rewards/rejected": 0.987691342830658, + "step": 293 + }, + { + "epoch": 1.04, + "learning_rate": 7.603943900001566e-08, + "logits/chosen": -2.0177698135375977, + "logits/rejected": -2.016324758529663, + "logps/chosen": -1.5496196746826172, + "logps/rejected": -8.673208236694336, + "loss": 0.6696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7908488512039185, + "rewards/margins": 0.1824064552783966, + "rewards/rejected": 0.6084424257278442, + "step": 294 + }, + { + "epoch": 1.04, + "learning_rate": 7.587631333193372e-08, + "logits/chosen": -2.0039305686950684, + "logits/rejected": -1.9959478378295898, + "logps/chosen": -9.702101707458496, + "logps/rejected": -2.0601956844329834, + "loss": 0.6737, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8690930604934692, + "rewards/margins": -0.07714074850082397, + "rewards/rejected": 0.946233868598938, + "step": 295 + }, + { + "epoch": 1.05, + "learning_rate": 7.571281061124393e-08, + "logits/chosen": -1.9353293180465698, + "logits/rejected": -1.9348620176315308, + "logps/chosen": -1.9172816276550293, + "logps/rejected": -3.3404147624969482, + "loss": 0.7157, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8089134693145752, + "rewards/margins": -0.04740190505981445, + "rewards/rejected": 0.8563153743743896, + "step": 296 + }, + { + "epoch": 1.05, + "learning_rate": 7.554893322040031e-08, + "logits/chosen": -1.991294026374817, + "logits/rejected": -1.9886553287506104, + "logps/chosen": -3.2436957359313965, + "logps/rejected": -1.4870834350585938, + "loss": 0.7519, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.599423885345459, + "rewards/margins": -0.22191965579986572, + "rewards/rejected": 0.8213435411453247, + "step": 297 + }, + { + "epoch": 1.05, + "learning_rate": 7.538468354731636e-08, + "logits/chosen": -1.962701439857483, + "logits/rejected": -1.9637813568115234, + "logps/chosen": -0.7486076354980469, + "logps/rejected": -2.967113494873047, + "loss": 0.7196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7177558541297913, + "rewards/margins": 0.03695139288902283, + "rewards/rejected": 0.6808044910430908, + "step": 298 + }, + { + "epoch": 1.06, + "learning_rate": 7.522006398533021e-08, + "logits/chosen": -1.9786303043365479, + "logits/rejected": -1.9869661331176758, + "logps/chosen": -2.6877264976501465, + "logps/rejected": -9.894564628601074, + "loss": 0.679, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7217094898223877, + "rewards/margins": 0.005373239517211914, + "rewards/rejected": 0.7163362503051758, + "step": 299 + }, + { + "epoch": 1.06, + "learning_rate": 7.505507693316976e-08, + "logits/chosen": -1.8982717990875244, + "logits/rejected": -1.9168156385421753, + "logps/chosen": -0.9648281335830688, + "logps/rejected": -13.72926139831543, + "loss": 0.6445, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8843680620193481, + "rewards/margins": 0.12152692675590515, + "rewards/rejected": 0.7628411650657654, + "step": 300 + }, + { + "epoch": 1.06, + "learning_rate": 7.488972479491777e-08, + "logits/chosen": -2.0338046550750732, + "logits/rejected": -2.037698984146118, + "logps/chosen": -3.084056854248047, + "logps/rejected": -2.588416576385498, + "loss": 0.5597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8710681200027466, + "rewards/margins": 0.2407524138689041, + "rewards/rejected": 0.6303157210350037, + "step": 301 + }, + { + "epoch": 1.07, + "learning_rate": 7.472400997997679e-08, + "logits/chosen": -1.9906104803085327, + "logits/rejected": -1.9838097095489502, + "logps/chosen": -1.721041202545166, + "logps/rejected": -10.674446105957031, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8119497299194336, + "rewards/margins": -0.04057621955871582, + "rewards/rejected": 0.8525259494781494, + "step": 302 + }, + { + "epoch": 1.07, + "learning_rate": 7.455793490303402e-08, + "logits/chosen": -1.9903233051300049, + "logits/rejected": -1.9888968467712402, + "logps/chosen": -2.0726757049560547, + "logps/rejected": -3.281386137008667, + "loss": 0.7442, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5871396064758301, + "rewards/margins": -0.15587067604064941, + "rewards/rejected": 0.7430102825164795, + "step": 303 + }, + { + "epoch": 1.07, + "learning_rate": 7.439150198402618e-08, + "logits/chosen": -1.9777837991714478, + "logits/rejected": -1.9916486740112305, + "logps/chosen": -2.847322940826416, + "logps/rejected": -6.55313777923584, + "loss": 0.594, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8119808435440063, + "rewards/margins": 0.03175652027130127, + "rewards/rejected": 0.7802243232727051, + "step": 304 + }, + { + "epoch": 1.08, + "learning_rate": 7.422471364810425e-08, + "logits/chosen": -1.990604043006897, + "logits/rejected": -1.986659049987793, + "logps/chosen": -2.203933000564575, + "logps/rejected": -3.870115041732788, + "loss": 0.6725, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7289800643920898, + "rewards/margins": -0.043648868799209595, + "rewards/rejected": 0.7726289629936218, + "step": 305 + }, + { + "epoch": 1.08, + "learning_rate": 7.405757232559807e-08, + "logits/chosen": -2.043447732925415, + "logits/rejected": -2.046013832092285, + "logps/chosen": -2.3495230674743652, + "logps/rejected": -2.637486219406128, + "loss": 0.6138, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7096340656280518, + "rewards/margins": -0.04712647199630737, + "rewards/rejected": 0.7567605376243591, + "step": 306 + }, + { + "epoch": 1.08, + "learning_rate": 7.389008045198102e-08, + "logits/chosen": -2.0475051403045654, + "logits/rejected": -2.052319288253784, + "logps/chosen": -1.213356614112854, + "logps/rejected": -11.598196029663086, + "loss": 0.7479, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8639934062957764, + "rewards/margins": 0.08156710863113403, + "rewards/rejected": 0.7824262976646423, + "step": 307 + }, + { + "epoch": 1.09, + "learning_rate": 7.37222404678344e-08, + "logits/chosen": -2.014669895172119, + "logits/rejected": -2.0189287662506104, + "logps/chosen": -11.201423645019531, + "logps/rejected": -3.7446539402008057, + "loss": 0.6133, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0889314413070679, + "rewards/margins": 0.4146066904067993, + "rewards/rejected": 0.6743247509002686, + "step": 308 + }, + { + "epoch": 1.09, + "learning_rate": 7.355405481881205e-08, + "logits/chosen": -1.9965211153030396, + "logits/rejected": -1.996758222579956, + "logps/chosen": -1.13792085647583, + "logps/rejected": -3.0558927059173584, + "loss": 0.6725, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7382439970970154, + "rewards/margins": 0.0466332733631134, + "rewards/rejected": 0.6916106939315796, + "step": 309 + }, + { + "epoch": 1.1, + "learning_rate": 7.338552595560455e-08, + "logits/chosen": -2.07747745513916, + "logits/rejected": -2.0792622566223145, + "logps/chosen": -1.2983989715576172, + "logps/rejected": -3.046955108642578, + "loss": 0.7437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7616469860076904, + "rewards/margins": 0.29341256618499756, + "rewards/rejected": 0.4682343900203705, + "step": 310 + }, + { + "epoch": 1.1, + "learning_rate": 7.321665633390355e-08, + "logits/chosen": -2.0619046688079834, + "logits/rejected": -2.0734477043151855, + "logps/chosen": -2.2710649967193604, + "logps/rejected": -14.841316223144531, + "loss": 0.7032, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8230923414230347, + "rewards/margins": 0.08683857321739197, + "rewards/rejected": 0.7362537980079651, + "step": 311 + }, + { + "epoch": 1.1, + "learning_rate": 7.304744841436606e-08, + "logits/chosen": -2.0181169509887695, + "logits/rejected": -2.022831678390503, + "logps/chosen": -1.9184941053390503, + "logps/rejected": -9.570780754089355, + "loss": 0.6214, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8060866594314575, + "rewards/margins": 0.2242881953716278, + "rewards/rejected": 0.5817984342575073, + "step": 312 + }, + { + "epoch": 1.11, + "learning_rate": 7.287790466257852e-08, + "logits/chosen": -1.993025779724121, + "logits/rejected": -1.9937193393707275, + "logps/chosen": -1.3965160846710205, + "logps/rejected": -2.339536666870117, + "loss": 0.6416, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7527121305465698, + "rewards/margins": 0.18835368752479553, + "rewards/rejected": 0.5643584728240967, + "step": 313 + }, + { + "epoch": 1.11, + "learning_rate": 7.27080275490209e-08, + "logits/chosen": -2.0876731872558594, + "logits/rejected": -2.0893948078155518, + "logps/chosen": -1.0443552732467651, + "logps/rejected": -2.0564448833465576, + "loss": 0.58, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6473686099052429, + "rewards/margins": -0.02189585566520691, + "rewards/rejected": 0.6692644357681274, + "step": 314 + }, + { + "epoch": 1.11, + "learning_rate": 7.253781954903072e-08, + "logits/chosen": -2.0055127143859863, + "logits/rejected": -2.015591621398926, + "logps/chosen": -0.8843281269073486, + "logps/rejected": -10.02095890045166, + "loss": 0.788, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7771568298339844, + "rewards/margins": -0.055948227643966675, + "rewards/rejected": 0.8331050276756287, + "step": 315 + }, + { + "epoch": 1.12, + "learning_rate": 7.236728314276691e-08, + "logits/chosen": -1.9985418319702148, + "logits/rejected": -1.9976329803466797, + "logps/chosen": -4.562763214111328, + "logps/rejected": -20.644779205322266, + "loss": 0.7198, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7205911874771118, + "rewards/margins": -0.23204004764556885, + "rewards/rejected": 0.9526312351226807, + "step": 316 + }, + { + "epoch": 1.12, + "learning_rate": 7.219642081517373e-08, + "logits/chosen": -2.0287413597106934, + "logits/rejected": -2.036536455154419, + "logps/chosen": -2.6645054817199707, + "logps/rejected": -3.0267491340637207, + "loss": 0.6091, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0952333211898804, + "rewards/margins": 0.614410400390625, + "rewards/rejected": 0.48082292079925537, + "step": 317 + }, + { + "epoch": 1.12, + "learning_rate": 7.20252350559446e-08, + "logits/chosen": -2.0793330669403076, + "logits/rejected": -2.130396842956543, + "logps/chosen": -6.362734317779541, + "logps/rejected": -12.263890266418457, + "loss": 0.6155, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9979767799377441, + "rewards/margins": 0.07183092832565308, + "rewards/rejected": 0.9261458516120911, + "step": 318 + }, + { + "epoch": 1.13, + "learning_rate": 7.185372835948573e-08, + "logits/chosen": -2.0291504859924316, + "logits/rejected": -2.024451494216919, + "logps/chosen": -3.1881606578826904, + "logps/rejected": -10.596033096313477, + "loss": 0.6645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7987775802612305, + "rewards/margins": 0.0586593896150589, + "rewards/rejected": 0.740118145942688, + "step": 319 + }, + { + "epoch": 1.13, + "learning_rate": 7.168190322487981e-08, + "logits/chosen": -2.0316481590270996, + "logits/rejected": -2.0340769290924072, + "logps/chosen": -2.8218655586242676, + "logps/rejected": -2.17043399810791, + "loss": 0.6446, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0784660577774048, + "rewards/margins": 0.517170786857605, + "rewards/rejected": 0.5612952709197998, + "step": 320 + }, + { + "epoch": 1.13, + "learning_rate": 7.150976215584966e-08, + "logits/chosen": -2.003220796585083, + "logits/rejected": -2.0043435096740723, + "logps/chosen": -2.87321400642395, + "logps/rejected": -2.5274009704589844, + "loss": 0.6375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8495907187461853, + "rewards/margins": 0.29277509450912476, + "rewards/rejected": 0.5568156242370605, + "step": 321 + }, + { + "epoch": 1.14, + "learning_rate": 7.133730766072162e-08, + "logits/chosen": -2.0633156299591064, + "logits/rejected": -2.0692267417907715, + "logps/chosen": -9.09179973602295, + "logps/rejected": -9.490880012512207, + "loss": 0.6555, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.124406337738037, + "rewards/margins": 0.2503308653831482, + "rewards/rejected": 0.8740755319595337, + "step": 322 + }, + { + "epoch": 1.14, + "learning_rate": 7.116454225238908e-08, + "logits/chosen": -2.106783390045166, + "logits/rejected": -2.1122419834136963, + "logps/chosen": -0.8738927841186523, + "logps/rejected": -3.512641191482544, + "loss": 0.6758, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8305176496505737, + "rewards/margins": 0.33088815212249756, + "rewards/rejected": 0.49962949752807617, + "step": 323 + }, + { + "epoch": 1.14, + "learning_rate": 7.09914684482759e-08, + "logits/chosen": -2.119508981704712, + "logits/rejected": -2.1236839294433594, + "logps/chosen": -1.8333892822265625, + "logps/rejected": -2.865098476409912, + "loss": 0.5803, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8094558119773865, + "rewards/margins": 0.20902101695537567, + "rewards/rejected": 0.6004347801208496, + "step": 324 + }, + { + "epoch": 1.15, + "learning_rate": 7.081808877029962e-08, + "logits/chosen": -2.0140633583068848, + "logits/rejected": -2.0084357261657715, + "logps/chosen": -1.2889963388442993, + "logps/rejected": -9.432709693908691, + "loss": 0.6615, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8367033004760742, + "rewards/margins": -0.0639161467552185, + "rewards/rejected": 0.900619387626648, + "step": 325 + }, + { + "epoch": 1.15, + "learning_rate": 7.064440574483482e-08, + "logits/chosen": -2.0796027183532715, + "logits/rejected": -2.0861761569976807, + "logps/chosen": -1.0849804878234863, + "logps/rejected": -12.56739330291748, + "loss": 0.6917, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8204231858253479, + "rewards/margins": -0.10428822040557861, + "rewards/rejected": 0.9247114062309265, + "step": 326 + }, + { + "epoch": 1.16, + "learning_rate": 7.047042190267624e-08, + "logits/chosen": -2.047593355178833, + "logits/rejected": -2.050189971923828, + "logps/chosen": -3.5930817127227783, + "logps/rejected": -10.398880004882812, + "loss": 0.8824, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8630772829055786, + "rewards/margins": 0.20445239543914795, + "rewards/rejected": 0.6586248874664307, + "step": 327 + }, + { + "epoch": 1.16, + "learning_rate": 7.02961397790019e-08, + "logits/chosen": -2.0194778442382812, + "logits/rejected": -2.0251212120056152, + "logps/chosen": -0.9596530199050903, + "logps/rejected": -13.002801895141602, + "loss": 0.5491, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7678300738334656, + "rewards/margins": -0.21036159992218018, + "rewards/rejected": 0.9781916737556458, + "step": 328 + }, + { + "epoch": 1.16, + "learning_rate": 7.012156191333624e-08, + "logits/chosen": -2.038062572479248, + "logits/rejected": -2.0394537448883057, + "logps/chosen": -8.818267822265625, + "logps/rejected": -7.720042705535889, + "loss": 0.5583, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1968612670898438, + "rewards/margins": 0.3386574685573578, + "rewards/rejected": 0.8582038283348083, + "step": 329 + }, + { + "epoch": 1.17, + "learning_rate": 6.994669084951302e-08, + "logits/chosen": -2.0249743461608887, + "logits/rejected": -2.027527093887329, + "logps/chosen": -5.559605598449707, + "logps/rejected": -13.58544921875, + "loss": 0.7514, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6256029605865479, + "rewards/margins": -0.07106775045394897, + "rewards/rejected": 0.696670651435852, + "step": 330 + }, + { + "epoch": 1.17, + "learning_rate": 6.977152913563824e-08, + "logits/chosen": -2.021636724472046, + "logits/rejected": -2.080385208129883, + "logps/chosen": -3.5473177433013916, + "logps/rejected": -7.292891025543213, + "loss": 0.6547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9151884317398071, + "rewards/margins": 0.2896645665168762, + "rewards/rejected": 0.6255238652229309, + "step": 331 + }, + { + "epoch": 1.17, + "learning_rate": 6.95960793240532e-08, + "logits/chosen": -2.0089399814605713, + "logits/rejected": -2.087005615234375, + "logps/chosen": -0.9165514707565308, + "logps/rejected": -25.28814697265625, + "loss": 0.6437, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.752465009689331, + "rewards/margins": -0.1517137885093689, + "rewards/rejected": 0.9041788578033447, + "step": 332 + }, + { + "epoch": 1.18, + "learning_rate": 6.942034397129702e-08, + "logits/chosen": -1.9609380960464478, + "logits/rejected": -1.9617955684661865, + "logps/chosen": -1.6671147346496582, + "logps/rejected": -3.519850492477417, + "loss": 0.5759, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7335266470909119, + "rewards/margins": -0.03835830092430115, + "rewards/rejected": 0.7718849182128906, + "step": 333 + }, + { + "epoch": 1.18, + "learning_rate": 6.924432563806961e-08, + "logits/chosen": -2.0079450607299805, + "logits/rejected": -2.018519401550293, + "logps/chosen": -0.8546421527862549, + "logps/rejected": -9.57946491241455, + "loss": 0.6843, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7535923719406128, + "rewards/margins": -0.182002991437912, + "rewards/rejected": 0.9355953931808472, + "step": 334 + }, + { + "epoch": 1.18, + "learning_rate": 6.906802688919433e-08, + "logits/chosen": -1.9951661825180054, + "logits/rejected": -2.003023147583008, + "logps/chosen": -1.3854504823684692, + "logps/rejected": -7.042419910430908, + "loss": 0.4681, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8765751123428345, + "rewards/margins": 0.21594572067260742, + "rewards/rejected": 0.660629391670227, + "step": 335 + }, + { + "epoch": 1.19, + "learning_rate": 6.889145029358045e-08, + "logits/chosen": -2.0105044841766357, + "logits/rejected": -2.0123848915100098, + "logps/chosen": -1.3919436931610107, + "logps/rejected": -2.762155055999756, + "loss": 0.7479, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8933443427085876, + "rewards/margins": 0.24052664637565613, + "rewards/rejected": 0.6528177261352539, + "step": 336 + }, + { + "epoch": 1.19, + "learning_rate": 6.871459842418595e-08, + "logits/chosen": -2.0307400226593018, + "logits/rejected": -2.028308868408203, + "logps/chosen": -1.9297596216201782, + "logps/rejected": -3.9943933486938477, + "loss": 0.7873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8276687860488892, + "rewards/margins": 0.20637744665145874, + "rewards/rejected": 0.6212913990020752, + "step": 337 + }, + { + "epoch": 1.19, + "learning_rate": 6.85374738579799e-08, + "logits/chosen": -2.0564050674438477, + "logits/rejected": -2.058974027633667, + "logps/chosen": -3.481093645095825, + "logps/rejected": -4.131351470947266, + "loss": 0.5572, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8785395622253418, + "rewards/margins": 0.3046349585056305, + "rewards/rejected": 0.5739045739173889, + "step": 338 + }, + { + "epoch": 1.2, + "learning_rate": 6.836007917590486e-08, + "logits/chosen": -2.0048842430114746, + "logits/rejected": -2.023575782775879, + "logps/chosen": -3.144120931625366, + "logps/rejected": -19.613452911376953, + "loss": 0.6436, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8191253542900085, + "rewards/margins": -0.07839921116828918, + "rewards/rejected": 0.8975245952606201, + "step": 339 + }, + { + "epoch": 1.2, + "learning_rate": 6.818241696283942e-08, + "logits/chosen": -2.0500543117523193, + "logits/rejected": -2.046323537826538, + "logps/chosen": -2.983139991760254, + "logps/rejected": -7.968719482421875, + "loss": 0.6975, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.656994104385376, + "rewards/margins": -0.29255202412605286, + "rewards/rejected": 0.9495460987091064, + "step": 340 + }, + { + "epoch": 1.2, + "learning_rate": 6.800448980756042e-08, + "logits/chosen": -2.0920300483703613, + "logits/rejected": -2.0917718410491943, + "logps/chosen": -4.075112342834473, + "logps/rejected": -2.7630691528320312, + "loss": 0.6884, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7794432640075684, + "rewards/margins": -0.06675291061401367, + "rewards/rejected": 0.846196174621582, + "step": 341 + }, + { + "epoch": 1.21, + "learning_rate": 6.782630030270524e-08, + "logits/chosen": -2.014333486557007, + "logits/rejected": -2.0250113010406494, + "logps/chosen": -1.4475176334381104, + "logps/rejected": -7.472857475280762, + "loss": 0.5526, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8901727199554443, + "rewards/margins": 0.18523412942886353, + "rewards/rejected": 0.704938530921936, + "step": 342 + }, + { + "epoch": 1.21, + "learning_rate": 6.76478510447341e-08, + "logits/chosen": -2.0116617679595947, + "logits/rejected": -2.0141403675079346, + "logps/chosen": -9.081931114196777, + "logps/rejected": -12.02839469909668, + "loss": 0.658, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0915391445159912, + "rewards/margins": 0.2879742980003357, + "rewards/rejected": 0.8035649061203003, + "step": 343 + }, + { + "epoch": 1.22, + "learning_rate": 6.746914463389215e-08, + "logits/chosen": -2.0077171325683594, + "logits/rejected": -2.0130529403686523, + "logps/chosen": -4.215682029724121, + "logps/rejected": -3.2171761989593506, + "loss": 0.7523, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8552757501602173, + "rewards/margins": 0.21854069828987122, + "rewards/rejected": 0.6367350220680237, + "step": 344 + }, + { + "epoch": 1.22, + "learning_rate": 6.729018367417158e-08, + "logits/chosen": -1.9797139167785645, + "logits/rejected": -1.9750332832336426, + "logps/chosen": -1.0432771444320679, + "logps/rejected": -6.228484153747559, + "loss": 0.7212, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7189686298370361, + "rewards/margins": -0.09437081217765808, + "rewards/rejected": 0.8133394122123718, + "step": 345 + }, + { + "epoch": 1.22, + "learning_rate": 6.711097077327372e-08, + "logits/chosen": -1.9801101684570312, + "logits/rejected": -1.9910528659820557, + "logps/chosen": -1.9608089923858643, + "logps/rejected": -6.921878337860107, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7725261449813843, + "rewards/margins": 0.11844681203365326, + "rewards/rejected": 0.6540793180465698, + "step": 346 + }, + { + "epoch": 1.23, + "learning_rate": 6.693150854257101e-08, + "logits/chosen": -2.113471746444702, + "logits/rejected": -2.1154465675354004, + "logps/chosen": -1.6178151369094849, + "logps/rejected": -8.522905349731445, + "loss": 0.6627, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8333930969238281, + "rewards/margins": 0.24293042719364166, + "rewards/rejected": 0.5904626250267029, + "step": 347 + }, + { + "epoch": 1.23, + "learning_rate": 6.675179959706898e-08, + "logits/chosen": -2.01448392868042, + "logits/rejected": -2.013353109359741, + "logps/chosen": -1.6884121894836426, + "logps/rejected": -12.11439323425293, + "loss": 0.7319, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7572202086448669, + "rewards/margins": -0.09969761967658997, + "rewards/rejected": 0.8569178581237793, + "step": 348 + }, + { + "epoch": 1.23, + "learning_rate": 6.657184655536809e-08, + "logits/chosen": -2.0546939373016357, + "logits/rejected": -2.0614895820617676, + "logps/chosen": -3.0008373260498047, + "logps/rejected": -6.879371643066406, + "loss": 0.6492, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8178433775901794, + "rewards/margins": 0.3057797849178314, + "rewards/rejected": 0.5120636224746704, + "step": 349 + }, + { + "epoch": 1.24, + "learning_rate": 6.639165203962567e-08, + "logits/chosen": -2.0457959175109863, + "logits/rejected": -2.0470223426818848, + "logps/chosen": -1.9715783596038818, + "logps/rejected": -3.0387394428253174, + "loss": 0.6709, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8274080753326416, + "rewards/margins": 0.29440683126449585, + "rewards/rejected": 0.5330012440681458, + "step": 350 + }, + { + "epoch": 1.24, + "learning_rate": 6.621121867551758e-08, + "logits/chosen": -2.088317394256592, + "logits/rejected": -2.081570863723755, + "logps/chosen": -12.680303573608398, + "logps/rejected": -8.495530128479004, + "loss": 0.7074, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0892140865325928, + "rewards/margins": 0.22910630702972412, + "rewards/rejected": 0.8601077198982239, + "step": 351 + }, + { + "epoch": 1.24, + "learning_rate": 6.603054909220004e-08, + "logits/chosen": -2.002457618713379, + "logits/rejected": -1.9931049346923828, + "logps/chosen": -3.5758438110351562, + "logps/rejected": -7.837817192077637, + "loss": 0.5831, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8774646520614624, + "rewards/margins": 0.16983163356781006, + "rewards/rejected": 0.7076330184936523, + "step": 352 + }, + { + "epoch": 1.25, + "learning_rate": 6.584964592227134e-08, + "logits/chosen": -2.016345977783203, + "logits/rejected": -2.015364170074463, + "logps/chosen": -9.993803024291992, + "logps/rejected": -4.708014011383057, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8112318515777588, + "rewards/margins": 0.21024617552757263, + "rewards/rejected": 0.6009856462478638, + "step": 353 + }, + { + "epoch": 1.25, + "learning_rate": 6.566851180173343e-08, + "logits/chosen": -2.0911872386932373, + "logits/rejected": -2.0965731143951416, + "logps/chosen": -3.4223368167877197, + "logps/rejected": -8.75579833984375, + "loss": 0.7093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6984277963638306, + "rewards/margins": -0.06073221564292908, + "rewards/rejected": 0.759160041809082, + "step": 354 + }, + { + "epoch": 1.25, + "learning_rate": 6.548714936995345e-08, + "logits/chosen": -2.0265090465545654, + "logits/rejected": -2.0358331203460693, + "logps/chosen": -10.39234447479248, + "logps/rejected": -4.398682594299316, + "loss": 0.6531, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.102057933807373, + "rewards/margins": 0.48081153631210327, + "rewards/rejected": 0.621246337890625, + "step": 355 + }, + { + "epoch": 1.26, + "learning_rate": 6.530556126962545e-08, + "logits/chosen": -1.9719972610473633, + "logits/rejected": -1.9776127338409424, + "logps/chosen": -1.4761799573898315, + "logps/rejected": -20.213436126708984, + "loss": 0.7115, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8310404419898987, + "rewards/margins": -0.27317073941230774, + "rewards/rejected": 1.1042112112045288, + "step": 356 + }, + { + "epoch": 1.26, + "learning_rate": 6.512375014673169e-08, + "logits/chosen": -2.017918825149536, + "logits/rejected": -2.017789602279663, + "logps/chosen": -0.5777454972267151, + "logps/rejected": -3.480546236038208, + "loss": 0.6393, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6879342794418335, + "rewards/margins": -0.15035218000411987, + "rewards/rejected": 0.8382864594459534, + "step": 357 + }, + { + "epoch": 1.27, + "learning_rate": 6.49417186505042e-08, + "logits/chosen": -1.9893862009048462, + "logits/rejected": -1.9894499778747559, + "logps/chosen": -5.3689093589782715, + "logps/rejected": -4.438340187072754, + "loss": 0.6843, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7023330330848694, + "rewards/margins": -0.10123127698898315, + "rewards/rejected": 0.8035643100738525, + "step": 358 + }, + { + "epoch": 1.27, + "learning_rate": 6.475946943338615e-08, + "logits/chosen": -2.0354816913604736, + "logits/rejected": -2.048110008239746, + "logps/chosen": -4.6014862060546875, + "logps/rejected": -11.300627708435059, + "loss": 0.6453, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0715820789337158, + "rewards/margins": 0.15998730063438416, + "rewards/rejected": 0.911594808101654, + "step": 359 + }, + { + "epoch": 1.27, + "learning_rate": 6.457700515099319e-08, + "logits/chosen": -2.004896640777588, + "logits/rejected": -2.0091536045074463, + "logps/chosen": -1.3737800121307373, + "logps/rejected": -3.973348379135132, + "loss": 0.5812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9505419731140137, + "rewards/margins": 0.5136680603027344, + "rewards/rejected": 0.4368739128112793, + "step": 360 + }, + { + "epoch": 1.28, + "learning_rate": 6.439432846207474e-08, + "logits/chosen": -2.101597309112549, + "logits/rejected": -2.104374885559082, + "logps/chosen": -1.7547948360443115, + "logps/rejected": -9.926070213317871, + "loss": 0.6181, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8136626482009888, + "rewards/margins": -0.12628039717674255, + "rewards/rejected": 0.9399430155754089, + "step": 361 + }, + { + "epoch": 1.28, + "learning_rate": 6.421144202847531e-08, + "logits/chosen": -1.9907695055007935, + "logits/rejected": -2.006349802017212, + "logps/chosen": -2.3501222133636475, + "logps/rejected": -11.349722862243652, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8593698143959045, + "rewards/margins": 0.4874004125595093, + "rewards/rejected": 0.37196940183639526, + "step": 362 + }, + { + "epoch": 1.28, + "learning_rate": 6.402834851509563e-08, + "logits/chosen": -2.066005229949951, + "logits/rejected": -2.0684781074523926, + "logps/chosen": -0.6392884254455566, + "logps/rejected": -2.7017056941986084, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8066162467002869, + "rewards/margins": 0.17724893987178802, + "rewards/rejected": 0.6293672919273376, + "step": 363 + }, + { + "epoch": 1.29, + "learning_rate": 6.384505058985388e-08, + "logits/chosen": -2.063748359680176, + "logits/rejected": -2.0555691719055176, + "logps/chosen": -6.753568649291992, + "logps/rejected": -11.110140800476074, + "loss": 0.8, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5571596622467041, + "rewards/margins": -0.6622412204742432, + "rewards/rejected": 1.2194008827209473, + "step": 364 + }, + { + "epoch": 1.29, + "learning_rate": 6.36615509236468e-08, + "logits/chosen": -2.042121171951294, + "logits/rejected": -2.042572498321533, + "logps/chosen": -1.3448045253753662, + "logps/rejected": -9.92697525024414, + "loss": 0.5848, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7974374294281006, + "rewards/margins": -0.14883971214294434, + "rewards/rejected": 0.9462771415710449, + "step": 365 + }, + { + "epoch": 1.29, + "learning_rate": 6.347785219031075e-08, + "logits/chosen": -2.055434226989746, + "logits/rejected": -2.0558159351348877, + "logps/chosen": -4.493001937866211, + "logps/rejected": -2.8312911987304688, + "loss": 0.6433, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.771396815776825, + "rewards/margins": 0.07200047373771667, + "rewards/rejected": 0.6993963718414307, + "step": 366 + }, + { + "epoch": 1.3, + "learning_rate": 6.329395706658277e-08, + "logits/chosen": -2.0754027366638184, + "logits/rejected": -2.0811405181884766, + "logps/chosen": -1.8197733163833618, + "logps/rejected": -9.930107116699219, + "loss": 0.5494, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8541202545166016, + "rewards/margins": -0.00841069221496582, + "rewards/rejected": 0.8625309467315674, + "step": 367 + }, + { + "epoch": 1.3, + "learning_rate": 6.310986823206159e-08, + "logits/chosen": -1.9914188385009766, + "logits/rejected": -1.996675968170166, + "logps/chosen": -3.127906322479248, + "logps/rejected": -4.787808418273926, + "loss": 0.5847, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9468159079551697, + "rewards/margins": 0.2728981077671051, + "rewards/rejected": 0.6739177703857422, + "step": 368 + }, + { + "epoch": 1.3, + "learning_rate": 6.292558836916855e-08, + "logits/chosen": -1.985142707824707, + "logits/rejected": -1.9867688417434692, + "logps/chosen": -0.5532424449920654, + "logps/rejected": -3.6320300102233887, + "loss": 0.7013, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8197859525680542, + "rewards/margins": 0.16904133558273315, + "rewards/rejected": 0.650744616985321, + "step": 369 + }, + { + "epoch": 1.31, + "learning_rate": 6.274112016310853e-08, + "logits/chosen": -2.083099365234375, + "logits/rejected": -2.087415933609009, + "logps/chosen": -2.7940316200256348, + "logps/rejected": -17.4298152923584, + "loss": 0.7615, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9723863005638123, + "rewards/margins": 0.3078272044658661, + "rewards/rejected": 0.6645590662956238, + "step": 370 + }, + { + "epoch": 1.31, + "learning_rate": 6.255646630183082e-08, + "logits/chosen": -2.003527879714966, + "logits/rejected": -2.005058526992798, + "logps/chosen": -6.158339023590088, + "logps/rejected": -3.0064477920532227, + "loss": 0.6517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8236038684844971, + "rewards/margins": 0.32325416803359985, + "rewards/rejected": 0.5003497004508972, + "step": 371 + }, + { + "epoch": 1.31, + "learning_rate": 6.237162947598997e-08, + "logits/chosen": -2.0916638374328613, + "logits/rejected": -2.1023671627044678, + "logps/chosen": -1.4832754135131836, + "logps/rejected": -11.736163139343262, + "loss": 0.7262, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8739482164382935, + "rewards/margins": 0.012841284275054932, + "rewards/rejected": 0.8611069321632385, + "step": 372 + }, + { + "epoch": 1.32, + "learning_rate": 6.218661237890654e-08, + "logits/chosen": -1.9910025596618652, + "logits/rejected": -1.9933462142944336, + "logps/chosen": -2.092916965484619, + "logps/rejected": -4.334254264831543, + "loss": 0.6588, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8042113780975342, + "rewards/margins": 0.2673882246017456, + "rewards/rejected": 0.5368231534957886, + "step": 373 + }, + { + "epoch": 1.32, + "learning_rate": 6.200141770652791e-08, + "logits/chosen": -2.0522944927215576, + "logits/rejected": -2.101686716079712, + "logps/chosen": -0.5202007293701172, + "logps/rejected": -29.437536239624023, + "loss": 0.8345, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6910809278488159, + "rewards/margins": -0.2407466173171997, + "rewards/rejected": 0.9318275451660156, + "step": 374 + }, + { + "epoch": 1.33, + "learning_rate": 6.181604815738898e-08, + "logits/chosen": -2.0675487518310547, + "logits/rejected": -2.067615032196045, + "logps/chosen": -1.455470323562622, + "logps/rejected": -2.8977608680725098, + "loss": 0.6754, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7692759037017822, + "rewards/margins": 0.03943154215812683, + "rewards/rejected": 0.729844331741333, + "step": 375 + }, + { + "epoch": 1.33, + "learning_rate": 6.163050643257282e-08, + "logits/chosen": -2.074638843536377, + "logits/rejected": -2.0781705379486084, + "logps/chosen": -3.0658960342407227, + "logps/rejected": -2.282510280609131, + "loss": 0.6816, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9563910961151123, + "rewards/margins": 0.31985971331596375, + "rewards/rejected": 0.636531412601471, + "step": 376 + }, + { + "epoch": 1.33, + "learning_rate": 6.14447952356713e-08, + "logits/chosen": -2.0369410514831543, + "logits/rejected": -2.0410871505737305, + "logps/chosen": -0.6507952213287354, + "logps/rejected": -8.52114200592041, + "loss": 0.6362, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8076935410499573, + "rewards/margins": 0.21568019688129425, + "rewards/rejected": 0.5920133590698242, + "step": 377 + }, + { + "epoch": 1.34, + "learning_rate": 6.12589172727458e-08, + "logits/chosen": -1.967811107635498, + "logits/rejected": -1.9923427104949951, + "logps/chosen": -0.8467729091644287, + "logps/rejected": -14.60690975189209, + "loss": 0.6561, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7349449396133423, + "rewards/margins": -0.24160286784172058, + "rewards/rejected": 0.9765478372573853, + "step": 378 + }, + { + "epoch": 1.34, + "learning_rate": 6.107287525228763e-08, + "logits/chosen": -2.008897304534912, + "logits/rejected": -2.015631675720215, + "logps/chosen": -0.5811237096786499, + "logps/rejected": -11.158145904541016, + "loss": 0.688, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8838824033737183, + "rewards/margins": -0.1320817768573761, + "rewards/rejected": 1.015964150428772, + "step": 379 + }, + { + "epoch": 1.34, + "learning_rate": 6.088667188517868e-08, + "logits/chosen": -2.0495123863220215, + "logits/rejected": -2.05372953414917, + "logps/chosen": -2.5792179107666016, + "logps/rejected": -16.892070770263672, + "loss": 0.7325, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.812070369720459, + "rewards/margins": -0.2931274473667145, + "rewards/rejected": 1.1051979064941406, + "step": 380 + }, + { + "epoch": 1.35, + "learning_rate": 6.070030988465191e-08, + "logits/chosen": -1.95384681224823, + "logits/rejected": -1.956581473350525, + "logps/chosen": -1.7936793565750122, + "logps/rejected": -2.537358283996582, + "loss": 0.7913, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7542133927345276, + "rewards/margins": 0.010795831680297852, + "rewards/rejected": 0.7434175610542297, + "step": 381 + }, + { + "epoch": 1.35, + "learning_rate": 6.05137919662517e-08, + "logits/chosen": -2.004143476486206, + "logits/rejected": -2.01336932182312, + "logps/chosen": -1.1597856283187866, + "logps/rejected": -7.527113914489746, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8296374082565308, + "rewards/margins": 0.1217653751373291, + "rewards/rejected": 0.7078720331192017, + "step": 382 + }, + { + "epoch": 1.35, + "learning_rate": 6.032712084779441e-08, + "logits/chosen": -1.9560608863830566, + "logits/rejected": -1.9544436931610107, + "logps/chosen": -7.474586009979248, + "logps/rejected": -2.8420162200927734, + "loss": 0.676, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1207635402679443, + "rewards/margins": 0.2833377420902252, + "rewards/rejected": 0.8374258279800415, + "step": 383 + }, + { + "epoch": 1.36, + "learning_rate": 6.014029924932873e-08, + "logits/chosen": -1.974664568901062, + "logits/rejected": -1.9764366149902344, + "logps/chosen": -3.1828017234802246, + "logps/rejected": -2.58215069770813, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9323933124542236, + "rewards/margins": -0.027151137590408325, + "rewards/rejected": 0.9595444202423096, + "step": 384 + }, + { + "epoch": 1.36, + "learning_rate": 5.995332989309602e-08, + "logits/chosen": -2.038248300552368, + "logits/rejected": -2.046967029571533, + "logps/chosen": -0.7940558195114136, + "logps/rejected": -5.7094035148620605, + "loss": 0.5493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8900354504585266, + "rewards/margins": 0.3481293320655823, + "rewards/rejected": 0.5419061183929443, + "step": 385 + }, + { + "epoch": 1.36, + "learning_rate": 5.976621550349071e-08, + "logits/chosen": -2.0146899223327637, + "logits/rejected": -2.021106243133545, + "logps/chosen": -3.23191237449646, + "logps/rejected": -9.587932586669922, + "loss": 0.7214, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8657009601593018, + "rewards/margins": -0.07130968570709229, + "rewards/rejected": 0.937010645866394, + "step": 386 + }, + { + "epoch": 1.37, + "learning_rate": 5.9578958807020554e-08, + "logits/chosen": -2.0858898162841797, + "logits/rejected": -2.086371421813965, + "logps/chosen": -2.0109081268310547, + "logps/rejected": -2.4800777435302734, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8801678419113159, + "rewards/margins": 0.22166505455970764, + "rewards/rejected": 0.6585028171539307, + "step": 387 + }, + { + "epoch": 1.37, + "learning_rate": 5.939156253226687e-08, + "logits/chosen": -2.1177899837493896, + "logits/rejected": -2.1130177974700928, + "logps/chosen": -9.557272911071777, + "logps/rejected": -3.5179929733276367, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.168700933456421, + "rewards/margins": 0.6386635899543762, + "rewards/rejected": 0.5300373435020447, + "step": 388 + }, + { + "epoch": 1.37, + "learning_rate": 5.9204029409844824e-08, + "logits/chosen": -2.086236000061035, + "logits/rejected": -2.1068339347839355, + "logps/chosen": -2.133889675140381, + "logps/rejected": -14.516474723815918, + "loss": 0.8134, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8743669986724854, + "rewards/margins": -0.025939881801605225, + "rewards/rejected": 0.9003069400787354, + "step": 389 + }, + { + "epoch": 1.38, + "learning_rate": 5.90163621723637e-08, + "logits/chosen": -2.1442902088165283, + "logits/rejected": -2.151904344558716, + "logps/chosen": -4.771976470947266, + "logps/rejected": -13.556497573852539, + "loss": 0.5316, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8777777552604675, + "rewards/margins": 0.2105550467967987, + "rewards/rejected": 0.6672227382659912, + "step": 390 + }, + { + "epoch": 1.38, + "learning_rate": 5.882856355438695e-08, + "logits/chosen": -2.0417559146881104, + "logits/rejected": -2.051753520965576, + "logps/chosen": -2.2076058387756348, + "logps/rejected": -5.952508926391602, + "loss": 0.5869, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.765190601348877, + "rewards/margins": 0.2552322447299957, + "rewards/rejected": 0.5099583268165588, + "step": 391 + }, + { + "epoch": 1.39, + "learning_rate": 5.8640636292392424e-08, + "logits/chosen": -2.033461332321167, + "logits/rejected": -2.0345048904418945, + "logps/chosen": -2.2156107425689697, + "logps/rejected": -7.678490161895752, + "loss": 0.6796, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8413029909133911, + "rewards/margins": 0.23304510116577148, + "rewards/rejected": 0.6082578897476196, + "step": 392 + }, + { + "epoch": 1.39, + "learning_rate": 5.8452583124732514e-08, + "logits/chosen": -1.990741491317749, + "logits/rejected": -1.9932096004486084, + "logps/chosen": -1.3439600467681885, + "logps/rejected": -2.3025319576263428, + "loss": 0.5776, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7662653923034668, + "rewards/margins": 0.21179889142513275, + "rewards/rejected": 0.5544664859771729, + "step": 393 + }, + { + "epoch": 1.39, + "learning_rate": 5.826440679159423e-08, + "logits/chosen": -2.0498859882354736, + "logits/rejected": -2.0465707778930664, + "logps/chosen": -2.1532142162323, + "logps/rejected": -2.3126626014709473, + "loss": 0.6357, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7842508554458618, + "rewards/margins": 0.0017898976802825928, + "rewards/rejected": 0.7824609875679016, + "step": 394 + }, + { + "epoch": 1.4, + "learning_rate": 5.8076110034959245e-08, + "logits/chosen": -1.9656867980957031, + "logits/rejected": -1.9675114154815674, + "logps/chosen": -1.131929874420166, + "logps/rejected": -8.605990409851074, + "loss": 0.6275, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8213405013084412, + "rewards/margins": -0.11401018500328064, + "rewards/rejected": 0.9353506565093994, + "step": 395 + }, + { + "epoch": 1.4, + "learning_rate": 5.7887695598563966e-08, + "logits/chosen": -2.0948901176452637, + "logits/rejected": -2.095322608947754, + "logps/chosen": -1.9180022478103638, + "logps/rejected": -2.6860511302948, + "loss": 0.6378, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8801929950714111, + "rewards/margins": 0.14728784561157227, + "rewards/rejected": 0.7329051494598389, + "step": 396 + }, + { + "epoch": 1.4, + "learning_rate": 5.7699166227859565e-08, + "logits/chosen": -1.9632657766342163, + "logits/rejected": -1.9631896018981934, + "logps/chosen": -1.9615055322647095, + "logps/rejected": -9.54023551940918, + "loss": 0.6737, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8214801549911499, + "rewards/margins": -0.08991578221321106, + "rewards/rejected": 0.9113959074020386, + "step": 397 + }, + { + "epoch": 1.41, + "learning_rate": 5.751052466997195e-08, + "logits/chosen": -2.0406064987182617, + "logits/rejected": -2.046266555786133, + "logps/chosen": -1.5369763374328613, + "logps/rejected": -26.088794708251953, + "loss": 0.6438, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6353905200958252, + "rewards/margins": -0.41096988320350647, + "rewards/rejected": 1.0463604927062988, + "step": 398 + }, + { + "epoch": 1.41, + "learning_rate": 5.732177367366175e-08, + "logits/chosen": -1.9484293460845947, + "logits/rejected": -1.964268445968628, + "logps/chosen": -0.6533129811286926, + "logps/rejected": -12.061738014221191, + "loss": 0.6427, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7399017214775085, + "rewards/margins": 0.0007763803005218506, + "rewards/rejected": 0.7391253709793091, + "step": 399 + }, + { + "epoch": 1.41, + "learning_rate": 5.713291598928428e-08, + "logits/chosen": -2.001999616622925, + "logits/rejected": -2.0025012493133545, + "logps/chosen": -1.3878692388534546, + "logps/rejected": -5.174630165100098, + "loss": 0.7596, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9341466426849365, + "rewards/margins": 0.36065077781677246, + "rewards/rejected": 0.5734958648681641, + "step": 400 + }, + { + "epoch": 1.42, + "learning_rate": 5.6943954368749416e-08, + "logits/chosen": -2.018983840942383, + "logits/rejected": -2.017455577850342, + "logps/chosen": -3.8306100368499756, + "logps/rejected": -2.0253970623016357, + "loss": 0.6014, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.63642418384552, + "rewards/margins": -0.17646309733390808, + "rewards/rejected": 0.8128873109817505, + "step": 401 + }, + { + "epoch": 1.42, + "learning_rate": 5.67548915654815e-08, + "logits/chosen": -2.0115089416503906, + "logits/rejected": -2.0150723457336426, + "logps/chosen": -0.6334203481674194, + "logps/rejected": -10.044404983520508, + "loss": 0.9006, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.728382408618927, + "rewards/margins": -0.22243118286132812, + "rewards/rejected": 0.9508135914802551, + "step": 402 + }, + { + "epoch": 1.42, + "learning_rate": 5.656573033437931e-08, + "logits/chosen": -1.9869811534881592, + "logits/rejected": -1.9927432537078857, + "logps/chosen": -0.9617888927459717, + "logps/rejected": -7.496981620788574, + "loss": 0.6584, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8220905065536499, + "rewards/margins": 0.19618263840675354, + "rewards/rejected": 0.6259078979492188, + "step": 403 + }, + { + "epoch": 1.43, + "learning_rate": 5.6376473431775796e-08, + "logits/chosen": -2.019702196121216, + "logits/rejected": -2.0211498737335205, + "logps/chosen": -1.3679780960083008, + "logps/rejected": -3.9680991172790527, + "loss": 0.7491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9101822972297668, + "rewards/margins": 0.1162932813167572, + "rewards/rejected": 0.793889045715332, + "step": 404 + }, + { + "epoch": 1.43, + "learning_rate": 5.618712361539798e-08, + "logits/chosen": -2.0261619091033936, + "logits/rejected": -2.024610757827759, + "logps/chosen": -2.227651834487915, + "logps/rejected": -3.0998640060424805, + "loss": 0.6137, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7950409650802612, + "rewards/margins": -0.00493234395980835, + "rewards/rejected": 0.7999733090400696, + "step": 405 + }, + { + "epoch": 1.43, + "learning_rate": 5.59976836443268e-08, + "logits/chosen": -2.032444715499878, + "logits/rejected": -2.036069869995117, + "logps/chosen": -1.886655330657959, + "logps/rejected": -15.315829277038574, + "loss": 0.7814, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8196829557418823, + "rewards/margins": -0.270522803068161, + "rewards/rejected": 1.0902057886123657, + "step": 406 + }, + { + "epoch": 1.44, + "learning_rate": 5.580815627895681e-08, + "logits/chosen": -2.0597851276397705, + "logits/rejected": -2.06496000289917, + "logps/chosen": -4.349756240844727, + "logps/rejected": -2.429677963256836, + "loss": 0.7288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8348046541213989, + "rewards/margins": 0.14793911576271057, + "rewards/rejected": 0.686865508556366, + "step": 407 + }, + { + "epoch": 1.44, + "learning_rate": 5.561854428095605e-08, + "logits/chosen": -1.995650053024292, + "logits/rejected": -1.9943383932113647, + "logps/chosen": -3.088792562484741, + "logps/rejected": -4.047159194946289, + "loss": 0.6566, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.644530177116394, + "rewards/margins": 0.10998211801052094, + "rewards/rejected": 0.5345481038093567, + "step": 408 + }, + { + "epoch": 1.45, + "learning_rate": 5.542885041322577e-08, + "logits/chosen": -1.9727768898010254, + "logits/rejected": -1.9728915691375732, + "logps/chosen": -1.1251168251037598, + "logps/rejected": -7.045303821563721, + "loss": 0.54, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7700644731521606, + "rewards/margins": 0.16687844693660736, + "rewards/rejected": 0.6031860113143921, + "step": 409 + }, + { + "epoch": 1.45, + "learning_rate": 5.523907743986016e-08, + "logits/chosen": -2.013444423675537, + "logits/rejected": -2.0129997730255127, + "logps/chosen": -1.1870884895324707, + "logps/rejected": -4.477358341217041, + "loss": 0.5286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9231430292129517, + "rewards/margins": 0.39418938755989075, + "rewards/rejected": 0.5289536714553833, + "step": 410 + }, + { + "epoch": 1.45, + "learning_rate": 5.50492281261061e-08, + "logits/chosen": -2.0300726890563965, + "logits/rejected": -2.0390405654907227, + "logps/chosen": -1.173054575920105, + "logps/rejected": -9.70925521850586, + "loss": 0.7114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8322014212608337, + "rewards/margins": 0.07901990413665771, + "rewards/rejected": 0.753181517124176, + "step": 411 + }, + { + "epoch": 1.46, + "learning_rate": 5.485930523832284e-08, + "logits/chosen": -2.0305588245391846, + "logits/rejected": -2.0303754806518555, + "logps/chosen": -0.5971939563751221, + "logps/rejected": -2.9170925617218018, + "loss": 0.5439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.730309009552002, + "rewards/margins": 0.028870224952697754, + "rewards/rejected": 0.7014387845993042, + "step": 412 + }, + { + "epoch": 1.46, + "learning_rate": 5.466931154394171e-08, + "logits/chosen": -2.0829203128814697, + "logits/rejected": -2.0947201251983643, + "logps/chosen": -1.3012233972549438, + "logps/rejected": -12.576242446899414, + "loss": 0.6811, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.923191249370575, + "rewards/margins": 0.08502048254013062, + "rewards/rejected": 0.8381707668304443, + "step": 413 + }, + { + "epoch": 1.46, + "learning_rate": 5.447924981142578e-08, + "logits/chosen": -1.9698141813278198, + "logits/rejected": -1.9768402576446533, + "logps/chosen": -3.1931772232055664, + "logps/rejected": -1.6348600387573242, + "loss": 0.7086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9519075155258179, + "rewards/margins": 0.4217282235622406, + "rewards/rejected": 0.5301792621612549, + "step": 414 + }, + { + "epoch": 1.47, + "learning_rate": 5.428912281022953e-08, + "logits/chosen": -2.0226356983184814, + "logits/rejected": -2.015850305557251, + "logps/chosen": -3.7773327827453613, + "logps/rejected": -3.931743860244751, + "loss": 0.6697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8940503001213074, + "rewards/margins": 0.29493993520736694, + "rewards/rejected": 0.5991103649139404, + "step": 415 + }, + { + "epoch": 1.47, + "learning_rate": 5.40989333107585e-08, + "logits/chosen": -2.0830464363098145, + "logits/rejected": -2.0883870124816895, + "logps/chosen": -2.9541330337524414, + "logps/rejected": -8.39959716796875, + "loss": 0.6428, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9627987146377563, + "rewards/margins": 0.046957939863204956, + "rewards/rejected": 0.915840744972229, + "step": 416 + }, + { + "epoch": 1.47, + "learning_rate": 5.3908684084328895e-08, + "logits/chosen": -2.056015729904175, + "logits/rejected": -2.058906316757202, + "logps/chosen": -1.6119663715362549, + "logps/rejected": -7.985307693481445, + "loss": 0.649, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9137560129165649, + "rewards/margins": -0.0638599693775177, + "rewards/rejected": 0.977616012096405, + "step": 417 + }, + { + "epoch": 1.48, + "learning_rate": 5.3718377903127244e-08, + "logits/chosen": -2.0471248626708984, + "logits/rejected": -2.05595064163208, + "logps/chosen": -1.7350432872772217, + "logps/rejected": -11.624783515930176, + "loss": 0.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8319758772850037, + "rewards/margins": 0.06728851795196533, + "rewards/rejected": 0.7646873593330383, + "step": 418 + }, + { + "epoch": 1.48, + "learning_rate": 5.352801754016997e-08, + "logits/chosen": -2.066455125808716, + "logits/rejected": -2.0641016960144043, + "logps/chosen": -2.4753878116607666, + "logps/rejected": -5.8906569480896, + "loss": 0.7157, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8044043183326721, + "rewards/margins": 0.03508728742599487, + "rewards/rejected": 0.7693170309066772, + "step": 419 + }, + { + "epoch": 1.48, + "learning_rate": 5.333760576926301e-08, + "logits/chosen": -2.0131959915161133, + "logits/rejected": -2.0162909030914307, + "logps/chosen": -1.232853651046753, + "logps/rejected": -8.927679061889648, + "loss": 0.7239, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8069596290588379, + "rewards/margins": 0.0809197723865509, + "rewards/rejected": 0.7260398268699646, + "step": 420 + }, + { + "epoch": 1.49, + "learning_rate": 5.314714536496134e-08, + "logits/chosen": -2.0396623611450195, + "logits/rejected": -2.0451724529266357, + "logps/chosen": -2.897946357727051, + "logps/rejected": -9.764772415161133, + "loss": 0.6138, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8587572574615479, + "rewards/margins": 0.09252989292144775, + "rewards/rejected": 0.7662273645401001, + "step": 421 + }, + { + "epoch": 1.49, + "learning_rate": 5.295663910252867e-08, + "logits/chosen": -1.9946132898330688, + "logits/rejected": -1.99668288230896, + "logps/chosen": -1.0110392570495605, + "logps/rejected": -9.305388450622559, + "loss": 0.7036, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7580358982086182, + "rewards/margins": -0.18912026286125183, + "rewards/rejected": 0.9471561908721924, + "step": 422 + }, + { + "epoch": 1.49, + "learning_rate": 5.276608975789683e-08, + "logits/chosen": -1.9605252742767334, + "logits/rejected": -1.9604377746582031, + "logps/chosen": -6.592268943786621, + "logps/rejected": -2.0108957290649414, + "loss": 0.6136, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8195445537567139, + "rewards/margins": 0.13598209619522095, + "rewards/rejected": 0.6835624575614929, + "step": 423 + }, + { + "epoch": 1.5, + "learning_rate": 5.2575500107625495e-08, + "logits/chosen": -2.0877435207366943, + "logits/rejected": -2.091658115386963, + "logps/chosen": -1.3423237800598145, + "logps/rejected": -1.988660216331482, + "loss": 0.728, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8367176055908203, + "rewards/margins": 0.21574562788009644, + "rewards/rejected": 0.6209719181060791, + "step": 424 + }, + { + "epoch": 1.5, + "learning_rate": 5.238487292886161e-08, + "logits/chosen": -2.090667486190796, + "logits/rejected": -2.092423915863037, + "logps/chosen": -2.3906869888305664, + "logps/rejected": -3.2754669189453125, + "loss": 0.5765, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9268951416015625, + "rewards/margins": 0.4201364517211914, + "rewards/rejected": 0.5067586898803711, + "step": 425 + }, + { + "epoch": 1.51, + "learning_rate": 5.219421099929898e-08, + "logits/chosen": -2.069239377975464, + "logits/rejected": -2.0625178813934326, + "logps/chosen": -6.886218547821045, + "logps/rejected": -2.996114730834961, + "loss": 0.7063, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0266616344451904, + "rewards/margins": 0.37233126163482666, + "rewards/rejected": 0.6543304324150085, + "step": 426 + }, + { + "epoch": 1.51, + "learning_rate": 5.200351709713773e-08, + "logits/chosen": -2.015993356704712, + "logits/rejected": -2.0164990425109863, + "logps/chosen": -3.882021903991699, + "logps/rejected": -3.3565258979797363, + "loss": 0.6346, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5484482645988464, + "rewards/margins": -0.3378588855266571, + "rewards/rejected": 0.8863071203231812, + "step": 427 + }, + { + "epoch": 1.51, + "learning_rate": 5.1812794001043924e-08, + "logits/chosen": -2.0080666542053223, + "logits/rejected": -2.00470232963562, + "logps/chosen": -0.9250922203063965, + "logps/rejected": -3.40871262550354, + "loss": 0.6825, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.835899829864502, + "rewards/margins": 0.14110761880874634, + "rewards/rejected": 0.6947922706604004, + "step": 428 + }, + { + "epoch": 1.52, + "learning_rate": 5.1622044490108984e-08, + "logits/chosen": -1.9762463569641113, + "logits/rejected": -1.9908268451690674, + "logps/chosen": -0.9461663961410522, + "logps/rejected": -12.095985412597656, + "loss": 0.7067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8437862396240234, + "rewards/margins": 0.013699173927307129, + "rewards/rejected": 0.8300870656967163, + "step": 429 + }, + { + "epoch": 1.52, + "learning_rate": 5.143127134380926e-08, + "logits/chosen": -1.9730662107467651, + "logits/rejected": -1.9754053354263306, + "logps/chosen": -2.8930611610412598, + "logps/rejected": -9.436213493347168, + "loss": 0.6548, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8201236724853516, + "rewards/margins": 0.00029712915420532227, + "rewards/rejected": 0.8198264837265015, + "step": 430 + }, + { + "epoch": 1.52, + "learning_rate": 5.124047734196548e-08, + "logits/chosen": -2.015064001083374, + "logits/rejected": -2.016348123550415, + "logps/chosen": -2.645974636077881, + "logps/rejected": -2.7916224002838135, + "loss": 0.6045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9697690010070801, + "rewards/margins": 0.3955572247505188, + "rewards/rejected": 0.5742117762565613, + "step": 431 + }, + { + "epoch": 1.53, + "learning_rate": 5.104966526470227e-08, + "logits/chosen": -1.9818365573883057, + "logits/rejected": -1.9827139377593994, + "logps/chosen": -0.8331803679466248, + "logps/rejected": -3.9609270095825195, + "loss": 0.6191, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7473673820495605, + "rewards/margins": 0.15388965606689453, + "rewards/rejected": 0.593477725982666, + "step": 432 + }, + { + "epoch": 1.53, + "learning_rate": 5.085883789240764e-08, + "logits/chosen": -2.0352296829223633, + "logits/rejected": -2.044806718826294, + "logps/chosen": -2.058864116668701, + "logps/rejected": -17.420848846435547, + "loss": 0.6992, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8977015614509583, + "rewards/margins": -0.30114424228668213, + "rewards/rejected": 1.1988458633422852, + "step": 433 + }, + { + "epoch": 1.53, + "learning_rate": 5.066799800569247e-08, + "logits/chosen": -2.0370700359344482, + "logits/rejected": -2.041255235671997, + "logps/chosen": -3.2147138118743896, + "logps/rejected": -2.2685577869415283, + "loss": 0.6244, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7562023401260376, + "rewards/margins": 0.17035217583179474, + "rewards/rejected": 0.5858501195907593, + "step": 434 + }, + { + "epoch": 1.54, + "learning_rate": 5.047714838534998e-08, + "logits/chosen": -1.9550491571426392, + "logits/rejected": -1.9531859159469604, + "logps/chosen": -6.517072677612305, + "logps/rejected": -2.320681571960449, + "loss": 0.5445, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9716500043869019, + "rewards/margins": 0.12404924631118774, + "rewards/rejected": 0.8476006984710693, + "step": 435 + }, + { + "epoch": 1.54, + "learning_rate": 5.028629181231525e-08, + "logits/chosen": -1.9821966886520386, + "logits/rejected": -1.988022804260254, + "logps/chosen": -0.8923206329345703, + "logps/rejected": -9.307008743286133, + "loss": 0.704, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.929919421672821, + "rewards/margins": -0.08639904856681824, + "rewards/rejected": 1.016318440437317, + "step": 436 + }, + { + "epoch": 1.54, + "learning_rate": 5.009543106762465e-08, + "logits/chosen": -2.0000181198120117, + "logits/rejected": -2.000490665435791, + "logps/chosen": -0.7049423456192017, + "logps/rejected": -3.4679408073425293, + "loss": 0.6814, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6806238889694214, + "rewards/margins": -0.12843185663223267, + "rewards/rejected": 0.8090558052062988, + "step": 437 + }, + { + "epoch": 1.55, + "learning_rate": 4.990456893237533e-08, + "logits/chosen": -1.936920166015625, + "logits/rejected": -1.9403204917907715, + "logps/chosen": -0.6340222954750061, + "logps/rejected": -9.688849449157715, + "loss": 0.7773, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6009465456008911, + "rewards/margins": -0.4210093021392822, + "rewards/rejected": 1.0219558477401733, + "step": 438 + }, + { + "epoch": 1.55, + "learning_rate": 4.9713708187684744e-08, + "logits/chosen": -2.023263692855835, + "logits/rejected": -2.0156285762786865, + "logps/chosen": -4.848706245422363, + "logps/rejected": -10.089259147644043, + "loss": 0.6599, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.830024003982544, + "rewards/margins": -0.20596396923065186, + "rewards/rejected": 1.0359879732131958, + "step": 439 + }, + { + "epoch": 1.55, + "learning_rate": 4.952285161465002e-08, + "logits/chosen": -2.0301599502563477, + "logits/rejected": -2.0271475315093994, + "logps/chosen": -1.361358642578125, + "logps/rejected": -2.5926456451416016, + "loss": 0.6737, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7401031255722046, + "rewards/margins": -0.0597740113735199, + "rewards/rejected": 0.7998771667480469, + "step": 440 + }, + { + "epoch": 1.56, + "learning_rate": 4.933200199430754e-08, + "logits/chosen": -2.013002395629883, + "logits/rejected": -2.0146191120147705, + "logps/chosen": -9.408967971801758, + "logps/rejected": -2.471646308898926, + "loss": 0.5181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1133514642715454, + "rewards/margins": 0.36026570200920105, + "rewards/rejected": 0.753085732460022, + "step": 441 + }, + { + "epoch": 1.56, + "learning_rate": 4.914116210759237e-08, + "logits/chosen": -2.0196428298950195, + "logits/rejected": -2.036376714706421, + "logps/chosen": -0.9020828604698181, + "logps/rejected": -14.798543930053711, + "loss": 0.6855, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7865010499954224, + "rewards/margins": -0.19338449835777283, + "rewards/rejected": 0.9798855781555176, + "step": 442 + }, + { + "epoch": 1.57, + "learning_rate": 4.895033473529774e-08, + "logits/chosen": -1.9841495752334595, + "logits/rejected": -2.0003795623779297, + "logps/chosen": -0.8353258371353149, + "logps/rejected": -11.816282272338867, + "loss": 0.6408, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8273388743400574, + "rewards/margins": -0.14617127180099487, + "rewards/rejected": 0.9735101461410522, + "step": 443 + }, + { + "epoch": 1.57, + "learning_rate": 4.875952265803451e-08, + "logits/chosen": -2.052184581756592, + "logits/rejected": -2.0680553913116455, + "logps/chosen": -2.2458372116088867, + "logps/rejected": -11.521520614624023, + "loss": 0.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.752544641494751, + "rewards/margins": -0.15436068177223206, + "rewards/rejected": 0.9069052934646606, + "step": 444 + }, + { + "epoch": 1.57, + "learning_rate": 4.8568728656190736e-08, + "logits/chosen": -2.072646379470825, + "logits/rejected": -2.075331926345825, + "logps/chosen": -0.7762209177017212, + "logps/rejected": -8.860960006713867, + "loss": 0.832, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7517163753509521, + "rewards/margins": -0.332887202501297, + "rewards/rejected": 1.0846035480499268, + "step": 445 + }, + { + "epoch": 1.58, + "learning_rate": 4.837795550989101e-08, + "logits/chosen": -1.9979747533798218, + "logits/rejected": -1.9973293542861938, + "logps/chosen": -3.5670645236968994, + "logps/rejected": -7.856192588806152, + "loss": 0.7038, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8239714503288269, + "rewards/margins": -0.10715687274932861, + "rewards/rejected": 0.9311283230781555, + "step": 446 + }, + { + "epoch": 1.58, + "learning_rate": 4.818720599895607e-08, + "logits/chosen": -1.9798504114151, + "logits/rejected": -1.992478609085083, + "logps/chosen": -0.9039241671562195, + "logps/rejected": -7.616443634033203, + "loss": 0.6081, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8331699371337891, + "rewards/margins": 0.03766685724258423, + "rewards/rejected": 0.7955030202865601, + "step": 447 + }, + { + "epoch": 1.58, + "learning_rate": 4.7996482902862275e-08, + "logits/chosen": -1.9565924406051636, + "logits/rejected": -1.9673333168029785, + "logps/chosen": -3.6396632194519043, + "logps/rejected": -5.412489414215088, + "loss": 0.5919, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9027977585792542, + "rewards/margins": 0.4046878218650818, + "rewards/rejected": 0.49810993671417236, + "step": 448 + }, + { + "epoch": 1.59, + "learning_rate": 4.780578900070103e-08, + "logits/chosen": -1.9612255096435547, + "logits/rejected": -1.9598028659820557, + "logps/chosen": -2.312610626220703, + "logps/rejected": -2.862576723098755, + "loss": 0.7922, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8833867907524109, + "rewards/margins": 0.05541110038757324, + "rewards/rejected": 0.8279756903648376, + "step": 449 + }, + { + "epoch": 1.59, + "learning_rate": 4.76151270711384e-08, + "logits/chosen": -2.0879406929016113, + "logits/rejected": -2.0922091007232666, + "logps/chosen": -0.5477692484855652, + "logps/rejected": -3.1250803470611572, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8233094215393066, + "rewards/margins": 0.16630405187606812, + "rewards/rejected": 0.6570053696632385, + "step": 450 + }, + { + "epoch": 1.59, + "learning_rate": 4.74244998923745e-08, + "logits/chosen": -1.9715300798416138, + "logits/rejected": -1.976974368095398, + "logps/chosen": -4.4775309562683105, + "logps/rejected": -2.253206729888916, + "loss": 0.621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9071865081787109, + "rewards/margins": 0.29171767830848694, + "rewards/rejected": 0.6154688596725464, + "step": 451 + }, + { + "epoch": 1.6, + "learning_rate": 4.7233910242103175e-08, + "logits/chosen": -2.0148918628692627, + "logits/rejected": -2.0155692100524902, + "logps/chosen": -6.842096328735352, + "logps/rejected": -8.13604736328125, + "loss": 0.5394, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.756463348865509, + "rewards/margins": -0.05228722095489502, + "rewards/rejected": 0.808750569820404, + "step": 452 + }, + { + "epoch": 1.6, + "learning_rate": 4.704336089747134e-08, + "logits/chosen": -2.0609352588653564, + "logits/rejected": -2.061661720275879, + "logps/chosen": -1.3462457656860352, + "logps/rejected": -2.9615039825439453, + "loss": 0.623, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9031734466552734, + "rewards/margins": 0.15471035242080688, + "rewards/rejected": 0.7484631538391113, + "step": 453 + }, + { + "epoch": 1.6, + "learning_rate": 4.685285463503866e-08, + "logits/chosen": -2.0002598762512207, + "logits/rejected": -1.9979758262634277, + "logps/chosen": -6.9981160163879395, + "logps/rejected": -9.364669799804688, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0693129301071167, + "rewards/margins": 0.23182234168052673, + "rewards/rejected": 0.8374905586242676, + "step": 454 + }, + { + "epoch": 1.61, + "learning_rate": 4.6662394230737014e-08, + "logits/chosen": -2.0437369346618652, + "logits/rejected": -2.1339731216430664, + "logps/chosen": -11.512495994567871, + "logps/rejected": -15.765096664428711, + "loss": 0.5191, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2696609497070312, + "rewards/margins": 1.003662109375, + "rewards/rejected": 0.2659987509250641, + "step": 455 + }, + { + "epoch": 1.61, + "learning_rate": 4.647198245983004e-08, + "logits/chosen": -2.0326521396636963, + "logits/rejected": -2.046682596206665, + "logps/chosen": -4.012043476104736, + "logps/rejected": -9.770425796508789, + "loss": 0.5948, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9339296221733093, + "rewards/margins": 0.1405608057975769, + "rewards/rejected": 0.7933688163757324, + "step": 456 + }, + { + "epoch": 1.61, + "learning_rate": 4.628162209687275e-08, + "logits/chosen": -2.0227279663085938, + "logits/rejected": -2.020524024963379, + "logps/chosen": -1.4769536256790161, + "logps/rejected": -6.174238204956055, + "loss": 0.7701, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7399169206619263, + "rewards/margins": 0.05815891921520233, + "rewards/rejected": 0.6817580461502075, + "step": 457 + }, + { + "epoch": 1.62, + "learning_rate": 4.60913159156711e-08, + "logits/chosen": -1.9898021221160889, + "logits/rejected": -1.985906720161438, + "logps/chosen": -2.084059238433838, + "logps/rejected": -4.058363914489746, + "loss": 0.6931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9412271976470947, + "rewards/margins": 0.343174546957016, + "rewards/rejected": 0.5980526208877563, + "step": 458 + }, + { + "epoch": 1.62, + "learning_rate": 4.5901066689241505e-08, + "logits/chosen": -2.053060531616211, + "logits/rejected": -2.0489325523376465, + "logps/chosen": -0.6857567429542542, + "logps/rejected": -3.3915023803710938, + "loss": 0.6466, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8042589426040649, + "rewards/margins": -0.07596099376678467, + "rewards/rejected": 0.8802199363708496, + "step": 459 + }, + { + "epoch": 1.63, + "learning_rate": 4.571087718977047e-08, + "logits/chosen": -2.054792881011963, + "logits/rejected": -2.075812339782715, + "logps/chosen": -1.086000919342041, + "logps/rejected": -16.213397979736328, + "loss": 0.643, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8182079195976257, + "rewards/margins": -0.16431251168251038, + "rewards/rejected": 0.9825204610824585, + "step": 460 + }, + { + "epoch": 1.63, + "learning_rate": 4.5520750188574225e-08, + "logits/chosen": -2.0659587383270264, + "logits/rejected": -2.063582420349121, + "logps/chosen": -8.869221687316895, + "logps/rejected": -3.3585410118103027, + "loss": 0.6815, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1529308557510376, + "rewards/margins": 0.5440673828125, + "rewards/rejected": 0.6088634729385376, + "step": 461 + }, + { + "epoch": 1.63, + "learning_rate": 4.5330688456058305e-08, + "logits/chosen": -2.048858165740967, + "logits/rejected": -2.047473669052124, + "logps/chosen": -1.7878072261810303, + "logps/rejected": -3.9856016635894775, + "loss": 0.6497, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7302944660186768, + "rewards/margins": -0.05092984437942505, + "rewards/rejected": 0.7812243700027466, + "step": 462 + }, + { + "epoch": 1.64, + "learning_rate": 4.5140694761677155e-08, + "logits/chosen": -2.007678270339966, + "logits/rejected": -2.0115573406219482, + "logps/chosen": -8.21692943572998, + "logps/rejected": -2.6952974796295166, + "loss": 0.6427, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1048425436019897, + "rewards/margins": 0.5785397887229919, + "rewards/rejected": 0.5263027548789978, + "step": 463 + }, + { + "epoch": 1.64, + "learning_rate": 4.49507718738939e-08, + "logits/chosen": -2.0463922023773193, + "logits/rejected": -2.0553202629089355, + "logps/chosen": -1.9085391759872437, + "logps/rejected": -10.208600044250488, + "loss": 0.8005, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7721104621887207, + "rewards/margins": -0.3321719169616699, + "rewards/rejected": 1.1042823791503906, + "step": 464 + }, + { + "epoch": 1.64, + "learning_rate": 4.4760922560139845e-08, + "logits/chosen": -2.0923922061920166, + "logits/rejected": -2.092442750930786, + "logps/chosen": -0.6040335893630981, + "logps/rejected": -5.359764099121094, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6976540088653564, + "rewards/margins": 0.145774245262146, + "rewards/rejected": 0.5518797636032104, + "step": 465 + }, + { + "epoch": 1.65, + "learning_rate": 4.457114958677423e-08, + "logits/chosen": -2.0299232006073, + "logits/rejected": -2.0301270484924316, + "logps/chosen": -5.597898483276367, + "logps/rejected": -3.0043857097625732, + "loss": 0.6011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9934437274932861, + "rewards/margins": 0.5337262153625488, + "rewards/rejected": 0.45971745252609253, + "step": 466 + }, + { + "epoch": 1.65, + "learning_rate": 4.4381455719043954e-08, + "logits/chosen": -2.0186383724212646, + "logits/rejected": -2.024634838104248, + "logps/chosen": -8.338130950927734, + "logps/rejected": -5.6706767082214355, + "loss": 0.615, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0751557350158691, + "rewards/margins": 0.36132076382637024, + "rewards/rejected": 0.7138350009918213, + "step": 467 + }, + { + "epoch": 1.65, + "learning_rate": 4.41918437210432e-08, + "logits/chosen": -2.051787853240967, + "logits/rejected": -2.060228109359741, + "logps/chosen": -0.8608881235122681, + "logps/rejected": -8.313345909118652, + "loss": 0.6212, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8119526505470276, + "rewards/margins": -0.01610150933265686, + "rewards/rejected": 0.8280541896820068, + "step": 468 + }, + { + "epoch": 1.66, + "learning_rate": 4.400231635567319e-08, + "logits/chosen": -2.0768144130706787, + "logits/rejected": -2.084219455718994, + "logps/chosen": -0.6845318675041199, + "logps/rejected": -8.429481506347656, + "loss": 0.6952, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.754654049873352, + "rewards/margins": -0.23601192235946655, + "rewards/rejected": 0.9906659722328186, + "step": 469 + }, + { + "epoch": 1.66, + "learning_rate": 4.381287638460201e-08, + "logits/chosen": -2.086693525314331, + "logits/rejected": -2.0859792232513428, + "logps/chosen": -8.36959171295166, + "logps/rejected": -2.3661251068115234, + "loss": 0.6029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2581064701080322, + "rewards/margins": 0.7302519083023071, + "rewards/rejected": 0.5278546214103699, + "step": 470 + }, + { + "epoch": 1.66, + "learning_rate": 4.362352656822421e-08, + "logits/chosen": -2.041407585144043, + "logits/rejected": -2.0576438903808594, + "logps/chosen": -1.0106086730957031, + "logps/rejected": -16.136240005493164, + "loss": 0.6882, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8415718674659729, + "rewards/margins": -0.3802269697189331, + "rewards/rejected": 1.2217988967895508, + "step": 471 + }, + { + "epoch": 1.67, + "learning_rate": 4.343426966562069e-08, + "logits/chosen": -2.0456764698028564, + "logits/rejected": -2.0514209270477295, + "logps/chosen": -9.399102210998535, + "logps/rejected": -9.333192825317383, + "loss": 0.5466, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2753163576126099, + "rewards/margins": 0.2922922670841217, + "rewards/rejected": 0.9830241203308105, + "step": 472 + }, + { + "epoch": 1.67, + "learning_rate": 4.32451084345185e-08, + "logits/chosen": -2.0198991298675537, + "logits/rejected": -2.0148534774780273, + "logps/chosen": -0.6527957916259766, + "logps/rejected": -6.180942535400391, + "loss": 0.9219, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7595587968826294, + "rewards/margins": -0.10266584157943726, + "rewards/rejected": 0.8622246980667114, + "step": 473 + }, + { + "epoch": 1.67, + "learning_rate": 4.3056045631250605e-08, + "logits/chosen": -2.039966583251953, + "logits/rejected": -2.0444209575653076, + "logps/chosen": -3.001587390899658, + "logps/rejected": -2.849776029586792, + "loss": 0.5996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9056953191757202, + "rewards/margins": 0.2817304730415344, + "rewards/rejected": 0.6239649057388306, + "step": 474 + }, + { + "epoch": 1.68, + "learning_rate": 4.286708401071573e-08, + "logits/chosen": -2.002443313598633, + "logits/rejected": -2.0561230182647705, + "logps/chosen": -6.7335686683654785, + "logps/rejected": -6.599236488342285, + "loss": 0.7393, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6979931592941284, + "rewards/margins": -0.0007208883762359619, + "rewards/rejected": 0.6987140774726868, + "step": 475 + }, + { + "epoch": 1.68, + "learning_rate": 4.267822632633824e-08, + "logits/chosen": -2.0006558895111084, + "logits/rejected": -2.003046989440918, + "logps/chosen": -1.304510235786438, + "logps/rejected": -2.0712037086486816, + "loss": 0.6225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.903831958770752, + "rewards/margins": 0.3627339005470276, + "rewards/rejected": 0.5410981178283691, + "step": 476 + }, + { + "epoch": 1.69, + "learning_rate": 4.248947533002805e-08, + "logits/chosen": -2.054265022277832, + "logits/rejected": -2.063375234603882, + "logps/chosen": -0.5634168386459351, + "logps/rejected": -5.1910834312438965, + "loss": 0.5501, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.749413013458252, + "rewards/margins": 0.1646736115217209, + "rewards/rejected": 0.5847393870353699, + "step": 477 + }, + { + "epoch": 1.69, + "learning_rate": 4.230083377214043e-08, + "logits/chosen": -1.9303072690963745, + "logits/rejected": -1.9356145858764648, + "logps/chosen": -1.6965779066085815, + "logps/rejected": -9.22527027130127, + "loss": 0.7439, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6864035725593567, + "rewards/margins": -0.26886942982673645, + "rewards/rejected": 0.9552730321884155, + "step": 478 + }, + { + "epoch": 1.69, + "learning_rate": 4.2112304401436036e-08, + "logits/chosen": -2.072603702545166, + "logits/rejected": -2.07981538772583, + "logps/chosen": -0.8221795558929443, + "logps/rejected": -7.838292121887207, + "loss": 0.7187, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8157384395599365, + "rewards/margins": 0.11669686436653137, + "rewards/rejected": 0.6990416049957275, + "step": 479 + }, + { + "epoch": 1.7, + "learning_rate": 4.192388996504076e-08, + "logits/chosen": -1.9767906665802002, + "logits/rejected": -1.969706416130066, + "logps/chosen": -1.6703375577926636, + "logps/rejected": -3.012401580810547, + "loss": 0.7401, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7609949707984924, + "rewards/margins": -0.2188262641429901, + "rewards/rejected": 0.9798212051391602, + "step": 480 + }, + { + "epoch": 1.7, + "learning_rate": 4.173559320840578e-08, + "logits/chosen": -2.0083200931549072, + "logits/rejected": -1.9957138299942017, + "logps/chosen": -10.496040344238281, + "logps/rejected": -6.935914516448975, + "loss": 0.6279, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0504958629608154, + "rewards/margins": 0.001768559217453003, + "rewards/rejected": 1.04872727394104, + "step": 481 + }, + { + "epoch": 1.7, + "learning_rate": 4.154741687526748e-08, + "logits/chosen": -2.0347139835357666, + "logits/rejected": -2.0559041500091553, + "logps/chosen": -0.6643368005752563, + "logps/rejected": -3.949586868286133, + "loss": 0.769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7920306921005249, + "rewards/margins": 0.0746644139289856, + "rewards/rejected": 0.7173662781715393, + "step": 482 + }, + { + "epoch": 1.71, + "learning_rate": 4.1359363707607585e-08, + "logits/chosen": -1.986863136291504, + "logits/rejected": -1.9893666505813599, + "logps/chosen": -0.6317017078399658, + "logps/rejected": -7.429633140563965, + "loss": 0.7047, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8096635341644287, + "rewards/margins": -0.1896054446697235, + "rewards/rejected": 0.9992690086364746, + "step": 483 + }, + { + "epoch": 1.71, + "learning_rate": 4.1171436445613054e-08, + "logits/chosen": -2.037929058074951, + "logits/rejected": -2.0921502113342285, + "logps/chosen": -0.4591904580593109, + "logps/rejected": -19.224979400634766, + "loss": 0.662, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6763129830360413, + "rewards/margins": -0.30418071150779724, + "rewards/rejected": 0.9804936647415161, + "step": 484 + }, + { + "epoch": 1.71, + "learning_rate": 4.09836378276363e-08, + "logits/chosen": -2.0375325679779053, + "logits/rejected": -2.0478827953338623, + "logps/chosen": -4.556056976318359, + "logps/rejected": -1.9402828216552734, + "loss": 0.5776, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8362867832183838, + "rewards/margins": 0.12543675303459167, + "rewards/rejected": 0.7108500003814697, + "step": 485 + }, + { + "epoch": 1.72, + "learning_rate": 4.079597059015518e-08, + "logits/chosen": -2.0760178565979004, + "logits/rejected": -2.0782957077026367, + "logps/chosen": -3.543773889541626, + "logps/rejected": -9.34070873260498, + "loss": 0.7833, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7817620635032654, + "rewards/margins": -0.27538028359413147, + "rewards/rejected": 1.0571423768997192, + "step": 486 + }, + { + "epoch": 1.72, + "learning_rate": 4.060843746773315e-08, + "logits/chosen": -2.001426935195923, + "logits/rejected": -2.0067906379699707, + "logps/chosen": -1.794614553451538, + "logps/rejected": -2.379093647003174, + "loss": 0.6555, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7868061065673828, + "rewards/margins": 0.1361326277256012, + "rewards/rejected": 0.650673508644104, + "step": 487 + }, + { + "epoch": 1.72, + "learning_rate": 4.0421041192979435e-08, + "logits/chosen": -2.019761800765991, + "logits/rejected": -2.0251786708831787, + "logps/chosen": -1.6045018434524536, + "logps/rejected": -7.3810625076293945, + "loss": 0.845, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7470712065696716, + "rewards/margins": -0.08484354615211487, + "rewards/rejected": 0.8319147825241089, + "step": 488 + }, + { + "epoch": 1.73, + "learning_rate": 4.023378449650928e-08, + "logits/chosen": -1.9491389989852905, + "logits/rejected": -1.9470124244689941, + "logps/chosen": -4.571204662322998, + "logps/rejected": -3.024712324142456, + "loss": 0.5591, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7426378726959229, + "rewards/margins": 0.22939716279506683, + "rewards/rejected": 0.5132407546043396, + "step": 489 + }, + { + "epoch": 1.73, + "learning_rate": 4.004667010690398e-08, + "logits/chosen": -1.9756423234939575, + "logits/rejected": -1.9793860912322998, + "logps/chosen": -2.1325855255126953, + "logps/rejected": -2.193197727203369, + "loss": 0.6643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8988503813743591, + "rewards/margins": 0.28943929076194763, + "rewards/rejected": 0.6094111204147339, + "step": 490 + }, + { + "epoch": 1.73, + "learning_rate": 3.9859700750671274e-08, + "logits/chosen": -1.9856154918670654, + "logits/rejected": -2.0080044269561768, + "logps/chosen": -3.8758726119995117, + "logps/rejected": -12.638447761535645, + "loss": 0.7811, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7840211391448975, + "rewards/margins": 0.0021733641624450684, + "rewards/rejected": 0.7818478345870972, + "step": 491 + }, + { + "epoch": 1.74, + "learning_rate": 3.96728791522056e-08, + "logits/chosen": -1.9935662746429443, + "logits/rejected": -2.000058174133301, + "logps/chosen": -2.613300323486328, + "logps/rejected": -2.3204116821289062, + "loss": 0.8063, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7004095315933228, + "rewards/margins": -0.057782307267189026, + "rewards/rejected": 0.7581918239593506, + "step": 492 + }, + { + "epoch": 1.74, + "learning_rate": 3.948620803374831e-08, + "logits/chosen": -2.084075689315796, + "logits/rejected": -2.083505153656006, + "logps/chosen": -1.76994788646698, + "logps/rejected": -5.059174060821533, + "loss": 0.6615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8095270991325378, + "rewards/margins": 0.07501327991485596, + "rewards/rejected": 0.7345138192176819, + "step": 493 + }, + { + "epoch": 1.75, + "learning_rate": 3.92996901153481e-08, + "logits/chosen": -1.996493935585022, + "logits/rejected": -2.001600503921509, + "logps/chosen": -1.6378049850463867, + "logps/rejected": -2.1092560291290283, + "loss": 0.729, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8837807774543762, + "rewards/margins": 0.21761158108711243, + "rewards/rejected": 0.6661691665649414, + "step": 494 + }, + { + "epoch": 1.75, + "learning_rate": 3.91133281148213e-08, + "logits/chosen": -2.040820837020874, + "logits/rejected": -2.0384421348571777, + "logps/chosen": -7.452361583709717, + "logps/rejected": -2.569246530532837, + "loss": 0.7881, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2293729782104492, + "rewards/margins": 0.5708858966827393, + "rewards/rejected": 0.6584872007369995, + "step": 495 + }, + { + "epoch": 1.75, + "learning_rate": 3.892712474771237e-08, + "logits/chosen": -2.07600736618042, + "logits/rejected": -2.074216604232788, + "logps/chosen": -2.3341474533081055, + "logps/rejected": -4.025932312011719, + "loss": 0.6468, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7433847188949585, + "rewards/margins": -0.03562924265861511, + "rewards/rejected": 0.779013991355896, + "step": 496 + }, + { + "epoch": 1.76, + "learning_rate": 3.874108272725421e-08, + "logits/chosen": -2.0299601554870605, + "logits/rejected": -2.0322697162628174, + "logps/chosen": -2.4806954860687256, + "logps/rejected": -2.610170841217041, + "loss": 0.613, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9429163932800293, + "rewards/margins": 0.39760851860046387, + "rewards/rejected": 0.5453078746795654, + "step": 497 + }, + { + "epoch": 1.76, + "learning_rate": 3.8555204764328706e-08, + "logits/chosen": -1.983758807182312, + "logits/rejected": -1.9873000383377075, + "logps/chosen": -1.6030898094177246, + "logps/rejected": -3.7345166206359863, + "loss": 0.7423, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8593732714653015, + "rewards/margins": 0.25029411911964417, + "rewards/rejected": 0.609079122543335, + "step": 498 + }, + { + "epoch": 1.76, + "learning_rate": 3.8369493567427205e-08, + "logits/chosen": -2.072202444076538, + "logits/rejected": -2.073483943939209, + "logps/chosen": -3.333197832107544, + "logps/rejected": -4.986106872558594, + "loss": 0.633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8591408729553223, + "rewards/margins": 0.24768255650997162, + "rewards/rejected": 0.6114583015441895, + "step": 499 + }, + { + "epoch": 1.77, + "learning_rate": 3.818395184261103e-08, + "logits/chosen": -2.0057013034820557, + "logits/rejected": -2.0060644149780273, + "logps/chosen": -1.603473424911499, + "logps/rejected": -1.798490285873413, + "loss": 0.5964, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7486532330513, + "rewards/margins": 0.024535715579986572, + "rewards/rejected": 0.7241175174713135, + "step": 500 + }, + { + "epoch": 1.77, + "learning_rate": 3.799858229347208e-08, + "logits/chosen": -1.9203071594238281, + "logits/rejected": -1.9171274900436401, + "logps/chosen": -2.070312023162842, + "logps/rejected": -5.368598937988281, + "loss": 0.6227, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7085436582565308, + "rewards/margins": -0.05357953906059265, + "rewards/rejected": 0.762123167514801, + "step": 501 + }, + { + "epoch": 1.77, + "learning_rate": 3.781338762109347e-08, + "logits/chosen": -1.9681785106658936, + "logits/rejected": -1.967158555984497, + "logps/chosen": -0.6621506214141846, + "logps/rejected": -3.693568706512451, + "loss": 0.6879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8163309693336487, + "rewards/margins": 0.048964500427246094, + "rewards/rejected": 0.7673664689064026, + "step": 502 + }, + { + "epoch": 1.78, + "learning_rate": 3.7628370524010034e-08, + "logits/chosen": -2.1190922260284424, + "logits/rejected": -2.1212058067321777, + "logps/chosen": -1.9981067180633545, + "logps/rejected": -1.8387198448181152, + "loss": 0.5142, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8978937864303589, + "rewards/margins": 0.3369745910167694, + "rewards/rejected": 0.5609192252159119, + "step": 503 + }, + { + "epoch": 1.78, + "learning_rate": 3.7443533698169184e-08, + "logits/chosen": -2.0141243934631348, + "logits/rejected": -2.0089917182922363, + "logps/chosen": -3.725149154663086, + "logps/rejected": -9.490835189819336, + "loss": 0.5997, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.695847749710083, + "rewards/margins": -0.39483901858329773, + "rewards/rejected": 1.0906867980957031, + "step": 504 + }, + { + "epoch": 1.78, + "learning_rate": 3.7258879836891476e-08, + "logits/chosen": -1.999445915222168, + "logits/rejected": -2.025217056274414, + "logps/chosen": -2.307692766189575, + "logps/rejected": -20.266942977905273, + "loss": 0.7677, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8078789710998535, + "rewards/margins": -0.4754992723464966, + "rewards/rejected": 1.28337824344635, + "step": 505 + }, + { + "epoch": 1.79, + "learning_rate": 3.707441163083146e-08, + "logits/chosen": -1.9775477647781372, + "logits/rejected": -1.9889403581619263, + "logps/chosen": -1.7356458902359009, + "logps/rejected": -14.256538391113281, + "loss": 0.803, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8422894477844238, + "rewards/margins": -0.44874411821365356, + "rewards/rejected": 1.2910335063934326, + "step": 506 + }, + { + "epoch": 1.79, + "learning_rate": 3.68901317679384e-08, + "logits/chosen": -2.009758949279785, + "logits/rejected": -2.0181849002838135, + "logps/chosen": -1.0672467947006226, + "logps/rejected": -11.95969295501709, + "loss": 0.6206, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9135645031929016, + "rewards/margins": -0.2214115560054779, + "rewards/rejected": 1.1349760293960571, + "step": 507 + }, + { + "epoch": 1.8, + "learning_rate": 3.670604293341722e-08, + "logits/chosen": -2.130502939224243, + "logits/rejected": -2.13706636428833, + "logps/chosen": -3.631016254425049, + "logps/rejected": -1.730623722076416, + "loss": 0.4755, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1320672035217285, + "rewards/margins": 0.5625433921813965, + "rewards/rejected": 0.569523811340332, + "step": 508 + }, + { + "epoch": 1.8, + "learning_rate": 3.6522147809689255e-08, + "logits/chosen": -2.0531868934631348, + "logits/rejected": -2.060654640197754, + "logps/chosen": -0.6351094245910645, + "logps/rejected": -10.517657279968262, + "loss": 0.7645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7765212655067444, + "rewards/margins": -0.0928502082824707, + "rewards/rejected": 0.8693714737892151, + "step": 509 + }, + { + "epoch": 1.8, + "learning_rate": 3.63384490763532e-08, + "logits/chosen": -2.0336825847625732, + "logits/rejected": -2.107422113418579, + "logps/chosen": -1.4717893600463867, + "logps/rejected": -25.92159652709961, + "loss": 0.5529, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8652842044830322, + "rewards/margins": -0.32480451464653015, + "rewards/rejected": 1.1900887489318848, + "step": 510 + }, + { + "epoch": 1.81, + "learning_rate": 3.615494941014613e-08, + "logits/chosen": -2.0883898735046387, + "logits/rejected": -2.085893392562866, + "logps/chosen": -0.7363142967224121, + "logps/rejected": -3.9899415969848633, + "loss": 0.6181, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8501417636871338, + "rewards/margins": 0.2475891411304474, + "rewards/rejected": 0.6025526523590088, + "step": 511 + }, + { + "epoch": 1.81, + "learning_rate": 3.597165148490438e-08, + "logits/chosen": -1.9502906799316406, + "logits/rejected": -1.9647570848464966, + "logps/chosen": -2.5164055824279785, + "logps/rejected": -11.542009353637695, + "loss": 0.7136, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7511206865310669, + "rewards/margins": -0.10036167502403259, + "rewards/rejected": 0.8514823913574219, + "step": 512 + }, + { + "epoch": 1.81, + "learning_rate": 3.578855797152469e-08, + "logits/chosen": -2.03668475151062, + "logits/rejected": -2.138859510421753, + "logps/chosen": -0.6691325902938843, + "logps/rejected": -32.09943389892578, + "loss": 0.6657, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8072614073753357, + "rewards/margins": -0.22747382521629333, + "rewards/rejected": 1.0347352027893066, + "step": 513 + }, + { + "epoch": 1.82, + "learning_rate": 3.560567153792526e-08, + "logits/chosen": -2.031186580657959, + "logits/rejected": -2.032327651977539, + "logps/chosen": -2.460505962371826, + "logps/rejected": -2.8570194244384766, + "loss": 0.7353, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9340801239013672, + "rewards/margins": 0.2317964732646942, + "rewards/rejected": 0.7022836208343506, + "step": 514 + }, + { + "epoch": 1.82, + "learning_rate": 3.54229948490068e-08, + "logits/chosen": -2.061553716659546, + "logits/rejected": -2.0630857944488525, + "logps/chosen": -2.5311193466186523, + "logps/rejected": -7.896140098571777, + "loss": 0.6855, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7442220449447632, + "rewards/margins": -0.2678731381893158, + "rewards/rejected": 1.0120952129364014, + "step": 515 + }, + { + "epoch": 1.82, + "learning_rate": 3.524053056661385e-08, + "logits/chosen": -1.9778205156326294, + "logits/rejected": -1.9794037342071533, + "logps/chosen": -1.3888278007507324, + "logps/rejected": -9.42696762084961, + "loss": 0.9411, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6588500142097473, + "rewards/margins": -0.48537981510162354, + "rewards/rejected": 1.1442298889160156, + "step": 516 + }, + { + "epoch": 1.83, + "learning_rate": 3.50582813494958e-08, + "logits/chosen": -1.9293794631958008, + "logits/rejected": -1.9241493940353394, + "logps/chosen": -8.044556617736816, + "logps/rejected": -2.5994129180908203, + "loss": 0.5245, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0535550117492676, + "rewards/margins": 0.4892274737358093, + "rewards/rejected": 0.5643274784088135, + "step": 517 + }, + { + "epoch": 1.83, + "learning_rate": 3.4876249853268325e-08, + "logits/chosen": -2.047361135482788, + "logits/rejected": -2.058753252029419, + "logps/chosen": -2.0414772033691406, + "logps/rejected": -16.0045108795166, + "loss": 0.7314, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8607810139656067, + "rewards/margins": -0.44900280237197876, + "rewards/rejected": 1.3097838163375854, + "step": 518 + }, + { + "epoch": 1.83, + "learning_rate": 3.469443873037457e-08, + "logits/chosen": -2.0416767597198486, + "logits/rejected": -2.045185089111328, + "logps/chosen": -1.4271190166473389, + "logps/rejected": -8.3588228225708, + "loss": 0.7809, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.871781051158905, + "rewards/margins": -0.0323236882686615, + "rewards/rejected": 0.9041047096252441, + "step": 519 + }, + { + "epoch": 1.84, + "learning_rate": 3.451285063004654e-08, + "logits/chosen": -2.0094738006591797, + "logits/rejected": -2.006446599960327, + "logps/chosen": -7.68764591217041, + "logps/rejected": -1.2649552822113037, + "loss": 0.6896, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9880450963973999, + "rewards/margins": 0.24014928936958313, + "rewards/rejected": 0.7478958368301392, + "step": 520 + }, + { + "epoch": 1.84, + "learning_rate": 3.433148819826657e-08, + "logits/chosen": -2.048924684524536, + "logits/rejected": -2.051501989364624, + "logps/chosen": -2.228991746902466, + "logps/rejected": -1.4790059328079224, + "loss": 0.6389, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8049038648605347, + "rewards/margins": 0.0028390884399414062, + "rewards/rejected": 0.8020647764205933, + "step": 521 + }, + { + "epoch": 1.84, + "learning_rate": 3.415035407772865e-08, + "logits/chosen": -1.9551259279251099, + "logits/rejected": -1.9479987621307373, + "logps/chosen": -1.5105535984039307, + "logps/rejected": -3.9278318881988525, + "loss": 0.7088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9129421710968018, + "rewards/margins": 0.41707128286361694, + "rewards/rejected": 0.4958708882331848, + "step": 522 + }, + { + "epoch": 1.85, + "learning_rate": 3.396945090779996e-08, + "logits/chosen": -2.04197359085083, + "logits/rejected": -2.0490293502807617, + "logps/chosen": -1.9794120788574219, + "logps/rejected": -7.9030327796936035, + "loss": 0.7786, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9635767936706543, + "rewards/margins": 0.06618022918701172, + "rewards/rejected": 0.8973965644836426, + "step": 523 + }, + { + "epoch": 1.85, + "learning_rate": 3.378878132448244e-08, + "logits/chosen": -2.0022506713867188, + "logits/rejected": -2.0265562534332275, + "logps/chosen": -0.534801185131073, + "logps/rejected": -15.17160701751709, + "loss": 0.7232, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7080270051956177, + "rewards/margins": -0.4462982416152954, + "rewards/rejected": 1.154325246810913, + "step": 524 + }, + { + "epoch": 1.86, + "learning_rate": 3.360834796037435e-08, + "logits/chosen": -2.0685338973999023, + "logits/rejected": -2.0698583126068115, + "logps/chosen": -0.8358314037322998, + "logps/rejected": -2.8418397903442383, + "loss": 0.7293, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8789623379707336, + "rewards/margins": 0.1962672770023346, + "rewards/rejected": 0.6826950311660767, + "step": 525 + }, + { + "epoch": 1.86, + "learning_rate": 3.34281534446319e-08, + "logits/chosen": -2.171719789505005, + "logits/rejected": -2.1738741397857666, + "logps/chosen": -0.5414974689483643, + "logps/rejected": -2.6939196586608887, + "loss": 0.6414, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8294838070869446, + "rewards/margins": 0.3874841034412384, + "rewards/rejected": 0.4419997036457062, + "step": 526 + }, + { + "epoch": 1.86, + "learning_rate": 3.324820040293102e-08, + "logits/chosen": -2.095395565032959, + "logits/rejected": -2.1060261726379395, + "logps/chosen": -1.695860505104065, + "logps/rejected": -8.131914138793945, + "loss": 0.6829, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6777318120002747, + "rewards/margins": -0.021112501621246338, + "rewards/rejected": 0.698844313621521, + "step": 527 + }, + { + "epoch": 1.87, + "learning_rate": 3.306849145742898e-08, + "logits/chosen": -2.0535178184509277, + "logits/rejected": -2.0600454807281494, + "logps/chosen": -0.6197307109832764, + "logps/rejected": -9.561275482177734, + "loss": 0.7717, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7168314456939697, + "rewards/margins": -0.20926064252853394, + "rewards/rejected": 0.9260920286178589, + "step": 528 + }, + { + "epoch": 1.87, + "learning_rate": 3.2889029226726285e-08, + "logits/chosen": -2.043935775756836, + "logits/rejected": -2.053406000137329, + "logps/chosen": -2.099017858505249, + "logps/rejected": -15.225845336914062, + "loss": 0.6898, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6938261985778809, + "rewards/margins": -0.44498589634895325, + "rewards/rejected": 1.1388120651245117, + "step": 529 + }, + { + "epoch": 1.87, + "learning_rate": 3.270981632582843e-08, + "logits/chosen": -2.063016176223755, + "logits/rejected": -2.054276704788208, + "logps/chosen": -3.9153950214385986, + "logps/rejected": -3.0642333030700684, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7531058192253113, + "rewards/margins": 0.12608224153518677, + "rewards/rejected": 0.6270235776901245, + "step": 530 + }, + { + "epoch": 1.88, + "learning_rate": 3.2530855366107855e-08, + "logits/chosen": -2.0834102630615234, + "logits/rejected": -2.086700677871704, + "logps/chosen": -1.9141795635223389, + "logps/rejected": -2.1283514499664307, + "loss": 0.7118, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7480906248092651, + "rewards/margins": 0.19461201131343842, + "rewards/rejected": 0.5534785985946655, + "step": 531 + }, + { + "epoch": 1.88, + "learning_rate": 3.235214895526589e-08, + "logits/chosen": -1.9698981046676636, + "logits/rejected": -1.9790433645248413, + "logps/chosen": -2.288414478302002, + "logps/rejected": -11.765528678894043, + "loss": 0.7658, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7722533941268921, + "rewards/margins": -0.11649250984191895, + "rewards/rejected": 0.888745903968811, + "step": 532 + }, + { + "epoch": 1.88, + "learning_rate": 3.2173699697294755e-08, + "logits/chosen": -1.9452455043792725, + "logits/rejected": -1.9457720518112183, + "logps/chosen": -1.7002671957015991, + "logps/rejected": -10.690563201904297, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7904394268989563, + "rewards/margins": -0.04682408273220062, + "rewards/rejected": 0.8372635245323181, + "step": 533 + }, + { + "epoch": 1.89, + "learning_rate": 3.1995510192439586e-08, + "logits/chosen": -2.0963692665100098, + "logits/rejected": -2.1068427562713623, + "logps/chosen": -1.1281081438064575, + "logps/rejected": -8.187812805175781, + "loss": 0.8193, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8603235483169556, + "rewards/margins": -0.09277483820915222, + "rewards/rejected": 0.9530984163284302, + "step": 534 + }, + { + "epoch": 1.89, + "learning_rate": 3.1817583037160576e-08, + "logits/chosen": -1.9483853578567505, + "logits/rejected": -1.954572319984436, + "logps/chosen": -3.1968178749084473, + "logps/rejected": -2.871859312057495, + "loss": 0.6006, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9378165006637573, + "rewards/margins": 0.22294330596923828, + "rewards/rejected": 0.714873194694519, + "step": 535 + }, + { + "epoch": 1.89, + "learning_rate": 3.163992082409515e-08, + "logits/chosen": -2.0271263122558594, + "logits/rejected": -2.0352861881256104, + "logps/chosen": -0.7921642065048218, + "logps/rejected": -3.7982306480407715, + "loss": 0.5927, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8923821449279785, + "rewards/margins": 0.27022892236709595, + "rewards/rejected": 0.6221532225608826, + "step": 536 + }, + { + "epoch": 1.9, + "learning_rate": 3.146252614202011e-08, + "logits/chosen": -1.994824767112732, + "logits/rejected": -2.002593517303467, + "logps/chosen": -0.9114043712615967, + "logps/rejected": -11.961177825927734, + "loss": 0.705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.824070394039154, + "rewards/margins": -0.2662028968334198, + "rewards/rejected": 1.0902732610702515, + "step": 537 + }, + { + "epoch": 1.9, + "learning_rate": 3.128540157581404e-08, + "logits/chosen": -2.0024964809417725, + "logits/rejected": -2.0001490116119385, + "logps/chosen": -14.6605224609375, + "logps/rejected": -3.2659988403320312, + "loss": 0.6344, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0602426528930664, + "rewards/margins": 0.36254405975341797, + "rewards/rejected": 0.6976985335350037, + "step": 538 + }, + { + "epoch": 1.9, + "learning_rate": 3.110854970641955e-08, + "logits/chosen": -2.041891098022461, + "logits/rejected": -2.042310953140259, + "logps/chosen": -1.8549883365631104, + "logps/rejected": -10.666351318359375, + "loss": 0.6907, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8397300243377686, + "rewards/margins": -0.19547832012176514, + "rewards/rejected": 1.0352083444595337, + "step": 539 + }, + { + "epoch": 1.91, + "learning_rate": 3.093197311080568e-08, + "logits/chosen": -2.0083870887756348, + "logits/rejected": -2.0235538482666016, + "logps/chosen": -0.7126603126525879, + "logps/rejected": -11.128395080566406, + "loss": 0.6346, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8299626111984253, + "rewards/margins": -0.010126352310180664, + "rewards/rejected": 0.840088963508606, + "step": 540 + }, + { + "epoch": 1.91, + "learning_rate": 3.0755674361930385e-08, + "logits/chosen": -1.9648241996765137, + "logits/rejected": -1.9668594598770142, + "logps/chosen": -2.9147002696990967, + "logps/rejected": -7.751448154449463, + "loss": 0.759, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7229217290878296, + "rewards/margins": -0.18490397930145264, + "rewards/rejected": 0.9078257083892822, + "step": 541 + }, + { + "epoch": 1.92, + "learning_rate": 3.057965602870299e-08, + "logits/chosen": -1.9627115726470947, + "logits/rejected": -1.9614741802215576, + "logps/chosen": -6.764293670654297, + "logps/rejected": -1.4561386108398438, + "loss": 0.7599, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2747888565063477, + "rewards/margins": 0.4494270980358124, + "rewards/rejected": 0.8253617882728577, + "step": 542 + }, + { + "epoch": 1.92, + "learning_rate": 3.0403920675946824e-08, + "logits/chosen": -1.992544174194336, + "logits/rejected": -2.0029733180999756, + "logps/chosen": -3.949306011199951, + "logps/rejected": -5.279593467712402, + "loss": 0.7488, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9997955560684204, + "rewards/margins": 0.21279653906822205, + "rewards/rejected": 0.786998987197876, + "step": 543 + }, + { + "epoch": 1.92, + "learning_rate": 3.0228470864361754e-08, + "logits/chosen": -2.0197525024414062, + "logits/rejected": -2.018883228302002, + "logps/chosen": -3.4301486015319824, + "logps/rejected": -7.684656143188477, + "loss": 0.7119, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6117221713066101, + "rewards/margins": -0.44747018814086914, + "rewards/rejected": 1.059192419052124, + "step": 544 + }, + { + "epoch": 1.93, + "learning_rate": 3.0053309150487e-08, + "logits/chosen": -2.050445079803467, + "logits/rejected": -2.055623769760132, + "logps/chosen": -3.134079933166504, + "logps/rejected": -1.8405287265777588, + "loss": 0.7635, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8497142791748047, + "rewards/margins": 0.11833688616752625, + "rewards/rejected": 0.731377363204956, + "step": 545 + }, + { + "epoch": 1.93, + "learning_rate": 2.987843808666375e-08, + "logits/chosen": -1.9778823852539062, + "logits/rejected": -1.9992653131484985, + "logps/chosen": -2.6414270401000977, + "logps/rejected": -3.5321731567382812, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8511974811553955, + "rewards/margins": 0.2952607274055481, + "rewards/rejected": 0.5559366941452026, + "step": 546 + }, + { + "epoch": 1.93, + "learning_rate": 2.970386022099809e-08, + "logits/chosen": -2.0111887454986572, + "logits/rejected": -2.0113322734832764, + "logps/chosen": -0.5956284999847412, + "logps/rejected": -2.076591968536377, + "loss": 0.7003, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6611626148223877, + "rewards/margins": 0.00129738450050354, + "rewards/rejected": 0.6598652601242065, + "step": 547 + }, + { + "epoch": 1.94, + "learning_rate": 2.9529578097323766e-08, + "logits/chosen": -2.0035054683685303, + "logits/rejected": -2.010072946548462, + "logps/chosen": -0.3407284915447235, + "logps/rejected": -10.682703971862793, + "loss": 0.7171, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7223385572433472, + "rewards/margins": -0.22619375586509705, + "rewards/rejected": 0.9485322833061218, + "step": 548 + }, + { + "epoch": 1.94, + "learning_rate": 2.9355594255165183e-08, + "logits/chosen": -1.9930146932601929, + "logits/rejected": -2.0233993530273438, + "logps/chosen": -8.408332824707031, + "logps/rejected": -6.189807415008545, + "loss": 0.7688, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.154813528060913, + "rewards/margins": 0.39637619256973267, + "rewards/rejected": 0.7584373950958252, + "step": 549 + }, + { + "epoch": 1.94, + "learning_rate": 2.9181911229700377e-08, + "logits/chosen": -2.0868642330169678, + "logits/rejected": -2.0846078395843506, + "logps/chosen": -3.1279478073120117, + "logps/rejected": -4.730249881744385, + "loss": 0.6284, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8471932411193848, + "rewards/margins": 0.38413330912590027, + "rewards/rejected": 0.4630599021911621, + "step": 550 + }, + { + "epoch": 1.95, + "learning_rate": 2.900853155172409e-08, + "logits/chosen": -2.046842575073242, + "logits/rejected": -2.055305242538452, + "logps/chosen": -3.7727034091949463, + "logps/rejected": -6.631382942199707, + "loss": 0.7528, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9294715523719788, + "rewards/margins": -0.044148385524749756, + "rewards/rejected": 0.9736199378967285, + "step": 551 + }, + { + "epoch": 1.95, + "learning_rate": 2.8835457747610903e-08, + "logits/chosen": -1.9963551759719849, + "logits/rejected": -1.9928714036941528, + "logps/chosen": -4.152552127838135, + "logps/rejected": -9.74232292175293, + "loss": 0.7138, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.715604841709137, + "rewards/margins": -0.062287673354148865, + "rewards/rejected": 0.777892529964447, + "step": 552 + }, + { + "epoch": 1.95, + "learning_rate": 2.8662692339278383e-08, + "logits/chosen": -1.9850748777389526, + "logits/rejected": -2.005174398422241, + "logps/chosen": -2.5669443607330322, + "logps/rejected": -18.587139129638672, + "loss": 0.7012, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8021912574768066, + "rewards/margins": -0.38843637704849243, + "rewards/rejected": 1.1906275749206543, + "step": 553 + }, + { + "epoch": 1.96, + "learning_rate": 2.8490237844150334e-08, + "logits/chosen": -1.9964152574539185, + "logits/rejected": -1.9914779663085938, + "logps/chosen": -8.383565902709961, + "logps/rejected": -5.974554061889648, + "loss": 0.5625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.11078679561615, + "rewards/margins": 0.4438531994819641, + "rewards/rejected": 0.666933536529541, + "step": 554 + }, + { + "epoch": 1.96, + "learning_rate": 2.831809677512018e-08, + "logits/chosen": -2.0619940757751465, + "logits/rejected": -2.0626964569091797, + "logps/chosen": -1.5812323093414307, + "logps/rejected": -6.55182409286499, + "loss": 0.6233, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8264256715774536, + "rewards/margins": 0.03188374638557434, + "rewards/rejected": 0.7945418953895569, + "step": 555 + }, + { + "epoch": 1.96, + "learning_rate": 2.8146271640514284e-08, + "logits/chosen": -2.052839756011963, + "logits/rejected": -2.060046434402466, + "logps/chosen": -2.2251148223876953, + "logps/rejected": -10.617984771728516, + "loss": 0.5692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8394283056259155, + "rewards/margins": 0.21796301007270813, + "rewards/rejected": 0.621465265750885, + "step": 556 + }, + { + "epoch": 1.97, + "learning_rate": 2.7974764944055395e-08, + "logits/chosen": -2.0255520343780518, + "logits/rejected": -2.032442569732666, + "logps/chosen": -2.5451784133911133, + "logps/rejected": -3.0225296020507812, + "loss": 0.6757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9274722337722778, + "rewards/margins": 0.464682012796402, + "rewards/rejected": 0.46279019117355347, + "step": 557 + }, + { + "epoch": 1.97, + "learning_rate": 2.780357918482627e-08, + "logits/chosen": -2.0259454250335693, + "logits/rejected": -2.026226758956909, + "logps/chosen": -1.595900058746338, + "logps/rejected": -2.322478771209717, + "loss": 0.7924, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.87444007396698, + "rewards/margins": 0.051410943269729614, + "rewards/rejected": 0.8230291604995728, + "step": 558 + }, + { + "epoch": 1.98, + "learning_rate": 2.763271685723311e-08, + "logits/chosen": -2.009242296218872, + "logits/rejected": -2.0088510513305664, + "logps/chosen": -4.318241596221924, + "logps/rejected": -2.611233711242676, + "loss": 0.6151, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8908120393753052, + "rewards/margins": 0.24333420395851135, + "rewards/rejected": 0.6474778056144714, + "step": 559 + }, + { + "epoch": 1.98, + "learning_rate": 2.7462180450969287e-08, + "logits/chosen": -2.017662286758423, + "logits/rejected": -2.0262460708618164, + "logps/chosen": -4.147726535797119, + "logps/rejected": -8.049290657043457, + "loss": 0.7394, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9340109825134277, + "rewards/margins": 0.04466971755027771, + "rewards/rejected": 0.8893412947654724, + "step": 560 + }, + { + "epoch": 1.98, + "learning_rate": 2.729197245097908e-08, + "logits/chosen": -2.00070858001709, + "logits/rejected": -1.9936566352844238, + "logps/chosen": -7.639114856719971, + "logps/rejected": -3.0128185749053955, + "loss": 0.7952, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.239055871963501, + "rewards/margins": 0.6847280263900757, + "rewards/rejected": 0.5543279647827148, + "step": 561 + }, + { + "epoch": 1.99, + "learning_rate": 2.7122095337421467e-08, + "logits/chosen": -1.992923617362976, + "logits/rejected": -1.994127631187439, + "logps/chosen": -1.551569938659668, + "logps/rejected": -3.6994969844818115, + "loss": 0.6546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8252840042114258, + "rewards/margins": 0.2636215090751648, + "rewards/rejected": 0.561662495136261, + "step": 562 + }, + { + "epoch": 1.99, + "learning_rate": 2.6952551585633943e-08, + "logits/chosen": -1.9605976343154907, + "logits/rejected": -1.95955228805542, + "logps/chosen": -9.526865005493164, + "logps/rejected": -7.722334861755371, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9444942474365234, + "rewards/margins": 0.06946167349815369, + "rewards/rejected": 0.8750325441360474, + "step": 563 + }, + { + "epoch": 1.99, + "learning_rate": 2.6783343666096442e-08, + "logits/chosen": -2.0327348709106445, + "logits/rejected": -2.0363640785217285, + "logps/chosen": -2.7642788887023926, + "logps/rejected": -5.516197681427002, + "loss": 0.7157, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8659851551055908, + "rewards/margins": -0.1381973922252655, + "rewards/rejected": 1.0041825771331787, + "step": 564 + }, + { + "epoch": 2.0, + "learning_rate": 2.6614474044395453e-08, + "logits/chosen": -2.047600269317627, + "logits/rejected": -2.0520899295806885, + "logps/chosen": -3.0265631675720215, + "logps/rejected": -3.1115164756774902, + "loss": 0.5862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8991777896881104, + "rewards/margins": 0.38286611437797546, + "rewards/rejected": 0.5163117051124573, + "step": 565 + }, + { + "epoch": 2.0, + "learning_rate": 2.6445945181187944e-08, + "logits/chosen": -2.0649373531341553, + "logits/rejected": -2.0811758041381836, + "logps/chosen": -1.3394925594329834, + "logps/rejected": -12.542106628417969, + "loss": 0.7097, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7461095452308655, + "rewards/margins": -0.23723956942558289, + "rewards/rejected": 0.983349084854126, + "step": 566 + }, + { + "epoch": 2.0, + "learning_rate": 2.6277759532165593e-08, + "logits/chosen": -2.0112385749816895, + "logits/rejected": -2.0282440185546875, + "logps/chosen": -1.500545620918274, + "logps/rejected": -5.230602264404297, + "loss": 0.6458, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8401870727539062, + "rewards/margins": 0.20178623497486115, + "rewards/rejected": 0.6384008526802063, + "step": 567 + }, + { + "epoch": 2.01, + "learning_rate": 2.6109919548019e-08, + "logits/chosen": -2.0407965183258057, + "logits/rejected": -2.046032428741455, + "logps/chosen": -3.7747154235839844, + "logps/rejected": -8.817118644714355, + "loss": 0.7395, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8961506485939026, + "rewards/margins": 0.047125816345214844, + "rewards/rejected": 0.8490248322486877, + "step": 568 + }, + { + "epoch": 2.01, + "learning_rate": 2.5942427674401934e-08, + "logits/chosen": -2.0034799575805664, + "logits/rejected": -2.011924982070923, + "logps/chosen": -3.278290271759033, + "logps/rejected": -9.199019432067871, + "loss": 0.7218, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8254525065422058, + "rewards/margins": -0.01707214117050171, + "rewards/rejected": 0.8425246477127075, + "step": 569 + }, + { + "epoch": 2.01, + "learning_rate": 2.577528635189574e-08, + "logits/chosen": -2.057476043701172, + "logits/rejected": -2.057609796524048, + "logps/chosen": -3.2188024520874023, + "logps/rejected": -2.8267674446105957, + "loss": 0.494, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.010270595550537, + "rewards/margins": 0.35321325063705444, + "rewards/rejected": 0.6570574045181274, + "step": 570 + }, + { + "epoch": 2.02, + "learning_rate": 2.560849801597381e-08, + "logits/chosen": -1.9759262800216675, + "logits/rejected": -1.9776380062103271, + "logps/chosen": -1.780182957649231, + "logps/rejected": -8.037775993347168, + "loss": 0.7855, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7115724086761475, + "rewards/margins": -0.1353342980146408, + "rewards/rejected": 0.8469066619873047, + "step": 571 + }, + { + "epoch": 2.02, + "learning_rate": 2.544206509696598e-08, + "logits/chosen": -2.0293140411376953, + "logits/rejected": -2.035022735595703, + "logps/chosen": -3.2516517639160156, + "logps/rejected": -0.988677978515625, + "loss": 0.7313, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9141806364059448, + "rewards/margins": 0.21743208169937134, + "rewards/rejected": 0.6967486143112183, + "step": 572 + }, + { + "epoch": 2.02, + "learning_rate": 2.5275990020023198e-08, + "logits/chosen": -2.0545294284820557, + "logits/rejected": -2.0617806911468506, + "logps/chosen": -1.8684097528457642, + "logps/rejected": -3.0811102390289307, + "loss": 0.6864, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.92132568359375, + "rewards/margins": 0.30214595794677734, + "rewards/rejected": 0.6191796660423279, + "step": 573 + }, + { + "epoch": 2.03, + "learning_rate": 2.511027520508222e-08, + "logits/chosen": -1.9770764112472534, + "logits/rejected": -1.9793449640274048, + "logps/chosen": -1.2212358713150024, + "logps/rejected": -2.209157705307007, + "loss": 0.551, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9534399509429932, + "rewards/margins": 0.25630471110343933, + "rewards/rejected": 0.6971352696418762, + "step": 574 + }, + { + "epoch": 2.03, + "learning_rate": 2.4944923066830242e-08, + "logits/chosen": -2.057431697845459, + "logits/rejected": -2.0581517219543457, + "logps/chosen": -2.410123348236084, + "logps/rejected": -2.6558408737182617, + "loss": 0.812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7993772029876709, + "rewards/margins": 0.061206042766571045, + "rewards/rejected": 0.7381712198257446, + "step": 575 + }, + { + "epoch": 2.04, + "learning_rate": 2.477993601466979e-08, + "logits/chosen": -2.0316355228424072, + "logits/rejected": -2.044489622116089, + "logps/chosen": -4.918635368347168, + "logps/rejected": -16.23321533203125, + "loss": 0.6546, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6547302007675171, + "rewards/margins": -0.26509252190589905, + "rewards/rejected": 0.9198226928710938, + "step": 576 + }, + { + "epoch": 2.04, + "learning_rate": 2.4615316452683637e-08, + "logits/chosen": -2.0879533290863037, + "logits/rejected": -2.0897574424743652, + "logps/chosen": -1.7288776636123657, + "logps/rejected": -3.2326817512512207, + "loss": 0.6385, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9714429378509521, + "rewards/margins": 0.4165174663066864, + "rewards/rejected": 0.5549254417419434, + "step": 577 + }, + { + "epoch": 2.04, + "learning_rate": 2.4451066779599688e-08, + "logits/chosen": -2.0044214725494385, + "logits/rejected": -2.0094046592712402, + "logps/chosen": -1.6971266269683838, + "logps/rejected": -7.149723529815674, + "loss": 0.7476, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8292839527130127, + "rewards/margins": 0.14392054080963135, + "rewards/rejected": 0.6853634119033813, + "step": 578 + }, + { + "epoch": 2.05, + "learning_rate": 2.428718938875607e-08, + "logits/chosen": -1.9645317792892456, + "logits/rejected": -1.9624508619308472, + "logps/chosen": -2.6154232025146484, + "logps/rejected": -3.554192066192627, + "loss": 0.5418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9955108165740967, + "rewards/margins": 0.41652026772499084, + "rewards/rejected": 0.5789905786514282, + "step": 579 + }, + { + "epoch": 2.05, + "learning_rate": 2.4123686668066278e-08, + "logits/chosen": -2.009467124938965, + "logits/rejected": -2.0185532569885254, + "logps/chosen": -4.825134754180908, + "logps/rejected": -8.01343059539795, + "loss": 0.8086, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7624952793121338, + "rewards/margins": -0.37172311544418335, + "rewards/rejected": 1.1342183351516724, + "step": 580 + }, + { + "epoch": 2.05, + "learning_rate": 2.3960560999984347e-08, + "logits/chosen": -2.05543851852417, + "logits/rejected": -2.052042007446289, + "logps/chosen": -1.7976655960083008, + "logps/rejected": -4.230362892150879, + "loss": 0.6222, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7570362091064453, + "rewards/margins": 0.013178825378417969, + "rewards/rejected": 0.7438573837280273, + "step": 581 + }, + { + "epoch": 2.06, + "learning_rate": 2.3797814761470142e-08, + "logits/chosen": -1.961545705795288, + "logits/rejected": -1.9631580114364624, + "logps/chosen": -1.7619738578796387, + "logps/rejected": -1.9743021726608276, + "loss": 0.6076, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7722119092941284, + "rewards/margins": -0.056018561124801636, + "rewards/rejected": 0.8282304406166077, + "step": 582 + }, + { + "epoch": 2.06, + "learning_rate": 2.363545032395477e-08, + "logits/chosen": -1.9618512392044067, + "logits/rejected": -1.9622458219528198, + "logps/chosen": -0.6731535196304321, + "logps/rejected": -3.099217414855957, + "loss": 0.6798, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8325234055519104, + "rewards/margins": 0.2326831817626953, + "rewards/rejected": 0.5998402237892151, + "step": 583 + }, + { + "epoch": 2.06, + "learning_rate": 2.347347005330595e-08, + "logits/chosen": -2.00575590133667, + "logits/rejected": -2.008124828338623, + "logps/chosen": -2.194767475128174, + "logps/rejected": -4.25375509262085, + "loss": 0.6817, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.852127730846405, + "rewards/margins": 0.30934983491897583, + "rewards/rejected": 0.5427778959274292, + "step": 584 + }, + { + "epoch": 2.07, + "learning_rate": 2.331187630979355e-08, + "logits/chosen": -1.9913712739944458, + "logits/rejected": -2.002030611038208, + "logps/chosen": -2.6001105308532715, + "logps/rejected": -8.32321548461914, + "loss": 0.6672, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8142086267471313, + "rewards/margins": 0.011523663997650146, + "rewards/rejected": 0.8026849627494812, + "step": 585 + }, + { + "epoch": 2.07, + "learning_rate": 2.3150671448055297e-08, + "logits/chosen": -1.9764267206192017, + "logits/rejected": -1.9773643016815186, + "logps/chosen": -8.834898948669434, + "logps/rejected": -3.1544270515441895, + "loss": 0.6264, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4302020072937012, + "rewards/margins": 0.9254300594329834, + "rewards/rejected": 0.5047719478607178, + "step": 586 + }, + { + "epoch": 2.07, + "learning_rate": 2.2989857817062324e-08, + "logits/chosen": -2.088477849960327, + "logits/rejected": -2.092190742492676, + "logps/chosen": -1.199931263923645, + "logps/rejected": -14.818371772766113, + "loss": 0.6739, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9407612681388855, + "rewards/margins": -0.11372298002243042, + "rewards/rejected": 1.054484248161316, + "step": 587 + }, + { + "epoch": 2.08, + "learning_rate": 2.2829437760085018e-08, + "logits/chosen": -2.008159637451172, + "logits/rejected": -2.0010764598846436, + "logps/chosen": -7.829075813293457, + "logps/rejected": -2.2703726291656494, + "loss": 0.7051, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0938199758529663, + "rewards/margins": 0.03658020496368408, + "rewards/rejected": 1.0572397708892822, + "step": 588 + }, + { + "epoch": 2.08, + "learning_rate": 2.266941361465886e-08, + "logits/chosen": -1.9559390544891357, + "logits/rejected": -1.9615315198898315, + "logps/chosen": -7.809641361236572, + "logps/rejected": -7.242613792419434, + "loss": 0.6745, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1503636837005615, + "rewards/margins": -0.03218865394592285, + "rewards/rejected": 1.1825523376464844, + "step": 589 + }, + { + "epoch": 2.08, + "learning_rate": 2.2509787712550422e-08, + "logits/chosen": -2.037580966949463, + "logits/rejected": -2.0364181995391846, + "logps/chosen": -1.9517459869384766, + "logps/rejected": -7.129281044006348, + "loss": 0.6869, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.816758394241333, + "rewards/margins": -0.05917319655418396, + "rewards/rejected": 0.8759315609931946, + "step": 590 + }, + { + "epoch": 2.09, + "learning_rate": 2.2350562379723258e-08, + "logits/chosen": -2.020242691040039, + "logits/rejected": -2.019148588180542, + "logps/chosen": -0.6644210815429688, + "logps/rejected": -5.718393325805664, + "loss": 0.5353, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7431098222732544, + "rewards/margins": 0.019783228635787964, + "rewards/rejected": 0.723326563835144, + "step": 591 + }, + { + "epoch": 2.09, + "learning_rate": 2.2191739936304142e-08, + "logits/chosen": -2.0384092330932617, + "logits/rejected": -2.0371034145355225, + "logps/chosen": -2.609067678451538, + "logps/rejected": -11.53480339050293, + "loss": 0.6514, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8335716724395752, + "rewards/margins": -0.15862277150154114, + "rewards/rejected": 0.992194414138794, + "step": 592 + }, + { + "epoch": 2.1, + "learning_rate": 2.2033322696549196e-08, + "logits/chosen": -1.9382199048995972, + "logits/rejected": -1.9403350353240967, + "logps/chosen": -1.2491860389709473, + "logps/rejected": -5.87467098236084, + "loss": 0.6252, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8515692949295044, + "rewards/margins": 0.020187795162200928, + "rewards/rejected": 0.8313815593719482, + "step": 593 + }, + { + "epoch": 2.1, + "learning_rate": 2.1875312968810165e-08, + "logits/chosen": -2.0866539478302, + "logits/rejected": -2.0814132690429688, + "logps/chosen": -8.953042984008789, + "logps/rejected": -7.559203147888184, + "loss": 0.4408, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2457467317581177, + "rewards/margins": 0.8054978251457214, + "rewards/rejected": 0.440248966217041, + "step": 594 + }, + { + "epoch": 2.1, + "learning_rate": 2.1717713055500802e-08, + "logits/chosen": -1.9773072004318237, + "logits/rejected": -1.9725635051727295, + "logps/chosen": -1.044565200805664, + "logps/rejected": -7.804239273071289, + "loss": 0.6695, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6771745085716248, + "rewards/margins": -0.4215153455734253, + "rewards/rejected": 1.0986897945404053, + "step": 595 + }, + { + "epoch": 2.11, + "learning_rate": 2.1560525253063356e-08, + "logits/chosen": -2.0067083835601807, + "logits/rejected": -2.0046184062957764, + "logps/chosen": -0.7185416221618652, + "logps/rejected": -4.102226257324219, + "loss": 0.6627, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7694543600082397, + "rewards/margins": 0.02975103259086609, + "rewards/rejected": 0.7397032976150513, + "step": 596 + }, + { + "epoch": 2.11, + "learning_rate": 2.140375185193502e-08, + "logits/chosen": -1.9915493726730347, + "logits/rejected": -1.9945294857025146, + "logps/chosen": -4.591081142425537, + "logps/rejected": -1.8489367961883545, + "loss": 0.6467, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0379424095153809, + "rewards/margins": 0.16970404982566833, + "rewards/rejected": 0.8682383894920349, + "step": 597 + }, + { + "epoch": 2.11, + "learning_rate": 2.124739513651459e-08, + "logits/chosen": -2.032160520553589, + "logits/rejected": -2.0365474224090576, + "logps/chosen": -2.226121425628662, + "logps/rejected": -2.910736560821533, + "loss": 0.6493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8582420945167542, + "rewards/margins": 0.2423931211233139, + "rewards/rejected": 0.615848958492279, + "step": 598 + }, + { + "epoch": 2.12, + "learning_rate": 2.109145738512926e-08, + "logits/chosen": -2.045753002166748, + "logits/rejected": -2.0703043937683105, + "logps/chosen": -2.902224063873291, + "logps/rejected": -10.012685775756836, + "loss": 0.7382, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7705689668655396, + "rewards/margins": -0.010104715824127197, + "rewards/rejected": 0.7806737422943115, + "step": 599 + }, + { + "epoch": 2.12, + "learning_rate": 2.0935940870001305e-08, + "logits/chosen": -2.054701566696167, + "logits/rejected": -2.0570759773254395, + "logps/chosen": -2.9068479537963867, + "logps/rejected": -15.112326622009277, + "loss": 0.8254, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.709728479385376, + "rewards/margins": -0.41981935501098633, + "rewards/rejected": 1.1295478343963623, + "step": 600 + }, + { + "epoch": 2.12, + "learning_rate": 2.0780847857215005e-08, + "logits/chosen": -1.9983631372451782, + "logits/rejected": -1.991119384765625, + "logps/chosen": -2.1741013526916504, + "logps/rejected": -2.8174612522125244, + "loss": 0.7923, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7056914567947388, + "rewards/margins": -0.20511162281036377, + "rewards/rejected": 0.9108030796051025, + "step": 601 + }, + { + "epoch": 2.13, + "learning_rate": 2.0626180606683712e-08, + "logits/chosen": -1.9828431606292725, + "logits/rejected": -1.9831266403198242, + "logps/chosen": -0.855148434638977, + "logps/rejected": -3.772496461868286, + "loss": 0.6323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8456794023513794, + "rewards/margins": 0.364407479763031, + "rewards/rejected": 0.48127198219299316, + "step": 602 + }, + { + "epoch": 2.13, + "learning_rate": 2.047194137211679e-08, + "logits/chosen": -2.0280098915100098, + "logits/rejected": -2.0297935009002686, + "logps/chosen": -1.2537363767623901, + "logps/rejected": -14.13105583190918, + "loss": 0.9797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8036458492279053, + "rewards/margins": -0.23997443914413452, + "rewards/rejected": 1.0436203479766846, + "step": 603 + }, + { + "epoch": 2.13, + "learning_rate": 2.031813240098686e-08, + "logits/chosen": -2.0764243602752686, + "logits/rejected": -2.0784060955047607, + "logps/chosen": -0.8096239566802979, + "logps/rejected": -7.835744857788086, + "loss": 0.6648, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9219307899475098, + "rewards/margins": -0.024880290031433105, + "rewards/rejected": 0.9468110799789429, + "step": 604 + }, + { + "epoch": 2.14, + "learning_rate": 2.0164755934497017e-08, + "logits/chosen": -2.003708839416504, + "logits/rejected": -2.011209011077881, + "logps/chosen": -2.681303024291992, + "logps/rejected": -1.8241283893585205, + "loss": 0.6698, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8556469678878784, + "rewards/margins": 0.17859110236167908, + "rewards/rejected": 0.677055835723877, + "step": 605 + }, + { + "epoch": 2.14, + "learning_rate": 2.001181420754819e-08, + "logits/chosen": -2.0569889545440674, + "logits/rejected": -2.057389736175537, + "logps/chosen": -1.164604663848877, + "logps/rejected": -3.2715351581573486, + "loss": 0.7233, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7473554015159607, + "rewards/margins": 0.032030582427978516, + "rewards/rejected": 0.7153248190879822, + "step": 606 + }, + { + "epoch": 2.14, + "learning_rate": 1.9859309448706567e-08, + "logits/chosen": -2.0413804054260254, + "logits/rejected": -2.045595645904541, + "logps/chosen": -7.072506904602051, + "logps/rejected": -2.856722354888916, + "loss": 0.6312, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2344695329666138, + "rewards/margins": 0.5054893493652344, + "rewards/rejected": 0.7289801836013794, + "step": 607 + }, + { + "epoch": 2.15, + "learning_rate": 1.9707243880171117e-08, + "logits/chosen": -2.022520065307617, + "logits/rejected": -2.0322115421295166, + "logps/chosen": -2.2906057834625244, + "logps/rejected": -9.180529594421387, + "loss": 0.7429, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7780551910400391, + "rewards/margins": -0.19868671894073486, + "rewards/rejected": 0.9767419099807739, + "step": 608 + }, + { + "epoch": 2.15, + "learning_rate": 1.9555619717741247e-08, + "logits/chosen": -1.9721676111221313, + "logits/rejected": -1.978164792060852, + "logps/chosen": -2.6750876903533936, + "logps/rejected": -2.336664915084839, + "loss": 0.6939, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8626894950866699, + "rewards/margins": 0.2115408480167389, + "rewards/rejected": 0.6511486172676086, + "step": 609 + }, + { + "epoch": 2.16, + "learning_rate": 1.9404439170784438e-08, + "logits/chosen": -2.064141273498535, + "logits/rejected": -2.0655877590179443, + "logps/chosen": -0.5082980394363403, + "logps/rejected": -7.92199182510376, + "loss": 0.8529, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6688159704208374, + "rewards/margins": -0.29575231671333313, + "rewards/rejected": 0.9645683169364929, + "step": 610 + }, + { + "epoch": 2.16, + "learning_rate": 1.925370444220415e-08, + "logits/chosen": -2.122034788131714, + "logits/rejected": -2.1084694862365723, + "logps/chosen": -2.7121593952178955, + "logps/rejected": -7.88394832611084, + "loss": 0.7391, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9141722917556763, + "rewards/margins": -0.04377911984920502, + "rewards/rejected": 0.9579514265060425, + "step": 611 + }, + { + "epoch": 2.16, + "learning_rate": 1.910341772840764e-08, + "logits/chosen": -2.025146722793579, + "logits/rejected": -2.0106194019317627, + "logps/chosen": -4.326772689819336, + "logps/rejected": -4.085480690002441, + "loss": 0.6733, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8304271697998047, + "rewards/margins": 0.2015409767627716, + "rewards/rejected": 0.6288861632347107, + "step": 612 + }, + { + "epoch": 2.17, + "learning_rate": 1.8953581219273984e-08, + "logits/chosen": -1.9889960289001465, + "logits/rejected": -1.9964889287948608, + "logps/chosen": -0.7468490600585938, + "logps/rejected": -13.927059173583984, + "loss": 0.7756, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7617274522781372, + "rewards/margins": -0.5284518599510193, + "rewards/rejected": 1.2901792526245117, + "step": 613 + }, + { + "epoch": 2.17, + "learning_rate": 1.8804197098122167e-08, + "logits/chosen": -2.0910017490386963, + "logits/rejected": -2.09944748878479, + "logps/chosen": -3.593019962310791, + "logps/rejected": -11.942726135253906, + "loss": 0.6209, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.974206805229187, + "rewards/margins": 0.03980359435081482, + "rewards/rejected": 0.9344031810760498, + "step": 614 + }, + { + "epoch": 2.17, + "learning_rate": 1.8655267541679316e-08, + "logits/chosen": -1.9786750078201294, + "logits/rejected": -1.9838060140609741, + "logps/chosen": -2.01824688911438, + "logps/rejected": -13.196106910705566, + "loss": 0.6677, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8110160231590271, + "rewards/margins": 0.11979460716247559, + "rewards/rejected": 0.6912214159965515, + "step": 615 + }, + { + "epoch": 2.18, + "learning_rate": 1.8506794720048903e-08, + "logits/chosen": -2.034433364868164, + "logits/rejected": -2.035766124725342, + "logps/chosen": -2.2776429653167725, + "logps/rejected": -17.69969940185547, + "loss": 0.8718, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8272294402122498, + "rewards/margins": -0.31186944246292114, + "rewards/rejected": 1.139098882675171, + "step": 616 + }, + { + "epoch": 2.18, + "learning_rate": 1.835878079667917e-08, + "logits/chosen": -1.9270710945129395, + "logits/rejected": -1.9479575157165527, + "logps/chosen": -3.4399912357330322, + "logps/rejected": -12.927302360534668, + "loss": 0.6615, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9511443376541138, + "rewards/margins": -0.07688146829605103, + "rewards/rejected": 1.02802574634552, + "step": 617 + }, + { + "epoch": 2.18, + "learning_rate": 1.821122792833159e-08, + "logits/chosen": -2.007516860961914, + "logits/rejected": -2.0139498710632324, + "logps/chosen": -1.521719217300415, + "logps/rejected": -2.819697856903076, + "loss": 0.67, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8211601376533508, + "rewards/margins": 0.15834173560142517, + "rewards/rejected": 0.662818431854248, + "step": 618 + }, + { + "epoch": 2.19, + "learning_rate": 1.8064138265049457e-08, + "logits/chosen": -2.0392799377441406, + "logits/rejected": -2.051175832748413, + "logps/chosen": -0.6170483231544495, + "logps/rejected": -11.655950546264648, + "loss": 0.7177, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7457737922668457, + "rewards/margins": -0.06285691261291504, + "rewards/rejected": 0.8086307048797607, + "step": 619 + }, + { + "epoch": 2.19, + "learning_rate": 1.7917513950126517e-08, + "logits/chosen": -1.996695637702942, + "logits/rejected": -1.999213695526123, + "logps/chosen": -3.4249954223632812, + "logps/rejected": -8.233606338500977, + "loss": 0.7159, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8560407757759094, + "rewards/margins": -0.09171494841575623, + "rewards/rejected": 0.9477556943893433, + "step": 620 + }, + { + "epoch": 2.19, + "learning_rate": 1.777135712007583e-08, + "logits/chosen": -2.0693438053131104, + "logits/rejected": -2.105133056640625, + "logps/chosen": -4.319206237792969, + "logps/rejected": -17.05260467529297, + "loss": 0.5698, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8089077472686768, + "rewards/margins": 0.11838430166244507, + "rewards/rejected": 0.6905234456062317, + "step": 621 + }, + { + "epoch": 2.2, + "learning_rate": 1.7625669904598516e-08, + "logits/chosen": -2.0367863178253174, + "logits/rejected": -2.0399792194366455, + "logps/chosen": -0.6974972486495972, + "logps/rejected": -7.474323749542236, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8210632801055908, + "rewards/margins": 0.1250191479921341, + "rewards/rejected": 0.6960441470146179, + "step": 622 + }, + { + "epoch": 2.2, + "learning_rate": 1.748045442655277e-08, + "logits/chosen": -1.9355571269989014, + "logits/rejected": -1.935289740562439, + "logps/chosen": -0.9043629169464111, + "logps/rejected": -2.8344295024871826, + "loss": 0.6637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8327189683914185, + "rewards/margins": 0.07216313481330872, + "rewards/rejected": 0.7605558633804321, + "step": 623 + }, + { + "epoch": 2.2, + "learning_rate": 1.7335712801923015e-08, + "logits/chosen": -1.9766651391983032, + "logits/rejected": -1.9668244123458862, + "logps/chosen": -4.653731346130371, + "logps/rejected": -8.41660213470459, + "loss": 0.6176, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8336288928985596, + "rewards/margins": -0.06122402846813202, + "rewards/rejected": 0.8948529362678528, + "step": 624 + }, + { + "epoch": 2.21, + "learning_rate": 1.7191447139788923e-08, + "logits/chosen": -2.016690254211426, + "logits/rejected": -2.0315163135528564, + "logps/chosen": -0.8172276020050049, + "logps/rejected": -7.386982440948486, + "loss": 0.7574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8168500661849976, + "rewards/margins": 0.3764428198337555, + "rewards/rejected": 0.44040727615356445, + "step": 625 + }, + { + "epoch": 2.21, + "learning_rate": 1.7047659542294758e-08, + "logits/chosen": -1.9935376644134521, + "logits/rejected": -2.0017035007476807, + "logps/chosen": -0.6560835838317871, + "logps/rejected": -13.202838897705078, + "loss": 0.8275, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7967087030410767, + "rewards/margins": -0.5762240886688232, + "rewards/rejected": 1.3729327917099, + "step": 626 + }, + { + "epoch": 2.22, + "learning_rate": 1.690435210461879e-08, + "logits/chosen": -2.0244901180267334, + "logits/rejected": -2.0363893508911133, + "logps/chosen": -0.7464725375175476, + "logps/rejected": -10.042951583862305, + "loss": 0.7706, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7921556234359741, + "rewards/margins": -0.44403427839279175, + "rewards/rejected": 1.236189842224121, + "step": 627 + }, + { + "epoch": 2.22, + "learning_rate": 1.676152691494268e-08, + "logits/chosen": -2.0517935752868652, + "logits/rejected": -2.0509769916534424, + "logps/chosen": -0.543655276298523, + "logps/rejected": -3.1626129150390625, + "loss": 0.593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7659159898757935, + "rewards/margins": 0.030998170375823975, + "rewards/rejected": 0.7349178194999695, + "step": 628 + }, + { + "epoch": 2.22, + "learning_rate": 1.6619186054421087e-08, + "logits/chosen": -2.055584192276001, + "logits/rejected": -2.063502073287964, + "logps/chosen": -5.039632797241211, + "logps/rejected": -13.890341758728027, + "loss": 0.6014, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1311943531036377, + "rewards/margins": 0.1494792103767395, + "rewards/rejected": 0.9817150831222534, + "step": 629 + }, + { + "epoch": 2.23, + "learning_rate": 1.6477331597151354e-08, + "logits/chosen": -2.001138687133789, + "logits/rejected": -2.0043752193450928, + "logps/chosen": -2.774481773376465, + "logps/rejected": -3.235058546066284, + "loss": 0.5714, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0673794746398926, + "rewards/margins": 0.532346248626709, + "rewards/rejected": 0.5350331664085388, + "step": 630 + }, + { + "epoch": 2.23, + "learning_rate": 1.633596561014327e-08, + "logits/chosen": -2.0180366039276123, + "logits/rejected": -2.022498607635498, + "logps/chosen": -1.6715686321258545, + "logps/rejected": -2.6716225147247314, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8013469576835632, + "rewards/margins": 0.17252743244171143, + "rewards/rejected": 0.6288195252418518, + "step": 631 + }, + { + "epoch": 2.23, + "learning_rate": 1.6195090153288965e-08, + "logits/chosen": -2.0205626487731934, + "logits/rejected": -2.0308783054351807, + "logps/chosen": -0.7282112836837769, + "logps/rejected": -15.152219772338867, + "loss": 0.7817, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7779254913330078, + "rewards/margins": -0.455770879983902, + "rewards/rejected": 1.2336963415145874, + "step": 632 + }, + { + "epoch": 2.24, + "learning_rate": 1.6054707279332864e-08, + "logits/chosen": -2.0393261909484863, + "logits/rejected": -2.0581719875335693, + "logps/chosen": -0.6303465962409973, + "logps/rejected": -19.168148040771484, + "loss": 0.7649, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7804315090179443, + "rewards/margins": -0.5865100622177124, + "rewards/rejected": 1.3669415712356567, + "step": 633 + }, + { + "epoch": 2.24, + "learning_rate": 1.591481903384184e-08, + "logits/chosen": -2.0524163246154785, + "logits/rejected": -2.0535519123077393, + "logps/chosen": -0.7245944738388062, + "logps/rejected": -6.748810768127441, + "loss": 0.6139, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7903498411178589, + "rewards/margins": -0.0012988746166229248, + "rewards/rejected": 0.7916487455368042, + "step": 634 + }, + { + "epoch": 2.24, + "learning_rate": 1.5775427455175327e-08, + "logits/chosen": -1.9905900955200195, + "logits/rejected": -1.9828003644943237, + "logps/chosen": -7.479012489318848, + "logps/rejected": -6.499181270599365, + "loss": 0.6463, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0572867393493652, + "rewards/margins": 0.5009394884109497, + "rewards/rejected": 0.5563472509384155, + "step": 635 + }, + { + "epoch": 2.25, + "learning_rate": 1.5636534574455684e-08, + "logits/chosen": -1.9791390895843506, + "logits/rejected": -1.9944130182266235, + "logps/chosen": -3.2026634216308594, + "logps/rejected": -8.150577545166016, + "loss": 0.592, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8955620527267456, + "rewards/margins": -0.03126728534698486, + "rewards/rejected": 0.9268293380737305, + "step": 636 + }, + { + "epoch": 2.25, + "learning_rate": 1.5498142415538558e-08, + "logits/chosen": -2.10295033454895, + "logits/rejected": -2.1083662509918213, + "logps/chosen": -1.8163459300994873, + "logps/rejected": -4.09120512008667, + "loss": 0.5583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8979101181030273, + "rewards/margins": 0.3972064256668091, + "rewards/rejected": 0.500703752040863, + "step": 637 + }, + { + "epoch": 2.25, + "learning_rate": 1.5360252994983402e-08, + "logits/chosen": -1.9281195402145386, + "logits/rejected": -1.9234906435012817, + "logps/chosen": -0.8552428483963013, + "logps/rejected": -5.927672386169434, + "loss": 0.6353, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8280496597290039, + "rewards/margins": 0.041133224964141846, + "rewards/rejected": 0.7869163751602173, + "step": 638 + }, + { + "epoch": 2.26, + "learning_rate": 1.522286832202409e-08, + "logits/chosen": -2.03206467628479, + "logits/rejected": -2.035742998123169, + "logps/chosen": -0.8042182922363281, + "logps/rejected": -7.642852783203125, + "loss": 0.5729, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8072888255119324, + "rewards/margins": 0.1098719984292984, + "rewards/rejected": 0.6974168419837952, + "step": 639 + }, + { + "epoch": 2.26, + "learning_rate": 1.5085990398539683e-08, + "logits/chosen": -1.9784064292907715, + "logits/rejected": -1.9899060726165771, + "logps/chosen": -0.8280088901519775, + "logps/rejected": -5.95151424407959, + "loss": 0.7244, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7931944131851196, + "rewards/margins": 0.0324881374835968, + "rewards/rejected": 0.7607063055038452, + "step": 640 + }, + { + "epoch": 2.27, + "learning_rate": 1.4949621219025194e-08, + "logits/chosen": -1.9960541725158691, + "logits/rejected": -1.990307092666626, + "logps/chosen": -3.020296335220337, + "logps/rejected": -14.007085800170898, + "loss": 0.6348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8403646349906921, + "rewards/margins": -0.3056904673576355, + "rewards/rejected": 1.1460551023483276, + "step": 641 + }, + { + "epoch": 2.27, + "learning_rate": 1.481376277056255e-08, + "logits/chosen": -2.0469722747802734, + "logits/rejected": -2.047741174697876, + "logps/chosen": -2.547891616821289, + "logps/rejected": -3.2712488174438477, + "loss": 0.7348, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8292343616485596, + "rewards/margins": -0.017476260662078857, + "rewards/rejected": 0.8467105627059937, + "step": 642 + }, + { + "epoch": 2.27, + "learning_rate": 1.4678417032791651e-08, + "logits/chosen": -2.0731072425842285, + "logits/rejected": -2.0990054607391357, + "logps/chosen": -6.644369125366211, + "logps/rejected": -14.275355339050293, + "loss": 0.8292, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.944605827331543, + "rewards/margins": -0.09639033675193787, + "rewards/rejected": 1.0409961938858032, + "step": 643 + }, + { + "epoch": 2.28, + "learning_rate": 1.4543585977881511e-08, + "logits/chosen": -2.0028035640716553, + "logits/rejected": -2.005750894546509, + "logps/chosen": -2.4404869079589844, + "logps/rejected": -6.502226829528809, + "loss": 0.6101, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7892129421234131, + "rewards/margins": -0.18795138597488403, + "rewards/rejected": 0.9771643280982971, + "step": 644 + }, + { + "epoch": 2.28, + "learning_rate": 1.4409271570501519e-08, + "logits/chosen": -2.048950433731079, + "logits/rejected": -2.070321798324585, + "logps/chosen": -0.592048168182373, + "logps/rejected": -3.6043734550476074, + "loss": 0.6456, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8274204730987549, + "rewards/margins": 0.07750433683395386, + "rewards/rejected": 0.749916136264801, + "step": 645 + }, + { + "epoch": 2.28, + "learning_rate": 1.4275475767792844e-08, + "logits/chosen": -1.9535112380981445, + "logits/rejected": -1.94771409034729, + "logps/chosen": -3.2636868953704834, + "logps/rejected": -6.86224889755249, + "loss": 0.6665, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7070361375808716, + "rewards/margins": -0.3484286963939667, + "rewards/rejected": 1.055464744567871, + "step": 646 + }, + { + "epoch": 2.29, + "learning_rate": 1.4142200519339841e-08, + "logits/chosen": -2.127493143081665, + "logits/rejected": -2.127772808074951, + "logps/chosen": -1.918084979057312, + "logps/rejected": -2.848398208618164, + "loss": 0.7445, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7153842449188232, + "rewards/margins": 0.022627443075180054, + "rewards/rejected": 0.6927567720413208, + "step": 647 + }, + { + "epoch": 2.29, + "learning_rate": 1.4009447767141746e-08, + "logits/chosen": -2.053114652633667, + "logits/rejected": -2.0569190979003906, + "logps/chosen": -1.351191759109497, + "logps/rejected": -3.118528366088867, + "loss": 0.6464, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7535523176193237, + "rewards/margins": 0.18522386252880096, + "rewards/rejected": 0.5683284401893616, + "step": 648 + }, + { + "epoch": 2.29, + "learning_rate": 1.3877219445584282e-08, + "logits/chosen": -2.00966739654541, + "logits/rejected": -2.062786817550659, + "logps/chosen": -6.912562370300293, + "logps/rejected": -4.22528076171875, + "loss": 0.6439, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6828598976135254, + "rewards/margins": -0.03647667169570923, + "rewards/rejected": 0.7193365693092346, + "step": 649 + }, + { + "epoch": 2.3, + "learning_rate": 1.3745517481411529e-08, + "logits/chosen": -1.9858731031417847, + "logits/rejected": -1.9860048294067383, + "logps/chosen": -1.0839316844940186, + "logps/rejected": -3.7435030937194824, + "loss": 0.554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9181524515151978, + "rewards/margins": 0.05196338891983032, + "rewards/rejected": 0.8661890625953674, + "step": 650 + }, + { + "epoch": 2.3, + "learning_rate": 1.361434379369783e-08, + "logits/chosen": -2.078198194503784, + "logits/rejected": -2.07645583152771, + "logps/chosen": -1.0227668285369873, + "logps/rejected": -4.009435653686523, + "loss": 0.7192, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6606489419937134, + "rewards/margins": -0.20450091361999512, + "rewards/rejected": 0.8651498556137085, + "step": 651 + }, + { + "epoch": 2.3, + "learning_rate": 1.3483700293819817e-08, + "logits/chosen": -1.9780060052871704, + "logits/rejected": -1.978055477142334, + "logps/chosen": -1.6383609771728516, + "logps/rejected": -4.226902008056641, + "loss": 0.6782, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8775732517242432, + "rewards/margins": 0.0716528594493866, + "rewards/rejected": 0.805920422077179, + "step": 652 + }, + { + "epoch": 2.31, + "learning_rate": 1.3353588885428618e-08, + "logits/chosen": -2.005842924118042, + "logits/rejected": -2.007995128631592, + "logps/chosen": -1.1226916313171387, + "logps/rejected": -6.909612655639648, + "loss": 0.7405, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8384670615196228, + "rewards/margins": 0.07961997389793396, + "rewards/rejected": 0.7588471174240112, + "step": 653 + }, + { + "epoch": 2.31, + "learning_rate": 1.322401146442203e-08, + "logits/chosen": -1.9935429096221924, + "logits/rejected": -2.071401834487915, + "logps/chosen": -1.0162827968597412, + "logps/rejected": -21.062850952148438, + "loss": 0.6797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8770461678504944, + "rewards/margins": -0.16934272646903992, + "rewards/rejected": 1.046388864517212, + "step": 654 + }, + { + "epoch": 2.31, + "learning_rate": 1.3094969918916965e-08, + "logits/chosen": -2.028362989425659, + "logits/rejected": -2.028378963470459, + "logps/chosen": -0.4788167476654053, + "logps/rejected": -3.5510759353637695, + "loss": 0.7347, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7102053165435791, + "rewards/margins": -0.018107324838638306, + "rewards/rejected": 0.728312611579895, + "step": 655 + }, + { + "epoch": 2.32, + "learning_rate": 1.2966466129221882e-08, + "logits/chosen": -2.0291144847869873, + "logits/rejected": -2.0291945934295654, + "logps/chosen": -0.8099204897880554, + "logps/rejected": -8.265928268432617, + "loss": 0.8562, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8635512590408325, + "rewards/margins": -0.2955421209335327, + "rewards/rejected": 1.1590933799743652, + "step": 656 + }, + { + "epoch": 2.32, + "learning_rate": 1.283850196780944e-08, + "logits/chosen": -2.0713605880737305, + "logits/rejected": -2.0736260414123535, + "logps/chosen": -2.0997540950775146, + "logps/rejected": -2.0410006046295166, + "loss": 0.5656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8744504451751709, + "rewards/margins": 0.21895119547843933, + "rewards/rejected": 0.655499279499054, + "step": 657 + }, + { + "epoch": 2.33, + "learning_rate": 1.2711079299289168e-08, + "logits/chosen": -2.03971004486084, + "logits/rejected": -2.038717031478882, + "logps/chosen": -4.6626200675964355, + "logps/rejected": -4.49288272857666, + "loss": 0.6191, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1101969480514526, + "rewards/margins": 0.49502724409103394, + "rewards/rejected": 0.6151697635650635, + "step": 658 + }, + { + "epoch": 2.33, + "learning_rate": 1.2584199980380356e-08, + "logits/chosen": -1.992759346961975, + "logits/rejected": -2.0055599212646484, + "logps/chosen": -4.609453201293945, + "logps/rejected": -15.827011108398438, + "loss": 0.803, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.75101637840271, + "rewards/margins": -0.16747219860553741, + "rewards/rejected": 0.9184885621070862, + "step": 659 + }, + { + "epoch": 2.33, + "learning_rate": 1.2457865859884908e-08, + "logits/chosen": -2.0129311084747314, + "logits/rejected": -2.025228977203369, + "logps/chosen": -1.5773069858551025, + "logps/rejected": -10.732341766357422, + "loss": 0.6987, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7971212863922119, + "rewards/margins": -0.20830205082893372, + "rewards/rejected": 1.0054233074188232, + "step": 660 + }, + { + "epoch": 2.34, + "learning_rate": 1.2332078778660515e-08, + "logits/chosen": -1.9734822511672974, + "logits/rejected": -1.9876651763916016, + "logps/chosen": -2.8912672996520996, + "logps/rejected": -2.581441640853882, + "loss": 0.7379, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8301430940628052, + "rewards/margins": 0.0350680947303772, + "rewards/rejected": 0.7950749397277832, + "step": 661 + }, + { + "epoch": 2.34, + "learning_rate": 1.2206840569593724e-08, + "logits/chosen": -2.0367283821105957, + "logits/rejected": -2.034942388534546, + "logps/chosen": -9.515594482421875, + "logps/rejected": -3.708285093307495, + "loss": 0.7152, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8659341335296631, + "rewards/margins": 0.14173224568367004, + "rewards/rejected": 0.7242018580436707, + "step": 662 + }, + { + "epoch": 2.34, + "learning_rate": 1.2082153057573297e-08, + "logits/chosen": -2.0367538928985596, + "logits/rejected": -2.0411858558654785, + "logps/chosen": -0.7875117063522339, + "logps/rejected": -9.107043266296387, + "loss": 0.7518, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.840976357460022, + "rewards/margins": -0.15781912207603455, + "rewards/rejected": 0.9987955093383789, + "step": 663 + }, + { + "epoch": 2.35, + "learning_rate": 1.1958018059463577e-08, + "logits/chosen": -2.036787271499634, + "logits/rejected": -2.0346148014068604, + "logps/chosen": -2.3823628425598145, + "logps/rejected": -3.96355938911438, + "loss": 0.6747, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9046579599380493, + "rewards/margins": 0.10841155052185059, + "rewards/rejected": 0.7962464094161987, + "step": 664 + }, + { + "epoch": 2.35, + "learning_rate": 1.1834437384078094e-08, + "logits/chosen": -2.0158591270446777, + "logits/rejected": -2.024993658065796, + "logps/chosen": -2.1164093017578125, + "logps/rejected": -9.773338317871094, + "loss": 0.6338, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8625360131263733, + "rewards/margins": -0.22942012548446655, + "rewards/rejected": 1.0919561386108398, + "step": 665 + }, + { + "epoch": 2.35, + "learning_rate": 1.17114128321531e-08, + "logits/chosen": -2.010103702545166, + "logits/rejected": -2.01177716255188, + "logps/chosen": -1.8204734325408936, + "logps/rejected": -2.582336902618408, + "loss": 0.5192, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7162758111953735, + "rewards/margins": -0.06351344287395477, + "rewards/rejected": 0.7797892689704895, + "step": 666 + }, + { + "epoch": 2.36, + "learning_rate": 1.1588946196321404e-08, + "logits/chosen": -2.102281332015991, + "logits/rejected": -2.1145927906036377, + "logps/chosen": -1.348692536354065, + "logps/rejected": -13.705635070800781, + "loss": 0.6935, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7882479429244995, + "rewards/margins": -0.07439818978309631, + "rewards/rejected": 0.8626461625099182, + "step": 667 + }, + { + "epoch": 2.36, + "learning_rate": 1.146703926108622e-08, + "logits/chosen": -1.9643436670303345, + "logits/rejected": -1.9635496139526367, + "logps/chosen": -0.6717028021812439, + "logps/rejected": -9.186776161193848, + "loss": 0.6671, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8164703845977783, + "rewards/margins": -0.36128801107406616, + "rewards/rejected": 1.1777584552764893, + "step": 668 + }, + { + "epoch": 2.36, + "learning_rate": 1.1345693802795175e-08, + "logits/chosen": -1.9457952976226807, + "logits/rejected": -1.9431381225585938, + "logps/chosen": -0.4219436049461365, + "logps/rejected": -3.354703426361084, + "loss": 0.8026, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.681389570236206, + "rewards/margins": -0.2462242841720581, + "rewards/rejected": 0.9276138544082642, + "step": 669 + }, + { + "epoch": 2.37, + "learning_rate": 1.1224911589614423e-08, + "logits/chosen": -2.011033058166504, + "logits/rejected": -2.0146453380584717, + "logps/chosen": -2.084024429321289, + "logps/rejected": -3.8628058433532715, + "loss": 0.6535, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8592081665992737, + "rewards/margins": 0.24384666979312897, + "rewards/rejected": 0.6153615117073059, + "step": 670 + }, + { + "epoch": 2.37, + "learning_rate": 1.11046943815029e-08, + "logits/chosen": -1.9569740295410156, + "logits/rejected": -1.9679490327835083, + "logps/chosen": -0.6641147136688232, + "logps/rejected": -4.792438983917236, + "loss": 0.6684, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7897384166717529, + "rewards/margins": 0.12114500999450684, + "rewards/rejected": 0.6685934066772461, + "step": 671 + }, + { + "epoch": 2.37, + "learning_rate": 1.0985043930186621e-08, + "logits/chosen": -1.969509243965149, + "logits/rejected": -1.982324481010437, + "logps/chosen": -2.777156114578247, + "logps/rejected": -11.52781867980957, + "loss": 0.6567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8467739820480347, + "rewards/margins": 0.5949541926383972, + "rewards/rejected": 0.25181975960731506, + "step": 672 + }, + { + "epoch": 2.38, + "learning_rate": 1.0865961979133243e-08, + "logits/chosen": -1.9913294315338135, + "logits/rejected": -1.9848248958587646, + "logps/chosen": -1.2518527507781982, + "logps/rejected": -4.313378810882568, + "loss": 0.7103, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6894317865371704, + "rewards/margins": -0.18187007308006287, + "rewards/rejected": 0.8713018894195557, + "step": 673 + }, + { + "epoch": 2.38, + "learning_rate": 1.0747450263526576e-08, + "logits/chosen": -2.0553154945373535, + "logits/rejected": -2.058866024017334, + "logps/chosen": -4.124239921569824, + "logps/rejected": -1.608016014099121, + "loss": 0.7403, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.897442638874054, + "rewards/margins": 0.09315210580825806, + "rewards/rejected": 0.8042905330657959, + "step": 674 + }, + { + "epoch": 2.39, + "learning_rate": 1.0629510510241336e-08, + "logits/chosen": -2.0107996463775635, + "logits/rejected": -2.0724916458129883, + "logps/chosen": -4.948009967803955, + "logps/rejected": -9.081350326538086, + "loss": 0.5083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9047877788543701, + "rewards/margins": 0.36046767234802246, + "rewards/rejected": 0.5443201065063477, + "step": 675 + }, + { + "epoch": 2.39, + "learning_rate": 1.0512144437817994e-08, + "logits/chosen": -1.973608374595642, + "logits/rejected": -1.9786279201507568, + "logps/chosen": -1.4928834438323975, + "logps/rejected": -5.028512001037598, + "loss": 0.6529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9740607738494873, + "rewards/margins": 0.5583186149597168, + "rewards/rejected": 0.4157421290874481, + "step": 676 + }, + { + "epoch": 2.39, + "learning_rate": 1.0395353756437698e-08, + "logits/chosen": -2.028677463531494, + "logits/rejected": -2.0312142372131348, + "logps/chosen": -1.4216511249542236, + "logps/rejected": -14.287956237792969, + "loss": 0.9812, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6607122421264648, + "rewards/margins": -0.9493438601493835, + "rewards/rejected": 1.6100561618804932, + "step": 677 + }, + { + "epoch": 2.4, + "learning_rate": 1.0279140167897427e-08, + "logits/chosen": -1.9926223754882812, + "logits/rejected": -1.9924815893173218, + "logps/chosen": -1.4255638122558594, + "logps/rejected": -3.264573097229004, + "loss": 0.5384, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.747039258480072, + "rewards/margins": -0.16181150078773499, + "rewards/rejected": 0.9088507890701294, + "step": 678 + }, + { + "epoch": 2.4, + "learning_rate": 1.0163505365585085e-08, + "logits/chosen": -2.0205814838409424, + "logits/rejected": -2.016139507293701, + "logps/chosen": -1.6919025182724, + "logps/rejected": -2.9233767986297607, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6897205114364624, + "rewards/margins": -0.3192529082298279, + "rewards/rejected": 1.0089733600616455, + "step": 679 + }, + { + "epoch": 2.4, + "learning_rate": 1.004845103445492e-08, + "logits/chosen": -2.0244016647338867, + "logits/rejected": -2.0317530632019043, + "logps/chosen": -0.7896166443824768, + "logps/rejected": -8.222039222717285, + "loss": 0.5939, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.857150673866272, + "rewards/margins": -0.04601225256919861, + "rewards/rejected": 0.903162956237793, + "step": 680 + }, + { + "epoch": 2.41, + "learning_rate": 9.93397885100291e-09, + "logits/chosen": -2.0088865756988525, + "logits/rejected": -2.013678550720215, + "logps/chosen": -8.764485359191895, + "logps/rejected": -7.219472885131836, + "loss": 0.7679, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2648030519485474, + "rewards/margins": 0.2820407748222351, + "rewards/rejected": 0.9827622175216675, + "step": 681 + }, + { + "epoch": 2.41, + "learning_rate": 9.820090483242393e-09, + "logits/chosen": -1.9569449424743652, + "logits/rejected": -1.9690184593200684, + "logps/chosen": -20.165298461914062, + "logps/rejected": -7.784318923950195, + "loss": 0.625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3611931800842285, + "rewards/margins": 0.7355426549911499, + "rewards/rejected": 0.6256504654884338, + "step": 682 + }, + { + "epoch": 2.41, + "learning_rate": 9.706787590679682e-09, + "logits/chosen": -2.1217827796936035, + "logits/rejected": -2.175931692123413, + "logps/chosen": -8.95374584197998, + "logps/rejected": -22.238840103149414, + "loss": 0.6621, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.112074375152588, + "rewards/margins": 0.5100778937339783, + "rewards/rejected": 0.6019964814186096, + "step": 683 + }, + { + "epoch": 2.42, + "learning_rate": 9.594071824289984e-09, + "logits/chosen": -2.0093488693237305, + "logits/rejected": -2.038503408432007, + "logps/chosen": -1.2226884365081787, + "logps/rejected": -15.618595123291016, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8199217915534973, + "rewards/margins": 0.5012691617012024, + "rewards/rejected": 0.3186526298522949, + "step": 684 + }, + { + "epoch": 2.42, + "learning_rate": 9.481944826493266e-09, + "logits/chosen": -2.0001909732818604, + "logits/rejected": -2.0153911113739014, + "logps/chosen": -5.912042617797852, + "logps/rejected": -15.552814483642578, + "loss": 0.7433, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.849388599395752, + "rewards/margins": -0.35523369908332825, + "rewards/rejected": 1.2046222686767578, + "step": 685 + }, + { + "epoch": 2.42, + "learning_rate": 9.370408231130345e-09, + "logits/chosen": -2.0491137504577637, + "logits/rejected": -2.049973726272583, + "logps/chosen": -0.3802735209465027, + "logps/rejected": -5.623290538787842, + "loss": 0.5981, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.711156964302063, + "rewards/margins": 0.3093181252479553, + "rewards/rejected": 0.4018387794494629, + "step": 686 + }, + { + "epoch": 2.43, + "learning_rate": 9.259463663439071e-09, + "logits/chosen": -2.0431056022644043, + "logits/rejected": -2.048551321029663, + "logps/chosen": -6.37994909286499, + "logps/rejected": -2.605681896209717, + "loss": 0.5997, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1135951280593872, + "rewards/margins": 0.43530797958374023, + "rewards/rejected": 0.678287148475647, + "step": 687 + }, + { + "epoch": 2.43, + "learning_rate": 9.149112740030674e-09, + "logits/chosen": -2.114964485168457, + "logits/rejected": -2.1161227226257324, + "logps/chosen": -2.511969566345215, + "logps/rejected": -2.1028642654418945, + "loss": 0.7832, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7410478591918945, + "rewards/margins": -0.1109631359577179, + "rewards/rejected": 0.85201096534729, + "step": 688 + }, + { + "epoch": 2.43, + "learning_rate": 9.039357068866176e-09, + "logits/chosen": -2.0399818420410156, + "logits/rejected": -2.040114402770996, + "logps/chosen": -0.5920535326004028, + "logps/rejected": -9.061433792114258, + "loss": 0.6431, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7797198295593262, + "rewards/margins": -0.3146747350692749, + "rewards/rejected": 1.094394564628601, + "step": 689 + }, + { + "epoch": 2.44, + "learning_rate": 8.930198249233e-09, + "logits/chosen": -2.0548789501190186, + "logits/rejected": -2.079993724822998, + "logps/chosen": -2.188602924346924, + "logps/rejected": -18.490949630737305, + "loss": 0.7586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7996467351913452, + "rewards/margins": -0.3432365357875824, + "rewards/rejected": 1.14288330078125, + "step": 690 + }, + { + "epoch": 2.44, + "learning_rate": 8.821637871721621e-09, + "logits/chosen": -1.9736435413360596, + "logits/rejected": -1.9784008264541626, + "logps/chosen": -1.114111065864563, + "logps/rejected": -4.652390956878662, + "loss": 0.6002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8163281679153442, + "rewards/margins": 0.4570881724357605, + "rewards/rejected": 0.35923999547958374, + "step": 691 + }, + { + "epoch": 2.45, + "learning_rate": 8.713677518202411e-09, + "logits/chosen": -1.9576033353805542, + "logits/rejected": -1.9507622718811035, + "logps/chosen": -6.973435878753662, + "logps/rejected": -2.240079164505005, + "loss": 0.6642, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1049141883850098, + "rewards/margins": 0.45737341046333313, + "rewards/rejected": 0.647540807723999, + "step": 692 + }, + { + "epoch": 2.45, + "learning_rate": 8.606318761802583e-09, + "logits/chosen": -2.032822847366333, + "logits/rejected": -2.042358636856079, + "logps/chosen": -1.628779649734497, + "logps/rejected": -6.813453197479248, + "loss": 0.7044, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.030196189880371, + "rewards/margins": 0.28658682107925415, + "rewards/rejected": 0.7436093091964722, + "step": 693 + }, + { + "epoch": 2.45, + "learning_rate": 8.49956316688329e-09, + "logits/chosen": -1.9435036182403564, + "logits/rejected": -1.9416804313659668, + "logps/chosen": -10.34801197052002, + "logps/rejected": -3.590965509414673, + "loss": 0.6759, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9078177213668823, + "rewards/margins": 0.23645779490470886, + "rewards/rejected": 0.6713598966598511, + "step": 694 + }, + { + "epoch": 2.46, + "learning_rate": 8.393412289016777e-09, + "logits/chosen": -1.9883816242218018, + "logits/rejected": -2.052948236465454, + "logps/chosen": -5.044040679931641, + "logps/rejected": -7.229404926300049, + "loss": 0.5807, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9165322780609131, + "rewards/margins": 0.2964531183242798, + "rewards/rejected": 0.6200791597366333, + "step": 695 + }, + { + "epoch": 2.46, + "learning_rate": 8.287867674963806e-09, + "logits/chosen": -1.9843825101852417, + "logits/rejected": -1.9811643362045288, + "logps/chosen": -3.353024959564209, + "logps/rejected": -2.447553873062134, + "loss": 0.5918, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7729475498199463, + "rewards/margins": 0.03333407640457153, + "rewards/rejected": 0.7396135330200195, + "step": 696 + }, + { + "epoch": 2.46, + "learning_rate": 8.182930862651011e-09, + "logits/chosen": -2.013400077819824, + "logits/rejected": -2.021955966949463, + "logps/chosen": -2.4645419120788574, + "logps/rejected": -12.790815353393555, + "loss": 0.689, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8440244197845459, + "rewards/margins": -0.503407895565033, + "rewards/rejected": 1.3474323749542236, + "step": 697 + }, + { + "epoch": 2.47, + "learning_rate": 8.078603381148574e-09, + "logits/chosen": -2.045684337615967, + "logits/rejected": -2.044368028640747, + "logps/chosen": -0.3532433807849884, + "logps/rejected": -6.273452281951904, + "loss": 0.7481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6861323118209839, + "rewards/margins": -0.2460324764251709, + "rewards/rejected": 0.9321647882461548, + "step": 698 + }, + { + "epoch": 2.47, + "learning_rate": 7.974886750647886e-09, + "logits/chosen": -2.0453832149505615, + "logits/rejected": -2.058607578277588, + "logps/chosen": -0.6405336260795593, + "logps/rejected": -14.173526763916016, + "loss": 0.6036, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.845041036605835, + "rewards/margins": -0.39377471804618835, + "rewards/rejected": 1.2388157844543457, + "step": 699 + }, + { + "epoch": 2.47, + "learning_rate": 7.871782482439431e-09, + "logits/chosen": -2.0756239891052246, + "logits/rejected": -2.0751729011535645, + "logps/chosen": -0.5871163606643677, + "logps/rejected": -3.2171144485473633, + "loss": 0.5981, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.831002950668335, + "rewards/margins": 0.1662926822900772, + "rewards/rejected": 0.664710283279419, + "step": 700 + }, + { + "epoch": 2.48, + "learning_rate": 7.769292078890743e-09, + "logits/chosen": -2.0709874629974365, + "logits/rejected": -2.075075149536133, + "logps/chosen": -0.5985554456710815, + "logps/rejected": -11.222784996032715, + "loss": 0.6328, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7844801545143127, + "rewards/margins": -0.08050906658172607, + "rewards/rejected": 0.8649892210960388, + "step": 701 + }, + { + "epoch": 2.48, + "learning_rate": 7.667417033424528e-09, + "logits/chosen": -2.1304430961608887, + "logits/rejected": -2.1283228397369385, + "logps/chosen": -2.1279473304748535, + "logps/rejected": -3.0292296409606934, + "loss": 0.6078, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7964010834693909, + "rewards/margins": 0.18519660830497742, + "rewards/rejected": 0.6112045049667358, + "step": 702 + }, + { + "epoch": 2.48, + "learning_rate": 7.566158830496916e-09, + "logits/chosen": -2.0094854831695557, + "logits/rejected": -2.0067758560180664, + "logps/chosen": -1.655074954032898, + "logps/rejected": -4.080992221832275, + "loss": 0.5132, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8381193280220032, + "rewards/margins": 0.21584655344486237, + "rewards/rejected": 0.622272789478302, + "step": 703 + }, + { + "epoch": 2.49, + "learning_rate": 7.465518945575788e-09, + "logits/chosen": -2.0027480125427246, + "logits/rejected": -2.007930278778076, + "logps/chosen": -7.780696392059326, + "logps/rejected": -13.138553619384766, + "loss": 0.775, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2009819746017456, + "rewards/margins": 0.29708611965179443, + "rewards/rejected": 0.9038958549499512, + "step": 704 + }, + { + "epoch": 2.49, + "learning_rate": 7.365498845119317e-09, + "logits/chosen": -1.9862974882125854, + "logits/rejected": -1.9797837734222412, + "logps/chosen": -4.020423412322998, + "logps/rejected": -9.145601272583008, + "loss": 0.7751, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9521447420120239, + "rewards/margins": -0.10699045658111572, + "rewards/rejected": 1.0591351985931396, + "step": 705 + }, + { + "epoch": 2.49, + "learning_rate": 7.2660999865545745e-09, + "logits/chosen": -2.033543109893799, + "logits/rejected": -2.041865110397339, + "logps/chosen": -2.7549266815185547, + "logps/rejected": -10.939742088317871, + "loss": 0.7571, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.887223482131958, + "rewards/margins": -0.027756303548812866, + "rewards/rejected": 0.9149797558784485, + "step": 706 + }, + { + "epoch": 2.5, + "learning_rate": 7.167323818256304e-09, + "logits/chosen": -2.054689884185791, + "logits/rejected": -2.060514211654663, + "logps/chosen": -1.5451074838638306, + "logps/rejected": -8.368194580078125, + "loss": 0.8694, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8588793873786926, + "rewards/margins": -0.002893984317779541, + "rewards/rejected": 0.8617733716964722, + "step": 707 + }, + { + "epoch": 2.5, + "learning_rate": 7.069171779525845e-09, + "logits/chosen": -2.046635150909424, + "logits/rejected": -2.0520856380462646, + "logps/chosen": -1.385947585105896, + "logps/rejected": -2.8020222187042236, + "loss": 0.7233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8153356313705444, + "rewards/margins": 0.25544020533561707, + "rewards/rejected": 0.559895396232605, + "step": 708 + }, + { + "epoch": 2.51, + "learning_rate": 6.9716453005700835e-09, + "logits/chosen": -2.0060997009277344, + "logits/rejected": -2.015644073486328, + "logps/chosen": -1.2503055334091187, + "logps/rejected": -7.325188636779785, + "loss": 0.8147, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7833892107009888, + "rewards/margins": -0.33828383684158325, + "rewards/rejected": 1.1216729879379272, + "step": 709 + }, + { + "epoch": 2.51, + "learning_rate": 6.874745802480713e-09, + "logits/chosen": -2.0688982009887695, + "logits/rejected": -2.070708751678467, + "logps/chosen": -1.3456133604049683, + "logps/rejected": -2.4713354110717773, + "loss": 0.7447, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.751529335975647, + "rewards/margins": 0.22912298142910004, + "rewards/rejected": 0.5224063992500305, + "step": 710 + }, + { + "epoch": 2.51, + "learning_rate": 6.7784746972134265e-09, + "logits/chosen": -1.946068286895752, + "logits/rejected": -1.9513283967971802, + "logps/chosen": -0.5922717452049255, + "logps/rejected": -12.319869041442871, + "loss": 0.8218, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7073915004730225, + "rewards/margins": -0.34295597672462463, + "rewards/rejected": 1.0503474473953247, + "step": 711 + }, + { + "epoch": 2.52, + "learning_rate": 6.682833387567422e-09, + "logits/chosen": -1.9523091316223145, + "logits/rejected": -1.9568681716918945, + "logps/chosen": -1.7017784118652344, + "logps/rejected": -3.5185317993164062, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8844276070594788, + "rewards/margins": 0.40950894355773926, + "rewards/rejected": 0.4749186635017395, + "step": 712 + }, + { + "epoch": 2.52, + "learning_rate": 6.58782326716491e-09, + "logits/chosen": -1.9346719980239868, + "logits/rejected": -1.9341336488723755, + "logps/chosen": -1.731608271598816, + "logps/rejected": -3.170091152191162, + "loss": 0.5437, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6641017198562622, + "rewards/margins": -0.12634223699569702, + "rewards/rejected": 0.7904439568519592, + "step": 713 + }, + { + "epoch": 2.52, + "learning_rate": 6.493445720430829e-09, + "logits/chosen": -2.015974998474121, + "logits/rejected": -2.0249381065368652, + "logps/chosen": -2.772273540496826, + "logps/rejected": -2.2778732776641846, + "loss": 0.6779, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.166812539100647, + "rewards/margins": 0.4446723759174347, + "rewards/rejected": 0.7221401929855347, + "step": 714 + }, + { + "epoch": 2.53, + "learning_rate": 6.399702122572698e-09, + "logits/chosen": -2.0608670711517334, + "logits/rejected": -2.06138014793396, + "logps/chosen": -0.7656630277633667, + "logps/rejected": -6.961968898773193, + "loss": 0.8118, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6943384408950806, + "rewards/margins": -0.07606565952301025, + "rewards/rejected": 0.7704041004180908, + "step": 715 + }, + { + "epoch": 2.53, + "learning_rate": 6.30659383956052e-09, + "logits/chosen": -2.0377578735351562, + "logits/rejected": -2.040783166885376, + "logps/chosen": -3.766071319580078, + "logps/rejected": -4.622697353363037, + "loss": 0.6028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8288868069648743, + "rewards/margins": 0.2914682626724243, + "rewards/rejected": 0.53741854429245, + "step": 716 + }, + { + "epoch": 2.53, + "learning_rate": 6.214122228106916e-09, + "logits/chosen": -2.010364294052124, + "logits/rejected": -2.0203075408935547, + "logps/chosen": -1.599360704421997, + "logps/rejected": -10.465657234191895, + "loss": 0.7662, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6534087061882019, + "rewards/margins": -0.37631088495254517, + "rewards/rejected": 1.029719591140747, + "step": 717 + }, + { + "epoch": 2.54, + "learning_rate": 6.122288635647355e-09, + "logits/chosen": -2.0474345684051514, + "logits/rejected": -2.0500617027282715, + "logps/chosen": -5.208782196044922, + "logps/rejected": -8.322638511657715, + "loss": 0.6954, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7298060655593872, + "rewards/margins": -0.3486040532588959, + "rewards/rejected": 1.0784101486206055, + "step": 718 + }, + { + "epoch": 2.54, + "learning_rate": 6.031094400320497e-09, + "logits/chosen": -2.051680088043213, + "logits/rejected": -2.051635980606079, + "logps/chosen": -1.2401936054229736, + "logps/rejected": -2.2016589641571045, + "loss": 0.5337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.906674861907959, + "rewards/margins": 0.26968690752983093, + "rewards/rejected": 0.6369879245758057, + "step": 719 + }, + { + "epoch": 2.54, + "learning_rate": 5.940540850948722e-09, + "logits/chosen": -2.0096099376678467, + "logits/rejected": -2.0119853019714355, + "logps/chosen": -0.9940755367279053, + "logps/rejected": -11.466721534729004, + "loss": 0.6298, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8273373246192932, + "rewards/margins": 0.201762393116951, + "rewards/rejected": 0.6255749464035034, + "step": 720 + }, + { + "epoch": 2.55, + "learning_rate": 5.850629307018767e-09, + "logits/chosen": -2.0183424949645996, + "logits/rejected": -2.014615535736084, + "logps/chosen": -2.117452383041382, + "logps/rejected": -3.182790517807007, + "loss": 0.7794, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8364837765693665, + "rewards/margins": 0.21496105194091797, + "rewards/rejected": 0.6215227246284485, + "step": 721 + }, + { + "epoch": 2.55, + "learning_rate": 5.761361078662464e-09, + "logits/chosen": -2.01804256439209, + "logits/rejected": -2.023376941680908, + "logps/chosen": -5.449763298034668, + "logps/rejected": -3.687314510345459, + "loss": 0.7917, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9413806200027466, + "rewards/margins": 0.16868102550506592, + "rewards/rejected": 0.7726995944976807, + "step": 722 + }, + { + "epoch": 2.55, + "learning_rate": 5.6727374666377e-09, + "logits/chosen": -2.0887937545776367, + "logits/rejected": -2.0891273021698, + "logps/chosen": -1.891357183456421, + "logps/rejected": -3.5985522270202637, + "loss": 0.7669, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8601087331771851, + "rewards/margins": 0.23114685714244843, + "rewards/rejected": 0.6289619207382202, + "step": 723 + }, + { + "epoch": 2.56, + "learning_rate": 5.5847597623094215e-09, + "logits/chosen": -2.0422587394714355, + "logits/rejected": -2.043567419052124, + "logps/chosen": -1.7402572631835938, + "logps/rejected": -8.766366004943848, + "loss": 0.7227, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8340229988098145, + "rewards/margins": -0.2231469750404358, + "rewards/rejected": 1.057170033454895, + "step": 724 + }, + { + "epoch": 2.56, + "learning_rate": 5.497429247630825e-09, + "logits/chosen": -1.956146478652954, + "logits/rejected": -1.9773359298706055, + "logps/chosen": -1.7194643020629883, + "logps/rejected": -3.6613211631774902, + "loss": 0.7016, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7568651437759399, + "rewards/margins": 0.006244093179702759, + "rewards/rejected": 0.7506210803985596, + "step": 725 + }, + { + "epoch": 2.57, + "learning_rate": 5.410747195124704e-09, + "logits/chosen": -2.02451229095459, + "logits/rejected": -2.028801918029785, + "logps/chosen": -2.3460590839385986, + "logps/rejected": -2.331206798553467, + "loss": 0.7185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.846625030040741, + "rewards/margins": 0.36803364753723145, + "rewards/rejected": 0.4785913825035095, + "step": 726 + }, + { + "epoch": 2.57, + "learning_rate": 5.32471486786486e-09, + "logits/chosen": -1.9203360080718994, + "logits/rejected": -1.927173376083374, + "logps/chosen": -1.4536354541778564, + "logps/rejected": -7.5455498695373535, + "loss": 0.7201, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7590805292129517, + "rewards/margins": 0.07344846427440643, + "rewards/rejected": 0.6856321096420288, + "step": 727 + }, + { + "epoch": 2.57, + "learning_rate": 5.239333519457778e-09, + "logits/chosen": -2.143270254135132, + "logits/rejected": -2.149052858352661, + "logps/chosen": -2.012284994125366, + "logps/rejected": -13.529197692871094, + "loss": 0.5714, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7590298056602478, + "rewards/margins": -0.14885294437408447, + "rewards/rejected": 0.9078827500343323, + "step": 728 + }, + { + "epoch": 2.58, + "learning_rate": 5.154604394024253e-09, + "logits/chosen": -2.0077359676361084, + "logits/rejected": -2.0165276527404785, + "logps/chosen": -0.5606021285057068, + "logps/rejected": -12.987120628356934, + "loss": 0.6116, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7567213177680969, + "rewards/margins": -0.38804182410240173, + "rewards/rejected": 1.1447631120681763, + "step": 729 + }, + { + "epoch": 2.58, + "learning_rate": 5.070528726181345e-09, + "logits/chosen": -1.9478814601898193, + "logits/rejected": -1.9514645338058472, + "logps/chosen": -0.8688209056854248, + "logps/rejected": -6.277561664581299, + "loss": 0.6267, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7936465740203857, + "rewards/margins": -0.050898849964141846, + "rewards/rejected": 0.8445454835891724, + "step": 730 + }, + { + "epoch": 2.58, + "learning_rate": 4.987107741024349e-09, + "logits/chosen": -2.0718839168548584, + "logits/rejected": -2.0744705200195312, + "logps/chosen": -1.4043169021606445, + "logps/rejected": -3.0488643646240234, + "loss": 0.6906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8335069417953491, + "rewards/margins": 0.30042216181755066, + "rewards/rejected": 0.5330848097801208, + "step": 731 + }, + { + "epoch": 2.59, + "learning_rate": 4.9043426541089565e-09, + "logits/chosen": -2.0684525966644287, + "logits/rejected": -2.0737133026123047, + "logps/chosen": -2.6041927337646484, + "logps/rejected": -3.249032735824585, + "loss": 0.6948, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9456807374954224, + "rewards/margins": 0.5285922288894653, + "rewards/rejected": 0.41708850860595703, + "step": 732 + }, + { + "epoch": 2.59, + "learning_rate": 4.8222346714335505e-09, + "logits/chosen": -2.0352349281311035, + "logits/rejected": -2.0806081295013428, + "logps/chosen": -4.066280841827393, + "logps/rejected": -16.366641998291016, + "loss": 0.6702, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7254934310913086, + "rewards/margins": -0.2946155071258545, + "rewards/rejected": 1.020108938217163, + "step": 733 + }, + { + "epoch": 2.59, + "learning_rate": 4.740784989421609e-09, + "logits/chosen": -1.9817390441894531, + "logits/rejected": -1.9925750494003296, + "logps/chosen": -2.3684396743774414, + "logps/rejected": -15.954732894897461, + "loss": 0.6502, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8382972478866577, + "rewards/margins": 0.10008671879768372, + "rewards/rejected": 0.7382104992866516, + "step": 734 + }, + { + "epoch": 2.6, + "learning_rate": 4.659994794904309e-09, + "logits/chosen": -2.139395236968994, + "logits/rejected": -2.1452724933624268, + "logps/chosen": -2.170642852783203, + "logps/rejected": -6.681427955627441, + "loss": 0.6095, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7515257596969604, + "rewards/margins": -0.00413590669631958, + "rewards/rejected": 0.75566166639328, + "step": 735 + }, + { + "epoch": 2.6, + "learning_rate": 4.5798652651031835e-09, + "logits/chosen": -2.0169026851654053, + "logits/rejected": -2.052128553390503, + "logps/chosen": -5.222471237182617, + "logps/rejected": -13.597497940063477, + "loss": 0.6728, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7651776075363159, + "rewards/margins": -0.01836562156677246, + "rewards/rejected": 0.7835432887077332, + "step": 736 + }, + { + "epoch": 2.6, + "learning_rate": 4.500397567613001e-09, + "logits/chosen": -2.063211441040039, + "logits/rejected": -2.0772311687469482, + "logps/chosen": -4.490671157836914, + "logps/rejected": -8.139833450317383, + "loss": 0.6421, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.060834527015686, + "rewards/margins": 0.4990158677101135, + "rewards/rejected": 0.5618186593055725, + "step": 737 + }, + { + "epoch": 2.61, + "learning_rate": 4.4215928603847595e-09, + "logits/chosen": -2.031834125518799, + "logits/rejected": -2.03230357170105, + "logps/chosen": -2.922715425491333, + "logps/rejected": -7.7074995040893555, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8008396625518799, + "rewards/margins": -0.17861789464950562, + "rewards/rejected": 0.9794575572013855, + "step": 738 + }, + { + "epoch": 2.61, + "learning_rate": 4.343452291708782e-09, + "logits/chosen": -1.9733037948608398, + "logits/rejected": -1.9737532138824463, + "logps/chosen": -1.7844158411026, + "logps/rejected": -1.9572288990020752, + "loss": 0.6895, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8409486413002014, + "rewards/margins": 0.08161178231239319, + "rewards/rejected": 0.7593368291854858, + "step": 739 + }, + { + "epoch": 2.61, + "learning_rate": 4.265977000197996e-09, + "logits/chosen": -2.1071884632110596, + "logits/rejected": -2.119689702987671, + "logps/chosen": -1.614532470703125, + "logps/rejected": -12.57592487335205, + "loss": 0.8067, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9285904169082642, + "rewards/margins": 0.003462493419647217, + "rewards/rejected": 0.9251278638839722, + "step": 740 + }, + { + "epoch": 2.62, + "learning_rate": 4.189168114771391e-09, + "logits/chosen": -2.0649564266204834, + "logits/rejected": -2.0640130043029785, + "logps/chosen": -0.8559577465057373, + "logps/rejected": -4.2023444175720215, + "loss": 0.7347, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8501089811325073, + "rewards/margins": 0.17430099844932556, + "rewards/rejected": 0.6758080124855042, + "step": 741 + }, + { + "epoch": 2.62, + "learning_rate": 4.113026754637472e-09, + "logits/chosen": -1.9814873933792114, + "logits/rejected": -1.9830695390701294, + "logps/chosen": -0.792579174041748, + "logps/rejected": -3.6915388107299805, + "loss": 0.7436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.797130823135376, + "rewards/margins": 0.14447838068008423, + "rewards/rejected": 0.6526524424552917, + "step": 742 + }, + { + "epoch": 2.63, + "learning_rate": 4.03755402927804e-09, + "logits/chosen": -1.9443498849868774, + "logits/rejected": -1.9453861713409424, + "logps/chosen": -0.7968714833259583, + "logps/rejected": -1.9379141330718994, + "loss": 0.5359, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8097206354141235, + "rewards/margins": 0.17554956674575806, + "rewards/rejected": 0.6341711282730103, + "step": 743 + }, + { + "epoch": 2.63, + "learning_rate": 3.962751038431961e-09, + "logits/chosen": -1.9898914098739624, + "logits/rejected": -1.98695969581604, + "logps/chosen": -3.4303650856018066, + "logps/rejected": -1.8467164039611816, + "loss": 0.651, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5921152830123901, + "rewards/margins": -0.21573039889335632, + "rewards/rejected": 0.8078457117080688, + "step": 744 + }, + { + "epoch": 2.63, + "learning_rate": 3.888618872079203e-09, + "logits/chosen": -2.0430080890655518, + "logits/rejected": -2.0409555435180664, + "logps/chosen": -2.84146785736084, + "logps/rejected": -2.682725667953491, + "loss": 0.6103, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.822039008140564, + "rewards/margins": -0.006409883499145508, + "rewards/rejected": 0.8284488916397095, + "step": 745 + }, + { + "epoch": 2.64, + "learning_rate": 3.815158610424896e-09, + "logits/chosen": -1.9567267894744873, + "logits/rejected": -1.9562342166900635, + "logps/chosen": -0.6662712097167969, + "logps/rejected": -2.7253847122192383, + "loss": 0.6367, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7391253709793091, + "rewards/margins": 0.04441303014755249, + "rewards/rejected": 0.6947123408317566, + "step": 746 + }, + { + "epoch": 2.64, + "learning_rate": 3.742371323883642e-09, + "logits/chosen": -2.0063388347625732, + "logits/rejected": -2.014080762863159, + "logps/chosen": -4.45096492767334, + "logps/rejected": -20.3436279296875, + "loss": 0.7412, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6127749085426331, + "rewards/margins": -0.6960119009017944, + "rewards/rejected": 1.3087868690490723, + "step": 747 + }, + { + "epoch": 2.64, + "learning_rate": 3.6702580730638646e-09, + "logits/chosen": -2.013213634490967, + "logits/rejected": -2.0231096744537354, + "logps/chosen": -2.3962035179138184, + "logps/rejected": -13.092933654785156, + "loss": 0.7954, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9076145887374878, + "rewards/margins": -0.015313327312469482, + "rewards/rejected": 0.9229279160499573, + "step": 748 + }, + { + "epoch": 2.65, + "learning_rate": 3.5988199087523986e-09, + "logits/chosen": -2.0273184776306152, + "logits/rejected": -2.0387158393859863, + "logps/chosen": -1.1719787120819092, + "logps/rejected": -7.484349250793457, + "loss": 0.8078, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7789568901062012, + "rewards/margins": 0.19364385306835175, + "rewards/rejected": 0.5853130221366882, + "step": 749 + }, + { + "epoch": 2.65, + "learning_rate": 3.528057871899154e-09, + "logits/chosen": -2.017503023147583, + "logits/rejected": -2.023488998413086, + "logps/chosen": -3.129150867462158, + "logps/rejected": -5.944158554077148, + "loss": 0.6703, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0472253561019897, + "rewards/margins": 0.23773249983787537, + "rewards/rejected": 0.8094928860664368, + "step": 750 + }, + { + "epoch": 2.65, + "learning_rate": 3.457972993601965e-09, + "logits/chosen": -2.0342416763305664, + "logits/rejected": -2.03588604927063, + "logps/chosen": -0.6115220785140991, + "logps/rejected": -5.55940055847168, + "loss": 0.7684, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7122302055358887, + "rewards/margins": 0.06058222055435181, + "rewards/rejected": 0.6516479253768921, + "step": 751 + }, + { + "epoch": 2.66, + "learning_rate": 3.388566295091544e-09, + "logits/chosen": -1.954818606376648, + "logits/rejected": -1.9600533246994019, + "logps/chosen": -5.027666091918945, + "logps/rejected": -8.518013000488281, + "loss": 0.8063, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7172009944915771, + "rewards/margins": -0.15685680508613586, + "rewards/rejected": 0.8740577697753906, + "step": 752 + }, + { + "epoch": 2.66, + "learning_rate": 3.3198387877166334e-09, + "logits/chosen": -2.0257413387298584, + "logits/rejected": -2.03024959564209, + "logps/chosen": -3.2157020568847656, + "logps/rejected": -3.056518793106079, + "loss": 0.5397, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9477333426475525, + "rewards/margins": 0.4665800929069519, + "rewards/rejected": 0.4811532497406006, + "step": 753 + }, + { + "epoch": 2.66, + "learning_rate": 3.251791472929244e-09, + "logits/chosen": -2.0413153171539307, + "logits/rejected": -2.0527234077453613, + "logps/chosen": -1.4235084056854248, + "logps/rejected": -15.894926071166992, + "loss": 0.7973, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7515846490859985, + "rewards/margins": -0.4063642621040344, + "rewards/rejected": 1.1579488515853882, + "step": 754 + }, + { + "epoch": 2.67, + "learning_rate": 3.1844253422700527e-09, + "logits/chosen": -2.0334632396698, + "logits/rejected": -2.036705493927002, + "logps/chosen": -6.640949726104736, + "logps/rejected": -2.8570547103881836, + "loss": 0.5822, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1367120742797852, + "rewards/margins": 0.45687225461006165, + "rewards/rejected": 0.6798397302627563, + "step": 755 + }, + { + "epoch": 2.67, + "learning_rate": 3.1177413773539774e-09, + "logits/chosen": -1.9839668273925781, + "logits/rejected": -1.9845491647720337, + "logps/chosen": -8.628040313720703, + "logps/rejected": -3.489814281463623, + "loss": 0.7266, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2276337146759033, + "rewards/margins": 0.5189299583435059, + "rewards/rejected": 0.7087036967277527, + "step": 756 + }, + { + "epoch": 2.67, + "learning_rate": 3.051740549855869e-09, + "logits/chosen": -2.012662410736084, + "logits/rejected": -2.0112760066986084, + "logps/chosen": -7.302083492279053, + "logps/rejected": -1.2128781080245972, + "loss": 0.7231, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2871180772781372, + "rewards/margins": 0.6210534572601318, + "rewards/rejected": 0.6660646796226501, + "step": 757 + }, + { + "epoch": 2.68, + "learning_rate": 2.9864238214963587e-09, + "logits/chosen": -1.9921298027038574, + "logits/rejected": -2.0059471130371094, + "logps/chosen": -4.112911224365234, + "logps/rejected": -8.312458992004395, + "loss": 0.5281, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9619156122207642, + "rewards/margins": 0.15044143795967102, + "rewards/rejected": 0.8114742040634155, + "step": 758 + }, + { + "epoch": 2.68, + "learning_rate": 2.9217921440278126e-09, + "logits/chosen": -1.996216058731079, + "logits/rejected": -2.0740745067596436, + "logps/chosen": -0.7638965845108032, + "logps/rejected": -17.2427978515625, + "loss": 0.8378, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7452353835105896, + "rewards/margins": -0.21635746955871582, + "rewards/rejected": 0.9615928530693054, + "step": 759 + }, + { + "epoch": 2.69, + "learning_rate": 2.857846459220514e-09, + "logits/chosen": -2.0559589862823486, + "logits/rejected": -2.061023473739624, + "logps/chosen": -1.744868278503418, + "logps/rejected": -6.855428695678711, + "loss": 0.6199, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8660241365432739, + "rewards/margins": 0.1659650206565857, + "rewards/rejected": 0.700059175491333, + "step": 760 + }, + { + "epoch": 2.69, + "learning_rate": 2.794587698848888e-09, + "logits/chosen": -2.0438947677612305, + "logits/rejected": -2.056877613067627, + "logps/chosen": -3.3294873237609863, + "logps/rejected": -8.489542007446289, + "loss": 0.7221, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.008626103401184, + "rewards/margins": 0.3222644329071045, + "rewards/rejected": 0.6863616704940796, + "step": 761 + }, + { + "epoch": 2.69, + "learning_rate": 2.732016784677954e-09, + "logits/chosen": -2.008540153503418, + "logits/rejected": -2.0090436935424805, + "logps/chosen": -1.985867977142334, + "logps/rejected": -1.7906041145324707, + "loss": 0.6023, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8143060207366943, + "rewards/margins": 0.000736236572265625, + "rewards/rejected": 0.8135697841644287, + "step": 762 + }, + { + "epoch": 2.7, + "learning_rate": 2.6701346284498994e-09, + "logits/chosen": -2.014523983001709, + "logits/rejected": -2.0160675048828125, + "logps/chosen": -1.887511968612671, + "logps/rejected": -8.693577766418457, + "loss": 0.5562, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.856716513633728, + "rewards/margins": 0.06769344210624695, + "rewards/rejected": 0.7890230417251587, + "step": 763 + }, + { + "epoch": 2.7, + "learning_rate": 2.6089421318707782e-09, + "logits/chosen": -1.9914612770080566, + "logits/rejected": -1.998498797416687, + "logps/chosen": -3.3118667602539062, + "logps/rejected": -8.418756484985352, + "loss": 0.6329, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8444534540176392, + "rewards/margins": -0.2916508615016937, + "rewards/rejected": 1.1361043453216553, + "step": 764 + }, + { + "epoch": 2.7, + "learning_rate": 2.5484401865973724e-09, + "logits/chosen": -2.0991854667663574, + "logits/rejected": -2.1006453037261963, + "logps/chosen": -0.6161876916885376, + "logps/rejected": -3.569758892059326, + "loss": 0.6211, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7584793567657471, + "rewards/margins": 0.1386008858680725, + "rewards/rejected": 0.6198784708976746, + "step": 765 + }, + { + "epoch": 2.71, + "learning_rate": 2.488629674224213e-09, + "logits/chosen": -2.0422818660736084, + "logits/rejected": -2.0485076904296875, + "logps/chosen": -7.979806423187256, + "logps/rejected": -7.806881427764893, + "loss": 0.7282, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1172795295715332, + "rewards/margins": 0.38360291719436646, + "rewards/rejected": 0.733676552772522, + "step": 766 + }, + { + "epoch": 2.71, + "learning_rate": 2.4295114662707285e-09, + "logits/chosen": -2.05346941947937, + "logits/rejected": -2.0602612495422363, + "logps/chosen": -1.1591172218322754, + "logps/rejected": -9.757563591003418, + "loss": 0.6854, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8200973272323608, + "rewards/margins": -0.3500862121582031, + "rewards/rejected": 1.170183539390564, + "step": 767 + }, + { + "epoch": 2.71, + "learning_rate": 2.3710864241685334e-09, + "logits/chosen": -2.0964338779449463, + "logits/rejected": -2.099780321121216, + "logps/chosen": -2.2126569747924805, + "logps/rejected": -7.552159309387207, + "loss": 0.6525, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9192639589309692, + "rewards/margins": -0.047728002071380615, + "rewards/rejected": 0.9669919013977051, + "step": 768 + }, + { + "epoch": 2.72, + "learning_rate": 2.313355399248884e-09, + "logits/chosen": -2.0570366382598877, + "logits/rejected": -2.0603268146514893, + "logps/chosen": -3.4148075580596924, + "logps/rejected": -8.66996955871582, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9834345579147339, + "rewards/margins": 0.12197688221931458, + "rewards/rejected": 0.8614577054977417, + "step": 769 + }, + { + "epoch": 2.72, + "learning_rate": 2.2563192327302916e-09, + "logits/chosen": -2.093506097793579, + "logits/rejected": -2.0945992469787598, + "logps/chosen": -3.06406569480896, + "logps/rejected": -2.737792730331421, + "loss": 0.7801, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9010565876960754, + "rewards/margins": 0.07328912615776062, + "rewards/rejected": 0.8277674913406372, + "step": 770 + }, + { + "epoch": 2.72, + "learning_rate": 2.199978755706228e-09, + "logits/chosen": -2.0433261394500732, + "logits/rejected": -2.0473320484161377, + "logps/chosen": -3.0328869819641113, + "logps/rejected": -4.964513778686523, + "loss": 0.6398, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8729187250137329, + "rewards/margins": 0.25031188130378723, + "rewards/rejected": 0.6226068735122681, + "step": 771 + }, + { + "epoch": 2.73, + "learning_rate": 2.1443347891330566e-09, + "logits/chosen": -2.0104923248291016, + "logits/rejected": -2.003589153289795, + "logps/chosen": -3.931213855743408, + "logps/rejected": -9.865578651428223, + "loss": 0.63, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6697304844856262, + "rewards/margins": -0.13489995896816254, + "rewards/rejected": 0.80463045835495, + "step": 772 + }, + { + "epoch": 2.73, + "learning_rate": 2.089388143818027e-09, + "logits/chosen": -2.0003726482391357, + "logits/rejected": -2.0081114768981934, + "logps/chosen": -1.0278258323669434, + "logps/rejected": -19.691375732421875, + "loss": 0.5937, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.623435914516449, + "rewards/margins": -0.14656974375247955, + "rewards/rejected": 0.7700056433677673, + "step": 773 + }, + { + "epoch": 2.73, + "learning_rate": 2.0351396204074944e-09, + "logits/chosen": -1.9502383470535278, + "logits/rejected": -1.9586279392242432, + "logps/chosen": -7.316928386688232, + "logps/rejected": -6.356278419494629, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.0210351943969727, + "rewards/margins": 0.01360863447189331, + "rewards/rejected": 1.0074266195297241, + "step": 774 + }, + { + "epoch": 2.74, + "learning_rate": 1.9815900093752448e-09, + "logits/chosen": -2.0399208068847656, + "logits/rejected": -2.1012139320373535, + "logps/chosen": -6.8880109786987305, + "logps/rejected": -10.463363647460938, + "loss": 0.6458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9668774604797363, + "rewards/margins": 0.6867296695709229, + "rewards/rejected": 0.2801477909088135, + "step": 775 + }, + { + "epoch": 2.74, + "learning_rate": 1.928740091010961e-09, + "logits/chosen": -2.0370874404907227, + "logits/rejected": -2.1359434127807617, + "logps/chosen": -3.1933746337890625, + "logps/rejected": -14.675901412963867, + "loss": 0.8696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8155375123023987, + "rewards/margins": -0.014428317546844482, + "rewards/rejected": 0.8299658298492432, + "step": 776 + }, + { + "epoch": 2.75, + "learning_rate": 1.87659063540887e-09, + "logits/chosen": -2.087409019470215, + "logits/rejected": -2.1011252403259277, + "logps/chosen": -3.0835776329040527, + "logps/rejected": -10.081119537353516, + "loss": 0.7492, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9471675157546997, + "rewards/margins": 0.25406643748283386, + "rewards/rejected": 0.6931010484695435, + "step": 777 + }, + { + "epoch": 2.75, + "learning_rate": 1.8251424024565253e-09, + "logits/chosen": -2.0670647621154785, + "logits/rejected": -2.0685954093933105, + "logps/chosen": -2.4386560916900635, + "logps/rejected": -3.5570716857910156, + "loss": 0.7459, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7534528970718384, + "rewards/margins": 0.020326420664787292, + "rewards/rejected": 0.7331265211105347, + "step": 778 + }, + { + "epoch": 2.75, + "learning_rate": 1.774396141823714e-09, + "logits/chosen": -2.174668788909912, + "logits/rejected": -2.1863300800323486, + "logps/chosen": -4.586387634277344, + "logps/rejected": -6.16770076751709, + "loss": 0.5409, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0960954427719116, + "rewards/margins": 0.6002812385559082, + "rewards/rejected": 0.4958142042160034, + "step": 779 + }, + { + "epoch": 2.76, + "learning_rate": 1.7243525929515501e-09, + "logits/chosen": -2.0625641345977783, + "logits/rejected": -2.0653023719787598, + "logps/chosen": -2.988719940185547, + "logps/rejected": -2.2888405323028564, + "loss": 0.6111, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.919766902923584, + "rewards/margins": 0.16401407122612, + "rewards/rejected": 0.7557528018951416, + "step": 780 + }, + { + "epoch": 2.76, + "learning_rate": 1.6750124850416825e-09, + "logits/chosen": -1.9700002670288086, + "logits/rejected": -1.969808578491211, + "logps/chosen": -2.606736421585083, + "logps/rejected": -7.110620498657227, + "loss": 0.5796, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8853650093078613, + "rewards/margins": -0.04579010605812073, + "rewards/rejected": 0.9311551451683044, + "step": 781 + }, + { + "epoch": 2.76, + "learning_rate": 1.6263765370457038e-09, + "logits/chosen": -1.9915668964385986, + "logits/rejected": -1.9934983253479004, + "logps/chosen": -0.6087383031845093, + "logps/rejected": -5.0742950439453125, + "loss": 0.7397, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7825950384140015, + "rewards/margins": 0.10471361875534058, + "rewards/rejected": 0.6778814196586609, + "step": 782 + }, + { + "epoch": 2.77, + "learning_rate": 1.5784454576546368e-09, + "logits/chosen": -1.9678717851638794, + "logits/rejected": -1.9732229709625244, + "logps/chosen": -2.2425661087036133, + "logps/rejected": -3.6468493938446045, + "loss": 0.6229, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9214881658554077, + "rewards/margins": 0.37283438444137573, + "rewards/rejected": 0.5486537218093872, + "step": 783 + }, + { + "epoch": 2.77, + "learning_rate": 1.5312199452886144e-09, + "logits/chosen": -2.083400249481201, + "logits/rejected": -2.083939552307129, + "logps/chosen": -1.4269421100616455, + "logps/rejected": -7.685921669006348, + "loss": 0.7177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8291968107223511, + "rewards/margins": 0.16720259189605713, + "rewards/rejected": 0.661994218826294, + "step": 784 + }, + { + "epoch": 2.77, + "learning_rate": 1.484700688086743e-09, + "logits/chosen": -2.001605272293091, + "logits/rejected": -2.0054163932800293, + "logps/chosen": -4.603106498718262, + "logps/rejected": -8.69485092163086, + "loss": 0.6482, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.039998173713684, + "rewards/margins": 0.16053742170333862, + "rewards/rejected": 0.8794606924057007, + "step": 785 + }, + { + "epoch": 2.78, + "learning_rate": 1.4388883638970062e-09, + "logits/chosen": -2.0716469287872314, + "logits/rejected": -2.071239471435547, + "logps/chosen": -1.6661381721496582, + "logps/rejected": -7.950540065765381, + "loss": 0.6811, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8894135355949402, + "rewards/margins": -0.27389106154441833, + "rewards/rejected": 1.1633045673370361, + "step": 786 + }, + { + "epoch": 2.78, + "learning_rate": 1.3937836402664494e-09, + "logits/chosen": -1.9743945598602295, + "logits/rejected": -1.9738662242889404, + "logps/chosen": -3.4002013206481934, + "logps/rejected": -3.1451752185821533, + "loss": 0.6608, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9394896030426025, + "rewards/margins": 0.23638683557510376, + "rewards/rejected": 0.703102707862854, + "step": 787 + }, + { + "epoch": 2.78, + "learning_rate": 1.3493871744314212e-09, + "logits/chosen": -2.008162260055542, + "logits/rejected": -2.0120034217834473, + "logps/chosen": -9.17656135559082, + "logps/rejected": -3.7101476192474365, + "loss": 0.7353, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2914175987243652, + "rewards/margins": 0.6136422157287598, + "rewards/rejected": 0.6777753829956055, + "step": 788 + }, + { + "epoch": 2.79, + "learning_rate": 1.3056996133079923e-09, + "logits/chosen": -1.9760469198226929, + "logits/rejected": -1.976087212562561, + "logps/chosen": -1.1280770301818848, + "logps/rejected": -3.075025796890259, + "loss": 0.7386, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.90775465965271, + "rewards/margins": 0.19430598616600037, + "rewards/rejected": 0.7134486436843872, + "step": 789 + }, + { + "epoch": 2.79, + "learning_rate": 1.2627215934825576e-09, + "logits/chosen": -2.062473773956299, + "logits/rejected": -2.075106382369995, + "logps/chosen": -0.5334907174110413, + "logps/rejected": -15.75739860534668, + "loss": 0.8385, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7692946195602417, + "rewards/margins": -0.542896032333374, + "rewards/rejected": 1.3121905326843262, + "step": 790 + }, + { + "epoch": 2.8, + "learning_rate": 1.220453741202543e-09, + "logits/chosen": -2.0671546459198, + "logits/rejected": -2.066413402557373, + "logps/chosen": -0.7174315452575684, + "logps/rejected": -8.558623313903809, + "loss": 0.834, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6374040842056274, + "rewards/margins": -0.6022014617919922, + "rewards/rejected": 1.2396055459976196, + "step": 791 + }, + { + "epoch": 2.8, + "learning_rate": 1.1788966723672633e-09, + "logits/chosen": -2.0473928451538086, + "logits/rejected": -2.047057867050171, + "logps/chosen": -0.7936781644821167, + "logps/rejected": -5.245582580566406, + "loss": 0.7815, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6745643615722656, + "rewards/margins": -0.18008694052696228, + "rewards/rejected": 0.8546513319015503, + "step": 792 + }, + { + "epoch": 2.8, + "learning_rate": 1.138050992518985e-09, + "logits/chosen": -1.9827325344085693, + "logits/rejected": -1.9844207763671875, + "logps/chosen": -1.1545151472091675, + "logps/rejected": -6.227300643920898, + "loss": 0.5898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9162492752075195, + "rewards/margins": 0.17558270692825317, + "rewards/rejected": 0.7406665086746216, + "step": 793 + }, + { + "epoch": 2.81, + "learning_rate": 1.0979172968340665e-09, + "logits/chosen": -2.0453646183013916, + "logits/rejected": -2.0463709831237793, + "logps/chosen": -2.4884300231933594, + "logps/rejected": -2.9647631645202637, + "loss": 0.6015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0235261917114258, + "rewards/margins": 0.4339827001094818, + "rewards/rejected": 0.5895435810089111, + "step": 794 + }, + { + "epoch": 2.81, + "learning_rate": 1.0584961701143146e-09, + "logits/chosen": -1.950596809387207, + "logits/rejected": -1.9595222473144531, + "logps/chosen": -0.7372739315032959, + "logps/rejected": -6.126054286956787, + "loss": 0.5084, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7411417365074158, + "rewards/margins": 0.0013988614082336426, + "rewards/rejected": 0.7397428750991821, + "step": 795 + }, + { + "epoch": 2.81, + "learning_rate": 1.0197881867784364e-09, + "logits/chosen": -2.0266098976135254, + "logits/rejected": -2.0315945148468018, + "logps/chosen": -2.1367106437683105, + "logps/rejected": -8.155990600585938, + "loss": 0.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8973848223686218, + "rewards/margins": 0.07021984457969666, + "rewards/rejected": 0.8271650075912476, + "step": 796 + }, + { + "epoch": 2.82, + "learning_rate": 9.817939108536955e-10, + "logits/chosen": -1.9404666423797607, + "logits/rejected": -1.936916708946228, + "logps/chosen": -5.04686164855957, + "logps/rejected": -1.7754974365234375, + "loss": 0.6061, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7935322523117065, + "rewards/margins": 0.00861850380897522, + "rewards/rejected": 0.7849137783050537, + "step": 797 + }, + { + "epoch": 2.82, + "learning_rate": 9.445138959676691e-10, + "logits/chosen": -1.9573057889938354, + "logits/rejected": -1.9550292491912842, + "logps/chosen": -3.1021337509155273, + "logps/rejected": -1.7771711349487305, + "loss": 0.7306, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6871731877326965, + "rewards/margins": -0.05201879143714905, + "rewards/rejected": 0.739192008972168, + "step": 798 + }, + { + "epoch": 2.82, + "learning_rate": 9.079486853402097e-10, + "logits/chosen": -2.024343252182007, + "logits/rejected": -2.026770830154419, + "logps/chosen": -2.591250419616699, + "logps/rejected": -2.700451374053955, + "loss": 0.5076, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9433870315551758, + "rewards/margins": 0.4646506905555725, + "rewards/rejected": 0.4787363111972809, + "step": 799 + }, + { + "epoch": 2.83, + "learning_rate": 8.720988117754957e-10, + "logits/chosen": -2.0463151931762695, + "logits/rejected": -2.094839096069336, + "logps/chosen": -0.8696421980857849, + "logps/rejected": -23.078937530517578, + "loss": 0.725, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6625446081161499, + "rewards/margins": -0.6175566911697388, + "rewards/rejected": 1.2801012992858887, + "step": 800 + }, + { + "epoch": 2.83, + "learning_rate": 8.369647976542882e-10, + "logits/chosen": -1.9969301223754883, + "logits/rejected": -2.004732847213745, + "logps/chosen": -0.603164792060852, + "logps/rejected": -5.676112174987793, + "loss": 0.7885, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8478790521621704, + "rewards/margins": 0.29848527908325195, + "rewards/rejected": 0.5493937730789185, + "step": 801 + }, + { + "epoch": 2.83, + "learning_rate": 8.025471549263141e-10, + "logits/chosen": -1.967464566230774, + "logits/rejected": -1.9661166667938232, + "logps/chosen": -2.019026279449463, + "logps/rejected": -3.84769868850708, + "loss": 0.7231, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7595319151878357, + "rewards/margins": -0.27605050802230835, + "rewards/rejected": 1.035582423210144, + "step": 802 + }, + { + "epoch": 2.84, + "learning_rate": 7.688463851028226e-10, + "logits/chosen": -1.9730894565582275, + "logits/rejected": -1.9711486101150513, + "logps/chosen": -2.4336414337158203, + "logps/rejected": -3.0513782501220703, + "loss": 0.7274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8742198944091797, + "rewards/margins": 0.26450687646865845, + "rewards/rejected": 0.6097130179405212, + "step": 803 + }, + { + "epoch": 2.84, + "learning_rate": 7.358629792492521e-10, + "logits/chosen": -1.9759057760238647, + "logits/rejected": -2.0191164016723633, + "logps/chosen": -1.608994960784912, + "logps/rejected": -17.518735885620117, + "loss": 0.7213, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5502568483352661, + "rewards/margins": -0.5163503885269165, + "rewards/rejected": 1.0666072368621826, + "step": 804 + }, + { + "epoch": 2.84, + "learning_rate": 7.035974179780802e-10, + "logits/chosen": -2.0171632766723633, + "logits/rejected": -2.017486572265625, + "logps/chosen": -2.8762121200561523, + "logps/rejected": -1.2975009679794312, + "loss": 0.7995, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9342976808547974, + "rewards/margins": 0.04502969980239868, + "rewards/rejected": 0.8892680406570435, + "step": 805 + }, + { + "epoch": 2.85, + "learning_rate": 6.720501714418237e-10, + "logits/chosen": -1.944404125213623, + "logits/rejected": -1.9472570419311523, + "logps/chosen": -2.7875924110412598, + "logps/rejected": -3.815286636352539, + "loss": 0.598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9704475402832031, + "rewards/margins": 0.4032118320465088, + "rewards/rejected": 0.5672357082366943, + "step": 806 + }, + { + "epoch": 2.85, + "learning_rate": 6.412216993262109e-10, + "logits/chosen": -2.0164501667022705, + "logits/rejected": -2.018580198287964, + "logps/chosen": -0.4500521719455719, + "logps/rejected": -4.2411885261535645, + "loss": 0.7971, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7837737202644348, + "rewards/margins": 0.4565250873565674, + "rewards/rejected": 0.32724860310554504, + "step": 807 + }, + { + "epoch": 2.86, + "learning_rate": 6.111124508434429e-10, + "logits/chosen": -2.0396528244018555, + "logits/rejected": -2.0395667552948, + "logps/chosen": -0.8864428997039795, + "logps/rejected": -3.5686864852905273, + "loss": 0.5591, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8007975816726685, + "rewards/margins": -0.0391480028629303, + "rewards/rejected": 0.8399455547332764, + "step": 808 + }, + { + "epoch": 2.86, + "learning_rate": 5.817228647256645e-10, + "logits/chosen": -2.0641558170318604, + "logits/rejected": -2.0697076320648193, + "logps/chosen": -1.664809226989746, + "logps/rejected": -4.126045227050781, + "loss": 0.4938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8975797891616821, + "rewards/margins": 0.28764650225639343, + "rewards/rejected": 0.6099333167076111, + "step": 809 + }, + { + "epoch": 2.86, + "learning_rate": 5.530533692185979e-10, + "logits/chosen": -1.9513391256332397, + "logits/rejected": -1.9531267881393433, + "logps/chosen": -0.5320684313774109, + "logps/rejected": -8.918525695800781, + "loss": 0.8797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7650554180145264, + "rewards/margins": -0.3129754960536957, + "rewards/rejected": 1.0780309438705444, + "step": 810 + }, + { + "epoch": 2.87, + "learning_rate": 5.251043820752532e-10, + "logits/chosen": -2.0092854499816895, + "logits/rejected": -2.0073013305664062, + "logps/chosen": -0.9783685803413391, + "logps/rejected": -6.281907081604004, + "loss": 0.8653, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6941394805908203, + "rewards/margins": -0.2663889229297638, + "rewards/rejected": 0.9605283737182617, + "step": 811 + }, + { + "epoch": 2.87, + "learning_rate": 4.978763105498774e-10, + "logits/chosen": -2.0435092449188232, + "logits/rejected": -2.05438494682312, + "logps/chosen": -1.2127048969268799, + "logps/rejected": -8.860550880432129, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8449338674545288, + "rewards/margins": 0.12326246500015259, + "rewards/rejected": 0.7216714024543762, + "step": 812 + }, + { + "epoch": 2.87, + "learning_rate": 4.713695513920146e-10, + "logits/chosen": -2.09495210647583, + "logits/rejected": -2.099759578704834, + "logps/chosen": -2.1475026607513428, + "logps/rejected": -2.779412031173706, + "loss": 0.6276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.973555862903595, + "rewards/margins": 0.37973636388778687, + "rewards/rejected": 0.5938194990158081, + "step": 813 + }, + { + "epoch": 2.88, + "learning_rate": 4.455844908407058e-10, + "logits/chosen": -2.120521068572998, + "logits/rejected": -2.120384931564331, + "logps/chosen": -1.567115306854248, + "logps/rejected": -3.3288002014160156, + "loss": 0.779, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8318677544593811, + "rewards/margins": 0.3767734169960022, + "rewards/rejected": 0.4550943374633789, + "step": 814 + }, + { + "epoch": 2.88, + "learning_rate": 4.2052150461889255e-10, + "logits/chosen": -2.002415895462036, + "logits/rejected": -2.0093140602111816, + "logps/chosen": -0.6711132526397705, + "logps/rejected": -6.583756446838379, + "loss": 0.6939, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8185642957687378, + "rewards/margins": -0.038839131593704224, + "rewards/rejected": 0.8574033975601196, + "step": 815 + }, + { + "epoch": 2.88, + "learning_rate": 3.961809579279052e-10, + "logits/chosen": -1.9982293844223022, + "logits/rejected": -1.9951056241989136, + "logps/chosen": -0.8186591863632202, + "logps/rejected": -3.9018354415893555, + "loss": 0.6414, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.749245285987854, + "rewards/margins": -0.3045717775821686, + "rewards/rejected": 1.0538170337677002, + "step": 816 + }, + { + "epoch": 2.89, + "learning_rate": 3.7256320544218347e-10, + "logits/chosen": -2.003527879714966, + "logits/rejected": -2.005072832107544, + "logps/chosen": -3.7793867588043213, + "logps/rejected": -6.211068153381348, + "loss": 0.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9092538356781006, + "rewards/margins": 0.32134103775024414, + "rewards/rejected": 0.5879127979278564, + "step": 817 + }, + { + "epoch": 2.89, + "learning_rate": 3.496685913040587e-10, + "logits/chosen": -1.9922999143600464, + "logits/rejected": -1.9976446628570557, + "logps/chosen": -6.562006950378418, + "logps/rejected": -7.466248512268066, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2433792352676392, + "rewards/margins": 0.22580760717391968, + "rewards/rejected": 1.0175716876983643, + "step": 818 + }, + { + "epoch": 2.89, + "learning_rate": 3.2749744911879097e-10, + "logits/chosen": -2.0009734630584717, + "logits/rejected": -2.0241518020629883, + "logps/chosen": -2.6455605030059814, + "logps/rejected": -9.009742736816406, + "loss": 0.7734, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8321704864501953, + "rewards/margins": 0.05004623532295227, + "rewards/rejected": 0.7821242213249207, + "step": 819 + }, + { + "epoch": 2.9, + "learning_rate": 3.060501019496675e-10, + "logits/chosen": -1.973865032196045, + "logits/rejected": -1.9832249879837036, + "logps/chosen": -2.18282413482666, + "logps/rejected": -2.9646217823028564, + "loss": 0.7864, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8342374563217163, + "rewards/margins": 0.3023204207420349, + "rewards/rejected": 0.5319170355796814, + "step": 820 + }, + { + "epoch": 2.9, + "learning_rate": 2.8532686231332314e-10, + "logits/chosen": -1.9829798936843872, + "logits/rejected": -1.9943536520004272, + "logps/chosen": -8.42570972442627, + "logps/rejected": -6.503291606903076, + "loss": 0.5122, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2558943033218384, + "rewards/margins": 0.6750603914260864, + "rewards/rejected": 0.5808339715003967, + "step": 821 + }, + { + "epoch": 2.9, + "learning_rate": 2.65328032175155e-10, + "logits/chosen": -2.02443265914917, + "logits/rejected": -2.024357795715332, + "logps/chosen": -1.6051645278930664, + "logps/rejected": -2.46146297454834, + "loss": 0.6757, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8293092846870422, + "rewards/margins": 0.2602137625217438, + "rewards/rejected": 0.5690955519676208, + "step": 822 + }, + { + "epoch": 2.91, + "learning_rate": 2.460539029449704e-10, + "logits/chosen": -2.0500056743621826, + "logits/rejected": -2.0501322746276855, + "logps/chosen": -0.8298680186271667, + "logps/rejected": -2.790482997894287, + "loss": 0.6463, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8337892889976501, + "rewards/margins": 0.08092907071113586, + "rewards/rejected": 0.7528601884841919, + "step": 823 + }, + { + "epoch": 2.91, + "learning_rate": 2.2750475547267944e-10, + "logits/chosen": -2.0397164821624756, + "logits/rejected": -2.0350515842437744, + "logps/chosen": -13.666533470153809, + "logps/rejected": -2.1366047859191895, + "loss": 0.5986, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6419697999954224, + "rewards/margins": 0.7952867746353149, + "rewards/rejected": 0.8466830253601074, + "step": 824 + }, + { + "epoch": 2.92, + "learning_rate": 2.09680860044259e-10, + "logits/chosen": -2.0912158489227295, + "logits/rejected": -2.094843864440918, + "logps/chosen": -3.561723470687866, + "logps/rejected": -13.871569633483887, + "loss": 0.8286, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8103519082069397, + "rewards/margins": -0.20625068247318268, + "rewards/rejected": 1.016602635383606, + "step": 825 + }, + { + "epoch": 2.92, + "learning_rate": 1.925824763777839e-10, + "logits/chosen": -2.0132670402526855, + "logits/rejected": -2.0158472061157227, + "logps/chosen": -1.7290022373199463, + "logps/rejected": -8.724039077758789, + "loss": 0.6229, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7975382208824158, + "rewards/margins": 0.025670483708381653, + "rewards/rejected": 0.7718677520751953, + "step": 826 + }, + { + "epoch": 2.92, + "learning_rate": 1.7620985361964657e-10, + "logits/chosen": -2.0480146408081055, + "logits/rejected": -2.051694393157959, + "logps/chosen": -1.7294468879699707, + "logps/rejected": -1.9312429428100586, + "loss": 0.6513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8340672254562378, + "rewards/margins": 0.23034626245498657, + "rewards/rejected": 0.6037209033966064, + "step": 827 + }, + { + "epoch": 2.93, + "learning_rate": 1.6056323034092655e-10, + "logits/chosen": -2.021005630493164, + "logits/rejected": -2.0277373790740967, + "logps/chosen": -2.056934356689453, + "logps/rejected": -2.9588775634765625, + "loss": 0.6769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9481070041656494, + "rewards/margins": 0.49134087562561035, + "rewards/rejected": 0.45676612854003906, + "step": 828 + }, + { + "epoch": 2.93, + "learning_rate": 1.4564283453392668e-10, + "logits/chosen": -2.0975935459136963, + "logits/rejected": -2.1086902618408203, + "logps/chosen": -0.5289784073829651, + "logps/rejected": -9.571739196777344, + "loss": 0.7169, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7836428880691528, + "rewards/margins": -0.22626900672912598, + "rewards/rejected": 1.0099118947982788, + "step": 829 + }, + { + "epoch": 2.93, + "learning_rate": 1.3144888360883678e-10, + "logits/chosen": -2.0308711528778076, + "logits/rejected": -2.033994197845459, + "logps/chosen": -1.2955249547958374, + "logps/rejected": -9.49899673461914, + "loss": 0.7145, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.947948694229126, + "rewards/margins": -0.220572829246521, + "rewards/rejected": 1.168521523475647, + "step": 830 + }, + { + "epoch": 2.94, + "learning_rate": 1.179815843905585e-10, + "logits/chosen": -2.061231851577759, + "logits/rejected": -2.0631167888641357, + "logps/chosen": -0.6229865550994873, + "logps/rejected": -9.705669403076172, + "loss": 0.7143, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7747979164123535, + "rewards/margins": -0.1268276870250702, + "rewards/rejected": 0.9016256332397461, + "step": 831 + }, + { + "epoch": 2.94, + "learning_rate": 1.0524113311571881e-10, + "logits/chosen": -1.9860156774520874, + "logits/rejected": -1.9950871467590332, + "logps/chosen": -1.7189728021621704, + "logps/rejected": -8.194829940795898, + "loss": 0.6754, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9610680341720581, + "rewards/margins": 0.3282413184642792, + "rewards/rejected": 0.6328266859054565, + "step": 832 + }, + { + "epoch": 2.94, + "learning_rate": 9.322771542978891e-11, + "logits/chosen": -1.9380372762680054, + "logits/rejected": -2.0478572845458984, + "logps/chosen": -2.215471029281616, + "logps/rejected": -30.663349151611328, + "loss": 0.6994, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7683043479919434, + "rewards/margins": -0.2023550570011139, + "rewards/rejected": 0.9706593751907349, + "step": 833 + }, + { + "epoch": 2.95, + "learning_rate": 8.194150638438091e-11, + "logits/chosen": -2.048755645751953, + "logits/rejected": -2.05592942237854, + "logps/chosen": -9.412174224853516, + "logps/rejected": -9.657772064208984, + "loss": 0.759, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2216287851333618, + "rewards/margins": 0.14537420868873596, + "rewards/rejected": 1.0762546062469482, + "step": 834 + }, + { + "epoch": 2.95, + "learning_rate": 7.138267043471647e-11, + "logits/chosen": -1.9452147483825684, + "logits/rejected": -1.9459277391433716, + "logps/chosen": -7.112281799316406, + "logps/rejected": -1.7596951723098755, + "loss": 0.7148, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9242203235626221, + "rewards/margins": 0.029816418886184692, + "rewards/rejected": 0.8944039344787598, + "step": 835 + }, + { + "epoch": 2.95, + "learning_rate": 6.155136143718986e-11, + "logits/chosen": -2.0129811763763428, + "logits/rejected": -2.077153444290161, + "logps/chosen": -0.9829760789871216, + "logps/rejected": -18.338348388671875, + "loss": 0.724, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9022408723831177, + "rewards/margins": -0.442083477973938, + "rewards/rejected": 1.3443243503570557, + "step": 836 + }, + { + "epoch": 2.96, + "learning_rate": 5.244772264717534e-11, + "logits/chosen": -2.1030113697052, + "logits/rejected": -2.1015119552612305, + "logps/chosen": -0.4493102729320526, + "logps/rejected": -6.1840667724609375, + "loss": 0.7172, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7310956716537476, + "rewards/margins": -0.0818982720375061, + "rewards/rejected": 0.8129938840866089, + "step": 837 + }, + { + "epoch": 2.96, + "learning_rate": 4.407188671690098e-11, + "logits/chosen": -2.0206961631774902, + "logits/rejected": -2.035187005996704, + "logps/chosen": -5.446196556091309, + "logps/rejected": -13.069404602050781, + "loss": 0.8257, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.831396222114563, + "rewards/margins": -0.33707985281944275, + "rewards/rejected": 1.1684761047363281, + "step": 838 + }, + { + "epoch": 2.96, + "learning_rate": 3.642397569353361e-11, + "logits/chosen": -1.9943350553512573, + "logits/rejected": -1.9986001253128052, + "logps/chosen": -1.6003903150558472, + "logps/rejected": -8.17935848236084, + "loss": 0.6621, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8270349502563477, + "rewards/margins": -0.3379298448562622, + "rewards/rejected": 1.1649647951126099, + "step": 839 + }, + { + "epoch": 2.97, + "learning_rate": 2.950410101740797e-11, + "logits/chosen": -2.016922950744629, + "logits/rejected": -2.022000789642334, + "logps/chosen": -2.342677354812622, + "logps/rejected": -1.8881261348724365, + "loss": 0.5578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9865232706069946, + "rewards/margins": 0.3047555088996887, + "rewards/rejected": 0.6817677021026611, + "step": 840 + }, + { + "epoch": 2.97, + "learning_rate": 2.331236352037802e-11, + "logits/chosen": -1.9783161878585815, + "logits/rejected": -1.9863388538360596, + "logps/chosen": -2.727750539779663, + "logps/rejected": -2.090574264526367, + "loss": 0.7271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9157295823097229, + "rewards/margins": 0.24466660618782043, + "rewards/rejected": 0.6710629463195801, + "step": 841 + }, + { + "epoch": 2.98, + "learning_rate": 1.7848853424362597e-11, + "logits/chosen": -1.982742190361023, + "logits/rejected": -1.989923357963562, + "logps/chosen": -1.5878386497497559, + "logps/rejected": -6.5481672286987305, + "loss": 0.5858, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7466580867767334, + "rewards/margins": -0.024029135704040527, + "rewards/rejected": 0.7706872224807739, + "step": 842 + }, + { + "epoch": 2.98, + "learning_rate": 1.3113650340046412e-11, + "logits/chosen": -2.011815071105957, + "logits/rejected": -2.0130198001861572, + "logps/chosen": -1.9644601345062256, + "logps/rejected": -6.847151756286621, + "loss": 0.7452, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8284903764724731, + "rewards/margins": 0.07334145903587341, + "rewards/rejected": 0.7551488876342773, + "step": 843 + }, + { + "epoch": 2.98, + "learning_rate": 9.10682326568102e-12, + "logits/chosen": -2.061251163482666, + "logits/rejected": -2.0599050521850586, + "logps/chosen": -1.692623496055603, + "logps/rejected": -2.3346753120422363, + "loss": 0.6488, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8229697942733765, + "rewards/margins": 0.1958860456943512, + "rewards/rejected": 0.6270837783813477, + "step": 844 + }, + { + "epoch": 2.99, + "learning_rate": 5.828430586124478e-12, + "logits/chosen": -2.2280962467193604, + "logits/rejected": -2.2326550483703613, + "logps/chosen": -2.4956228733062744, + "logps/rejected": -7.903504371643066, + "loss": 0.7942, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8894762992858887, + "rewards/margins": -0.06208790838718414, + "rewards/rejected": 0.9515641927719116, + "step": 845 + }, + { + "epoch": 2.99, + "learning_rate": 3.2785200719476212e-12, + "logits/chosen": -2.121659755706787, + "logits/rejected": -2.121368169784546, + "logps/chosen": -2.858590602874756, + "logps/rejected": -7.644906520843506, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8503637909889221, + "rewards/margins": -0.07914632558822632, + "rewards/rejected": 0.9295101165771484, + "step": 846 + }, + { + "epoch": 2.99, + "learning_rate": 1.4571288787790237e-12, + "logits/chosen": -2.021636962890625, + "logits/rejected": -2.02209210395813, + "logps/chosen": -0.8179419040679932, + "logps/rejected": -2.9099676609039307, + "loss": 0.594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8382905125617981, + "rewards/margins": 0.3059450089931488, + "rewards/rejected": 0.5323455333709717, + "step": 847 + }, + { + "epoch": 3.0, + "learning_rate": 3.642835467165817e-13, + "logits/chosen": -2.014096975326538, + "logits/rejected": -2.015622615814209, + "logps/chosen": -1.4956687688827515, + "logps/rejected": -3.2233171463012695, + "loss": 0.713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8237510919570923, + "rewards/margins": 0.1960834115743637, + "rewards/rejected": 0.6276677250862122, + "step": 848 + }, + { + "epoch": 3.0, + "learning_rate": 0.0, + "logits/chosen": -2.0429530143737793, + "logits/rejected": -2.0433237552642822, + "logps/chosen": -2.1422834396362305, + "logps/rejected": -3.175344467163086, + "loss": 0.6515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7805936336517334, + "rewards/margins": 0.16026803851127625, + "rewards/rejected": 0.6203255653381348, + "step": 849 + }, + { + "epoch": 3.0, + "step": 849, + "total_flos": 0.0, + "train_loss": 0.6737315421320664, + "train_runtime": 6413.6412, + "train_samples_per_second": 1.057, + "train_steps_per_second": 0.132 + } + ], + "logging_steps": 1.0, + "max_steps": 849, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}