diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15065 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6096631611034903, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006096631611034903, + "grad_norm": 0.0, + "learning_rate": 0.0, + "logits/chosen": 0.05287215858697891, + "logits/rejected": 0.009399833157658577, + "logps/chosen": -73.52249145507812, + "logps/rejected": -51.21772003173828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0012193263222069807, + "grad_norm": 0.0, + "learning_rate": 0.0, + "logits/chosen": -0.016249075531959534, + "logits/rejected": 0.055124565958976746, + "logps/chosen": -168.17079162597656, + "logps/rejected": -134.3463592529297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.001828989483310471, + "grad_norm": 0.0, + "learning_rate": 0.0, + "logits/chosen": 0.039522528648376465, + "logits/rejected": 0.035245977342128754, + "logps/chosen": -83.67476654052734, + "logps/rejected": -100.48914337158203, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3 + }, + { + "epoch": 0.0024386526444139613, + "grad_norm": 0.0, + "learning_rate": 0.0, + "logits/chosen": 0.1155989021062851, + "logits/rejected": 0.03127114474773407, + "logps/chosen": -313.77618408203125, + "logps/rejected": -281.04266357421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4 + }, + { + "epoch": 0.0030483158055174516, + "grad_norm": 0.0, + "learning_rate": 0.0, + "logits/chosen": -0.04632345587015152, + "logits/rejected": 0.047024864703416824, + "logps/chosen": -153.97320556640625, + "logps/rejected": -97.37532043457031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5 + }, + { + "epoch": 0.003657978966620942, + "grad_norm": 68.37611427847516, + "learning_rate": 4.390243902439024e-11, + "logits/chosen": 0.35580092668533325, + "logits/rejected": 0.34043076634407043, + "logps/chosen": -150.33229064941406, + "logps/rejected": -127.56856536865234, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6 + }, + { + "epoch": 0.004267642127724432, + "grad_norm": 68.37611427847516, + "learning_rate": 4.390243902439024e-11, + "logits/chosen": 0.16691184043884277, + "logits/rejected": 0.06875558197498322, + "logps/chosen": -21.509504318237305, + "logps/rejected": -56.21983337402344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7 + }, + { + "epoch": 0.004877305288827923, + "grad_norm": 74.25793363323966, + "learning_rate": 8.780487804878048e-11, + "logits/chosen": -0.29858696460723877, + "logits/rejected": -0.16083328425884247, + "logps/chosen": -211.10525512695312, + "logps/rejected": -231.42721557617188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8 + }, + { + "epoch": 0.0054869684499314125, + "grad_norm": 70.63404747210473, + "learning_rate": 1.3170731707317074e-10, + "logits/chosen": 0.017641346901655197, + "logits/rejected": 0.12449988722801208, + "logps/chosen": -317.4058837890625, + "logps/rejected": -266.26617431640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9 + }, + { + "epoch": 0.006096631611034903, + "grad_norm": 70.47359046304224, + "learning_rate": 1.7560975609756095e-10, + "logits/chosen": -0.17641407251358032, + "logits/rejected": 0.1076519638299942, + "logps/chosen": -291.4444580078125, + "logps/rejected": -167.51348876953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 10 + }, + { + "epoch": 0.006706294772138394, + "grad_norm": 71.24522499173389, + "learning_rate": 2.1951219512195122e-10, + "logits/chosen": 0.2724137306213379, + "logits/rejected": 0.16814950108528137, + "logps/chosen": -113.14727783203125, + "logps/rejected": -174.35382080078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 11 + }, + { + "epoch": 0.007315957933241884, + "grad_norm": 70.93965975944207, + "learning_rate": 2.634146341463415e-10, + "logits/chosen": 0.3719051778316498, + "logits/rejected": 0.08294087648391724, + "logps/chosen": -60.49693298339844, + "logps/rejected": -129.46234130859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 12 + }, + { + "epoch": 0.007925621094345374, + "grad_norm": 65.48114720101339, + "learning_rate": 3.073170731707317e-10, + "logits/chosen": -0.2717147469520569, + "logits/rejected": -0.35204601287841797, + "logps/chosen": -130.41958618164062, + "logps/rejected": -90.1414794921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 13 + }, + { + "epoch": 0.008535284255448864, + "grad_norm": 85.3419090068418, + "learning_rate": 3.512195121951219e-10, + "logits/chosen": 0.14461614191532135, + "logits/rejected": 0.12076608836650848, + "logps/chosen": -194.26065063476562, + "logps/rejected": -214.17393493652344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 14 + }, + { + "epoch": 0.009144947416552356, + "grad_norm": 62.78589973694846, + "learning_rate": 3.9512195121951215e-10, + "logits/chosen": 0.06305968761444092, + "logits/rejected": -0.0633743405342102, + "logps/chosen": -106.2256851196289, + "logps/rejected": -165.22598266601562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 15 + }, + { + "epoch": 0.009754610577655845, + "grad_norm": 69.97682774304194, + "learning_rate": 4.3902439024390244e-10, + "logits/chosen": 0.0744749903678894, + "logits/rejected": 0.043965235352516174, + "logps/chosen": -143.64990234375, + "logps/rejected": -198.17315673828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 16 + }, + { + "epoch": 0.010364273738759335, + "grad_norm": 75.24344073025307, + "learning_rate": 4.829268292682926e-10, + "logits/chosen": 0.27772170305252075, + "logits/rejected": -0.03273066133260727, + "logps/chosen": -175.71385192871094, + "logps/rejected": -165.26828002929688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 17 + }, + { + "epoch": 0.010973936899862825, + "grad_norm": 77.00834822090226, + "learning_rate": 5.26829268292683e-10, + "logits/chosen": 0.02788539230823517, + "logits/rejected": 0.21624302864074707, + "logps/chosen": -133.7073974609375, + "logps/rejected": -54.169464111328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 18 + }, + { + "epoch": 0.011583600060966317, + "grad_norm": 70.54084146607406, + "learning_rate": 5.707317073170731e-10, + "logits/chosen": -0.0591580867767334, + "logits/rejected": 0.33370155096054077, + "logps/chosen": -346.8431396484375, + "logps/rejected": -155.22689819335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 19 + }, + { + "epoch": 0.012193263222069806, + "grad_norm": 67.3297470565744, + "learning_rate": 6.146341463414634e-10, + "logits/chosen": 0.191809743642807, + "logits/rejected": 0.17111678421497345, + "logps/chosen": -10.86440658569336, + "logps/rejected": -36.462684631347656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 20 + }, + { + "epoch": 0.012802926383173296, + "grad_norm": 67.3297470565744, + "learning_rate": 6.146341463414634e-10, + "logits/chosen": -0.1280287802219391, + "logits/rejected": 0.1918884813785553, + "logps/chosen": -325.1832275390625, + "logps/rejected": -239.93368530273438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 21 + }, + { + "epoch": 0.013412589544276788, + "grad_norm": 67.3297470565744, + "learning_rate": 6.146341463414634e-10, + "logits/chosen": 0.13415230810642242, + "logits/rejected": 0.10988141596317291, + "logps/chosen": -18.177507400512695, + "logps/rejected": -82.06600952148438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 22 + }, + { + "epoch": 0.014022252705380277, + "grad_norm": 93.08074695025091, + "learning_rate": 6.585365853658536e-10, + "logits/chosen": 0.22793379426002502, + "logits/rejected": 0.19679906964302063, + "logps/chosen": -124.44007873535156, + "logps/rejected": -148.74542236328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 23 + }, + { + "epoch": 0.014631915866483767, + "grad_norm": 93.24059814069281, + "learning_rate": 7.024390243902438e-10, + "logits/chosen": -0.2758808732032776, + "logits/rejected": -0.08728434145450592, + "logps/chosen": -591.4380493164062, + "logps/rejected": -279.4563293457031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 24 + }, + { + "epoch": 0.015241579027587259, + "grad_norm": 68.63554830592973, + "learning_rate": 7.463414634146342e-10, + "logits/chosen": 0.40160396695137024, + "logits/rejected": 0.3144241273403168, + "logps/chosen": -59.92793273925781, + "logps/rejected": -72.62495422363281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 25 + }, + { + "epoch": 0.01585124218869075, + "grad_norm": 76.72619070695632, + "learning_rate": 7.902439024390243e-10, + "logits/chosen": 0.06995319575071335, + "logits/rejected": 0.0718432143330574, + "logps/chosen": -21.1335506439209, + "logps/rejected": -39.64347839355469, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 26 + }, + { + "epoch": 0.01646090534979424, + "grad_norm": 67.71348466576721, + "learning_rate": 8.341463414634145e-10, + "logits/chosen": 0.19081905484199524, + "logits/rejected": 0.107520692050457, + "logps/chosen": -74.09236145019531, + "logps/rejected": -84.74115753173828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 27 + }, + { + "epoch": 0.017070568510897728, + "grad_norm": 70.6623181277699, + "learning_rate": 8.780487804878049e-10, + "logits/chosen": 0.06893181055784225, + "logits/rejected": -0.011756572872400284, + "logps/chosen": -141.17626953125, + "logps/rejected": -244.47691345214844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 28 + }, + { + "epoch": 0.01768023167200122, + "grad_norm": 65.09736234964716, + "learning_rate": 9.21951219512195e-10, + "logits/chosen": -0.11326849460601807, + "logits/rejected": 0.057974159717559814, + "logps/chosen": -265.6029357910156, + "logps/rejected": -153.57164001464844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 29 + }, + { + "epoch": 0.01828989483310471, + "grad_norm": 62.02073777738137, + "learning_rate": 9.658536585365852e-10, + "logits/chosen": 0.08635324239730835, + "logits/rejected": 0.07806559652090073, + "logps/chosen": -14.0606689453125, + "logps/rejected": -22.32585906982422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 30 + }, + { + "epoch": 0.0188995579942082, + "grad_norm": 74.42992018793248, + "learning_rate": 1.0097560975609755e-09, + "logits/chosen": -0.038718074560165405, + "logits/rejected": 0.3243560791015625, + "logps/chosen": -75.48886108398438, + "logps/rejected": -26.728788375854492, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 31 + }, + { + "epoch": 0.01950922115531169, + "grad_norm": 64.29375640734017, + "learning_rate": 1.053658536585366e-09, + "logits/chosen": -0.09937749058008194, + "logits/rejected": -0.15032553672790527, + "logps/chosen": -99.7437973022461, + "logps/rejected": -160.94308471679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 32 + }, + { + "epoch": 0.020118884316415182, + "grad_norm": 70.93071036097813, + "learning_rate": 1.097560975609756e-09, + "logits/chosen": 0.19127793610095978, + "logits/rejected": 0.15430431067943573, + "logps/chosen": -160.15353393554688, + "logps/rejected": -216.41665649414062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 33 + }, + { + "epoch": 0.02072854747751867, + "grad_norm": 74.52915654091993, + "learning_rate": 1.1414634146341462e-09, + "logits/chosen": 0.13181781768798828, + "logits/rejected": 0.2162623405456543, + "logps/chosen": -119.47815704345703, + "logps/rejected": -74.19168853759766, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 34 + }, + { + "epoch": 0.021338210638622162, + "grad_norm": 80.24022253894402, + "learning_rate": 1.1853658536585366e-09, + "logits/chosen": 0.11026953160762787, + "logits/rejected": 0.05110342800617218, + "logps/chosen": -156.3619384765625, + "logps/rejected": -169.68821716308594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 35 + }, + { + "epoch": 0.02194787379972565, + "grad_norm": 61.9848191612332, + "learning_rate": 1.2292682926829269e-09, + "logits/chosen": -0.05266339331865311, + "logits/rejected": 0.08744192123413086, + "logps/chosen": -178.451416015625, + "logps/rejected": -166.0316619873047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 36 + }, + { + "epoch": 0.02255753696082914, + "grad_norm": 78.38786858503308, + "learning_rate": 1.273170731707317e-09, + "logits/chosen": 0.14256571233272552, + "logits/rejected": 0.4338546693325043, + "logps/chosen": -300.1292419433594, + "logps/rejected": -200.44442749023438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 37 + }, + { + "epoch": 0.023167200121932633, + "grad_norm": 77.88261701115505, + "learning_rate": 1.3170731707317072e-09, + "logits/chosen": 0.22669938206672668, + "logits/rejected": 0.2338542938232422, + "logps/chosen": -47.191864013671875, + "logps/rejected": -50.06071472167969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 38 + }, + { + "epoch": 0.02377686328303612, + "grad_norm": 85.52421451349792, + "learning_rate": 1.3609756097560974e-09, + "logits/chosen": -0.032261237502098083, + "logits/rejected": 0.23262041807174683, + "logps/chosen": -409.99365234375, + "logps/rejected": -253.15380859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 39 + }, + { + "epoch": 0.024386526444139613, + "grad_norm": 79.15481772304013, + "learning_rate": 1.4048780487804876e-09, + "logits/chosen": 0.25447630882263184, + "logits/rejected": 0.18663440644741058, + "logps/chosen": -336.67388916015625, + "logps/rejected": -161.332275390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 40 + }, + { + "epoch": 0.024996189605243104, + "grad_norm": 71.22850423237752, + "learning_rate": 1.448780487804878e-09, + "logits/chosen": 0.2154097706079483, + "logits/rejected": 0.2652926743030548, + "logps/chosen": -132.73236083984375, + "logps/rejected": -77.12788391113281, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 41 + }, + { + "epoch": 0.025605852766346592, + "grad_norm": 77.86620038216259, + "learning_rate": 1.4926829268292683e-09, + "logits/chosen": 0.1286822408437729, + "logits/rejected": 0.3024751842021942, + "logps/chosen": -298.9327392578125, + "logps/rejected": -119.84310913085938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 42 + }, + { + "epoch": 0.026215515927450084, + "grad_norm": 73.10788956685286, + "learning_rate": 1.5365853658536586e-09, + "logits/chosen": 0.20254218578338623, + "logits/rejected": 0.5039613842964172, + "logps/chosen": -116.90916442871094, + "logps/rejected": -63.421058654785156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 43 + }, + { + "epoch": 0.026825179088553575, + "grad_norm": 72.62025945728479, + "learning_rate": 1.5804878048780486e-09, + "logits/chosen": 0.08061984181404114, + "logits/rejected": 0.05667334049940109, + "logps/chosen": -122.43840026855469, + "logps/rejected": -128.56314086914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 44 + }, + { + "epoch": 0.027434842249657063, + "grad_norm": 70.52926879204011, + "learning_rate": 1.6243902439024388e-09, + "logits/chosen": -0.02340932935476303, + "logits/rejected": 0.043494515120983124, + "logps/chosen": -127.13800811767578, + "logps/rejected": -94.7784194946289, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 45 + }, + { + "epoch": 0.028044505410760555, + "grad_norm": 64.88774875207254, + "learning_rate": 1.668292682926829e-09, + "logits/chosen": 0.39714503288269043, + "logits/rejected": 0.39462360739707947, + "logps/chosen": -7.40703821182251, + "logps/rejected": -14.890948295593262, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 46 + }, + { + "epoch": 0.028654168571864046, + "grad_norm": 68.79365680982743, + "learning_rate": 1.7121951219512195e-09, + "logits/chosen": 0.08758289366960526, + "logits/rejected": 0.21223704516887665, + "logps/chosen": -224.2176513671875, + "logps/rejected": -224.02700805664062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 47 + }, + { + "epoch": 0.029263831732967534, + "grad_norm": 71.49047996194585, + "learning_rate": 1.7560975609756097e-09, + "logits/chosen": 0.21929675340652466, + "logits/rejected": 0.2699821889400482, + "logps/chosen": -104.31961059570312, + "logps/rejected": -60.335113525390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 48 + }, + { + "epoch": 0.029873494894071026, + "grad_norm": 70.73794598984682, + "learning_rate": 1.8e-09, + "logits/chosen": -0.3800536096096039, + "logits/rejected": -0.150841623544693, + "logps/chosen": -220.71783447265625, + "logps/rejected": -113.48838806152344, + "loss": 0.6933, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 49 + }, + { + "epoch": 0.030483158055174518, + "grad_norm": 66.7889498717846, + "learning_rate": 1.84390243902439e-09, + "logits/chosen": -0.06314300000667572, + "logits/rejected": 0.048857904970645905, + "logps/chosen": -239.1514129638672, + "logps/rejected": -155.99298095703125, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001027262187562883, + "rewards/margins": -0.000668776105158031, + "rewards/rejected": -0.00035848619882017374, + "step": 50 + }, + { + "epoch": 0.031092821216278006, + "grad_norm": 70.63181999151213, + "learning_rate": 1.8878048780487805e-09, + "logits/chosen": 0.06969748437404633, + "logits/rejected": 0.427978515625, + "logps/chosen": -300.5170593261719, + "logps/rejected": -128.99072265625, + "loss": 0.6926, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0006778716924600303, + "rewards/margins": -0.0008071899646893144, + "rewards/rejected": 0.0014850615989416838, + "step": 51 + }, + { + "epoch": 0.0317024843773815, + "grad_norm": 66.6174987991604, + "learning_rate": 1.9317073170731705e-09, + "logits/chosen": 0.28025972843170166, + "logits/rejected": 0.19407892227172852, + "logps/chosen": -57.48769760131836, + "logps/rejected": -79.77017974853516, + "loss": 0.6941, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0017806501127779484, + "rewards/margins": -0.0018260792130604386, + "rewards/rejected": 4.542919486993924e-05, + "step": 52 + }, + { + "epoch": 0.032312147538484985, + "grad_norm": 81.19668841318229, + "learning_rate": 1.975609756097561e-09, + "logits/chosen": 0.20146635174751282, + "logits/rejected": 0.13105042278766632, + "logps/chosen": -192.02371215820312, + "logps/rejected": -272.7171325683594, + "loss": 0.6923, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0005275249131955206, + "rewards/margins": -0.0012145042419433594, + "rewards/rejected": 0.000686979154124856, + "step": 53 + }, + { + "epoch": 0.03292181069958848, + "grad_norm": 66.9314671755827, + "learning_rate": 2.019512195121951e-09, + "logits/chosen": -0.07518087327480316, + "logits/rejected": 0.33032354712486267, + "logps/chosen": -210.52719116210938, + "logps/rejected": -185.80984497070312, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0013447999954223633, + "rewards/margins": -0.002561330795288086, + "rewards/rejected": 0.0012165309162810445, + "step": 54 + }, + { + "epoch": 0.03353147386069197, + "grad_norm": 70.91475343182961, + "learning_rate": 2.0634146341463414e-09, + "logits/chosen": 0.23357635736465454, + "logits/rejected": 0.08638399094343185, + "logps/chosen": -166.65553283691406, + "logps/rejected": -243.26834106445312, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0004352211835794151, + "rewards/margins": 0.002582198241725564, + "rewards/rejected": -0.0030174197163432837, + "step": 55 + }, + { + "epoch": 0.034141137021795456, + "grad_norm": 60.88045789418313, + "learning_rate": 2.107317073170732e-09, + "logits/chosen": 0.1865972876548767, + "logits/rejected": 0.011848561465740204, + "logps/chosen": -141.57260131835938, + "logps/rejected": -143.6577911376953, + "loss": 0.6945, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002753830049186945, + "rewards/margins": -0.0051968577317893505, + "rewards/rejected": 0.0024430276826024055, + "step": 56 + }, + { + "epoch": 0.03475080018289895, + "grad_norm": 70.07864758335027, + "learning_rate": 2.151219512195122e-09, + "logits/chosen": 0.07055716216564178, + "logits/rejected": 0.12593163549900055, + "logps/chosen": -262.76190185546875, + "logps/rejected": -185.82838439941406, + "loss": 0.6939, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0021800040267407894, + "rewards/margins": -0.0016491890419274569, + "rewards/rejected": 0.00382919330149889, + "step": 57 + }, + { + "epoch": 0.03536046334400244, + "grad_norm": 73.56352249882974, + "learning_rate": 2.195121951219512e-09, + "logits/chosen": 0.04468034580349922, + "logits/rejected": 0.19408458471298218, + "logps/chosen": -31.595966339111328, + "logps/rejected": -31.431028366088867, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0004551530000753701, + "rewards/margins": -0.0012581408955156803, + "rewards/rejected": 0.0017132939537987113, + "step": 58 + }, + { + "epoch": 0.03597012650510593, + "grad_norm": 80.33636599620638, + "learning_rate": 2.2390243902439024e-09, + "logits/chosen": 0.3012107312679291, + "logits/rejected": 0.27375027537345886, + "logps/chosen": -148.99156188964844, + "logps/rejected": -32.328948974609375, + "loss": 0.6933, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0038415552116930485, + "rewards/margins": -0.003134638536721468, + "rewards/rejected": -0.0007069166749715805, + "step": 59 + }, + { + "epoch": 0.03657978966620942, + "grad_norm": 70.2703831832364, + "learning_rate": 2.2829268292682924e-09, + "logits/chosen": -0.024738460779190063, + "logits/rejected": -0.07055769860744476, + "logps/chosen": -167.5659942626953, + "logps/rejected": -138.68312072753906, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0011559011181816459, + "rewards/margins": -0.0006197362090460956, + "rewards/rejected": 0.0017756373854354024, + "step": 60 + }, + { + "epoch": 0.03718945282731291, + "grad_norm": 74.5492664117818, + "learning_rate": 2.326829268292683e-09, + "logits/chosen": -0.18852980434894562, + "logits/rejected": 0.006448015570640564, + "logps/chosen": -165.77377319335938, + "logps/rejected": -177.71710205078125, + "loss": 0.6927, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0011799321509897709, + "rewards/margins": -0.004253004677593708, + "rewards/rejected": 0.003073072526603937, + "step": 61 + }, + { + "epoch": 0.0377991159884164, + "grad_norm": 63.99980911512165, + "learning_rate": 2.3707317073170733e-09, + "logits/chosen": -0.15407471358776093, + "logits/rejected": 0.41999027132987976, + "logps/chosen": -127.58901977539062, + "logps/rejected": -43.56388473510742, + "loss": 0.6917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007107532117515802, + "rewards/margins": 0.010301482863724232, + "rewards/rejected": -0.0031939507462084293, + "step": 62 + }, + { + "epoch": 0.038408779149519894, + "grad_norm": 82.13245010916935, + "learning_rate": 2.4146341463414633e-09, + "logits/chosen": -0.10190024226903915, + "logits/rejected": 0.2237187922000885, + "logps/chosen": -229.680908203125, + "logps/rejected": -208.50599670410156, + "loss": 0.6917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0031118900515139103, + "rewards/margins": 0.000753405736759305, + "rewards/rejected": 0.0023584843147546053, + "step": 63 + }, + { + "epoch": 0.03901844231062338, + "grad_norm": 70.61017230058009, + "learning_rate": 2.4585365853658538e-09, + "logits/chosen": 0.11584638804197311, + "logits/rejected": 0.20095054805278778, + "logps/chosen": -173.85516357421875, + "logps/rejected": -126.08563995361328, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002786439610645175, + "rewards/margins": 0.0041068256832659245, + "rewards/rejected": -0.0013203859562054276, + "step": 64 + }, + { + "epoch": 0.03962810547172687, + "grad_norm": 84.0050894833399, + "learning_rate": 2.502439024390244e-09, + "logits/chosen": 0.11885404586791992, + "logits/rejected": 0.08198799192905426, + "logps/chosen": -170.72412109375, + "logps/rejected": -198.9031219482422, + "loss": 0.6931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003328704973682761, + "rewards/margins": 0.009418916888535023, + "rewards/rejected": -0.006090211682021618, + "step": 65 + }, + { + "epoch": 0.040237768632830365, + "grad_norm": 70.46414310404583, + "learning_rate": 2.546341463414634e-09, + "logits/chosen": 0.2348729968070984, + "logits/rejected": 0.05341774597764015, + "logps/chosen": -185.4467315673828, + "logps/rejected": -201.00570678710938, + "loss": 0.6933, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003058910369873047, + "rewards/margins": 0.006163597106933594, + "rewards/rejected": -0.003104686737060547, + "step": 66 + }, + { + "epoch": 0.04084743179393385, + "grad_norm": 84.39671887821069, + "learning_rate": 2.590243902439024e-09, + "logits/chosen": 0.1465904712677002, + "logits/rejected": 0.15641841292381287, + "logps/chosen": -25.62973403930664, + "logps/rejected": -33.12474822998047, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0015766852302476764, + "rewards/margins": 9.535928256809711e-06, + "rewards/rejected": 0.0015671491855755448, + "step": 67 + }, + { + "epoch": 0.04145709495503734, + "grad_norm": 80.88343791628121, + "learning_rate": 2.6341463414634143e-09, + "logits/chosen": 0.16637735068798065, + "logits/rejected": 0.14293934404850006, + "logps/chosen": -79.70658874511719, + "logps/rejected": -96.13426208496094, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004015589132905006, + "rewards/margins": 0.0017949821194633842, + "rewards/rejected": 0.0022206068970263004, + "step": 68 + }, + { + "epoch": 0.04206675811614083, + "grad_norm": 95.81105276337522, + "learning_rate": 2.6780487804878048e-09, + "logits/chosen": 0.1661253273487091, + "logits/rejected": 0.3702969253063202, + "logps/chosen": -137.08416748046875, + "logps/rejected": -70.77066802978516, + "loss": 0.6934, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.005868458654731512, + "rewards/margins": -0.004885214846581221, + "rewards/rejected": -0.000983244157396257, + "step": 69 + }, + { + "epoch": 0.042676421277244324, + "grad_norm": 85.50609013077252, + "learning_rate": 2.7219512195121948e-09, + "logits/chosen": 0.04000654071569443, + "logits/rejected": 0.33832550048828125, + "logps/chosen": -253.7117919921875, + "logps/rejected": -155.51600646972656, + "loss": 0.6935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00038905144901946187, + "rewards/margins": 0.004518482368439436, + "rewards/rejected": -0.004907533526420593, + "step": 70 + }, + { + "epoch": 0.04328608443834781, + "grad_norm": 80.53614050473402, + "learning_rate": 2.7658536585365852e-09, + "logits/chosen": 0.17947782576084137, + "logits/rejected": 0.17558667063713074, + "logps/chosen": -22.479354858398438, + "logps/rejected": -23.31631088256836, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0022156001068651676, + "rewards/margins": 0.0005453706253319979, + "rewards/rejected": 0.001670229365117848, + "step": 71 + }, + { + "epoch": 0.0438957475994513, + "grad_norm": 75.93425185505885, + "learning_rate": 2.8097560975609753e-09, + "logits/chosen": 0.15121379494667053, + "logits/rejected": -0.04035184532403946, + "logps/chosen": -91.0668716430664, + "logps/rejected": -99.52243041992188, + "loss": 0.6937, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0011352181900292635, + "rewards/margins": 0.005007707979530096, + "rewards/rejected": -0.0038724897895008326, + "step": 72 + }, + { + "epoch": 0.044505410760554795, + "grad_norm": 64.74961747089739, + "learning_rate": 2.8536585365853657e-09, + "logits/chosen": 0.19988885521888733, + "logits/rejected": 0.16584698855876923, + "logps/chosen": -107.76717376708984, + "logps/rejected": -147.28042602539062, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0007725238101556897, + "rewards/margins": 0.001528930850327015, + "rewards/rejected": -0.0007564069237560034, + "step": 73 + }, + { + "epoch": 0.04511507392165828, + "grad_norm": 84.10029335113224, + "learning_rate": 2.897560975609756e-09, + "logits/chosen": -0.034904882311820984, + "logits/rejected": 0.03536960482597351, + "logps/chosen": -150.70921325683594, + "logps/rejected": -80.46929931640625, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00169027887750417, + "rewards/margins": -0.0003446043701842427, + "rewards/rejected": 0.0020348832476884127, + "step": 74 + }, + { + "epoch": 0.04572473708276177, + "grad_norm": 72.00711273151795, + "learning_rate": 2.941463414634146e-09, + "logits/chosen": 0.0597052201628685, + "logits/rejected": 0.03572994843125343, + "logps/chosen": -350.35992431640625, + "logps/rejected": -322.17724609375, + "loss": 0.6935, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00667121447622776, + "rewards/margins": -0.00563659705221653, + "rewards/rejected": -0.0010346174240112305, + "step": 75 + }, + { + "epoch": 0.046334400243865266, + "grad_norm": 74.23989965163715, + "learning_rate": 2.9853658536585366e-09, + "logits/chosen": 0.31788504123687744, + "logits/rejected": 0.3128117322921753, + "logps/chosen": -7.529122352600098, + "logps/rejected": -8.063995361328125, + "loss": 0.6936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.000852310738991946, + "rewards/margins": 0.00029422639636322856, + "rewards/rejected": -0.0011465370189398527, + "step": 76 + }, + { + "epoch": 0.046944063404968754, + "grad_norm": 72.2708304153655, + "learning_rate": 3.0292682926829267e-09, + "logits/chosen": 0.3397638201713562, + "logits/rejected": 0.20202289521694183, + "logps/chosen": -78.62361145019531, + "logps/rejected": -112.6453628540039, + "loss": 0.6936, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005385488038882613, + "rewards/margins": -0.0005318729672580957, + "rewards/rejected": -6.67572021484375e-06, + "step": 77 + }, + { + "epoch": 0.04755372656607224, + "grad_norm": 75.04661372709796, + "learning_rate": 3.073170731707317e-09, + "logits/chosen": 0.35530340671539307, + "logits/rejected": 0.22375106811523438, + "logps/chosen": -64.1614990234375, + "logps/rejected": -93.27325439453125, + "loss": 0.694, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0016937867039814591, + "rewards/margins": 0.004260053858160973, + "rewards/rejected": -0.002566267503425479, + "step": 78 + }, + { + "epoch": 0.04816338972717574, + "grad_norm": 83.73910637650769, + "learning_rate": 3.1170731707317067e-09, + "logits/chosen": -0.10525587201118469, + "logits/rejected": -0.14019280672073364, + "logps/chosen": -54.87730407714844, + "logps/rejected": -60.87992858886719, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00018210409325547516, + "rewards/margins": 0.0017521620029583573, + "rewards/rejected": -0.0019342661835253239, + "step": 79 + }, + { + "epoch": 0.048773052888279225, + "grad_norm": 65.36609025123921, + "learning_rate": 3.160975609756097e-09, + "logits/chosen": 0.20115497708320618, + "logits/rejected": 0.19292600452899933, + "logps/chosen": -43.20243835449219, + "logps/rejected": -28.736347198486328, + "loss": 0.6924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0015144406352192163, + "rewards/margins": -0.001647680765017867, + "rewards/rejected": 0.0001332402171101421, + "step": 80 + }, + { + "epoch": 0.04938271604938271, + "grad_norm": 68.35723980447455, + "learning_rate": 3.2048780487804876e-09, + "logits/chosen": 0.06546074151992798, + "logits/rejected": 0.06650267541408539, + "logps/chosen": -45.10093307495117, + "logps/rejected": -144.30990600585938, + "loss": 0.6946, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0006906271446496248, + "rewards/margins": -0.0065479036420583725, + "rewards/rejected": 0.005857276730239391, + "step": 81 + }, + { + "epoch": 0.04999237921048621, + "grad_norm": 76.28615406166055, + "learning_rate": 3.2487804878048777e-09, + "logits/chosen": -0.11237098276615143, + "logits/rejected": 0.06433381885290146, + "logps/chosen": -205.50204467773438, + "logps/rejected": -182.95997619628906, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0007084847311489284, + "rewards/margins": 0.003876638598740101, + "rewards/rejected": -0.0045851231552660465, + "step": 82 + }, + { + "epoch": 0.050602042371589696, + "grad_norm": 80.94256607086295, + "learning_rate": 3.292682926829268e-09, + "logits/chosen": 0.3187202513217926, + "logits/rejected": 0.2065531611442566, + "logps/chosen": -255.35821533203125, + "logps/rejected": -217.7652587890625, + "loss": 0.6927, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006643927656114101, + "rewards/margins": 0.01088404655456543, + "rewards/rejected": -0.004240119829773903, + "step": 83 + }, + { + "epoch": 0.051211705532693184, + "grad_norm": 71.45394858065539, + "learning_rate": 3.336585365853658e-09, + "logits/chosen": -0.25007927417755127, + "logits/rejected": 0.15381662547588348, + "logps/chosen": -180.00770568847656, + "logps/rejected": -132.6005096435547, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0001525879488326609, + "rewards/margins": -0.0020342827774584293, + "rewards/rejected": 0.0021868706680834293, + "step": 84 + }, + { + "epoch": 0.05182136869379668, + "grad_norm": 72.06045896192381, + "learning_rate": 3.3804878048780486e-09, + "logits/chosen": 0.2230096161365509, + "logits/rejected": 0.502068817615509, + "logps/chosen": -215.84043884277344, + "logps/rejected": -156.7900390625, + "loss": 0.692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004702424630522728, + "rewards/margins": 0.007914667949080467, + "rewards/rejected": -0.0032122433185577393, + "step": 85 + }, + { + "epoch": 0.05243103185490017, + "grad_norm": 71.38400999771645, + "learning_rate": 3.424390243902439e-09, + "logits/chosen": 0.024465292692184448, + "logits/rejected": 0.20754316449165344, + "logps/chosen": -322.27496337890625, + "logps/rejected": -150.3963623046875, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0038268952630460262, + "rewards/margins": 0.0009583504870533943, + "rewards/rejected": 0.0028685452416539192, + "step": 86 + }, + { + "epoch": 0.053040695016003656, + "grad_norm": 74.51379013327342, + "learning_rate": 3.468292682926829e-09, + "logits/chosen": 0.16065523028373718, + "logits/rejected": -0.08133505284786224, + "logps/chosen": -104.59339904785156, + "logps/rejected": -266.986572265625, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003070497652515769, + "rewards/margins": 0.006459474563598633, + "rewards/rejected": -0.0033889771439135075, + "step": 87 + }, + { + "epoch": 0.05365035817710715, + "grad_norm": 64.65775651723335, + "learning_rate": 3.5121951219512195e-09, + "logits/chosen": 0.19086205959320068, + "logits/rejected": 0.08758498728275299, + "logps/chosen": -69.55133056640625, + "logps/rejected": -49.74491882324219, + "loss": 0.693, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0007361411117017269, + "rewards/margins": -0.0010319054126739502, + "rewards/rejected": 0.00029576424276456237, + "step": 88 + }, + { + "epoch": 0.05426002133821064, + "grad_norm": 67.41490806453626, + "learning_rate": 3.5560975609756095e-09, + "logits/chosen": -0.3904249370098114, + "logits/rejected": 0.40122270584106445, + "logps/chosen": -295.4908142089844, + "logps/rejected": -181.67678833007812, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00012779224198311567, + "rewards/margins": 0.0018295288318768144, + "rewards/rejected": -0.001701736357063055, + "step": 89 + }, + { + "epoch": 0.05486968449931413, + "grad_norm": 69.7352013160028, + "learning_rate": 3.6e-09, + "logits/chosen": 0.02868320234119892, + "logits/rejected": 0.023117877542972565, + "logps/chosen": -123.01502990722656, + "logps/rejected": -327.2979431152344, + "loss": 0.6926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0014316558372229338, + "rewards/margins": 0.0007941245567053556, + "rewards/rejected": -0.0022257803939282894, + "step": 90 + }, + { + "epoch": 0.05547934766041762, + "grad_norm": 85.9010441341789, + "learning_rate": 3.6439024390243904e-09, + "logits/chosen": -0.07268917560577393, + "logits/rejected": 0.03133467584848404, + "logps/chosen": -108.12759399414062, + "logps/rejected": -118.45396423339844, + "loss": 0.6948, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0018654584418982267, + "rewards/margins": -0.0015044212341308594, + "rewards/rejected": -0.0003610372659750283, + "step": 91 + }, + { + "epoch": 0.05608901082152111, + "grad_norm": 70.92085506184092, + "learning_rate": 3.68780487804878e-09, + "logits/chosen": 0.1769690215587616, + "logits/rejected": 0.2651098072528839, + "logps/chosen": -495.8620300292969, + "logps/rejected": -185.98446655273438, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00516090402379632, + "rewards/margins": 0.0031100749038159847, + "rewards/rejected": 0.0020508291199803352, + "step": 92 + }, + { + "epoch": 0.0566986739826246, + "grad_norm": 71.03428763196679, + "learning_rate": 3.73170731707317e-09, + "logits/chosen": 0.1058330312371254, + "logits/rejected": 0.13351327180862427, + "logps/chosen": -13.810348510742188, + "logps/rejected": -17.113176345825195, + "loss": 0.6944, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0006977797020226717, + "rewards/margins": 0.0007781625026836991, + "rewards/rejected": -8.038282976485789e-05, + "step": 93 + }, + { + "epoch": 0.05730833714372809, + "grad_norm": 73.25497477824051, + "learning_rate": 3.775609756097561e-09, + "logits/chosen": 0.26408499479293823, + "logits/rejected": 0.33262404799461365, + "logps/chosen": -404.1988830566406, + "logps/rejected": -239.35232543945312, + "loss": 0.6933, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00642166193574667, + "rewards/margins": 0.005939484108239412, + "rewards/rejected": 0.00048217771109193563, + "step": 94 + }, + { + "epoch": 0.05791800030483158, + "grad_norm": 67.09278397767419, + "learning_rate": 3.819512195121951e-09, + "logits/chosen": 0.15695643424987793, + "logits/rejected": 0.18819648027420044, + "logps/chosen": -119.06463623046875, + "logps/rejected": -137.01116943359375, + "loss": 0.6927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00042524340096861124, + "rewards/margins": 0.0015760839451104403, + "rewards/rejected": -0.001150840544141829, + "step": 95 + }, + { + "epoch": 0.05852766346593507, + "grad_norm": 75.0674412350952, + "learning_rate": 3.863414634146341e-09, + "logits/chosen": 0.09307468682527542, + "logits/rejected": 0.08323965966701508, + "logps/chosen": -261.7306823730469, + "logps/rejected": -260.0284118652344, + "loss": 0.6939, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0001418590545654297, + "rewards/margins": -0.003254556329920888, + "rewards/rejected": 0.003112697508186102, + "step": 96 + }, + { + "epoch": 0.059137326627038564, + "grad_norm": 80.34872386837137, + "learning_rate": 3.907317073170732e-09, + "logits/chosen": 0.18482346832752228, + "logits/rejected": 0.10585962235927582, + "logps/chosen": -42.423362731933594, + "logps/rejected": -63.49856948852539, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00029054880724288523, + "rewards/margins": -0.001655351952649653, + "rewards/rejected": 0.0013648034073412418, + "step": 97 + }, + { + "epoch": 0.05974698978814205, + "grad_norm": 81.7468245081625, + "learning_rate": 3.951219512195122e-09, + "logits/chosen": -0.2540174722671509, + "logits/rejected": -0.12836764752864838, + "logps/chosen": -199.8457489013672, + "logps/rejected": -181.54751586914062, + "loss": 0.6937, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0015849112533032894, + "rewards/margins": -0.0025141239166259766, + "rewards/rejected": 0.0009292126633226871, + "step": 98 + }, + { + "epoch": 0.06035665294924554, + "grad_norm": 65.7346035629931, + "learning_rate": 3.995121951219512e-09, + "logits/chosen": 0.2020762711763382, + "logits/rejected": 0.04373517259955406, + "logps/chosen": -17.618925094604492, + "logps/rejected": -26.497638702392578, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00015217059990391135, + "rewards/margins": -4.655128577724099e-05, + "rewards/rejected": 0.00019872188568115234, + "step": 99 + }, + { + "epoch": 0.060966316110349035, + "grad_norm": 72.7762663088043, + "learning_rate": 4.039024390243902e-09, + "logits/chosen": 0.027187101542949677, + "logits/rejected": -0.13702692091464996, + "logps/chosen": -240.42532348632812, + "logps/rejected": -361.12823486328125, + "loss": 0.6932, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0015394926303997636, + "rewards/margins": -0.00020051910541951656, + "rewards/rejected": -0.0013389736413955688, + "step": 100 + }, + { + "epoch": 0.06157597927145252, + "grad_norm": 87.81366907788147, + "learning_rate": 4.082926829268293e-09, + "logits/chosen": -0.01530487835407257, + "logits/rejected": 0.003859208896756172, + "logps/chosen": -263.99493408203125, + "logps/rejected": -353.2584533691406, + "loss": 0.6919, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009844470769166946, + "rewards/margins": 0.01946244202554226, + "rewards/rejected": -0.009617972187697887, + "step": 101 + }, + { + "epoch": 0.06218564243255601, + "grad_norm": 70.52878373104612, + "learning_rate": 4.126829268292683e-09, + "logits/chosen": 0.23338094353675842, + "logits/rejected": 0.19701853394508362, + "logps/chosen": -171.7310028076172, + "logps/rejected": -102.3233413696289, + "loss": 0.6942, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0015583753120154142, + "rewards/margins": 0.0036748647689819336, + "rewards/rejected": -0.0021164894569665194, + "step": 102 + }, + { + "epoch": 0.0627953055936595, + "grad_norm": 65.56038043811691, + "learning_rate": 4.170731707317073e-09, + "logits/chosen": 0.48671942949295044, + "logits/rejected": 0.4513268768787384, + "logps/chosen": -30.95024871826172, + "logps/rejected": -81.58662414550781, + "loss": 0.6926, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0014354228042066097, + "rewards/margins": 0.0016133070457726717, + "rewards/rejected": -0.0030487298499792814, + "step": 103 + }, + { + "epoch": 0.063404968754763, + "grad_norm": 64.36723338107112, + "learning_rate": 4.214634146341464e-09, + "logits/chosen": 0.19133959710597992, + "logits/rejected": 0.2739216387271881, + "logps/chosen": -125.99061584472656, + "logps/rejected": -232.95196533203125, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0020974159706383944, + "rewards/margins": -0.0030968666542321444, + "rewards/rejected": 0.0009994508000090718, + "step": 104 + }, + { + "epoch": 0.06401463191586648, + "grad_norm": 66.7082232964686, + "learning_rate": 4.258536585365853e-09, + "logits/chosen": 0.13941365480422974, + "logits/rejected": 0.03759532794356346, + "logps/chosen": -130.84054565429688, + "logps/rejected": -240.2591094970703, + "loss": 0.6922, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00239717960357666, + "rewards/margins": 0.0012583851348608732, + "rewards/rejected": -0.003655564971268177, + "step": 105 + }, + { + "epoch": 0.06462429507696997, + "grad_norm": 77.77334705609563, + "learning_rate": 4.302439024390244e-09, + "logits/chosen": 0.3149589002132416, + "logits/rejected": 0.12610098719596863, + "logps/chosen": -70.48065185546875, + "logps/rejected": -156.10791015625, + "loss": 0.6926, + "rewards/accuracies": 0.25, + "rewards/chosen": -3.081554314121604e-05, + "rewards/margins": 0.001992344856262207, + "rewards/rejected": -0.002023160457611084, + "step": 106 + }, + { + "epoch": 0.06523395823807346, + "grad_norm": 67.20989589144399, + "learning_rate": 4.346341463414634e-09, + "logits/chosen": 0.33753344416618347, + "logits/rejected": 0.18962666392326355, + "logps/chosen": -125.78367614746094, + "logps/rejected": -190.9263916015625, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002459835959598422, + "rewards/margins": 0.00020625581964850426, + "rewards/rejected": -0.00266609201207757, + "step": 107 + }, + { + "epoch": 0.06584362139917696, + "grad_norm": 71.6543977566246, + "learning_rate": 4.390243902439024e-09, + "logits/chosen": 0.1545097827911377, + "logits/rejected": 0.16738542914390564, + "logps/chosen": -42.83551788330078, + "logps/rejected": -58.91304016113281, + "loss": 0.6944, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002740550087764859, + "rewards/margins": -0.007119977846741676, + "rewards/rejected": 0.0043794275261461735, + "step": 108 + }, + { + "epoch": 0.06645328456028045, + "grad_norm": 67.8633172345031, + "learning_rate": 4.434146341463415e-09, + "logits/chosen": 0.14236405491828918, + "logits/rejected": 0.13578234612941742, + "logps/chosen": -37.31237030029297, + "logps/rejected": -42.171234130859375, + "loss": 0.6926, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0012849331833422184, + "rewards/margins": 0.002069628331810236, + "rewards/rejected": -0.0007846951484680176, + "step": 109 + }, + { + "epoch": 0.06706294772138394, + "grad_norm": 97.09783574054993, + "learning_rate": 4.478048780487805e-09, + "logits/chosen": 0.023844445124268532, + "logits/rejected": 0.20791736245155334, + "logps/chosen": -126.77980041503906, + "logps/rejected": -124.97605895996094, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0018016814719885588, + "rewards/margins": -0.00039469008333981037, + "rewards/rejected": 0.002196371555328369, + "step": 110 + }, + { + "epoch": 0.06767261088248742, + "grad_norm": 87.92196146928691, + "learning_rate": 4.521951219512195e-09, + "logits/chosen": 0.23804506659507751, + "logits/rejected": 0.1031482145190239, + "logps/chosen": -327.8410949707031, + "logps/rejected": -213.29132080078125, + "loss": 0.6923, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005219769664108753, + "rewards/margins": 0.004817938432097435, + "rewards/rejected": 0.00040183070814237, + "step": 111 + }, + { + "epoch": 0.06828227404359091, + "grad_norm": 66.36502679238257, + "learning_rate": 4.565853658536585e-09, + "logits/chosen": 0.0046591609716415405, + "logits/rejected": 0.016852840781211853, + "logps/chosen": -365.377685546875, + "logps/rejected": -377.4510498046875, + "loss": 0.6908, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0113862045109272, + "rewards/margins": 0.01930229738354683, + "rewards/rejected": -0.007916092872619629, + "step": 112 + }, + { + "epoch": 0.0688919372046944, + "grad_norm": 75.66303479659113, + "learning_rate": 4.609756097560976e-09, + "logits/chosen": 0.12720055878162384, + "logits/rejected": -0.09696392714977264, + "logps/chosen": -106.36075592041016, + "logps/rejected": -150.9593505859375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0023461461532860994, + "rewards/margins": -0.008014858700335026, + "rewards/rejected": 0.005668711848556995, + "step": 113 + }, + { + "epoch": 0.0695016003657979, + "grad_norm": 67.06432628491909, + "learning_rate": 4.653658536585366e-09, + "logits/chosen": 0.08921600133180618, + "logits/rejected": 0.09431330859661102, + "logps/chosen": -202.0355987548828, + "logps/rejected": -112.927734375, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001555365277454257, + "rewards/margins": -0.0014966904418542981, + "rewards/rejected": -5.86748355999589e-05, + "step": 114 + }, + { + "epoch": 0.07011126352690139, + "grad_norm": 64.48741151262199, + "learning_rate": 4.697560975609756e-09, + "logits/chosen": -0.1840996891260147, + "logits/rejected": 0.4118211567401886, + "logps/chosen": -155.63963317871094, + "logps/rejected": -120.42257690429688, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002329915761947632, + "rewards/margins": -0.00041795382276177406, + "rewards/rejected": 0.002747869584709406, + "step": 115 + }, + { + "epoch": 0.07072092668800488, + "grad_norm": 80.92793766737928, + "learning_rate": 4.741463414634147e-09, + "logits/chosen": 0.08726423978805542, + "logits/rejected": 0.03326858952641487, + "logps/chosen": -66.10111236572266, + "logps/rejected": -58.67345428466797, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0038454767782241106, + "rewards/margins": 0.0025439260061830282, + "rewards/rejected": 0.0013015507720410824, + "step": 116 + }, + { + "epoch": 0.07133058984910837, + "grad_norm": 76.67413964771319, + "learning_rate": 4.785365853658537e-09, + "logits/chosen": 0.10839516669511795, + "logits/rejected": 0.38721898198127747, + "logps/chosen": -270.4200134277344, + "logps/rejected": -179.22666931152344, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": 8.792884182184935e-05, + "rewards/margins": 0.005356276407837868, + "rewards/rejected": -0.0052683474496006966, + "step": 117 + }, + { + "epoch": 0.07194025301021185, + "grad_norm": 73.66752467833105, + "learning_rate": 4.829268292682927e-09, + "logits/chosen": 0.2082112729549408, + "logits/rejected": 0.151906818151474, + "logps/chosen": -99.91865539550781, + "logps/rejected": -81.91474914550781, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0017058372031897306, + "rewards/margins": 0.0025429727975279093, + "rewards/rejected": -0.000837135361507535, + "step": 118 + }, + { + "epoch": 0.07254991617131534, + "grad_norm": 77.3103028937938, + "learning_rate": 4.8731707317073175e-09, + "logits/chosen": 0.08812177926301956, + "logits/rejected": 0.5891497731208801, + "logps/chosen": -364.0269470214844, + "logps/rejected": -34.905548095703125, + "loss": 0.6923, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00534291286021471, + "rewards/margins": -0.004794662352651358, + "rewards/rejected": -0.000548250915016979, + "step": 119 + }, + { + "epoch": 0.07315957933241884, + "grad_norm": 77.47933217400289, + "learning_rate": 4.9170731707317075e-09, + "logits/chosen": 0.050755493342876434, + "logits/rejected": 0.010417714715003967, + "logps/chosen": -141.3895721435547, + "logps/rejected": -132.2732391357422, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.000584030116442591, + "rewards/margins": -0.00037059775786474347, + "rewards/rejected": -0.00021343230037018657, + "step": 120 + }, + { + "epoch": 0.07376924249352233, + "grad_norm": 74.72684436968582, + "learning_rate": 4.9609756097560976e-09, + "logits/chosen": -0.2643606960773468, + "logits/rejected": 0.16526812314987183, + "logps/chosen": -260.1291198730469, + "logps/rejected": -121.6513442993164, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004811191465705633, + "rewards/margins": 0.006060314364731312, + "rewards/rejected": -0.001249122666195035, + "step": 121 + }, + { + "epoch": 0.07437890565462582, + "grad_norm": 82.1357479983591, + "learning_rate": 5.004878048780488e-09, + "logits/chosen": 0.14291734993457794, + "logits/rejected": 0.057670608162879944, + "logps/chosen": -197.93487548828125, + "logps/rejected": -234.44915771484375, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0030743121169507504, + "rewards/margins": -0.0021574492566287518, + "rewards/rejected": -0.0009168625110760331, + "step": 122 + }, + { + "epoch": 0.07498856881572931, + "grad_norm": 68.83068517187307, + "learning_rate": 5.0487804878048785e-09, + "logits/chosen": 0.2741588056087494, + "logits/rejected": 0.31478196382522583, + "logps/chosen": -116.69654083251953, + "logps/rejected": -53.103763580322266, + "loss": 0.6943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0025397776626050472, + "rewards/margins": -0.0028411983512341976, + "rewards/rejected": 0.0003014207468368113, + "step": 123 + }, + { + "epoch": 0.0755982319768328, + "grad_norm": 73.28815100450528, + "learning_rate": 5.092682926829268e-09, + "logits/chosen": 0.20874908566474915, + "logits/rejected": 0.17765402793884277, + "logps/chosen": -114.76847839355469, + "logps/rejected": -147.07997131347656, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0049163103103637695, + "rewards/margins": -0.0006764174322597682, + "rewards/rejected": 0.005592728033661842, + "step": 124 + }, + { + "epoch": 0.07620789513793629, + "grad_norm": 65.55323005492332, + "learning_rate": 5.136585365853658e-09, + "logits/chosen": 0.33819684386253357, + "logits/rejected": 0.3356032371520996, + "logps/chosen": -100.74069213867188, + "logps/rejected": -189.4708709716797, + "loss": 0.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.3112439773976803e-06, + "rewards/margins": -0.0010233878856524825, + "rewards/rejected": 0.001024699187837541, + "step": 125 + }, + { + "epoch": 0.07681755829903979, + "grad_norm": 70.86724977666721, + "learning_rate": 5.180487804878048e-09, + "logits/chosen": 0.0392058864235878, + "logits/rejected": 0.03747990354895592, + "logps/chosen": -296.35845947265625, + "logps/rejected": -205.3852081298828, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0004379808669909835, + "rewards/margins": -0.0004094182513654232, + "rewards/rejected": -2.8562499210238457e-05, + "step": 126 + }, + { + "epoch": 0.07742722146014328, + "grad_norm": 65.61722606001547, + "learning_rate": 5.224390243902439e-09, + "logits/chosen": 0.04799790680408478, + "logits/rejected": 0.09802756458520889, + "logps/chosen": -101.85343933105469, + "logps/rejected": -75.31245422363281, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001424169517122209, + "rewards/margins": -0.000865292502567172, + "rewards/rejected": -0.0005588769563473761, + "step": 127 + }, + { + "epoch": 0.07803688462124676, + "grad_norm": 68.60395561720807, + "learning_rate": 5.268292682926829e-09, + "logits/chosen": 0.017364241182804108, + "logits/rejected": 0.03964319825172424, + "logps/chosen": -135.31761169433594, + "logps/rejected": -177.23977661132812, + "loss": 0.6933, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002281582448631525, + "rewards/margins": -0.0005889536114409566, + "rewards/rejected": -0.0016926288371905684, + "step": 128 + }, + { + "epoch": 0.07864654778235025, + "grad_norm": 65.65690014360091, + "learning_rate": 5.312195121951219e-09, + "logits/chosen": -0.0188586488366127, + "logits/rejected": 0.1370946168899536, + "logps/chosen": -174.27993774414062, + "logps/rejected": -133.8212890625, + "loss": 0.6928, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.002814221428707242, + "rewards/margins": 0.00708196172490716, + "rewards/rejected": -0.004267740063369274, + "step": 129 + }, + { + "epoch": 0.07925621094345374, + "grad_norm": 74.75775309701856, + "learning_rate": 5.3560975609756095e-09, + "logits/chosen": 0.14698414504528046, + "logits/rejected": 0.13047370314598083, + "logps/chosen": -182.12315368652344, + "logps/rejected": -196.8042449951172, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0017709494568407536, + "rewards/margins": 0.0008408309076912701, + "rewards/rejected": 0.0009301184909418225, + "step": 130 + }, + { + "epoch": 0.07986587410455723, + "grad_norm": 72.45493881357304, + "learning_rate": 5.3999999999999996e-09, + "logits/chosen": 0.010124213993549347, + "logits/rejected": 0.11211246252059937, + "logps/chosen": -154.32940673828125, + "logps/rejected": -111.28329467773438, + "loss": 0.6927, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00022742748842574656, + "rewards/margins": 0.0015665411483496428, + "rewards/rejected": -0.0017939688405022025, + "step": 131 + }, + { + "epoch": 0.08047553726566073, + "grad_norm": 82.5594881924206, + "learning_rate": 5.4439024390243896e-09, + "logits/chosen": 0.394523561000824, + "logits/rejected": 0.3098101019859314, + "logps/chosen": -58.86858367919922, + "logps/rejected": -73.11603546142578, + "loss": 0.6916, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0026440678630024195, + "rewards/margins": 0.000991898705251515, + "rewards/rejected": 0.0016521692741662264, + "step": 132 + }, + { + "epoch": 0.08108520042676422, + "grad_norm": 73.74422964095989, + "learning_rate": 5.4878048780487804e-09, + "logits/chosen": -0.2440057098865509, + "logits/rejected": 0.08008784800767899, + "logps/chosen": -359.561279296875, + "logps/rejected": -144.55764770507812, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005802393425256014, + "rewards/margins": -0.0014970778720453382, + "rewards/rejected": 0.0009168386459350586, + "step": 133 + }, + { + "epoch": 0.0816948635878677, + "grad_norm": 77.62821526350095, + "learning_rate": 5.5317073170731705e-09, + "logits/chosen": 0.06320123374462128, + "logits/rejected": 0.08124424517154694, + "logps/chosen": -185.04531860351562, + "logps/rejected": -171.1767120361328, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004624283406883478, + "rewards/margins": 0.00328406086191535, + "rewards/rejected": 0.0013402223121374846, + "step": 134 + }, + { + "epoch": 0.0823045267489712, + "grad_norm": 63.982325955375515, + "learning_rate": 5.5756097560975605e-09, + "logits/chosen": 0.12405374646186829, + "logits/rejected": 0.23707516491413116, + "logps/chosen": -147.10157775878906, + "logps/rejected": -37.86238098144531, + "loss": 0.6935, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0051132082007825375, + "rewards/margins": 0.0039332155138254166, + "rewards/rejected": 0.0011799931526184082, + "step": 135 + }, + { + "epoch": 0.08291418991007468, + "grad_norm": 67.87599102837838, + "learning_rate": 5.6195121951219505e-09, + "logits/chosen": 0.3629865348339081, + "logits/rejected": 0.41198331117630005, + "logps/chosen": -37.38160705566406, + "logps/rejected": -61.17850875854492, + "loss": 0.6925, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0010821342002600431, + "rewards/margins": -0.002665102481842041, + "rewards/rejected": 0.001582968165166676, + "step": 136 + }, + { + "epoch": 0.08352385307117817, + "grad_norm": 73.39680498164013, + "learning_rate": 5.663414634146341e-09, + "logits/chosen": -0.25749722123146057, + "logits/rejected": -0.06635760515928268, + "logps/chosen": -158.20542907714844, + "logps/rejected": -147.95916748046875, + "loss": 0.6926, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0005931377527303994, + "rewards/margins": -0.0009785653091967106, + "rewards/rejected": 0.0003854274982586503, + "step": 137 + }, + { + "epoch": 0.08413351623228166, + "grad_norm": 74.89745405053004, + "learning_rate": 5.7073170731707314e-09, + "logits/chosen": 0.10234642773866653, + "logits/rejected": 0.13057830929756165, + "logps/chosen": -190.80194091796875, + "logps/rejected": -184.14112854003906, + "loss": 0.6931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006725311279296875, + "rewards/margins": 0.00970449484884739, + "rewards/rejected": -0.002979183103889227, + "step": 138 + }, + { + "epoch": 0.08474317939338516, + "grad_norm": 71.86528735607519, + "learning_rate": 5.7512195121951215e-09, + "logits/chosen": 0.18370205163955688, + "logits/rejected": 0.4245617389678955, + "logps/chosen": -107.59567260742188, + "logps/rejected": -62.562522888183594, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0008191107772290707, + "rewards/margins": -0.002900493098422885, + "rewards/rejected": 0.0037196041084825993, + "step": 139 + }, + { + "epoch": 0.08535284255448865, + "grad_norm": 67.80064759232623, + "learning_rate": 5.795121951219512e-09, + "logits/chosen": 0.1179521232843399, + "logits/rejected": 0.17211908102035522, + "logps/chosen": -76.15290832519531, + "logps/rejected": -69.85437774658203, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00045137398410588503, + "rewards/margins": 0.0019128681160509586, + "rewards/rejected": -0.0014614940155297518, + "step": 140 + }, + { + "epoch": 0.08596250571559214, + "grad_norm": 78.07775502978046, + "learning_rate": 5.839024390243902e-09, + "logits/chosen": 0.20504337549209595, + "logits/rejected": -0.07019396126270294, + "logps/chosen": -110.20990753173828, + "logps/rejected": -154.95419311523438, + "loss": 0.693, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0011904718121513724, + "rewards/margins": 0.004687226377427578, + "rewards/rejected": -0.005877697840332985, + "step": 141 + }, + { + "epoch": 0.08657216887669562, + "grad_norm": 70.51385967841539, + "learning_rate": 5.882926829268292e-09, + "logits/chosen": 0.002869613468647003, + "logits/rejected": -0.0603582039475441, + "logps/chosen": -99.5188980102539, + "logps/rejected": -124.32498931884766, + "loss": 0.6922, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0024773627519607544, + "rewards/margins": 0.003924766089767218, + "rewards/rejected": -0.0014474033378064632, + "step": 142 + }, + { + "epoch": 0.08718183203779911, + "grad_norm": 82.46505649910281, + "learning_rate": 5.926829268292683e-09, + "logits/chosen": 0.196449413895607, + "logits/rejected": 0.10419797152280807, + "logps/chosen": -182.15365600585938, + "logps/rejected": -239.742919921875, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0011578560806810856, + "rewards/margins": 0.0012749790912494063, + "rewards/rejected": -0.0024328352883458138, + "step": 143 + }, + { + "epoch": 0.0877914951989026, + "grad_norm": 69.5667664808787, + "learning_rate": 5.970731707317073e-09, + "logits/chosen": -0.04383482411503792, + "logits/rejected": 0.2302844524383545, + "logps/chosen": -173.1061553955078, + "logps/rejected": -50.136810302734375, + "loss": 0.6926, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0038678408600389957, + "rewards/margins": -0.0033862111158668995, + "rewards/rejected": -0.00048162939492613077, + "step": 144 + }, + { + "epoch": 0.0884011583600061, + "grad_norm": 66.42758158814961, + "learning_rate": 6.014634146341463e-09, + "logits/chosen": 0.016252242028713226, + "logits/rejected": 0.3309721052646637, + "logps/chosen": -119.42570495605469, + "logps/rejected": -68.9472885131836, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0029084503185003996, + "rewards/margins": 0.0022307662293314934, + "rewards/rejected": 0.0006776839727535844, + "step": 145 + }, + { + "epoch": 0.08901082152110959, + "grad_norm": 70.44646364581894, + "learning_rate": 6.058536585365853e-09, + "logits/chosen": 0.46475750207901, + "logits/rejected": 0.8950963616371155, + "logps/chosen": -223.4705810546875, + "logps/rejected": -428.9252624511719, + "loss": 0.6933, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00443327147513628, + "rewards/margins": -0.006467171013355255, + "rewards/rejected": 0.0020338997710496187, + "step": 146 + }, + { + "epoch": 0.08962048468221308, + "grad_norm": 58.04813314381115, + "learning_rate": 6.102439024390244e-09, + "logits/chosen": 0.05219703167676926, + "logits/rejected": 0.3147392272949219, + "logps/chosen": -242.02186584472656, + "logps/rejected": -111.68846130371094, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.001748537877574563, + "rewards/margins": -0.0019744159653782845, + "rewards/rejected": 0.003722954075783491, + "step": 147 + }, + { + "epoch": 0.09023014784331657, + "grad_norm": 60.55357991712482, + "learning_rate": 6.146341463414634e-09, + "logits/chosen": 0.22557038068771362, + "logits/rejected": 0.08623065799474716, + "logps/chosen": -142.86192321777344, + "logps/rejected": -235.4132080078125, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0005093335639685392, + "rewards/margins": -0.0008229495724663138, + "rewards/rejected": 0.0013322830200195312, + "step": 148 + }, + { + "epoch": 0.09083981100442005, + "grad_norm": 72.04225145996718, + "learning_rate": 6.1902439024390234e-09, + "logits/chosen": 0.1499841958284378, + "logits/rejected": 0.23211584985256195, + "logps/chosen": -178.3646697998047, + "logps/rejected": -113.13501739501953, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003059941343963146, + "rewards/margins": 0.003623262047767639, + "rewards/rejected": -0.0005633204709738493, + "step": 149 + }, + { + "epoch": 0.09144947416552354, + "grad_norm": 78.80313031497876, + "learning_rate": 6.2341463414634135e-09, + "logits/chosen": -0.026065614074468613, + "logits/rejected": -0.22232206165790558, + "logps/chosen": -144.16940307617188, + "logps/rejected": -271.5802917480469, + "loss": 0.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001899433322250843, + "rewards/margins": -0.0005844119004905224, + "rewards/rejected": -0.0013150214217603207, + "step": 150 + }, + { + "epoch": 0.09205913732662704, + "grad_norm": 69.56302623595346, + "learning_rate": 6.278048780487804e-09, + "logits/chosen": 0.43501123785972595, + "logits/rejected": 0.41309887170791626, + "logps/chosen": -138.13449096679688, + "logps/rejected": -198.89093017578125, + "loss": 0.6915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005390692036598921, + "rewards/margins": 0.00937967374920845, + "rewards/rejected": -0.003988981246948242, + "step": 151 + }, + { + "epoch": 0.09266880048773053, + "grad_norm": 72.69477403333501, + "learning_rate": 6.321951219512194e-09, + "logits/chosen": 0.011581763625144958, + "logits/rejected": 0.02895890176296234, + "logps/chosen": -29.926578521728516, + "logps/rejected": -13.040694236755371, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004392695613205433, + "rewards/margins": -0.004276537802070379, + "rewards/rejected": -0.00011615746188908815, + "step": 152 + }, + { + "epoch": 0.09327846364883402, + "grad_norm": 69.08178656192919, + "learning_rate": 6.365853658536584e-09, + "logits/chosen": -0.04695054888725281, + "logits/rejected": -0.03760403022170067, + "logps/chosen": -139.91473388671875, + "logps/rejected": -96.32427215576172, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00232369895093143, + "rewards/margins": -0.0003409802448004484, + "rewards/rejected": 0.0026646791957318783, + "step": 153 + }, + { + "epoch": 0.09388812680993751, + "grad_norm": 61.070245240120485, + "learning_rate": 6.409756097560975e-09, + "logits/chosen": 0.004546787589788437, + "logits/rejected": -0.0730147734284401, + "logps/chosen": -210.40216064453125, + "logps/rejected": -171.3146209716797, + "loss": 0.6939, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002398204756900668, + "rewards/margins": -0.0008605123148299754, + "rewards/rejected": 0.0032587170135229826, + "step": 154 + }, + { + "epoch": 0.094497789971041, + "grad_norm": 99.83659075422874, + "learning_rate": 6.453658536585365e-09, + "logits/chosen": 0.11244277656078339, + "logits/rejected": 0.44888627529144287, + "logps/chosen": -33.60845184326172, + "logps/rejected": -41.175819396972656, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0019771575462073088, + "rewards/margins": -5.798344500362873e-05, + "rewards/rejected": 0.0020351409912109375, + "step": 155 + }, + { + "epoch": 0.09510745313214448, + "grad_norm": 62.447112753666794, + "learning_rate": 6.497560975609755e-09, + "logits/chosen": -0.13742254674434662, + "logits/rejected": 0.10493803769350052, + "logps/chosen": -252.29299926757812, + "logps/rejected": -146.0027313232422, + "loss": 0.6929, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0014365673996508121, + "rewards/margins": -0.0029184818267822266, + "rewards/rejected": 0.004355049226433039, + "step": 156 + }, + { + "epoch": 0.09571711629324799, + "grad_norm": 70.99603983614955, + "learning_rate": 6.541463414634146e-09, + "logits/chosen": -0.005733788013458252, + "logits/rejected": 0.255418598651886, + "logps/chosen": -129.1565399169922, + "logps/rejected": -92.6162109375, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.000543901405762881, + "rewards/margins": -0.0035296266432851553, + "rewards/rejected": 0.0029857249464839697, + "step": 157 + }, + { + "epoch": 0.09632677945435147, + "grad_norm": 74.24357293276846, + "learning_rate": 6.585365853658536e-09, + "logits/chosen": -0.08054069429636002, + "logits/rejected": 0.06550734490156174, + "logps/chosen": -320.4570007324219, + "logps/rejected": -186.04795837402344, + "loss": 0.6934, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.002433204557746649, + "rewards/margins": 0.0002149580977857113, + "rewards/rejected": 0.0022182464599609375, + "step": 158 + }, + { + "epoch": 0.09693644261545496, + "grad_norm": 81.33426491489708, + "learning_rate": 6.629268292682926e-09, + "logits/chosen": 0.1674249768257141, + "logits/rejected": 0.13584084808826447, + "logps/chosen": -269.9678955078125, + "logps/rejected": -225.60061645507812, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007037067785859108, + "rewards/margins": 0.006308174692094326, + "rewards/rejected": 0.000728893093764782, + "step": 159 + }, + { + "epoch": 0.09754610577655845, + "grad_norm": 59.87214960114537, + "learning_rate": 6.673170731707316e-09, + "logits/chosen": 0.1393468827009201, + "logits/rejected": 0.12032058835029602, + "logps/chosen": -60.78736877441406, + "logps/rejected": -65.82872009277344, + "loss": 0.6937, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002607786562293768, + "rewards/margins": 0.0034523606300354004, + "rewards/rejected": -0.0008445740677416325, + "step": 160 + }, + { + "epoch": 0.09815576893766194, + "grad_norm": 85.93623654558834, + "learning_rate": 6.717073170731707e-09, + "logits/chosen": 0.19709917902946472, + "logits/rejected": 0.25526177883148193, + "logps/chosen": -57.56209182739258, + "logps/rejected": -48.53240966796875, + "loss": 0.6917, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0026808977127075195, + "rewards/margins": -0.0017124293372035027, + "rewards/rejected": 0.004393327049911022, + "step": 161 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 67.99786555755102, + "learning_rate": 6.760975609756097e-09, + "logits/chosen": 0.028237100690603256, + "logits/rejected": 0.14847011864185333, + "logps/chosen": -177.81239318847656, + "logps/rejected": -161.88641357421875, + "loss": 0.6927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.001543379039503634, + "rewards/margins": 0.0066079264506697655, + "rewards/rejected": -0.005064547061920166, + "step": 162 + }, + { + "epoch": 0.09937509525986893, + "grad_norm": 78.68252640793447, + "learning_rate": 6.804878048780487e-09, + "logits/chosen": 0.19014248251914978, + "logits/rejected": 0.034527841955423355, + "logps/chosen": -70.37176513671875, + "logps/rejected": -87.45320892333984, + "loss": 0.6906, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.004232907667756081, + "rewards/margins": 0.0012796358205378056, + "rewards/rejected": 0.0029532716143876314, + "step": 163 + }, + { + "epoch": 0.09998475842097242, + "grad_norm": 69.75392929489868, + "learning_rate": 6.848780487804878e-09, + "logits/chosen": 0.07758036255836487, + "logits/rejected": 0.15910190343856812, + "logps/chosen": -259.58648681640625, + "logps/rejected": -281.25311279296875, + "loss": 0.6933, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0008043288835324347, + "rewards/margins": -0.007910936139523983, + "rewards/rejected": 0.008715265430510044, + "step": 164 + }, + { + "epoch": 0.1005944215820759, + "grad_norm": 80.0527512833749, + "learning_rate": 6.892682926829268e-09, + "logits/chosen": -0.204082190990448, + "logits/rejected": 0.2399766445159912, + "logps/chosen": -86.7333984375, + "logps/rejected": -35.39960479736328, + "loss": 0.6913, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.001207971596159041, + "rewards/margins": 0.0016857744194567204, + "rewards/rejected": -0.0004778027650900185, + "step": 165 + }, + { + "epoch": 0.10120408474317939, + "grad_norm": 79.6637012053017, + "learning_rate": 6.936585365853658e-09, + "logits/chosen": 0.17938564717769623, + "logits/rejected": 0.24092160165309906, + "logps/chosen": -99.06732177734375, + "logps/rejected": -89.8432846069336, + "loss": 0.6927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.001825571060180664, + "rewards/margins": 0.0024854838848114014, + "rewards/rejected": -0.0006599128828383982, + "step": 166 + }, + { + "epoch": 0.10181374790428288, + "grad_norm": 72.41252861114882, + "learning_rate": 6.980487804878049e-09, + "logits/chosen": 0.023349490016698837, + "logits/rejected": 0.1881551891565323, + "logps/chosen": -118.31248474121094, + "logps/rejected": -98.12593078613281, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002156746806576848, + "rewards/margins": -0.0027242780197411776, + "rewards/rejected": 0.004881024360656738, + "step": 167 + }, + { + "epoch": 0.10242341106538637, + "grad_norm": 68.56858420299291, + "learning_rate": 7.024390243902439e-09, + "logits/chosen": 0.016032271087169647, + "logits/rejected": 0.11249391734600067, + "logps/chosen": -323.5757141113281, + "logps/rejected": -123.6161880493164, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003967964556068182, + "rewards/margins": -0.0001054612803272903, + "rewards/rejected": -0.003862503683194518, + "step": 168 + }, + { + "epoch": 0.10303307422648987, + "grad_norm": 81.05255215869212, + "learning_rate": 7.068292682926829e-09, + "logits/chosen": 0.08069111406803131, + "logits/rejected": -0.07778602838516235, + "logps/chosen": -168.92613220214844, + "logps/rejected": -252.23606872558594, + "loss": 0.6924, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0032866718247532845, + "rewards/margins": 0.005971884820610285, + "rewards/rejected": -0.0026852129958570004, + "step": 169 + }, + { + "epoch": 0.10364273738759336, + "grad_norm": 81.7644402967527, + "learning_rate": 7.112195121951219e-09, + "logits/chosen": 0.31781864166259766, + "logits/rejected": 0.14330296218395233, + "logps/chosen": -367.62811279296875, + "logps/rejected": -239.89463806152344, + "loss": 0.6895, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013696718961000443, + "rewards/margins": 0.01696225255727768, + "rewards/rejected": -0.003265536157414317, + "step": 170 + }, + { + "epoch": 0.10425240054869685, + "grad_norm": 67.91609096551838, + "learning_rate": 7.15609756097561e-09, + "logits/chosen": 0.18671970069408417, + "logits/rejected": 0.16253669559955597, + "logps/chosen": -6.201991081237793, + "logps/rejected": -20.30820083618164, + "loss": 0.6919, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002145099686458707, + "rewards/margins": -0.0015753626357764006, + "rewards/rejected": -0.0005697369924746454, + "step": 171 + }, + { + "epoch": 0.10486206370980033, + "grad_norm": 78.16797384812574, + "learning_rate": 7.2e-09, + "logits/chosen": 0.176150843501091, + "logits/rejected": 0.14651542901992798, + "logps/chosen": -247.4420166015625, + "logps/rejected": -293.2618713378906, + "loss": 0.6919, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008729219436645508, + "rewards/margins": 0.007733630947768688, + "rewards/rejected": 0.000995588256046176, + "step": 172 + }, + { + "epoch": 0.10547172687090382, + "grad_norm": 57.67765206609665, + "learning_rate": 7.24390243902439e-09, + "logits/chosen": 0.18694573640823364, + "logits/rejected": 0.30927199125289917, + "logps/chosen": -194.29551696777344, + "logps/rejected": -198.8756561279297, + "loss": 0.6924, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.001628529978916049, + "rewards/margins": -0.0044615985825657845, + "rewards/rejected": 0.002833068370819092, + "step": 173 + }, + { + "epoch": 0.10608139003200731, + "grad_norm": 80.57272427404622, + "learning_rate": 7.287804878048781e-09, + "logits/chosen": -0.45078766345977783, + "logits/rejected": 0.4929141104221344, + "logps/chosen": -458.40496826171875, + "logps/rejected": -97.18772888183594, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008907699957489967, + "rewards/margins": 0.009705161675810814, + "rewards/rejected": -0.0007974625332280993, + "step": 174 + }, + { + "epoch": 0.10669105319311081, + "grad_norm": 80.57272427404622, + "learning_rate": 7.287804878048781e-09, + "logits/chosen": 0.12999090552330017, + "logits/rejected": -0.07178585976362228, + "logps/chosen": -126.14864349365234, + "logps/rejected": -138.47508239746094, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010566807352006435, + "rewards/margins": 0.009783018380403519, + "rewards/rejected": 0.0007837892626412213, + "step": 175 + }, + { + "epoch": 0.1073007163542143, + "grad_norm": 82.43150494799481, + "learning_rate": 7.33170731707317e-09, + "logits/chosen": 0.1309414952993393, + "logits/rejected": 0.15505224466323853, + "logps/chosen": -155.169677734375, + "logps/rejected": -143.5417022705078, + "loss": 0.6909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013601267710328102, + "rewards/margins": 0.007817518897354603, + "rewards/rejected": 0.005783748812973499, + "step": 176 + }, + { + "epoch": 0.10791037951531779, + "grad_norm": 88.0310517626559, + "learning_rate": 7.37560975609756e-09, + "logits/chosen": 0.1322554349899292, + "logits/rejected": 0.28939059376716614, + "logps/chosen": -421.849609375, + "logps/rejected": -345.86822509765625, + "loss": 0.6923, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01242215745151043, + "rewards/margins": 0.003427768126130104, + "rewards/rejected": 0.00899438839405775, + "step": 177 + }, + { + "epoch": 0.10852004267642128, + "grad_norm": 92.50555215779028, + "learning_rate": 7.41951219512195e-09, + "logits/chosen": 0.20377972722053528, + "logits/rejected": 0.1981021612882614, + "logps/chosen": -100.24634552001953, + "logps/rejected": -227.90379333496094, + "loss": 0.6928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0025395273696631193, + "rewards/margins": 0.005182635970413685, + "rewards/rejected": -0.002643108367919922, + "step": 178 + }, + { + "epoch": 0.10912970583752477, + "grad_norm": 70.63263622932259, + "learning_rate": 7.46341463414634e-09, + "logits/chosen": 0.09560803323984146, + "logits/rejected": -0.16471828520298004, + "logps/chosen": -184.5498046875, + "logps/rejected": -421.5089111328125, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0006469726795330644, + "rewards/margins": 0.002256846521049738, + "rewards/rejected": -0.0016098737251013517, + "step": 179 + }, + { + "epoch": 0.10973936899862825, + "grad_norm": 81.22199051486554, + "learning_rate": 7.50731707317073e-09, + "logits/chosen": 0.05242873355746269, + "logits/rejected": 0.1468435525894165, + "logps/chosen": -474.50384521484375, + "logps/rejected": -392.1907043457031, + "loss": 0.6911, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0184478759765625, + "rewards/margins": 0.021483995020389557, + "rewards/rejected": -0.003036117646843195, + "step": 180 + }, + { + "epoch": 0.11034903215973174, + "grad_norm": 77.14185791783366, + "learning_rate": 7.551219512195122e-09, + "logits/chosen": 0.2575138807296753, + "logits/rejected": 0.28635162115097046, + "logps/chosen": -116.87586975097656, + "logps/rejected": -69.42999267578125, + "loss": 0.6917, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0032803399953991175, + "rewards/margins": 0.0046702162362635136, + "rewards/rejected": -0.0013898760080337524, + "step": 181 + }, + { + "epoch": 0.11095869532083524, + "grad_norm": 61.14794911738017, + "learning_rate": 7.595121951219512e-09, + "logits/chosen": 0.11099124699831009, + "logits/rejected": 0.13927927613258362, + "logps/chosen": -31.17013931274414, + "logps/rejected": -67.29956817626953, + "loss": 0.6942, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0005793154705315828, + "rewards/margins": -0.0017756938468664885, + "rewards/rejected": 0.0023550093173980713, + "step": 182 + }, + { + "epoch": 0.11156835848193873, + "grad_norm": 68.58252525337726, + "learning_rate": 7.639024390243902e-09, + "logits/chosen": 0.20379912853240967, + "logits/rejected": 0.2304908186197281, + "logps/chosen": -44.79044723510742, + "logps/rejected": -54.05611038208008, + "loss": 0.6914, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0029021562077105045, + "rewards/margins": -0.004501253366470337, + "rewards/rejected": 0.001599097391590476, + "step": 183 + }, + { + "epoch": 0.11217802164304222, + "grad_norm": 79.38044931815365, + "learning_rate": 7.682926829268292e-09, + "logits/chosen": 0.0941167026758194, + "logits/rejected": 0.15919072926044464, + "logps/chosen": -247.39495849609375, + "logps/rejected": -171.2981414794922, + "loss": 0.692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005813932977616787, + "rewards/margins": 0.006898021325469017, + "rewards/rejected": -0.0010840892791748047, + "step": 184 + }, + { + "epoch": 0.11278768480414571, + "grad_norm": 76.2796090328514, + "learning_rate": 7.726829268292682e-09, + "logits/chosen": 0.05305987223982811, + "logits/rejected": -0.08095654845237732, + "logps/chosen": -229.75924682617188, + "logps/rejected": -258.1326599121094, + "loss": 0.6911, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01245895680040121, + "rewards/margins": 0.009998410940170288, + "rewards/rejected": 0.0024605453945696354, + "step": 185 + }, + { + "epoch": 0.1133973479652492, + "grad_norm": 63.95901673960713, + "learning_rate": 7.770731707317072e-09, + "logits/chosen": 0.19521519541740417, + "logits/rejected": 0.05974021553993225, + "logps/chosen": -266.0050048828125, + "logps/rejected": -517.4422607421875, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012186050415039062, + "rewards/margins": 0.0059299468994140625, + "rewards/rejected": 0.006256103515625, + "step": 186 + }, + { + "epoch": 0.11400701112635268, + "grad_norm": 74.72782131859475, + "learning_rate": 7.814634146341464e-09, + "logits/chosen": -0.06481368094682693, + "logits/rejected": -0.05777443200349808, + "logps/chosen": -145.55892944335938, + "logps/rejected": -40.200660705566406, + "loss": 0.6903, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.003788316622376442, + "rewards/margins": 0.001457572216168046, + "rewards/rejected": 0.002330744406208396, + "step": 187 + }, + { + "epoch": 0.11461667428745619, + "grad_norm": 76.91636228373078, + "learning_rate": 7.858536585365854e-09, + "logits/chosen": 0.11966991424560547, + "logits/rejected": 0.14306171238422394, + "logps/chosen": -14.93128776550293, + "logps/rejected": -19.36440086364746, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0010291219223290682, + "rewards/margins": -0.0011190533405169845, + "rewards/rejected": 8.99315346032381e-05, + "step": 188 + }, + { + "epoch": 0.11522633744855967, + "grad_norm": 57.596654535367165, + "learning_rate": 7.902439024390244e-09, + "logits/chosen": -0.074323371052742, + "logits/rejected": 0.05753961205482483, + "logps/chosen": -77.96446228027344, + "logps/rejected": -96.30802154541016, + "loss": 0.6936, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002775573870167136, + "rewards/margins": -0.004966557025909424, + "rewards/rejected": 0.0021909833885729313, + "step": 189 + }, + { + "epoch": 0.11583600060966316, + "grad_norm": 83.12182513412321, + "learning_rate": 7.946341463414634e-09, + "logits/chosen": 0.26109641790390015, + "logits/rejected": 0.2574569284915924, + "logps/chosen": -282.251953125, + "logps/rejected": -298.4128723144531, + "loss": 0.6912, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018349360674619675, + "rewards/margins": 0.013233971782028675, + "rewards/rejected": 0.0051153902895748615, + "step": 190 + }, + { + "epoch": 0.11644566377076665, + "grad_norm": 77.9532709110735, + "learning_rate": 7.990243902439024e-09, + "logits/chosen": 0.3206827640533447, + "logits/rejected": 0.2922167181968689, + "logps/chosen": -110.56204223632812, + "logps/rejected": -211.07745361328125, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0025666176807135344, + "rewards/margins": 0.0027544558979570866, + "rewards/rejected": -0.00018783810082823038, + "step": 191 + }, + { + "epoch": 0.11705532693187014, + "grad_norm": 74.79594478387207, + "learning_rate": 8.034146341463414e-09, + "logits/chosen": 0.1410955786705017, + "logits/rejected": -0.02253938466310501, + "logps/chosen": -267.8140869140625, + "logps/rejected": -266.12744140625, + "loss": 0.6908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012267494574189186, + "rewards/margins": 0.013858987018465996, + "rewards/rejected": -0.0015914919786155224, + "step": 192 + }, + { + "epoch": 0.11766499009297363, + "grad_norm": 78.49673136598805, + "learning_rate": 8.078048780487804e-09, + "logits/chosen": 0.21787187457084656, + "logits/rejected": 0.17934630811214447, + "logps/chosen": -239.90481567382812, + "logps/rejected": -274.919677734375, + "loss": 0.6905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01241397950798273, + "rewards/margins": 0.008371520787477493, + "rewards/rejected": 0.004042458720505238, + "step": 193 + }, + { + "epoch": 0.11827465325407713, + "grad_norm": 71.74342228269752, + "learning_rate": 8.121951219512196e-09, + "logits/chosen": 0.30999499559402466, + "logits/rejected": 0.3137747049331665, + "logps/chosen": -115.93299865722656, + "logps/rejected": -111.77952575683594, + "loss": 0.6899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011038887314498425, + "rewards/margins": 0.012829184532165527, + "rewards/rejected": -0.0017902969848364592, + "step": 194 + }, + { + "epoch": 0.11888431641518062, + "grad_norm": 84.24460399939697, + "learning_rate": 8.165853658536586e-09, + "logits/chosen": -0.08364257961511612, + "logits/rejected": 0.2403935045003891, + "logps/chosen": -165.97039794921875, + "logps/rejected": -117.39863586425781, + "loss": 0.692, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0038569211028516293, + "rewards/margins": 0.0012012722436338663, + "rewards/rejected": 0.0026556490920484066, + "step": 195 + }, + { + "epoch": 0.1194939795762841, + "grad_norm": 76.94193710343421, + "learning_rate": 8.209756097560976e-09, + "logits/chosen": -0.3118302822113037, + "logits/rejected": -0.17941252887248993, + "logps/chosen": -448.2669677734375, + "logps/rejected": -407.0946044921875, + "loss": 0.6923, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00807888526469469, + "rewards/margins": -0.009395670145750046, + "rewards/rejected": 0.01747455634176731, + "step": 196 + }, + { + "epoch": 0.12010364273738759, + "grad_norm": 81.81840784374043, + "learning_rate": 8.253658536585366e-09, + "logits/chosen": -0.10127736628055573, + "logits/rejected": 0.16854047775268555, + "logps/chosen": -135.72747802734375, + "logps/rejected": -125.63229370117188, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0014416694175451994, + "rewards/margins": -0.0011126695899292827, + "rewards/rejected": 0.002554339123889804, + "step": 197 + }, + { + "epoch": 0.12071330589849108, + "grad_norm": 72.36687920211662, + "learning_rate": 8.297560975609756e-09, + "logits/chosen": 0.22871333360671997, + "logits/rejected": 0.2734183371067047, + "logps/chosen": -226.57386779785156, + "logps/rejected": -166.44577026367188, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006101250648498535, + "rewards/margins": 0.008862042799592018, + "rewards/rejected": -0.0027607919182628393, + "step": 198 + }, + { + "epoch": 0.12132296905959457, + "grad_norm": 82.22642889906784, + "learning_rate": 8.341463414634146e-09, + "logits/chosen": 0.3314269781112671, + "logits/rejected": 0.36815717816352844, + "logps/chosen": -349.48284912109375, + "logps/rejected": -342.3211975097656, + "loss": 0.6906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019129181280732155, + "rewards/margins": 0.01226501539349556, + "rewards/rejected": 0.0068641663528978825, + "step": 199 + }, + { + "epoch": 0.12193263222069807, + "grad_norm": 72.9650328696923, + "learning_rate": 8.385365853658536e-09, + "logits/chosen": 0.03564765304327011, + "logits/rejected": 0.09759993851184845, + "logps/chosen": -160.033935546875, + "logps/rejected": -115.61434173583984, + "loss": 0.6903, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011529779992997646, + "rewards/margins": 0.011023235507309437, + "rewards/rejected": 0.0005065440782345831, + "step": 200 + }, + { + "epoch": 0.12254229538180156, + "grad_norm": 64.22348688374149, + "learning_rate": 8.429268292682927e-09, + "logits/chosen": 0.14622417092323303, + "logits/rejected": 0.253497838973999, + "logps/chosen": -186.90524291992188, + "logps/rejected": -158.447265625, + "loss": 0.6908, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0100333783775568, + "rewards/margins": 0.013532849960029125, + "rewards/rejected": -0.0034994720481336117, + "step": 201 + }, + { + "epoch": 0.12315195854290505, + "grad_norm": 74.52374042440181, + "learning_rate": 8.473170731707316e-09, + "logits/chosen": 0.054381199181079865, + "logits/rejected": -0.04007922112941742, + "logps/chosen": -271.0592041015625, + "logps/rejected": -200.66111755371094, + "loss": 0.689, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008767509832978249, + "rewards/margins": 0.008909749798476696, + "rewards/rejected": -0.00014224054757505655, + "step": 202 + }, + { + "epoch": 0.12376162170400853, + "grad_norm": 77.76424103790391, + "learning_rate": 8.517073170731706e-09, + "logits/chosen": -0.15442079305648804, + "logits/rejected": -0.15553969144821167, + "logps/chosen": -189.68994140625, + "logps/rejected": -283.78009033203125, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0001678466214798391, + "rewards/margins": -0.0070205689407885075, + "rewards/rejected": 0.0068527222611010075, + "step": 203 + }, + { + "epoch": 0.12437128486511202, + "grad_norm": 80.96444954317272, + "learning_rate": 8.560975609756096e-09, + "logits/chosen": 0.201796293258667, + "logits/rejected": 0.5128307342529297, + "logps/chosen": -192.67794799804688, + "logps/rejected": -180.17579650878906, + "loss": 0.6937, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012137127108871937, + "rewards/margins": 0.012270909734070301, + "rewards/rejected": -0.0001337828580290079, + "step": 204 + }, + { + "epoch": 0.12498094802621551, + "grad_norm": 75.66251081128826, + "learning_rate": 8.604878048780488e-09, + "logits/chosen": -0.023175127804279327, + "logits/rejected": 0.003958750516176224, + "logps/chosen": -184.97940063476562, + "logps/rejected": -67.05559539794922, + "loss": 0.6908, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00750888604670763, + "rewards/margins": 0.0070923687890172005, + "rewards/rejected": 0.00041651714127510786, + "step": 205 + }, + { + "epoch": 0.125590611187319, + "grad_norm": 74.05343130345244, + "learning_rate": 8.648780487804878e-09, + "logits/chosen": 0.23187266290187836, + "logits/rejected": 0.21487364172935486, + "logps/chosen": -16.77553939819336, + "logps/rejected": -22.398555755615234, + "loss": 0.6917, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00042227510130032897, + "rewards/margins": -0.0017372013535350561, + "rewards/rejected": 0.0013149260776117444, + "step": 206 + }, + { + "epoch": 0.1262002743484225, + "grad_norm": 75.21374984890747, + "learning_rate": 8.692682926829268e-09, + "logits/chosen": -0.06297703087329865, + "logits/rejected": 0.03414461761713028, + "logps/chosen": -398.39837646484375, + "logps/rejected": -352.8562927246094, + "loss": 0.6888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011148834601044655, + "rewards/margins": 0.014732742682099342, + "rewards/rejected": -0.003583908313885331, + "step": 207 + }, + { + "epoch": 0.126809937509526, + "grad_norm": 64.04142645281772, + "learning_rate": 8.736585365853658e-09, + "logits/chosen": 0.2422681748867035, + "logits/rejected": 0.12101413309574127, + "logps/chosen": -161.3976593017578, + "logps/rejected": -283.5793762207031, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011684561148285866, + "rewards/margins": 0.00624473113566637, + "rewards/rejected": 0.005439830012619495, + "step": 208 + }, + { + "epoch": 0.1274196006706295, + "grad_norm": 62.59370826680618, + "learning_rate": 8.780487804878048e-09, + "logits/chosen": 0.21639201045036316, + "logits/rejected": 0.23975257575511932, + "logps/chosen": -257.2457275390625, + "logps/rejected": -187.59793090820312, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010793114081025124, + "rewards/margins": 0.005155419930815697, + "rewards/rejected": 0.005637693218886852, + "step": 209 + }, + { + "epoch": 0.12802926383173296, + "grad_norm": 81.70767471490443, + "learning_rate": 8.824390243902438e-09, + "logits/chosen": -0.07190696895122528, + "logits/rejected": -0.13351590931415558, + "logps/chosen": -70.58195495605469, + "logps/rejected": -158.46754455566406, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0009778023231774569, + "rewards/margins": -0.0018841506680473685, + "rewards/rejected": 0.0009063484612852335, + "step": 210 + }, + { + "epoch": 0.12863892699283647, + "grad_norm": 74.5787702478486, + "learning_rate": 8.86829268292683e-09, + "logits/chosen": 0.25175967812538147, + "logits/rejected": 0.2309994399547577, + "logps/chosen": -26.840103149414062, + "logps/rejected": -21.15106201171875, + "loss": 0.6914, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005501949694007635, + "rewards/margins": 0.005997526925057173, + "rewards/rejected": -0.0004955768818035722, + "step": 211 + }, + { + "epoch": 0.12924859015393994, + "grad_norm": 73.11146783164249, + "learning_rate": 8.91219512195122e-09, + "logits/chosen": 0.033776264637708664, + "logits/rejected": 0.13960430026054382, + "logps/chosen": -218.73910522460938, + "logps/rejected": -81.34803771972656, + "loss": 0.6916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005093121901154518, + "rewards/margins": 0.0035371482372283936, + "rewards/rejected": 0.001555973314680159, + "step": 212 + }, + { + "epoch": 0.12985825331504344, + "grad_norm": 74.17960029850262, + "learning_rate": 8.95609756097561e-09, + "logits/chosen": 0.32470473647117615, + "logits/rejected": 0.3272430896759033, + "logps/chosen": -5.314748287200928, + "logps/rejected": -3.1400012969970703, + "loss": 0.692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0008516312809661031, + "rewards/margins": 0.004156092181801796, + "rewards/rejected": -0.0033044605515897274, + "step": 213 + }, + { + "epoch": 0.13046791647614692, + "grad_norm": 72.54347247391986, + "learning_rate": 9e-09, + "logits/chosen": 0.2663123607635498, + "logits/rejected": 0.16501428186893463, + "logps/chosen": -150.55316162109375, + "logps/rejected": -95.2711410522461, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005064946599304676, + "rewards/margins": 0.001300519797950983, + "rewards/rejected": 0.0037644270341843367, + "step": 214 + }, + { + "epoch": 0.13107757963725042, + "grad_norm": 70.75458425042433, + "learning_rate": 9.04390243902439e-09, + "logits/chosen": 0.39246898889541626, + "logits/rejected": -0.19269554316997528, + "logps/chosen": -221.33596801757812, + "logps/rejected": -500.8580627441406, + "loss": 0.6915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018619585782289505, + "rewards/margins": 0.009124422445893288, + "rewards/rejected": 0.009495163336396217, + "step": 215 + }, + { + "epoch": 0.13168724279835392, + "grad_norm": 68.26030742798383, + "learning_rate": 9.08780487804878e-09, + "logits/chosen": -0.08046863973140717, + "logits/rejected": 0.18038855493068695, + "logps/chosen": -376.5609130859375, + "logps/rejected": -144.64633178710938, + "loss": 0.6928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015356696210801601, + "rewards/margins": 0.010357105173170567, + "rewards/rejected": 0.00499959010630846, + "step": 216 + }, + { + "epoch": 0.1322969059594574, + "grad_norm": 81.05019110481086, + "learning_rate": 9.13170731707317e-09, + "logits/chosen": 0.08664529770612717, + "logits/rejected": 0.09168754518032074, + "logps/chosen": -91.68215942382812, + "logps/rejected": -56.12030029296875, + "loss": 0.6907, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014924057759344578, + "rewards/margins": 0.012504622340202332, + "rewards/rejected": 0.0024194358848035336, + "step": 217 + }, + { + "epoch": 0.1329065691205609, + "grad_norm": 70.44096712910866, + "learning_rate": 9.175609756097561e-09, + "logits/chosen": 0.26090556383132935, + "logits/rejected": 0.33934906125068665, + "logps/chosen": -170.63369750976562, + "logps/rejected": -149.51715087890625, + "loss": 0.6889, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010795742273330688, + "rewards/margins": 0.005506640300154686, + "rewards/rejected": 0.0052891019731760025, + "step": 218 + }, + { + "epoch": 0.13351623228166437, + "grad_norm": 70.2700617741892, + "learning_rate": 9.219512195121951e-09, + "logits/chosen": -0.08490590751171112, + "logits/rejected": -0.20087462663650513, + "logps/chosen": -52.203590393066406, + "logps/rejected": -76.68606567382812, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0032541872933506966, + "rewards/margins": 0.0005172848468646407, + "rewards/rejected": 0.002736902330070734, + "step": 219 + }, + { + "epoch": 0.13412589544276787, + "grad_norm": 69.55984465687895, + "learning_rate": 9.263414634146341e-09, + "logits/chosen": 0.1535046100616455, + "logits/rejected": 0.02677365019917488, + "logps/chosen": -4.253844261169434, + "logps/rejected": -14.466489791870117, + "loss": 0.6896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0004813492705579847, + "rewards/margins": 0.005597323644906282, + "rewards/rejected": -0.006078672595322132, + "step": 220 + }, + { + "epoch": 0.13473555860387137, + "grad_norm": 85.92535947691584, + "learning_rate": 9.307317073170731e-09, + "logits/chosen": 0.30106112360954285, + "logits/rejected": 0.1480739563703537, + "logps/chosen": -85.90007019042969, + "logps/rejected": -169.9725341796875, + "loss": 0.688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.006270409096032381, + "rewards/margins": 0.0010327575728297234, + "rewards/rejected": 0.00523765105754137, + "step": 221 + }, + { + "epoch": 0.13534522176497485, + "grad_norm": 119.21977123994778, + "learning_rate": 9.351219512195121e-09, + "logits/chosen": 0.11533677577972412, + "logits/rejected": 0.13379010558128357, + "logps/chosen": -149.5064697265625, + "logps/rejected": -71.87149810791016, + "loss": 0.6913, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0037560579366981983, + "rewards/margins": -0.0026161072310060263, + "rewards/rejected": 0.006372165400534868, + "step": 222 + }, + { + "epoch": 0.13595488492607835, + "grad_norm": 70.29667315024419, + "learning_rate": 9.395121951219511e-09, + "logits/chosen": -0.346357524394989, + "logits/rejected": 0.3720865845680237, + "logps/chosen": -320.56134033203125, + "logps/rejected": -285.1332702636719, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014269685372710228, + "rewards/margins": -8.902512490749359e-05, + "rewards/rejected": 0.014358710497617722, + "step": 223 + }, + { + "epoch": 0.13656454808718183, + "grad_norm": 72.74974042510391, + "learning_rate": 9.439024390243903e-09, + "logits/chosen": 0.14322340488433838, + "logits/rejected": 0.18459954857826233, + "logps/chosen": -69.95736694335938, + "logps/rejected": -56.18511199951172, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01004111859947443, + "rewards/margins": 0.007080426439642906, + "rewards/rejected": 0.0029606912285089493, + "step": 224 + }, + { + "epoch": 0.13717421124828533, + "grad_norm": 72.47288590044104, + "learning_rate": 9.482926829268293e-09, + "logits/chosen": 0.30410856008529663, + "logits/rejected": -0.12342655658721924, + "logps/chosen": -218.58901977539062, + "logps/rejected": -450.06744384765625, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00634857127442956, + "rewards/margins": -0.014498129487037659, + "rewards/rejected": 0.02084670029580593, + "step": 225 + }, + { + "epoch": 0.1377838744093888, + "grad_norm": 72.17461776109012, + "learning_rate": 9.526829268292683e-09, + "logits/chosen": 0.40045663714408875, + "logits/rejected": 0.3636299967765808, + "logps/chosen": -69.74336242675781, + "logps/rejected": -81.21648406982422, + "loss": 0.6902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0014475822681561112, + "rewards/margins": 0.00355433183722198, + "rewards/rejected": -0.0050019146874547005, + "step": 226 + }, + { + "epoch": 0.1383935375704923, + "grad_norm": 63.74354424455339, + "learning_rate": 9.570731707317073e-09, + "logits/chosen": -0.007155582308769226, + "logits/rejected": 0.4101467728614807, + "logps/chosen": -190.18185424804688, + "logps/rejected": -129.58155822753906, + "loss": 0.6899, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017444992437958717, + "rewards/margins": 0.004689312539994717, + "rewards/rejected": 0.012755680829286575, + "step": 227 + }, + { + "epoch": 0.1390032007315958, + "grad_norm": 64.11977457576249, + "learning_rate": 9.614634146341463e-09, + "logits/chosen": 0.25982144474983215, + "logits/rejected": 0.20755594968795776, + "logps/chosen": -35.146148681640625, + "logps/rejected": -54.77658462524414, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010507357306778431, + "rewards/margins": 0.005585014820098877, + "rewards/rejected": 0.004922342021018267, + "step": 228 + }, + { + "epoch": 0.13961286389269928, + "grad_norm": 73.2959257159521, + "learning_rate": 9.658536585365853e-09, + "logits/chosen": 0.2829352617263794, + "logits/rejected": 0.226519376039505, + "logps/chosen": -189.8313446044922, + "logps/rejected": -381.81146240234375, + "loss": 0.69, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012041282840073109, + "rewards/margins": 0.016209380701184273, + "rewards/rejected": -0.004168099258095026, + "step": 229 + }, + { + "epoch": 0.14022252705380278, + "grad_norm": 68.94961304303615, + "learning_rate": 9.702439024390243e-09, + "logits/chosen": -0.02386903390288353, + "logits/rejected": 0.22455506026744843, + "logps/chosen": -252.91159057617188, + "logps/rejected": -205.3868408203125, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028557967394590378, + "rewards/margins": 0.018563751131296158, + "rewards/rejected": 0.009994215331971645, + "step": 230 + }, + { + "epoch": 0.14083219021490626, + "grad_norm": 78.6604435432664, + "learning_rate": 9.746341463414635e-09, + "logits/chosen": 0.34106820821762085, + "logits/rejected": 0.07739745080471039, + "logps/chosen": -122.4163818359375, + "logps/rejected": -258.68914794921875, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025600817054510117, + "rewards/margins": 0.005098343826830387, + "rewards/rejected": 0.020502472296357155, + "step": 231 + }, + { + "epoch": 0.14144185337600976, + "grad_norm": 70.32022280924876, + "learning_rate": 9.790243902439025e-09, + "logits/chosen": 0.16174301505088806, + "logits/rejected": 0.22027698159217834, + "logps/chosen": -77.50466918945312, + "logps/rejected": -116.17520141601562, + "loss": 0.6897, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.005060291383415461, + "rewards/margins": -0.001702064648270607, + "rewards/rejected": 0.00676235556602478, + "step": 232 + }, + { + "epoch": 0.14205151653711323, + "grad_norm": 73.99688934574431, + "learning_rate": 9.834146341463415e-09, + "logits/chosen": 0.33195942640304565, + "logits/rejected": -0.011777505278587341, + "logps/chosen": -150.14395141601562, + "logps/rejected": -177.5060272216797, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01235343236476183, + "rewards/margins": 0.015944160521030426, + "rewards/rejected": -0.0035907269921153784, + "step": 233 + }, + { + "epoch": 0.14266117969821673, + "grad_norm": 63.849320564913384, + "learning_rate": 9.878048780487805e-09, + "logits/chosen": 0.10598531365394592, + "logits/rejected": 0.12136658281087875, + "logps/chosen": -181.38348388671875, + "logps/rejected": -69.75995635986328, + "loss": 0.6898, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020735835656523705, + "rewards/margins": 0.015682529658079147, + "rewards/rejected": 0.005053305998444557, + "step": 234 + }, + { + "epoch": 0.14327084285932024, + "grad_norm": 75.28468551677496, + "learning_rate": 9.921951219512195e-09, + "logits/chosen": 0.35082173347473145, + "logits/rejected": 0.3787878453731537, + "logps/chosen": -117.42396545410156, + "logps/rejected": -79.96060180664062, + "loss": 0.6867, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0063343290239572525, + "rewards/margins": 0.008362340740859509, + "rewards/rejected": -0.002028012415394187, + "step": 235 + }, + { + "epoch": 0.1438805060204237, + "grad_norm": 63.49933420829787, + "learning_rate": 9.965853658536585e-09, + "logits/chosen": -0.029095180332660675, + "logits/rejected": -0.02358836866915226, + "logps/chosen": -269.658935546875, + "logps/rejected": -184.3779754638672, + "loss": 0.6902, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.028628159314393997, + "rewards/margins": 0.009772539138793945, + "rewards/rejected": 0.018855620175600052, + "step": 236 + }, + { + "epoch": 0.1444901691815272, + "grad_norm": 75.31000642223354, + "learning_rate": 1.0009756097560975e-08, + "logits/chosen": -0.06486168503761292, + "logits/rejected": 0.0071268510073423386, + "logps/chosen": -299.6549072265625, + "logps/rejected": -165.22476196289062, + "loss": 0.6885, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032582368701696396, + "rewards/margins": 0.019990747794508934, + "rewards/rejected": 0.012591619044542313, + "step": 237 + }, + { + "epoch": 0.14509983234263069, + "grad_norm": 68.11793891801905, + "learning_rate": 1.0053658536585367e-08, + "logits/chosen": -0.018292773514986038, + "logits/rejected": 0.06804099678993225, + "logps/chosen": -240.0941619873047, + "logps/rejected": -300.457275390625, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02616003155708313, + "rewards/margins": -0.0042310659773647785, + "rewards/rejected": 0.03039109706878662, + "step": 238 + }, + { + "epoch": 0.1457094955037342, + "grad_norm": 63.56920454825996, + "learning_rate": 1.0097560975609757e-08, + "logits/chosen": 0.20902913808822632, + "logits/rejected": 0.20397305488586426, + "logps/chosen": -198.93582153320312, + "logps/rejected": -207.8833770751953, + "loss": 0.6878, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03262896463274956, + "rewards/margins": 0.042327940464019775, + "rewards/rejected": -0.009698974899947643, + "step": 239 + }, + { + "epoch": 0.1463191586648377, + "grad_norm": 74.6828087626269, + "learning_rate": 1.0141463414634145e-08, + "logits/chosen": 0.16607873141765594, + "logits/rejected": 0.19343027472496033, + "logps/chosen": -11.94176959991455, + "logps/rejected": -13.455016136169434, + "loss": 0.6869, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0027127116918563843, + "rewards/margins": -0.006608984433114529, + "rewards/rejected": 0.003896272275596857, + "step": 240 + }, + { + "epoch": 0.14692882182594116, + "grad_norm": 67.57369790544568, + "learning_rate": 1.0185365853658535e-08, + "logits/chosen": -0.14159193634986877, + "logits/rejected": -0.01214917004108429, + "logps/chosen": -162.12225341796875, + "logps/rejected": -126.66780090332031, + "loss": 0.6879, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.013150770217180252, + "rewards/margins": 0.011054106056690216, + "rewards/rejected": 0.0020966639276593924, + "step": 241 + }, + { + "epoch": 0.14753848498704467, + "grad_norm": 69.09487800609341, + "learning_rate": 1.0229268292682925e-08, + "logits/chosen": 0.05865873396396637, + "logits/rejected": -0.021177947521209717, + "logps/chosen": -193.3614501953125, + "logps/rejected": -167.36978149414062, + "loss": 0.6876, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.005057024769484997, + "rewards/margins": -0.004579547327011824, + "rewards/rejected": 0.009636571630835533, + "step": 242 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 62.3242029486047, + "learning_rate": 1.0273170731707315e-08, + "logits/chosen": 0.09503556787967682, + "logits/rejected": 0.02194279432296753, + "logps/chosen": -92.28962707519531, + "logps/rejected": -85.16290283203125, + "loss": 0.691, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010578328743577003, + "rewards/margins": 0.01028031762689352, + "rewards/rejected": 0.0002980114077217877, + "step": 243 + }, + { + "epoch": 0.14875781130925164, + "grad_norm": 75.66652761452913, + "learning_rate": 1.0317073170731705e-08, + "logits/chosen": 0.20517027378082275, + "logits/rejected": 0.15875345468521118, + "logps/chosen": -113.72001647949219, + "logps/rejected": -49.26511001586914, + "loss": 0.6871, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004640038590878248, + "rewards/margins": 0.0027486036997288465, + "rewards/rejected": 0.0018914344254881144, + "step": 244 + }, + { + "epoch": 0.14936747447035512, + "grad_norm": 89.69620229693227, + "learning_rate": 1.0360975609756095e-08, + "logits/chosen": 0.11705029010772705, + "logits/rejected": 0.26808589696884155, + "logps/chosen": -245.8486328125, + "logps/rejected": -245.96533203125, + "loss": 0.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.016329145058989525, + "rewards/margins": 0.019762611016631126, + "rewards/rejected": -0.0034334659576416016, + "step": 245 + }, + { + "epoch": 0.14997713763145862, + "grad_norm": 67.96453824857987, + "learning_rate": 1.0404878048780487e-08, + "logits/chosen": 0.20623037219047546, + "logits/rejected": 0.1848663091659546, + "logps/chosen": -14.415666580200195, + "logps/rejected": -7.248907566070557, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00188540224917233, + "rewards/margins": -0.0007506401743739843, + "rewards/rejected": -0.0011347620747983456, + "step": 246 + }, + { + "epoch": 0.15058680079256212, + "grad_norm": 70.56113761845653, + "learning_rate": 1.0448780487804877e-08, + "logits/chosen": 0.0934453085064888, + "logits/rejected": 0.12496070563793182, + "logps/chosen": -85.32705688476562, + "logps/rejected": -78.3795166015625, + "loss": 0.6859, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0045030489563941956, + "rewards/margins": 0.005073298700153828, + "rewards/rejected": -0.0005702495109289885, + "step": 247 + }, + { + "epoch": 0.1511964639536656, + "grad_norm": 76.9638268372275, + "learning_rate": 1.0492682926829267e-08, + "logits/chosen": 0.20455703139305115, + "logits/rejected": 0.18343859910964966, + "logps/chosen": -199.44857788085938, + "logps/rejected": -103.60641479492188, + "loss": 0.6867, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010823631659150124, + "rewards/margins": 0.013277888298034668, + "rewards/rejected": -0.0024542571045458317, + "step": 248 + }, + { + "epoch": 0.1518061271147691, + "grad_norm": 75.07135689944852, + "learning_rate": 1.0536585365853657e-08, + "logits/chosen": 0.3327924907207489, + "logits/rejected": 0.36023616790771484, + "logps/chosen": -185.33592224121094, + "logps/rejected": -29.10883903503418, + "loss": 0.6854, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.003931510262191296, + "rewards/margins": 0.0014942025300115347, + "rewards/rejected": 0.002437308430671692, + "step": 249 + }, + { + "epoch": 0.15241579027587257, + "grad_norm": 75.58232939333541, + "learning_rate": 1.0580487804878047e-08, + "logits/chosen": 0.11740530282258987, + "logits/rejected": 0.18833479285240173, + "logps/chosen": -424.054931640625, + "logps/rejected": -343.39404296875, + "loss": 0.689, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05939657613635063, + "rewards/margins": 0.03401017189025879, + "rewards/rejected": 0.025386402383446693, + "step": 250 + }, + { + "epoch": 0.15302545343697607, + "grad_norm": 73.0173068286647, + "learning_rate": 1.0624390243902437e-08, + "logits/chosen": 0.27976128458976746, + "logits/rejected": 0.25798600912094116, + "logps/chosen": -70.59455108642578, + "logps/rejected": -91.9207534790039, + "loss": 0.6894, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.005557769909501076, + "rewards/margins": -0.011985952034592628, + "rewards/rejected": 0.006428182125091553, + "step": 251 + }, + { + "epoch": 0.15363511659807957, + "grad_norm": 74.34648231743009, + "learning_rate": 1.0668292682926829e-08, + "logits/chosen": 0.09387435019016266, + "logits/rejected": -0.007215626537799835, + "logps/chosen": -219.23934936523438, + "logps/rejected": -250.1908416748047, + "loss": 0.6892, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01466522179543972, + "rewards/margins": 0.009566032327711582, + "rewards/rejected": 0.005099189467728138, + "step": 252 + }, + { + "epoch": 0.15424477975918305, + "grad_norm": 64.34807060967165, + "learning_rate": 1.0712195121951219e-08, + "logits/chosen": 0.02402116358280182, + "logits/rejected": 0.019050151109695435, + "logps/chosen": -90.4888687133789, + "logps/rejected": -148.08981323242188, + "loss": 0.6896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011893940158188343, + "rewards/margins": 0.012876272201538086, + "rewards/rejected": -0.0009823321597650647, + "step": 253 + }, + { + "epoch": 0.15485444292028655, + "grad_norm": 64.79662928731439, + "learning_rate": 1.0756097560975609e-08, + "logits/chosen": 0.22467710077762604, + "logits/rejected": 0.0950518324971199, + "logps/chosen": -61.54901885986328, + "logps/rejected": -61.83803176879883, + "loss": 0.6907, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01443713903427124, + "rewards/margins": 0.014978361316025257, + "rewards/rejected": -0.000541222165338695, + "step": 254 + }, + { + "epoch": 0.15546410608139002, + "grad_norm": 59.93707097217542, + "learning_rate": 1.0799999999999999e-08, + "logits/chosen": 0.29136666655540466, + "logits/rejected": 0.11373914033174515, + "logps/chosen": -75.9327392578125, + "logps/rejected": -126.5439224243164, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011048637330532074, + "rewards/margins": 0.006899937056005001, + "rewards/rejected": 0.004148700274527073, + "step": 255 + }, + { + "epoch": 0.15607376924249353, + "grad_norm": 71.95254740944107, + "learning_rate": 1.0843902439024389e-08, + "logits/chosen": 0.02697262167930603, + "logits/rejected": 0.18303431570529938, + "logps/chosen": -187.94979858398438, + "logps/rejected": -67.05463409423828, + "loss": 0.6857, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.02066936530172825, + "rewards/margins": 0.011891448870301247, + "rewards/rejected": 0.008777916431427002, + "step": 256 + }, + { + "epoch": 0.156683432403597, + "grad_norm": 66.16923012937521, + "learning_rate": 1.0887804878048779e-08, + "logits/chosen": -0.10520759224891663, + "logits/rejected": 0.1884341984987259, + "logps/chosen": -299.9891357421875, + "logps/rejected": -119.1136703491211, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011963725090026855, + "rewards/margins": 0.002015376463532448, + "rewards/rejected": 0.009948348626494408, + "step": 257 + }, + { + "epoch": 0.1572930955647005, + "grad_norm": 66.51771221207379, + "learning_rate": 1.093170731707317e-08, + "logits/chosen": 0.039739660918712616, + "logits/rejected": -0.05868508294224739, + "logps/chosen": -41.552310943603516, + "logps/rejected": -61.76737594604492, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004486262798309326, + "rewards/margins": -2.5379355065524578e-05, + "rewards/rejected": 0.004511642269790173, + "step": 258 + }, + { + "epoch": 0.157902758725804, + "grad_norm": 73.0958015664315, + "learning_rate": 1.0975609756097561e-08, + "logits/chosen": 0.16631904244422913, + "logits/rejected": 0.15885590016841888, + "logps/chosen": -105.40766143798828, + "logps/rejected": -167.66253662109375, + "loss": 0.6882, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.00017958879470825195, + "rewards/margins": 0.0016276472015306354, + "rewards/rejected": -0.0014480589888989925, + "step": 259 + }, + { + "epoch": 0.15851242188690748, + "grad_norm": 70.25652911635771, + "learning_rate": 1.1019512195121951e-08, + "logits/chosen": 0.2429775893688202, + "logits/rejected": 0.1486252248287201, + "logps/chosen": -106.29670715332031, + "logps/rejected": -162.82992553710938, + "loss": 0.6849, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028435135260224342, + "rewards/margins": 0.029486199840903282, + "rewards/rejected": -0.0010510622523725033, + "step": 260 + }, + { + "epoch": 0.15912208504801098, + "grad_norm": 72.10049120828978, + "learning_rate": 1.1063414634146341e-08, + "logits/chosen": -0.11289437860250473, + "logits/rejected": 0.297699511051178, + "logps/chosen": -347.2435607910156, + "logps/rejected": -196.04190063476562, + "loss": 0.6873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.031672000885009766, + "rewards/margins": 0.013463189825415611, + "rewards/rejected": 0.018208812922239304, + "step": 261 + }, + { + "epoch": 0.15973174820911445, + "grad_norm": 80.93364378898619, + "learning_rate": 1.1107317073170731e-08, + "logits/chosen": 0.0010141544044017792, + "logits/rejected": 0.2935892641544342, + "logps/chosen": -498.2570495605469, + "logps/rejected": -312.3238220214844, + "loss": 0.6825, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08853988349437714, + "rewards/margins": 0.06164970621466637, + "rewards/rejected": 0.026890181005001068, + "step": 262 + }, + { + "epoch": 0.16034141137021796, + "grad_norm": 102.5062733225681, + "learning_rate": 1.1151219512195121e-08, + "logits/chosen": -0.16654887795448303, + "logits/rejected": -0.14394907653331757, + "logps/chosen": -193.4143524169922, + "logps/rejected": -228.5217742919922, + "loss": 0.6834, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019860554486513138, + "rewards/margins": 0.008599311113357544, + "rewards/rejected": 0.01126124244183302, + "step": 263 + }, + { + "epoch": 0.16095107453132146, + "grad_norm": 81.75798142518528, + "learning_rate": 1.1195121951219511e-08, + "logits/chosen": -0.009945273399353027, + "logits/rejected": 0.0056189000606536865, + "logps/chosen": -587.51708984375, + "logps/rejected": -362.59234619140625, + "loss": 0.6882, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03963165730237961, + "rewards/margins": 0.030398894101381302, + "rewards/rejected": 0.009232759475708008, + "step": 264 + }, + { + "epoch": 0.16156073769242493, + "grad_norm": 68.03395581723177, + "learning_rate": 1.1239024390243901e-08, + "logits/chosen": 0.18979834020137787, + "logits/rejected": 0.20110829174518585, + "logps/chosen": -38.780731201171875, + "logps/rejected": -47.85240173339844, + "loss": 0.6843, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008783669210970402, + "rewards/margins": 0.005118245724588633, + "rewards/rejected": 0.0036654232535511255, + "step": 265 + }, + { + "epoch": 0.16217040085352843, + "grad_norm": 67.12938115410965, + "learning_rate": 1.1282926829268293e-08, + "logits/chosen": 0.14108797907829285, + "logits/rejected": 0.0863712877035141, + "logps/chosen": -65.01193237304688, + "logps/rejected": -197.5635986328125, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008278220891952515, + "rewards/margins": -0.0024371203035116196, + "rewards/rejected": 0.010715341195464134, + "step": 266 + }, + { + "epoch": 0.1627800640146319, + "grad_norm": 76.95114992952817, + "learning_rate": 1.1326829268292683e-08, + "logits/chosen": 0.31340616941452026, + "logits/rejected": 0.29575198888778687, + "logps/chosen": -147.78167724609375, + "logps/rejected": -129.01556396484375, + "loss": 0.6884, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028375498950481415, + "rewards/margins": -0.008279353380203247, + "rewards/rejected": 0.03665485233068466, + "step": 267 + }, + { + "epoch": 0.1633897271757354, + "grad_norm": 61.97636980432929, + "learning_rate": 1.1370731707317073e-08, + "logits/chosen": 0.3050193190574646, + "logits/rejected": 0.0612892247736454, + "logps/chosen": -178.6527557373047, + "logps/rejected": -183.0557861328125, + "loss": 0.6888, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021247386932373047, + "rewards/margins": 0.011964130215346813, + "rewards/rejected": 0.009283257648348808, + "step": 268 + }, + { + "epoch": 0.16399939033683889, + "grad_norm": 59.77971279845762, + "learning_rate": 1.1414634146341463e-08, + "logits/chosen": -0.049220163375139236, + "logits/rejected": -0.018158867955207825, + "logps/chosen": -127.37361145019531, + "logps/rejected": -90.01386260986328, + "loss": 0.6894, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02707068994641304, + "rewards/margins": 0.02435753308236599, + "rewards/rejected": 0.002713155932724476, + "step": 269 + }, + { + "epoch": 0.1646090534979424, + "grad_norm": 67.97088968853843, + "learning_rate": 1.1458536585365853e-08, + "logits/chosen": -0.22824090719223022, + "logits/rejected": -0.23402655124664307, + "logps/chosen": -62.05147933959961, + "logps/rejected": -91.66260528564453, + "loss": 0.6872, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012143706902861595, + "rewards/margins": -0.007138777989894152, + "rewards/rejected": 0.01928248628973961, + "step": 270 + }, + { + "epoch": 0.1652187166590459, + "grad_norm": 72.07185018278172, + "learning_rate": 1.1502439024390243e-08, + "logits/chosen": -0.20504449307918549, + "logits/rejected": 0.2059239149093628, + "logps/chosen": -128.02024841308594, + "logps/rejected": -153.56642150878906, + "loss": 0.6847, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02709498442709446, + "rewards/margins": 0.004441403783857822, + "rewards/rejected": 0.022653579711914062, + "step": 271 + }, + { + "epoch": 0.16582837982014936, + "grad_norm": 70.08052151086115, + "learning_rate": 1.1546341463414635e-08, + "logits/chosen": 0.2460930347442627, + "logits/rejected": 0.6110115051269531, + "logps/chosen": -157.7004852294922, + "logps/rejected": -103.66472625732422, + "loss": 0.6896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0354963093996048, + "rewards/margins": 0.013490723446011543, + "rewards/rejected": 0.022005582228302956, + "step": 272 + }, + { + "epoch": 0.16643804298125287, + "grad_norm": 75.28336099467903, + "learning_rate": 1.1590243902439025e-08, + "logits/chosen": -0.06624182313680649, + "logits/rejected": -0.19460135698318481, + "logps/chosen": -88.95880889892578, + "logps/rejected": -237.4802703857422, + "loss": 0.6896, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.010710209608078003, + "rewards/margins": -0.024119380861520767, + "rewards/rejected": 0.03482959419488907, + "step": 273 + }, + { + "epoch": 0.16704770614235634, + "grad_norm": 70.70930457013314, + "learning_rate": 1.1634146341463415e-08, + "logits/chosen": 0.004238814115524292, + "logits/rejected": -0.008775483816862106, + "logps/chosen": -47.78416442871094, + "logps/rejected": -53.0060920715332, + "loss": 0.6863, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.011054693721234798, + "rewards/margins": -0.01163354143500328, + "rewards/rejected": 0.0005788472481071949, + "step": 274 + }, + { + "epoch": 0.16765736930345984, + "grad_norm": 61.65440882497993, + "learning_rate": 1.1678048780487805e-08, + "logits/chosen": 0.04496167600154877, + "logits/rejected": 0.2716519832611084, + "logps/chosen": -127.30840301513672, + "logps/rejected": -81.77163696289062, + "loss": 0.6855, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020088758319616318, + "rewards/margins": 0.005410086363554001, + "rewards/rejected": 0.014678669162094593, + "step": 275 + }, + { + "epoch": 0.16826703246456332, + "grad_norm": 74.55600716443797, + "learning_rate": 1.1721951219512195e-08, + "logits/chosen": 0.059458762407302856, + "logits/rejected": 0.0097731314599514, + "logps/chosen": -144.34033203125, + "logps/rejected": -183.9947967529297, + "loss": 0.6872, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03361999988555908, + "rewards/margins": 0.01005165558308363, + "rewards/rejected": 0.023568345233798027, + "step": 276 + }, + { + "epoch": 0.16887669562566682, + "grad_norm": 72.79481905339401, + "learning_rate": 1.1765853658536585e-08, + "logits/chosen": 0.24767087399959564, + "logits/rejected": -0.08833365142345428, + "logps/chosen": -212.47438049316406, + "logps/rejected": -296.18572998046875, + "loss": 0.6888, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.026554489508271217, + "rewards/margins": -0.012244321405887604, + "rewards/rejected": 0.03879880905151367, + "step": 277 + }, + { + "epoch": 0.16948635878677032, + "grad_norm": 71.71878768094764, + "learning_rate": 1.1809756097560975e-08, + "logits/chosen": -0.09843967109918594, + "logits/rejected": 0.2668130695819855, + "logps/chosen": -254.5621795654297, + "logps/rejected": -192.59725952148438, + "loss": 0.6862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02968626096844673, + "rewards/margins": 0.011512184515595436, + "rewards/rejected": 0.018174076452851295, + "step": 278 + }, + { + "epoch": 0.1700960219478738, + "grad_norm": 64.6045703248563, + "learning_rate": 1.1853658536585366e-08, + "logits/chosen": 0.12652164697647095, + "logits/rejected": 0.17770615220069885, + "logps/chosen": -112.51658630371094, + "logps/rejected": -144.6085205078125, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02689737267792225, + "rewards/margins": -0.0017916793003678322, + "rewards/rejected": 0.028689051046967506, + "step": 279 + }, + { + "epoch": 0.1707056851089773, + "grad_norm": 72.4789900569941, + "learning_rate": 1.1897560975609757e-08, + "logits/chosen": -0.054657019674777985, + "logits/rejected": 0.34233659505844116, + "logps/chosen": -216.79249572753906, + "logps/rejected": -68.52703857421875, + "loss": 0.6877, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03096923977136612, + "rewards/margins": 0.024797670543193817, + "rewards/rejected": 0.0061715710908174515, + "step": 280 + }, + { + "epoch": 0.17131534827008077, + "grad_norm": 72.46230500292192, + "learning_rate": 1.1941463414634147e-08, + "logits/chosen": 0.10445204377174377, + "logits/rejected": 0.11657428741455078, + "logps/chosen": -321.2290954589844, + "logps/rejected": -278.23028564453125, + "loss": 0.6853, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05811662971973419, + "rewards/margins": 0.0354214683175087, + "rewards/rejected": 0.022695159539580345, + "step": 281 + }, + { + "epoch": 0.17192501143118427, + "grad_norm": 64.7679771023373, + "learning_rate": 1.1985365853658537e-08, + "logits/chosen": -0.13013964891433716, + "logits/rejected": 0.009020913392305374, + "logps/chosen": -222.4367218017578, + "logps/rejected": -148.70559692382812, + "loss": 0.6845, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024588415399193764, + "rewards/margins": 0.02192620374262333, + "rewards/rejected": 0.0026622116565704346, + "step": 282 + }, + { + "epoch": 0.17253467459228777, + "grad_norm": 76.10500673143999, + "learning_rate": 1.2029268292682927e-08, + "logits/chosen": -0.01958797127008438, + "logits/rejected": 0.08497656136751175, + "logps/chosen": -311.93658447265625, + "logps/rejected": -220.24310302734375, + "loss": 0.6856, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.061127856373786926, + "rewards/margins": 0.051468826830387115, + "rewards/rejected": 0.009659028612077236, + "step": 283 + }, + { + "epoch": 0.17314433775339125, + "grad_norm": 67.20260473782929, + "learning_rate": 1.2073170731707317e-08, + "logits/chosen": -0.1200600266456604, + "logits/rejected": 0.16886617243289948, + "logps/chosen": -113.31182098388672, + "logps/rejected": -54.280303955078125, + "loss": 0.6876, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0186814796179533, + "rewards/margins": 0.017820002511143684, + "rewards/rejected": 0.0008614779217168689, + "step": 284 + }, + { + "epoch": 0.17375400091449475, + "grad_norm": 68.2214791236598, + "learning_rate": 1.2117073170731707e-08, + "logits/chosen": 0.2490437924861908, + "logits/rejected": 0.19444838166236877, + "logps/chosen": -96.3344497680664, + "logps/rejected": -105.00188446044922, + "loss": 0.6823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.013741038739681244, + "rewards/margins": 0.02261732891201973, + "rewards/rejected": -0.008876289241015911, + "step": 285 + }, + { + "epoch": 0.17436366407559822, + "grad_norm": 68.96159344811562, + "learning_rate": 1.2160975609756098e-08, + "logits/chosen": 0.15206699073314667, + "logits/rejected": 0.19348430633544922, + "logps/chosen": -23.09684944152832, + "logps/rejected": -57.77210998535156, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0002287685638293624, + "rewards/margins": -0.008274602703750134, + "rewards/rejected": 0.008045833557844162, + "step": 286 + }, + { + "epoch": 0.17497332723670173, + "grad_norm": 75.95314882506473, + "learning_rate": 1.2204878048780488e-08, + "logits/chosen": 0.07631039619445801, + "logits/rejected": 0.05917757749557495, + "logps/chosen": -182.9187774658203, + "logps/rejected": -146.29830932617188, + "loss": 0.6892, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04305421561002731, + "rewards/margins": -0.0070860725827515125, + "rewards/rejected": 0.05014029145240784, + "step": 287 + }, + { + "epoch": 0.1755829903978052, + "grad_norm": 63.03047471572614, + "learning_rate": 1.2248780487804878e-08, + "logits/chosen": 0.06928090751171112, + "logits/rejected": 0.05114259943366051, + "logps/chosen": -9.679801940917969, + "logps/rejected": -46.23088455200195, + "loss": 0.6893, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0027698874473571777, + "rewards/margins": -0.0036961734294891357, + "rewards/rejected": 0.0009262859239242971, + "step": 288 + }, + { + "epoch": 0.1761926535589087, + "grad_norm": 77.659425328325, + "learning_rate": 1.2292682926829268e-08, + "logits/chosen": 0.1752196103334427, + "logits/rejected": 0.009080037474632263, + "logps/chosen": -176.6894989013672, + "logps/rejected": -155.2613525390625, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02636120282113552, + "rewards/margins": 0.0025505602825433016, + "rewards/rejected": 0.023810643702745438, + "step": 289 + }, + { + "epoch": 0.1768023167200122, + "grad_norm": 62.44123666007119, + "learning_rate": 1.2336585365853658e-08, + "logits/chosen": 0.11948700249195099, + "logits/rejected": 0.13347646594047546, + "logps/chosen": -255.56439208984375, + "logps/rejected": -272.15509033203125, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.046774283051490784, + "rewards/margins": 0.003010498359799385, + "rewards/rejected": 0.04376378282904625, + "step": 290 + }, + { + "epoch": 0.17741197988111568, + "grad_norm": 80.2087763882322, + "learning_rate": 1.2380487804878047e-08, + "logits/chosen": -0.143706813454628, + "logits/rejected": 0.03876394033432007, + "logps/chosen": -327.98394775390625, + "logps/rejected": -221.12884521484375, + "loss": 0.6822, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05700650438666344, + "rewards/margins": 0.05423087626695633, + "rewards/rejected": 0.0027756216004490852, + "step": 291 + }, + { + "epoch": 0.17802164304221918, + "grad_norm": 67.0077788494056, + "learning_rate": 1.2424390243902437e-08, + "logits/chosen": 0.11543978005647659, + "logits/rejected": 0.08386662602424622, + "logps/chosen": -142.90765380859375, + "logps/rejected": -203.89308166503906, + "loss": 0.6917, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023183250799775124, + "rewards/margins": -0.020256493240594864, + "rewards/rejected": 0.04343974590301514, + "step": 292 + }, + { + "epoch": 0.17863130620332265, + "grad_norm": 70.08502232129598, + "learning_rate": 1.2468292682926827e-08, + "logits/chosen": 0.3591795861721039, + "logits/rejected": 0.5780224800109863, + "logps/chosen": -200.40618896484375, + "logps/rejected": -150.89456176757812, + "loss": 0.6838, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04847842827439308, + "rewards/margins": 0.064790278673172, + "rewards/rejected": -0.016311855986714363, + "step": 293 + }, + { + "epoch": 0.17924096936442616, + "grad_norm": 80.72254859228765, + "learning_rate": 1.2512195121951219e-08, + "logits/chosen": 0.21753554046154022, + "logits/rejected": 0.3264102041721344, + "logps/chosen": -210.21556091308594, + "logps/rejected": -255.1538848876953, + "loss": 0.6801, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.022591782733798027, + "rewards/margins": 0.005800206214189529, + "rewards/rejected": 0.016791576519608498, + "step": 294 + }, + { + "epoch": 0.17985063252552966, + "grad_norm": 66.42495940270162, + "learning_rate": 1.2556097560975609e-08, + "logits/chosen": -0.036476314067840576, + "logits/rejected": 0.001236744225025177, + "logps/chosen": -38.627227783203125, + "logps/rejected": -24.439422607421875, + "loss": 0.6871, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.002048587892204523, + "rewards/margins": 0.012927819043397903, + "rewards/rejected": -0.010879230685532093, + "step": 295 + }, + { + "epoch": 0.18046029568663313, + "grad_norm": 68.23030718617511, + "learning_rate": 1.2599999999999999e-08, + "logits/chosen": 0.033884622156620026, + "logits/rejected": -0.24851661920547485, + "logps/chosen": -100.02306365966797, + "logps/rejected": -188.92852783203125, + "loss": 0.6831, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012595081701874733, + "rewards/margins": -0.019318008795380592, + "rewards/rejected": 0.031913090497255325, + "step": 296 + }, + { + "epoch": 0.18106995884773663, + "grad_norm": 60.34466285564362, + "learning_rate": 1.2643902439024389e-08, + "logits/chosen": 0.28695887327194214, + "logits/rejected": 0.2805330753326416, + "logps/chosen": -53.780399322509766, + "logps/rejected": -55.31425476074219, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011322993785142899, + "rewards/margins": 0.004818183369934559, + "rewards/rejected": 0.00650481041520834, + "step": 297 + }, + { + "epoch": 0.1816796220088401, + "grad_norm": 65.27088798439672, + "learning_rate": 1.2687804878048779e-08, + "logits/chosen": -0.3651633560657501, + "logits/rejected": -0.26293641328811646, + "logps/chosen": -83.63407135009766, + "logps/rejected": -109.81761169433594, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004503482487052679, + "rewards/margins": 0.004652255214750767, + "rewards/rejected": -0.0001487727276980877, + "step": 298 + }, + { + "epoch": 0.1822892851699436, + "grad_norm": 70.8428190288745, + "learning_rate": 1.2731707317073169e-08, + "logits/chosen": 0.11984477937221527, + "logits/rejected": 0.1091717779636383, + "logps/chosen": -97.25994873046875, + "logps/rejected": -83.18009185791016, + "loss": 0.6838, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01965179480612278, + "rewards/margins": 0.005422592628747225, + "rewards/rejected": 0.014229202643036842, + "step": 299 + }, + { + "epoch": 0.18289894833104708, + "grad_norm": 78.14090515623363, + "learning_rate": 1.277560975609756e-08, + "logits/chosen": 0.18486709892749786, + "logits/rejected": 0.23836563527584076, + "logps/chosen": -279.04827880859375, + "logps/rejected": -162.093994140625, + "loss": 0.685, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03765735402703285, + "rewards/margins": -0.012346451170742512, + "rewards/rejected": 0.05000380426645279, + "step": 300 + }, + { + "epoch": 0.1835086114921506, + "grad_norm": 67.7939086879178, + "learning_rate": 1.281951219512195e-08, + "logits/chosen": -0.051534123718738556, + "logits/rejected": 0.009366922080516815, + "logps/chosen": -140.4922637939453, + "logps/rejected": -180.57522583007812, + "loss": 0.6892, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.01847749948501587, + "rewards/margins": -0.007998285815119743, + "rewards/rejected": 0.02647578716278076, + "step": 301 + }, + { + "epoch": 0.1841182746532541, + "grad_norm": 66.52604228882412, + "learning_rate": 1.286341463414634e-08, + "logits/chosen": 0.17128783464431763, + "logits/rejected": 0.08553063124418259, + "logps/chosen": -63.9774055480957, + "logps/rejected": -86.0840072631836, + "loss": 0.6826, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0008003080729395151, + "rewards/margins": -0.004530087113380432, + "rewards/rejected": 0.005330395884811878, + "step": 302 + }, + { + "epoch": 0.18472793781435756, + "grad_norm": 57.95653169447476, + "learning_rate": 1.290731707317073e-08, + "logits/chosen": 0.10138653963804245, + "logits/rejected": 0.04099968075752258, + "logps/chosen": -57.205345153808594, + "logps/rejected": -158.7127685546875, + "loss": 0.6826, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012483743950724602, + "rewards/margins": -0.013382421806454659, + "rewards/rejected": 0.02586616389453411, + "step": 303 + }, + { + "epoch": 0.18533760097546106, + "grad_norm": 77.2152688293654, + "learning_rate": 1.295121951219512e-08, + "logits/chosen": -0.09431920945644379, + "logits/rejected": 0.21408668160438538, + "logps/chosen": -187.88401794433594, + "logps/rejected": -175.33287048339844, + "loss": 0.684, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03187720850110054, + "rewards/margins": 0.010566890239715576, + "rewards/rejected": 0.021310318261384964, + "step": 304 + }, + { + "epoch": 0.18594726413656454, + "grad_norm": 64.46126122037869, + "learning_rate": 1.299512195121951e-08, + "logits/chosen": -0.0165112167596817, + "logits/rejected": 0.04795960336923599, + "logps/chosen": -21.955730438232422, + "logps/rejected": -40.482303619384766, + "loss": 0.6897, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004565143957734108, + "rewards/margins": -0.0029786108061671257, + "rewards/rejected": 0.007543754298239946, + "step": 305 + }, + { + "epoch": 0.18655692729766804, + "grad_norm": 70.30427168183434, + "learning_rate": 1.30390243902439e-08, + "logits/chosen": 0.11046487092971802, + "logits/rejected": -0.20083540678024292, + "logps/chosen": -312.3200378417969, + "logps/rejected": -483.58050537109375, + "loss": 0.6834, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03268561512231827, + "rewards/margins": -0.02073187753558159, + "rewards/rejected": 0.05341749265789986, + "step": 306 + }, + { + "epoch": 0.18716659045877154, + "grad_norm": 68.57658949942703, + "learning_rate": 1.3082926829268292e-08, + "logits/chosen": 0.32596519589424133, + "logits/rejected": 0.2826036810874939, + "logps/chosen": -173.72314453125, + "logps/rejected": -270.8544616699219, + "loss": 0.6777, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.049483247101306915, + "rewards/margins": 0.002615373581647873, + "rewards/rejected": 0.04686787351965904, + "step": 307 + }, + { + "epoch": 0.18777625361987502, + "grad_norm": 69.75985299125284, + "learning_rate": 1.3126829268292682e-08, + "logits/chosen": -0.0608794242143631, + "logits/rejected": 0.05051223561167717, + "logps/chosen": -182.84506225585938, + "logps/rejected": -154.138916015625, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018271446228027344, + "rewards/margins": -0.0005187001079320908, + "rewards/rejected": 0.018790146335959435, + "step": 308 + }, + { + "epoch": 0.18838591678097852, + "grad_norm": 76.91526853893619, + "learning_rate": 1.3170731707317072e-08, + "logits/chosen": 0.08850069344043732, + "logits/rejected": -0.018443353474140167, + "logps/chosen": -160.97219848632812, + "logps/rejected": -243.20228576660156, + "loss": 0.6774, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07890434563159943, + "rewards/margins": 0.08282683789730072, + "rewards/rejected": -0.003922486677765846, + "step": 309 + }, + { + "epoch": 0.188995579942082, + "grad_norm": 90.68328105332195, + "learning_rate": 1.3214634146341462e-08, + "logits/chosen": -0.008595839142799377, + "logits/rejected": -0.034652434289455414, + "logps/chosen": -120.25025939941406, + "logps/rejected": -113.72771453857422, + "loss": 0.6698, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02222449705004692, + "rewards/margins": 0.01967179775238037, + "rewards/rejected": 0.0025526999961584806, + "step": 310 + }, + { + "epoch": 0.1896052431031855, + "grad_norm": 68.07711194958355, + "learning_rate": 1.3258536585365852e-08, + "logits/chosen": 0.19305960834026337, + "logits/rejected": 0.031466081738471985, + "logps/chosen": -190.48495483398438, + "logps/rejected": -322.8329162597656, + "loss": 0.6854, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06905165314674377, + "rewards/margins": 0.01939660869538784, + "rewards/rejected": 0.04965504631400108, + "step": 311 + }, + { + "epoch": 0.19021490626428897, + "grad_norm": 79.53778607030513, + "learning_rate": 1.3302439024390242e-08, + "logits/chosen": 0.21615885198116302, + "logits/rejected": 0.06739786267280579, + "logps/chosen": -12.077869415283203, + "logps/rejected": -36.77394485473633, + "loss": 0.6821, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00834993738681078, + "rewards/margins": 0.015117020346224308, + "rewards/rejected": -0.023466957733035088, + "step": 312 + }, + { + "epoch": 0.19082456942539247, + "grad_norm": 64.73371427411682, + "learning_rate": 1.3346341463414633e-08, + "logits/chosen": -0.018608778715133667, + "logits/rejected": 0.34300583600997925, + "logps/chosen": -267.38226318359375, + "logps/rejected": -72.22630310058594, + "loss": 0.68, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07769179344177246, + "rewards/margins": 0.0658668652176857, + "rewards/rejected": 0.011824929155409336, + "step": 313 + }, + { + "epoch": 0.19143423258649597, + "grad_norm": 75.58955343976855, + "learning_rate": 1.3390243902439024e-08, + "logits/chosen": 0.06820555776357651, + "logits/rejected": 0.38175755739212036, + "logps/chosen": -137.0054931640625, + "logps/rejected": -179.29209899902344, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01276245154440403, + "rewards/margins": -0.00345882261171937, + "rewards/rejected": 0.016221273690462112, + "step": 314 + }, + { + "epoch": 0.19204389574759945, + "grad_norm": 68.68931860822275, + "learning_rate": 1.3434146341463414e-08, + "logits/chosen": -0.2746092677116394, + "logits/rejected": 0.3636752963066101, + "logps/chosen": -537.5502319335938, + "logps/rejected": -327.2066650390625, + "loss": 0.6806, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08374825119972229, + "rewards/margins": 0.04939880222082138, + "rewards/rejected": 0.03434944525361061, + "step": 315 + }, + { + "epoch": 0.19265355890870295, + "grad_norm": 74.7544819583531, + "learning_rate": 1.3478048780487804e-08, + "logits/chosen": 0.12123498320579529, + "logits/rejected": 0.12799996137619019, + "logps/chosen": -269.18017578125, + "logps/rejected": -345.50787353515625, + "loss": 0.6754, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0619237907230854, + "rewards/margins": 0.019133759662508965, + "rewards/rejected": 0.04279003292322159, + "step": 316 + }, + { + "epoch": 0.19326322206980642, + "grad_norm": 75.88764727172456, + "learning_rate": 1.3521951219512194e-08, + "logits/chosen": 0.2825852036476135, + "logits/rejected": 0.195020854473114, + "logps/chosen": -279.4342041015625, + "logps/rejected": -225.8255157470703, + "loss": 0.6774, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.036413855850696564, + "rewards/margins": 0.019771214574575424, + "rewards/rejected": 0.01664264127612114, + "step": 317 + }, + { + "epoch": 0.19387288523090992, + "grad_norm": 80.27254116091218, + "learning_rate": 1.3565853658536584e-08, + "logits/chosen": 0.06519882380962372, + "logits/rejected": 0.16340479254722595, + "logps/chosen": -272.77972412109375, + "logps/rejected": -296.5497741699219, + "loss": 0.6805, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02005164511501789, + "rewards/margins": 0.011397051624953747, + "rewards/rejected": 0.008654594421386719, + "step": 318 + }, + { + "epoch": 0.1944825483920134, + "grad_norm": 78.19214046578293, + "learning_rate": 1.3609756097560974e-08, + "logits/chosen": 0.22997507452964783, + "logits/rejected": 0.22382646799087524, + "logps/chosen": -15.212729454040527, + "logps/rejected": -20.76936149597168, + "loss": 0.6839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.007372564170509577, + "rewards/margins": 0.0029362740460783243, + "rewards/rejected": -0.010308838449418545, + "step": 319 + }, + { + "epoch": 0.1950922115531169, + "grad_norm": 67.95675168668035, + "learning_rate": 1.3653658536585366e-08, + "logits/chosen": 0.14689771831035614, + "logits/rejected": 0.2453615814447403, + "logps/chosen": -315.5559387207031, + "logps/rejected": -197.97909545898438, + "loss": 0.6879, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.061474286019802094, + "rewards/margins": 0.02385631576180458, + "rewards/rejected": 0.037617966532707214, + "step": 320 + }, + { + "epoch": 0.1957018747142204, + "grad_norm": 65.1681457046982, + "learning_rate": 1.3697560975609756e-08, + "logits/chosen": -0.10886508971452713, + "logits/rejected": 0.0993279367685318, + "logps/chosen": -172.10069274902344, + "logps/rejected": -123.24795532226562, + "loss": 0.6776, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04928499832749367, + "rewards/margins": 0.041656363755464554, + "rewards/rejected": 0.00762863177806139, + "step": 321 + }, + { + "epoch": 0.19631153787532388, + "grad_norm": 85.13184974325452, + "learning_rate": 1.3741463414634146e-08, + "logits/chosen": 0.04807595908641815, + "logits/rejected": 0.10156890004873276, + "logps/chosen": -83.60809326171875, + "logps/rejected": -70.86627197265625, + "loss": 0.6901, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.004450934939086437, + "rewards/margins": -0.01871207356452942, + "rewards/rejected": 0.014261138625442982, + "step": 322 + }, + { + "epoch": 0.19692120103642738, + "grad_norm": 71.36768546843327, + "learning_rate": 1.3785365853658536e-08, + "logits/chosen": 0.39577725529670715, + "logits/rejected": 0.2465072125196457, + "logps/chosen": -86.98997497558594, + "logps/rejected": -94.35633850097656, + "loss": 0.6814, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023127174004912376, + "rewards/margins": 0.002038024365901947, + "rewards/rejected": 0.02108914777636528, + "step": 323 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 81.6959801621272, + "learning_rate": 1.3829268292682926e-08, + "logits/chosen": 0.3274226784706116, + "logits/rejected": 0.3019973933696747, + "logps/chosen": -12.285454750061035, + "logps/rejected": -12.830500602722168, + "loss": 0.6868, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002893734024837613, + "rewards/margins": 0.008286512456834316, + "rewards/rejected": -0.005392777733504772, + "step": 324 + }, + { + "epoch": 0.19814052735863436, + "grad_norm": 71.60153689924304, + "learning_rate": 1.3873170731707316e-08, + "logits/chosen": 0.13020280003547668, + "logits/rejected": 0.11316104233264923, + "logps/chosen": -21.833646774291992, + "logps/rejected": -22.383956909179688, + "loss": 0.6768, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.004084374755620956, + "rewards/margins": -0.003979433327913284, + "rewards/rejected": -0.00010494131129235029, + "step": 325 + }, + { + "epoch": 0.19875019051973786, + "grad_norm": 72.5793081914242, + "learning_rate": 1.3917073170731706e-08, + "logits/chosen": -0.010539315640926361, + "logits/rejected": 0.02804122120141983, + "logps/chosen": -74.72570037841797, + "logps/rejected": -19.162845611572266, + "loss": 0.6776, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005877655930817127, + "rewards/margins": -0.005930660292506218, + "rewards/rejected": 5.300401244312525e-05, + "step": 326 + }, + { + "epoch": 0.19935985368084133, + "grad_norm": 61.15732466814767, + "learning_rate": 1.3960975609756098e-08, + "logits/chosen": 0.05574619770050049, + "logits/rejected": 0.010447800159454346, + "logps/chosen": -17.385906219482422, + "logps/rejected": -33.139347076416016, + "loss": 0.68, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005576098337769508, + "rewards/margins": 0.011889881454408169, + "rewards/rejected": -0.00631378311663866, + "step": 327 + }, + { + "epoch": 0.19996951684194483, + "grad_norm": 65.92631336914816, + "learning_rate": 1.4004878048780488e-08, + "logits/chosen": -0.010778829455375671, + "logits/rejected": 0.045100219547748566, + "logps/chosen": -166.8217010498047, + "logps/rejected": -171.8801727294922, + "loss": 0.6884, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03109455294907093, + "rewards/margins": -0.006510566920042038, + "rewards/rejected": 0.03760511800646782, + "step": 328 + }, + { + "epoch": 0.2005791800030483, + "grad_norm": 73.21614905742888, + "learning_rate": 1.4048780487804878e-08, + "logits/chosen": 0.11447135359048843, + "logits/rejected": 0.30053094029426575, + "logps/chosen": -71.59620666503906, + "logps/rejected": -47.92333984375, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02008110284805298, + "rewards/margins": -0.012820337899029255, + "rewards/rejected": 0.03290143981575966, + "step": 329 + }, + { + "epoch": 0.2011888431641518, + "grad_norm": 69.45047129413042, + "learning_rate": 1.4092682926829268e-08, + "logits/chosen": 0.1477673351764679, + "logits/rejected": -0.25245657563209534, + "logps/chosen": -78.40091705322266, + "logps/rejected": -154.0237274169922, + "loss": 0.6773, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018941640853881836, + "rewards/margins": -0.003910040948539972, + "rewards/rejected": 0.02285168133676052, + "step": 330 + }, + { + "epoch": 0.20179850632525528, + "grad_norm": 74.12584253721137, + "learning_rate": 1.4136585365853658e-08, + "logits/chosen": 0.025876976549625397, + "logits/rejected": 0.1491180956363678, + "logps/chosen": -270.4678649902344, + "logps/rejected": -162.21859741210938, + "loss": 0.677, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07905292510986328, + "rewards/margins": 0.05511059612035751, + "rewards/rejected": 0.02394232712686062, + "step": 331 + }, + { + "epoch": 0.20240816948635879, + "grad_norm": 70.77365253302148, + "learning_rate": 1.4180487804878048e-08, + "logits/chosen": 0.18511934578418732, + "logits/rejected": 0.1744193583726883, + "logps/chosen": -166.81033325195312, + "logps/rejected": -178.5686492919922, + "loss": 0.6844, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.060819078236818314, + "rewards/margins": 0.059952981770038605, + "rewards/rejected": 0.0008660973981022835, + "step": 332 + }, + { + "epoch": 0.2030178326474623, + "grad_norm": 72.78538549799997, + "learning_rate": 1.4224390243902438e-08, + "logits/chosen": 0.23981404304504395, + "logits/rejected": -0.009058898314833641, + "logps/chosen": -42.26893997192383, + "logps/rejected": -107.62962341308594, + "loss": 0.6897, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0003601315547712147, + "rewards/margins": -0.027908090502023697, + "rewards/rejected": 0.027547962963581085, + "step": 333 + }, + { + "epoch": 0.20362749580856576, + "grad_norm": 81.6015116007124, + "learning_rate": 1.426829268292683e-08, + "logits/chosen": 0.21915912628173828, + "logits/rejected": 0.12471703439950943, + "logps/chosen": -93.21151733398438, + "logps/rejected": -82.82246398925781, + "loss": 0.6794, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018854821100831032, + "rewards/margins": -0.0010149849113076925, + "rewards/rejected": 0.01986980438232422, + "step": 334 + }, + { + "epoch": 0.20423715896966926, + "grad_norm": 70.07969810368989, + "learning_rate": 1.431219512195122e-08, + "logits/chosen": -0.07455149292945862, + "logits/rejected": 0.16817906498908997, + "logps/chosen": -255.03790283203125, + "logps/rejected": -149.12513732910156, + "loss": 0.6914, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0851854756474495, + "rewards/margins": 0.06319474428892136, + "rewards/rejected": 0.021990716457366943, + "step": 335 + }, + { + "epoch": 0.20484682213077274, + "grad_norm": 67.79996961976595, + "learning_rate": 1.435609756097561e-08, + "logits/chosen": 0.30746638774871826, + "logits/rejected": 0.26823362708091736, + "logps/chosen": -22.284900665283203, + "logps/rejected": -12.613653182983398, + "loss": 0.6869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015383643098175526, + "rewards/margins": 0.0022090603597462177, + "rewards/rejected": -0.01759270392358303, + "step": 336 + }, + { + "epoch": 0.20545648529187624, + "grad_norm": 71.5858654272021, + "learning_rate": 1.44e-08, + "logits/chosen": -0.04005648195743561, + "logits/rejected": 0.006720844656229019, + "logps/chosen": -45.05376434326172, + "logps/rejected": -51.12355422973633, + "loss": 0.6811, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.006362485699355602, + "rewards/margins": 0.001839661505073309, + "rewards/rejected": 0.004522824659943581, + "step": 337 + }, + { + "epoch": 0.20606614845297974, + "grad_norm": 60.123978078064084, + "learning_rate": 1.444390243902439e-08, + "logits/chosen": 0.18723182380199432, + "logits/rejected": 0.05063885822892189, + "logps/chosen": -242.62884521484375, + "logps/rejected": -243.73561096191406, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04879581928253174, + "rewards/margins": 0.027438664808869362, + "rewards/rejected": 0.021357156336307526, + "step": 338 + }, + { + "epoch": 0.20667581161408322, + "grad_norm": 73.62335435629042, + "learning_rate": 1.448780487804878e-08, + "logits/chosen": 0.24362905323505402, + "logits/rejected": 0.13164113461971283, + "logps/chosen": -291.8893737792969, + "logps/rejected": -349.28289794921875, + "loss": 0.6749, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06466102600097656, + "rewards/margins": 0.020573971793055534, + "rewards/rejected": 0.04408705234527588, + "step": 339 + }, + { + "epoch": 0.20728547477518672, + "grad_norm": 69.46968495310414, + "learning_rate": 1.4531707317073172e-08, + "logits/chosen": -0.10782186686992645, + "logits/rejected": 0.057648010551929474, + "logps/chosen": -382.67706298828125, + "logps/rejected": -227.41738891601562, + "loss": 0.6807, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11021074652671814, + "rewards/margins": 0.05586683005094528, + "rewards/rejected": 0.05434391647577286, + "step": 340 + }, + { + "epoch": 0.2078951379362902, + "grad_norm": 105.32468726144603, + "learning_rate": 1.4575609756097562e-08, + "logits/chosen": -0.10040725767612457, + "logits/rejected": 0.22399954497814178, + "logps/chosen": -131.145263671875, + "logps/rejected": -87.96915435791016, + "loss": 0.6862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009819794446229935, + "rewards/margins": 0.006483984645456076, + "rewards/rejected": 0.0033358102664351463, + "step": 341 + }, + { + "epoch": 0.2085048010973937, + "grad_norm": 68.16982295009444, + "learning_rate": 1.4619512195121952e-08, + "logits/chosen": -0.03772008419036865, + "logits/rejected": -0.08728692680597305, + "logps/chosen": -88.0912857055664, + "logps/rejected": -116.21687316894531, + "loss": 0.6763, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0516599640250206, + "rewards/margins": -0.0033350931480526924, + "rewards/rejected": 0.054995059967041016, + "step": 342 + }, + { + "epoch": 0.20911446425849717, + "grad_norm": 71.28049059029516, + "learning_rate": 1.466341463414634e-08, + "logits/chosen": 0.22303928434848785, + "logits/rejected": 0.286670982837677, + "logps/chosen": -142.04025268554688, + "logps/rejected": -63.7384033203125, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04323422163724899, + "rewards/margins": 0.029265914112329483, + "rewards/rejected": 0.013968306593596935, + "step": 343 + }, + { + "epoch": 0.20972412741960067, + "grad_norm": 66.75179875324964, + "learning_rate": 1.470731707317073e-08, + "logits/chosen": 0.08236043155193329, + "logits/rejected": 0.10761090368032455, + "logps/chosen": -262.4383544921875, + "logps/rejected": -132.79490661621094, + "loss": 0.6843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019755559042096138, + "rewards/margins": 0.054811589419841766, + "rewards/rejected": -0.03505603224039078, + "step": 344 + }, + { + "epoch": 0.21033379058070417, + "grad_norm": 67.62567899403905, + "learning_rate": 1.475121951219512e-08, + "logits/chosen": 0.3225303590297699, + "logits/rejected": 0.08760485053062439, + "logps/chosen": -76.64276123046875, + "logps/rejected": -275.00244140625, + "loss": 0.6791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0005946287419646978, + "rewards/margins": 0.036113083362579346, + "rewards/rejected": -0.03670771047472954, + "step": 345 + }, + { + "epoch": 0.21094345374180765, + "grad_norm": 71.01903716429102, + "learning_rate": 1.479512195121951e-08, + "logits/chosen": 0.31444257497787476, + "logits/rejected": 0.10715361684560776, + "logps/chosen": -62.70277786254883, + "logps/rejected": -155.2872314453125, + "loss": 0.679, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.039180826395750046, + "rewards/margins": 0.01031193882226944, + "rewards/rejected": 0.028868889436125755, + "step": 346 + }, + { + "epoch": 0.21155311690291115, + "grad_norm": 66.59902373176311, + "learning_rate": 1.48390243902439e-08, + "logits/chosen": 0.06511352956295013, + "logits/rejected": 0.040359482169151306, + "logps/chosen": -101.69937896728516, + "logps/rejected": -85.10595703125, + "loss": 0.6853, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0786714106798172, + "rewards/margins": 0.017423998564481735, + "rewards/rejected": 0.06124741584062576, + "step": 347 + }, + { + "epoch": 0.21216278006401462, + "grad_norm": 77.56987130285738, + "learning_rate": 1.4882926829268292e-08, + "logits/chosen": 0.3418797254562378, + "logits/rejected": 0.35460710525512695, + "logps/chosen": -8.70235824584961, + "logps/rejected": -13.812934875488281, + "loss": 0.6747, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.009386222809553146, + "rewards/margins": -0.0036907976027578115, + "rewards/rejected": -0.005695425905287266, + "step": 348 + }, + { + "epoch": 0.21277244322511812, + "grad_norm": 73.92616252372007, + "learning_rate": 1.492682926829268e-08, + "logits/chosen": 0.14430879056453705, + "logits/rejected": 0.04682805389165878, + "logps/chosen": -126.25191497802734, + "logps/rejected": -186.4823760986328, + "loss": 0.6746, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015251537784934044, + "rewards/margins": -0.0009545590728521347, + "rewards/rejected": 0.016206098720431328, + "step": 349 + }, + { + "epoch": 0.21338210638622163, + "grad_norm": 74.28393494263427, + "learning_rate": 1.4970731707317072e-08, + "logits/chosen": 0.0976111888885498, + "logits/rejected": 0.3321661353111267, + "logps/chosen": -271.572265625, + "logps/rejected": -187.7233123779297, + "loss": 0.677, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08827683329582214, + "rewards/margins": 0.02552950382232666, + "rewards/rejected": 0.06274733692407608, + "step": 350 + }, + { + "epoch": 0.2139917695473251, + "grad_norm": 69.05233519059429, + "learning_rate": 1.501463414634146e-08, + "logits/chosen": 0.2835656702518463, + "logits/rejected": 0.24562017619609833, + "logps/chosen": -10.603927612304688, + "logps/rejected": -20.90781021118164, + "loss": 0.6745, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0011128070764243603, + "rewards/margins": 0.0070378538221120834, + "rewards/rejected": -0.005925048142671585, + "step": 351 + }, + { + "epoch": 0.2146014327084286, + "grad_norm": 59.37878833199465, + "learning_rate": 1.5058536585365852e-08, + "logits/chosen": 0.16556383669376373, + "logits/rejected": 0.16473199427127838, + "logps/chosen": -15.063851356506348, + "logps/rejected": -8.009671211242676, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003202187828719616, + "rewards/margins": 0.008544718846678734, + "rewards/rejected": -0.01174690667539835, + "step": 352 + }, + { + "epoch": 0.21521109586953208, + "grad_norm": 85.20699487189376, + "learning_rate": 1.5102439024390244e-08, + "logits/chosen": -0.09299083054065704, + "logits/rejected": -0.07927870750427246, + "logps/chosen": -249.5418243408203, + "logps/rejected": -218.46571350097656, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06072034686803818, + "rewards/margins": 0.00193779356777668, + "rewards/rejected": 0.05878255516290665, + "step": 353 + }, + { + "epoch": 0.21582075903063558, + "grad_norm": 69.44085781934815, + "learning_rate": 1.5146341463414632e-08, + "logits/chosen": 0.20161965489387512, + "logits/rejected": 0.27443933486938477, + "logps/chosen": -60.126609802246094, + "logps/rejected": -105.94583129882812, + "loss": 0.6863, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032499201595783234, + "rewards/margins": 0.0038094576448202133, + "rewards/rejected": 0.02868974395096302, + "step": 354 + }, + { + "epoch": 0.21643042219173905, + "grad_norm": 86.77175274631634, + "learning_rate": 1.5190243902439024e-08, + "logits/chosen": 0.3397376239299774, + "logits/rejected": 0.49842071533203125, + "logps/chosen": -145.3336944580078, + "logps/rejected": -133.59646606445312, + "loss": 0.6798, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05712239816784859, + "rewards/margins": 0.03735244274139404, + "rewards/rejected": 0.019769955426454544, + "step": 355 + }, + { + "epoch": 0.21704008535284255, + "grad_norm": 70.30705893545047, + "learning_rate": 1.5234146341463412e-08, + "logits/chosen": 0.3698701858520508, + "logits/rejected": 0.37715011835098267, + "logps/chosen": -201.96597290039062, + "logps/rejected": -180.94497680664062, + "loss": 0.676, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.129371777176857, + "rewards/margins": 0.12676672637462616, + "rewards/rejected": 0.002605058252811432, + "step": 356 + }, + { + "epoch": 0.21764974851394606, + "grad_norm": 72.03748108737564, + "learning_rate": 1.5278048780487804e-08, + "logits/chosen": -0.01042770966887474, + "logits/rejected": -0.040296003222465515, + "logps/chosen": -85.13925170898438, + "logps/rejected": -87.65917205810547, + "loss": 0.6762, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.018222618848085403, + "rewards/margins": 0.006572199985384941, + "rewards/rejected": 0.011650418862700462, + "step": 357 + }, + { + "epoch": 0.21825941167504953, + "grad_norm": 71.02458108213027, + "learning_rate": 1.5321951219512196e-08, + "logits/chosen": 0.37640199065208435, + "logits/rejected": 0.3643328547477722, + "logps/chosen": -110.40522003173828, + "logps/rejected": -125.43128204345703, + "loss": 0.6783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04483679682016373, + "rewards/margins": 0.037366390228271484, + "rewards/rejected": 0.0074704051949083805, + "step": 358 + }, + { + "epoch": 0.21886907483615303, + "grad_norm": 72.27060883103734, + "learning_rate": 1.5365853658536584e-08, + "logits/chosen": -0.029051396995782852, + "logits/rejected": 0.003680701367557049, + "logps/chosen": -196.9068603515625, + "logps/rejected": -92.90815734863281, + "loss": 0.6823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07214992493391037, + "rewards/margins": 0.01639285311102867, + "rewards/rejected": 0.0557570680975914, + "step": 359 + }, + { + "epoch": 0.2194787379972565, + "grad_norm": 75.53555099638058, + "learning_rate": 1.5409756097560976e-08, + "logits/chosen": 0.1217944324016571, + "logits/rejected": 0.14078305661678314, + "logps/chosen": -194.9487762451172, + "logps/rejected": -117.2947769165039, + "loss": 0.6762, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06772525608539581, + "rewards/margins": 0.05223900079727173, + "rewards/rejected": 0.015486260876059532, + "step": 360 + }, + { + "epoch": 0.22008840115836, + "grad_norm": 71.75572742756489, + "learning_rate": 1.5453658536585364e-08, + "logits/chosen": 0.26062798500061035, + "logits/rejected": 0.041195519268512726, + "logps/chosen": -79.91947937011719, + "logps/rejected": -109.20831298828125, + "loss": 0.6946, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.061104319989681244, + "rewards/margins": 0.05685047060251236, + "rewards/rejected": 0.004253853112459183, + "step": 361 + }, + { + "epoch": 0.22069806431946348, + "grad_norm": 62.98288199759064, + "learning_rate": 1.5497560975609756e-08, + "logits/chosen": 0.16532793641090393, + "logits/rejected": 0.15597744286060333, + "logps/chosen": -42.087825775146484, + "logps/rejected": -80.29241180419922, + "loss": 0.6775, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.029199182987213135, + "rewards/margins": 0.02532096952199936, + "rewards/rejected": 0.0038782134652137756, + "step": 362 + }, + { + "epoch": 0.22130772748056698, + "grad_norm": 75.18737218514192, + "learning_rate": 1.5541463414634144e-08, + "logits/chosen": 0.09273873269557953, + "logits/rejected": 0.1297970712184906, + "logps/chosen": -81.83040618896484, + "logps/rejected": -37.91920852661133, + "loss": 0.6743, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03654135391116142, + "rewards/margins": 0.022258350625634193, + "rewards/rejected": 0.014283007010817528, + "step": 363 + }, + { + "epoch": 0.2219173906416705, + "grad_norm": 76.11977534284549, + "learning_rate": 1.5585365853658536e-08, + "logits/chosen": 0.16321827471256256, + "logits/rejected": 0.343872994184494, + "logps/chosen": -384.1506042480469, + "logps/rejected": -154.103515625, + "loss": 0.664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16939716041088104, + "rewards/margins": 0.1677771955728531, + "rewards/rejected": 0.001619958784431219, + "step": 364 + }, + { + "epoch": 0.22252705380277396, + "grad_norm": 71.43880381699847, + "learning_rate": 1.5629268292682927e-08, + "logits/chosen": 0.12354715168476105, + "logits/rejected": 0.18932867050170898, + "logps/chosen": -381.6833801269531, + "logps/rejected": -319.3189392089844, + "loss": 0.665, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15169011056423187, + "rewards/margins": 0.1057649627327919, + "rewards/rejected": 0.045925140380859375, + "step": 365 + }, + { + "epoch": 0.22313671696387746, + "grad_norm": 70.59839888869874, + "learning_rate": 1.5673170731707316e-08, + "logits/chosen": 0.3457193374633789, + "logits/rejected": 0.35894274711608887, + "logps/chosen": -77.30677032470703, + "logps/rejected": -94.93476104736328, + "loss": 0.6809, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.052020955830812454, + "rewards/margins": 0.010834289714694023, + "rewards/rejected": 0.04118666425347328, + "step": 366 + }, + { + "epoch": 0.22374638012498094, + "grad_norm": 66.61954489334738, + "learning_rate": 1.5717073170731707e-08, + "logits/chosen": -0.16386066377162933, + "logits/rejected": 0.1188196912407875, + "logps/chosen": -337.9201354980469, + "logps/rejected": -212.218017578125, + "loss": 0.6802, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0663379654288292, + "rewards/margins": 0.06414184719324112, + "rewards/rejected": 0.002196121495217085, + "step": 367 + }, + { + "epoch": 0.22435604328608444, + "grad_norm": 66.95436799361724, + "learning_rate": 1.5760975609756096e-08, + "logits/chosen": 0.2824529707431793, + "logits/rejected": -0.034459665417671204, + "logps/chosen": -44.80812072753906, + "logps/rejected": -98.14501190185547, + "loss": 0.6842, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.013245273381471634, + "rewards/margins": -0.02863171324133873, + "rewards/rejected": 0.015386438928544521, + "step": 368 + }, + { + "epoch": 0.22496570644718794, + "grad_norm": 73.92873087284248, + "learning_rate": 1.5804878048780488e-08, + "logits/chosen": 0.2049051970243454, + "logits/rejected": 0.2036842703819275, + "logps/chosen": -23.559926986694336, + "logps/rejected": -19.191795349121094, + "loss": 0.6805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00024777118233032525, + "rewards/margins": 0.0016170772723853588, + "rewards/rejected": -0.001864847494289279, + "step": 369 + }, + { + "epoch": 0.22557536960829142, + "grad_norm": 76.3516607294514, + "learning_rate": 1.5848780487804876e-08, + "logits/chosen": -0.022610221058130264, + "logits/rejected": 0.06078797206282616, + "logps/chosen": -238.5109100341797, + "logps/rejected": -150.67947387695312, + "loss": 0.6752, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06348590552806854, + "rewards/margins": 0.072443388402462, + "rewards/rejected": -0.00895748008042574, + "step": 370 + }, + { + "epoch": 0.22618503276939492, + "grad_norm": 71.07269983204453, + "learning_rate": 1.5892682926829268e-08, + "logits/chosen": -0.17673306167125702, + "logits/rejected": -0.04492383077740669, + "logps/chosen": -212.25999450683594, + "logps/rejected": -166.0389862060547, + "loss": 0.677, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0878465548157692, + "rewards/margins": 0.058984603732824326, + "rewards/rejected": 0.028861945495009422, + "step": 371 + }, + { + "epoch": 0.2267946959304984, + "grad_norm": 62.117189746721714, + "learning_rate": 1.593658536585366e-08, + "logits/chosen": 0.2229243814945221, + "logits/rejected": 0.19021424651145935, + "logps/chosen": -104.9860610961914, + "logps/rejected": -148.7133026123047, + "loss": 0.6891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06309252977371216, + "rewards/margins": 0.04245776683092117, + "rewards/rejected": 0.020634770393371582, + "step": 372 + }, + { + "epoch": 0.2274043590916019, + "grad_norm": 71.22364328481646, + "learning_rate": 1.5980487804878048e-08, + "logits/chosen": 0.1851186901330948, + "logits/rejected": 0.1108449250459671, + "logps/chosen": -229.87127685546875, + "logps/rejected": -267.2801818847656, + "loss": 0.6803, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12824177742004395, + "rewards/margins": 0.07881484925746918, + "rewards/rejected": 0.049426935613155365, + "step": 373 + }, + { + "epoch": 0.22801402225270537, + "grad_norm": 64.10057772002011, + "learning_rate": 1.602439024390244e-08, + "logits/chosen": 0.41635462641716003, + "logits/rejected": 0.4206591546535492, + "logps/chosen": -9.9085054397583, + "logps/rejected": -3.9580512046813965, + "loss": 0.6759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.008948644623160362, + "rewards/margins": 0.01165156438946724, + "rewards/rejected": -0.02060021087527275, + "step": 374 + }, + { + "epoch": 0.22862368541380887, + "grad_norm": 67.90557059778888, + "learning_rate": 1.6068292682926828e-08, + "logits/chosen": 0.10314898192882538, + "logits/rejected": 0.0605747364461422, + "logps/chosen": -203.1617889404297, + "logps/rejected": -161.07518005371094, + "loss": 0.678, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1218641996383667, + "rewards/margins": 0.07133705914020538, + "rewards/rejected": 0.050527140498161316, + "step": 375 + }, + { + "epoch": 0.22923334857491237, + "grad_norm": 76.57097598943874, + "learning_rate": 1.611219512195122e-08, + "logits/chosen": -0.10780695080757141, + "logits/rejected": 0.17631042003631592, + "logps/chosen": -318.8275451660156, + "logps/rejected": -158.26197814941406, + "loss": 0.6662, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0654771700501442, + "rewards/margins": 0.05782622471451759, + "rewards/rejected": 0.0076509444043040276, + "step": 376 + }, + { + "epoch": 0.22984301173601585, + "grad_norm": 64.66974887707583, + "learning_rate": 1.6156097560975608e-08, + "logits/chosen": 0.2233721911907196, + "logits/rejected": 0.25299790501594543, + "logps/chosen": -89.06605529785156, + "logps/rejected": -87.20704650878906, + "loss": 0.6798, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.043851472437381744, + "rewards/margins": 0.0646313801407814, + "rewards/rejected": -0.020779911428689957, + "step": 377 + }, + { + "epoch": 0.23045267489711935, + "grad_norm": 68.70294004326945, + "learning_rate": 1.62e-08, + "logits/chosen": 0.015518620610237122, + "logits/rejected": 0.04770684242248535, + "logps/chosen": -122.88348388671875, + "logps/rejected": -90.29814910888672, + "loss": 0.6755, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.042027123272418976, + "rewards/margins": -0.0038461170624941587, + "rewards/rejected": 0.045873235911130905, + "step": 378 + }, + { + "epoch": 0.23106233805822282, + "grad_norm": 69.59769440231553, + "learning_rate": 1.624390243902439e-08, + "logits/chosen": 0.13944903016090393, + "logits/rejected": 0.08162405341863632, + "logps/chosen": -22.848159790039062, + "logps/rejected": -26.46254539489746, + "loss": 0.6773, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0193781740963459, + "rewards/margins": 0.031845081597566605, + "rewards/rejected": -0.012466907501220703, + "step": 379 + }, + { + "epoch": 0.23167200121932632, + "grad_norm": 86.15023573147026, + "learning_rate": 1.628780487804878e-08, + "logits/chosen": 0.01964738965034485, + "logits/rejected": 0.08549747616052628, + "logps/chosen": -315.42120361328125, + "logps/rejected": -250.3840789794922, + "loss": 0.6834, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1753004491329193, + "rewards/margins": 0.042230166494846344, + "rewards/rejected": 0.13307029008865356, + "step": 380 + }, + { + "epoch": 0.23228166438042983, + "grad_norm": 73.90575796165558, + "learning_rate": 1.633170731707317e-08, + "logits/chosen": 0.2518499493598938, + "logits/rejected": 0.20044390857219696, + "logps/chosen": -282.0683898925781, + "logps/rejected": -402.50018310546875, + "loss": 0.6604, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2078959345817566, + "rewards/margins": 0.07857249677181244, + "rewards/rejected": 0.12932342290878296, + "step": 381 + }, + { + "epoch": 0.2328913275415333, + "grad_norm": 65.02101991761026, + "learning_rate": 1.637560975609756e-08, + "logits/chosen": 0.12615638971328735, + "logits/rejected": -0.0017370134592056274, + "logps/chosen": -144.21067810058594, + "logps/rejected": -175.3563690185547, + "loss": 0.6883, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021349143236875534, + "rewards/margins": -0.027446461841464043, + "rewards/rejected": 0.048795606940984726, + "step": 382 + }, + { + "epoch": 0.2335009907026368, + "grad_norm": 84.86151834266377, + "learning_rate": 1.641951219512195e-08, + "logits/chosen": 0.08340620249509811, + "logits/rejected": 0.09593676775693893, + "logps/chosen": -173.072021484375, + "logps/rejected": -201.46633911132812, + "loss": 0.6543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15591296553611755, + "rewards/margins": 0.13049039244651794, + "rewards/rejected": 0.025422584265470505, + "step": 383 + }, + { + "epoch": 0.23411065386374028, + "grad_norm": 66.10244136306407, + "learning_rate": 1.646341463414634e-08, + "logits/chosen": -0.07609017938375473, + "logits/rejected": -0.06766972690820694, + "logps/chosen": -85.554931640625, + "logps/rejected": -83.42675018310547, + "loss": 0.6744, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0532434917986393, + "rewards/margins": 0.0021083245519548655, + "rewards/rejected": 0.05113516375422478, + "step": 384 + }, + { + "epoch": 0.23472031702484378, + "grad_norm": 67.2842157885265, + "learning_rate": 1.650731707317073e-08, + "logits/chosen": 0.044176261872053146, + "logits/rejected": 0.2498132288455963, + "logps/chosen": -93.33272552490234, + "logps/rejected": -56.10679244995117, + "loss": 0.6712, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06030142307281494, + "rewards/margins": 0.03469962999224663, + "rewards/rejected": 0.025601793080568314, + "step": 385 + }, + { + "epoch": 0.23532998018594725, + "grad_norm": 62.52062745782673, + "learning_rate": 1.6551219512195123e-08, + "logits/chosen": 0.17597870528697968, + "logits/rejected": 0.23001301288604736, + "logps/chosen": -23.035144805908203, + "logps/rejected": -38.43086242675781, + "loss": 0.6969, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.021736489608883858, + "rewards/margins": -0.03424207493662834, + "rewards/rejected": 0.01250558439642191, + "step": 386 + }, + { + "epoch": 0.23593964334705075, + "grad_norm": 75.00696126073979, + "learning_rate": 1.659512195121951e-08, + "logits/chosen": 0.054178833961486816, + "logits/rejected": 0.8317170739173889, + "logps/chosen": -294.10821533203125, + "logps/rejected": -207.20391845703125, + "loss": 0.6686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02296428754925728, + "rewards/margins": 0.02375636249780655, + "rewards/rejected": -0.0007920744828879833, + "step": 387 + }, + { + "epoch": 0.23654930650815426, + "grad_norm": 69.94571772040307, + "learning_rate": 1.6639024390243903e-08, + "logits/chosen": 0.13065457344055176, + "logits/rejected": 0.1328984498977661, + "logps/chosen": -162.05563354492188, + "logps/rejected": -255.76258850097656, + "loss": 0.6675, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08216194808483124, + "rewards/margins": 0.043298520147800446, + "rewards/rejected": 0.038863420486450195, + "step": 388 + }, + { + "epoch": 0.23715896966925773, + "grad_norm": 69.62954136859975, + "learning_rate": 1.668292682926829e-08, + "logits/chosen": 0.2529556155204773, + "logits/rejected": 0.1152164563536644, + "logps/chosen": -268.89447021484375, + "logps/rejected": -566.7929077148438, + "loss": 0.6693, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11476411670446396, + "rewards/margins": 0.05900774151086807, + "rewards/rejected": 0.05575638264417648, + "step": 389 + }, + { + "epoch": 0.23776863283036123, + "grad_norm": 70.59570631057309, + "learning_rate": 1.6726829268292683e-08, + "logits/chosen": 0.21214380860328674, + "logits/rejected": 0.23615749180316925, + "logps/chosen": -27.17583465576172, + "logps/rejected": -41.58104705810547, + "loss": 0.6761, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.024480272084474564, + "rewards/margins": -0.029336892068386078, + "rewards/rejected": 0.05381716042757034, + "step": 390 + }, + { + "epoch": 0.2383782959914647, + "grad_norm": 62.53230766531854, + "learning_rate": 1.677073170731707e-08, + "logits/chosen": 0.04014543443918228, + "logits/rejected": 0.1318899691104889, + "logps/chosen": -296.1917724609375, + "logps/rejected": -224.27108764648438, + "loss": 0.6781, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14381027221679688, + "rewards/margins": 0.09685342758893967, + "rewards/rejected": 0.046956852078437805, + "step": 391 + }, + { + "epoch": 0.2389879591525682, + "grad_norm": 66.55683699515184, + "learning_rate": 1.6814634146341463e-08, + "logits/chosen": 0.32010552287101746, + "logits/rejected": 0.2512872517108917, + "logps/chosen": -69.6078872680664, + "logps/rejected": -126.6366195678711, + "loss": 0.663, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012041234411299229, + "rewards/margins": 0.0030512893572449684, + "rewards/rejected": 0.008989945985376835, + "step": 392 + }, + { + "epoch": 0.2395976223136717, + "grad_norm": 69.45333612941845, + "learning_rate": 1.6858536585365855e-08, + "logits/chosen": -0.005228646099567413, + "logits/rejected": 0.503351092338562, + "logps/chosen": -258.2673645019531, + "logps/rejected": -256.77728271484375, + "loss": 0.6708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11970634758472443, + "rewards/margins": 0.04186611622571945, + "rewards/rejected": 0.07784023135900497, + "step": 393 + }, + { + "epoch": 0.24020728547477518, + "grad_norm": 56.83502852210459, + "learning_rate": 1.6902439024390243e-08, + "logits/chosen": -0.14765405654907227, + "logits/rejected": -0.25918394327163696, + "logps/chosen": -96.93010711669922, + "logps/rejected": -176.19113159179688, + "loss": 0.6733, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04271545261144638, + "rewards/margins": 0.043752267956733704, + "rewards/rejected": -0.0010368103394284844, + "step": 394 + }, + { + "epoch": 0.24081694863587869, + "grad_norm": 77.91111017133723, + "learning_rate": 1.6946341463414632e-08, + "logits/chosen": 0.30944663286209106, + "logits/rejected": 0.21470560133457184, + "logps/chosen": -419.9107666015625, + "logps/rejected": -390.0615234375, + "loss": 0.6809, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09270159155130386, + "rewards/margins": 0.015946976840496063, + "rewards/rejected": 0.0767546147108078, + "step": 395 + }, + { + "epoch": 0.24142661179698216, + "grad_norm": 68.24931704160569, + "learning_rate": 1.6990243902439023e-08, + "logits/chosen": 0.1575380265712738, + "logits/rejected": 0.1915324479341507, + "logps/chosen": -26.391529083251953, + "logps/rejected": -25.220199584960938, + "loss": 0.6854, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016043771058321, + "rewards/margins": 0.008305154740810394, + "rewards/rejected": -0.024348925799131393, + "step": 396 + }, + { + "epoch": 0.24203627495808566, + "grad_norm": 58.7737804792128, + "learning_rate": 1.7034146341463412e-08, + "logits/chosen": 0.28481295704841614, + "logits/rejected": 0.21050994098186493, + "logps/chosen": -72.94969940185547, + "logps/rejected": -151.31069946289062, + "loss": 0.6659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04740132391452789, + "rewards/margins": 0.06847492605447769, + "rewards/rejected": -0.021073605865240097, + "step": 397 + }, + { + "epoch": 0.24264593811918914, + "grad_norm": 75.02976906364937, + "learning_rate": 1.7078048780487803e-08, + "logits/chosen": 0.16953998804092407, + "logits/rejected": 0.38476526737213135, + "logps/chosen": -152.58050537109375, + "logps/rejected": -50.43113708496094, + "loss": 0.6726, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05665000155568123, + "rewards/margins": 0.06928949803113937, + "rewards/rejected": -0.012639498338103294, + "step": 398 + }, + { + "epoch": 0.24325560128029264, + "grad_norm": 75.03590741770022, + "learning_rate": 1.7121951219512192e-08, + "logits/chosen": 0.15704068541526794, + "logits/rejected": 0.20390085875988007, + "logps/chosen": -290.34130859375, + "logps/rejected": -163.56857299804688, + "loss": 0.6644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11936527490615845, + "rewards/margins": 0.08607812225818634, + "rewards/rejected": 0.03328714519739151, + "step": 399 + }, + { + "epoch": 0.24386526444139614, + "grad_norm": 63.37300742176534, + "learning_rate": 1.7165853658536583e-08, + "logits/chosen": 0.04996849596500397, + "logits/rejected": 0.44465917348861694, + "logps/chosen": -398.497802734375, + "logps/rejected": -296.1901550292969, + "loss": 0.6832, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18099498748779297, + "rewards/margins": 0.09929068386554718, + "rewards/rejected": 0.08170431107282639, + "step": 400 + }, + { + "epoch": 0.24447492760249961, + "grad_norm": 66.90625226276222, + "learning_rate": 1.7209756097560975e-08, + "logits/chosen": 0.09004481881856918, + "logits/rejected": 0.13244619965553284, + "logps/chosen": -120.15802001953125, + "logps/rejected": -104.40956115722656, + "loss": 0.6586, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13799571990966797, + "rewards/margins": 0.06407812237739563, + "rewards/rejected": 0.07391760498285294, + "step": 401 + }, + { + "epoch": 0.24508459076360312, + "grad_norm": 57.48359214891604, + "learning_rate": 1.7253658536585364e-08, + "logits/chosen": 0.0856451690196991, + "logits/rejected": 0.1392892301082611, + "logps/chosen": -9.377010345458984, + "logps/rejected": -14.292369842529297, + "loss": 0.6821, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0027606389485299587, + "rewards/margins": -0.01972278766334057, + "rewards/rejected": 0.022483427077531815, + "step": 402 + }, + { + "epoch": 0.2456942539247066, + "grad_norm": 66.60343709678001, + "learning_rate": 1.7297560975609755e-08, + "logits/chosen": 0.3190247714519501, + "logits/rejected": 0.0962214469909668, + "logps/chosen": -95.74803161621094, + "logps/rejected": -177.26490783691406, + "loss": 0.6713, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.011794280260801315, + "rewards/margins": -0.022473935037851334, + "rewards/rejected": 0.010679653845727444, + "step": 403 + }, + { + "epoch": 0.2463039170858101, + "grad_norm": 69.51953007957174, + "learning_rate": 1.7341463414634144e-08, + "logits/chosen": 0.2589119076728821, + "logits/rejected": 0.4490640461444855, + "logps/chosen": -448.5097351074219, + "logps/rejected": -340.63507080078125, + "loss": 0.6583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17544841766357422, + "rewards/margins": 0.13995066285133362, + "rewards/rejected": 0.0354977622628212, + "step": 404 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 83.31258597050335, + "learning_rate": 1.7385365853658535e-08, + "logits/chosen": -0.07003955543041229, + "logits/rejected": 0.3138444721698761, + "logps/chosen": -391.0108642578125, + "logps/rejected": -156.393310546875, + "loss": 0.6746, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1309162825345993, + "rewards/margins": 0.12338415533304214, + "rewards/rejected": 0.007532132789492607, + "step": 405 + }, + { + "epoch": 0.24752324340801707, + "grad_norm": 66.03022550267046, + "learning_rate": 1.7429268292682927e-08, + "logits/chosen": 0.07335321605205536, + "logits/rejected": 0.08494667708873749, + "logps/chosen": -99.77315521240234, + "logps/rejected": -66.1137466430664, + "loss": 0.6663, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046557843685150146, + "rewards/margins": 0.09241703897714615, + "rewards/rejected": -0.045859195291996, + "step": 406 + }, + { + "epoch": 0.24813290656912057, + "grad_norm": 74.775088673753, + "learning_rate": 1.7473170731707315e-08, + "logits/chosen": 0.24731683731079102, + "logits/rejected": 0.03129180520772934, + "logps/chosen": -211.49945068359375, + "logps/rejected": -364.1927490234375, + "loss": 0.6771, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1337459683418274, + "rewards/margins": 0.1884436011314392, + "rewards/rejected": -0.05469761788845062, + "step": 407 + }, + { + "epoch": 0.24874256973022404, + "grad_norm": 63.2436985259422, + "learning_rate": 1.7517073170731707e-08, + "logits/chosen": 0.2944685220718384, + "logits/rejected": 0.16938583552837372, + "logps/chosen": -60.469932556152344, + "logps/rejected": -226.4971466064453, + "loss": 0.6697, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01250808872282505, + "rewards/margins": -0.03638029098510742, + "rewards/rejected": 0.048888374119997025, + "step": 408 + }, + { + "epoch": 0.24935223289132755, + "grad_norm": 64.69668791721564, + "learning_rate": 1.7560975609756095e-08, + "logits/chosen": 0.19941283762454987, + "logits/rejected": 0.2230873554944992, + "logps/chosen": -187.40347290039062, + "logps/rejected": -127.78536224365234, + "loss": 0.6603, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.035814762115478516, + "rewards/margins": 0.028313683345913887, + "rewards/rejected": 0.007501076906919479, + "step": 409 + }, + { + "epoch": 0.24996189605243102, + "grad_norm": 64.72994759177746, + "learning_rate": 1.7604878048780487e-08, + "logits/chosen": 0.09028814733028412, + "logits/rejected": 0.12085546553134918, + "logps/chosen": -79.18751525878906, + "logps/rejected": -212.34779357910156, + "loss": 0.6641, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02626047283411026, + "rewards/margins": 0.050155192613601685, + "rewards/rejected": -0.023894723504781723, + "step": 410 + }, + { + "epoch": 0.2505715592135345, + "grad_norm": 80.73701959991998, + "learning_rate": 1.7648780487804875e-08, + "logits/chosen": 0.21164226531982422, + "logits/rejected": 0.12578725814819336, + "logps/chosen": -24.69331932067871, + "logps/rejected": -62.57676696777344, + "loss": 0.6619, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017589306458830833, + "rewards/margins": 0.0014315862208604813, + "rewards/rejected": 0.0161577221006155, + "step": 411 + }, + { + "epoch": 0.251181222374638, + "grad_norm": 66.00444890188646, + "learning_rate": 1.7692682926829267e-08, + "logits/chosen": 0.07404673099517822, + "logits/rejected": -0.03831756114959717, + "logps/chosen": -107.595947265625, + "logps/rejected": -138.1403350830078, + "loss": 0.6817, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07385444641113281, + "rewards/margins": 0.013393595814704895, + "rewards/rejected": 0.060460854321718216, + "step": 412 + }, + { + "epoch": 0.2517908855357415, + "grad_norm": 64.14888621824818, + "learning_rate": 1.773658536585366e-08, + "logits/chosen": -0.05230114609003067, + "logits/rejected": 0.33488669991493225, + "logps/chosen": -127.23094177246094, + "logps/rejected": -57.00074005126953, + "loss": 0.673, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.038622189313173294, + "rewards/margins": 0.03555377945303917, + "rewards/rejected": 0.0030684114899486303, + "step": 413 + }, + { + "epoch": 0.252400548696845, + "grad_norm": 64.3480813371628, + "learning_rate": 1.7780487804878047e-08, + "logits/chosen": -0.13973070681095123, + "logits/rejected": 0.19591814279556274, + "logps/chosen": -174.74899291992188, + "logps/rejected": -186.95013427734375, + "loss": 0.6711, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.031192690134048462, + "rewards/margins": -0.034893058240413666, + "rewards/rejected": 0.06608574837446213, + "step": 414 + }, + { + "epoch": 0.2530102118579485, + "grad_norm": 73.86086587703944, + "learning_rate": 1.782439024390244e-08, + "logits/chosen": -0.09877456724643707, + "logits/rejected": 0.1697251796722412, + "logps/chosen": -358.82391357421875, + "logps/rejected": -161.81329345703125, + "loss": 0.6589, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1746138632297516, + "rewards/margins": 0.12531642615795135, + "rewards/rejected": 0.049297429621219635, + "step": 415 + }, + { + "epoch": 0.253619875019052, + "grad_norm": 61.51659876037952, + "learning_rate": 1.7868292682926827e-08, + "logits/chosen": -0.07529552280902863, + "logits/rejected": 0.021304693073034286, + "logps/chosen": -74.10441589355469, + "logps/rejected": -82.88666534423828, + "loss": 0.6735, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.015579151920974255, + "rewards/margins": -0.00527801550924778, + "rewards/rejected": 0.02085716649889946, + "step": 416 + }, + { + "epoch": 0.2542295381801555, + "grad_norm": 75.4546435785866, + "learning_rate": 1.791219512195122e-08, + "logits/chosen": -0.0038693025708198547, + "logits/rejected": 0.17084497213363647, + "logps/chosen": -217.5841064453125, + "logps/rejected": -302.18634033203125, + "loss": 0.6868, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09169816970825195, + "rewards/margins": -0.04251394420862198, + "rewards/rejected": 0.13421212136745453, + "step": 417 + }, + { + "epoch": 0.254839201341259, + "grad_norm": 66.82138942843116, + "learning_rate": 1.7956097560975607e-08, + "logits/chosen": 0.3456932306289673, + "logits/rejected": 0.2908247113227844, + "logps/chosen": -112.54121398925781, + "logps/rejected": -72.51783752441406, + "loss": 0.6716, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14502708613872528, + "rewards/margins": 0.122799813747406, + "rewards/rejected": 0.022227276116609573, + "step": 418 + }, + { + "epoch": 0.2554488645023624, + "grad_norm": 72.92019462743787, + "learning_rate": 1.8e-08, + "logits/chosen": 0.12400247156620026, + "logits/rejected": 0.3096410930156708, + "logps/chosen": -290.733642578125, + "logps/rejected": -235.09213256835938, + "loss": 0.6684, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10874108970165253, + "rewards/margins": 0.025292323902249336, + "rewards/rejected": 0.08344876766204834, + "step": 419 + }, + { + "epoch": 0.25605852766346593, + "grad_norm": 70.71468075252082, + "learning_rate": 1.804390243902439e-08, + "logits/chosen": -0.012874945998191833, + "logits/rejected": -0.034830302000045776, + "logps/chosen": -273.19000244140625, + "logps/rejected": -224.76705932617188, + "loss": 0.6732, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2105557769536972, + "rewards/margins": 0.23022204637527466, + "rewards/rejected": -0.019666265696287155, + "step": 420 + }, + { + "epoch": 0.25666819082456943, + "grad_norm": 74.03103411357598, + "learning_rate": 1.808780487804878e-08, + "logits/chosen": 0.25390613079071045, + "logits/rejected": 0.24167406558990479, + "logps/chosen": -94.19416046142578, + "logps/rejected": -97.90225982666016, + "loss": 0.6549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032872818410396576, + "rewards/margins": 0.006419522687792778, + "rewards/rejected": 0.026453299447894096, + "step": 421 + }, + { + "epoch": 0.25727785398567293, + "grad_norm": 70.28787189866513, + "learning_rate": 1.813170731707317e-08, + "logits/chosen": -0.1942972093820572, + "logits/rejected": -0.15210787951946259, + "logps/chosen": -132.333984375, + "logps/rejected": -113.71392059326172, + "loss": 0.6773, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02886814996600151, + "rewards/margins": 0.03950446844100952, + "rewards/rejected": -0.010636317543685436, + "step": 422 + }, + { + "epoch": 0.2578875171467764, + "grad_norm": 79.69632773166165, + "learning_rate": 1.817560975609756e-08, + "logits/chosen": 0.2247251272201538, + "logits/rejected": 0.31595107913017273, + "logps/chosen": -108.94129943847656, + "logps/rejected": -365.61224365234375, + "loss": 0.6546, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.038946785032749176, + "rewards/margins": 0.04372507333755493, + "rewards/rejected": -0.004778290167450905, + "step": 423 + }, + { + "epoch": 0.2584971803078799, + "grad_norm": 63.99624156445586, + "learning_rate": 1.821951219512195e-08, + "logits/chosen": 0.11942100524902344, + "logits/rejected": 0.06516227126121521, + "logps/chosen": -32.335792541503906, + "logps/rejected": -79.2452392578125, + "loss": 0.6578, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.007939141243696213, + "rewards/margins": -0.01192130520939827, + "rewards/rejected": 0.019860446453094482, + "step": 424 + }, + { + "epoch": 0.2591068434689834, + "grad_norm": 69.47093923455672, + "learning_rate": 1.826341463414634e-08, + "logits/chosen": 0.06378969550132751, + "logits/rejected": 0.34526288509368896, + "logps/chosen": -209.16851806640625, + "logps/rejected": -113.60540771484375, + "loss": 0.6595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12819691002368927, + "rewards/margins": 0.13993854820728302, + "rewards/rejected": -0.01174163818359375, + "step": 425 + }, + { + "epoch": 0.2597165066300869, + "grad_norm": 68.55476273419893, + "learning_rate": 1.830731707317073e-08, + "logits/chosen": 0.14273546636104584, + "logits/rejected": 0.4567739963531494, + "logps/chosen": -133.89926147460938, + "logps/rejected": -53.5922966003418, + "loss": 0.6645, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04424731805920601, + "rewards/margins": 0.09210419654846191, + "rewards/rejected": -0.047856878489255905, + "step": 426 + }, + { + "epoch": 0.2603261697911904, + "grad_norm": 69.5881777755876, + "learning_rate": 1.8351219512195123e-08, + "logits/chosen": -0.07335227727890015, + "logits/rejected": -0.1167667955160141, + "logps/chosen": -277.70880126953125, + "logps/rejected": -338.48626708984375, + "loss": 0.67, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17319995164871216, + "rewards/margins": 0.042127128690481186, + "rewards/rejected": 0.13107281923294067, + "step": 427 + }, + { + "epoch": 0.26093583295229383, + "grad_norm": 67.29761472890893, + "learning_rate": 1.839512195121951e-08, + "logits/chosen": 0.4153785705566406, + "logits/rejected": 0.3026084899902344, + "logps/chosen": -182.0535888671875, + "logps/rejected": -243.2552490234375, + "loss": 0.6388, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13951362669467926, + "rewards/margins": 0.31202802062034607, + "rewards/rejected": -0.17251437902450562, + "step": 428 + }, + { + "epoch": 0.26154549611339734, + "grad_norm": 64.93182010188912, + "learning_rate": 1.8439024390243903e-08, + "logits/chosen": -0.04005971923470497, + "logits/rejected": -0.051459453999996185, + "logps/chosen": -34.133052825927734, + "logps/rejected": -48.049068450927734, + "loss": 0.6726, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01578432321548462, + "rewards/margins": -0.005022215656936169, + "rewards/rejected": 0.020806537941098213, + "step": 429 + }, + { + "epoch": 0.26215515927450084, + "grad_norm": 66.63677062463813, + "learning_rate": 1.848292682926829e-08, + "logits/chosen": 0.14951828122138977, + "logits/rejected": 0.12362108379602432, + "logps/chosen": -304.57733154296875, + "logps/rejected": -325.12017822265625, + "loss": 0.6634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29309120774269104, + "rewards/margins": 0.20215153694152832, + "rewards/rejected": 0.09093966335058212, + "step": 430 + }, + { + "epoch": 0.26276482243560434, + "grad_norm": 68.94318829626472, + "learning_rate": 1.8526829268292683e-08, + "logits/chosen": 0.11069389432668686, + "logits/rejected": 0.1824926733970642, + "logps/chosen": -103.77944946289062, + "logps/rejected": -68.4943618774414, + "loss": 0.6689, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.038526106625795364, + "rewards/margins": -0.055515188723802567, + "rewards/rejected": 0.09404130280017853, + "step": 431 + }, + { + "epoch": 0.26337448559670784, + "grad_norm": 61.669867746373285, + "learning_rate": 1.857073170731707e-08, + "logits/chosen": 0.17164219915866852, + "logits/rejected": 0.1094513013958931, + "logps/chosen": -98.68592071533203, + "logps/rejected": -111.42066955566406, + "loss": 0.6673, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06978795677423477, + "rewards/margins": 0.08880459517240524, + "rewards/rejected": -0.019016636535525322, + "step": 432 + }, + { + "epoch": 0.2639841487578113, + "grad_norm": 67.73293276351028, + "learning_rate": 1.8614634146341463e-08, + "logits/chosen": 0.06384044885635376, + "logits/rejected": 0.18351881206035614, + "logps/chosen": -55.75654983520508, + "logps/rejected": -14.070140838623047, + "loss": 0.6698, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.031082313507795334, + "rewards/margins": 0.011718548834323883, + "rewards/rejected": -0.04280085861682892, + "step": 433 + }, + { + "epoch": 0.2645938119189148, + "grad_norm": 74.82178146757374, + "learning_rate": 1.8658536585365854e-08, + "logits/chosen": 0.21451379358768463, + "logits/rejected": 0.2656710743904114, + "logps/chosen": -252.65444946289062, + "logps/rejected": -249.56112670898438, + "loss": 0.6592, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16140216588974, + "rewards/margins": 0.07049685716629028, + "rewards/rejected": 0.09090529382228851, + "step": 434 + }, + { + "epoch": 0.2652034750800183, + "grad_norm": 68.59328455137637, + "learning_rate": 1.8702439024390243e-08, + "logits/chosen": 0.08416593074798584, + "logits/rejected": -0.025433972477912903, + "logps/chosen": -126.26637268066406, + "logps/rejected": -389.1315002441406, + "loss": 0.6566, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.026880884543061256, + "rewards/margins": 0.05392398685216904, + "rewards/rejected": -0.02704310230910778, + "step": 435 + }, + { + "epoch": 0.2658131382411218, + "grad_norm": 67.76205030806439, + "learning_rate": 1.8746341463414635e-08, + "logits/chosen": -0.12446662783622742, + "logits/rejected": 0.14436742663383484, + "logps/chosen": -296.7868957519531, + "logps/rejected": -198.11489868164062, + "loss": 0.6793, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07260856032371521, + "rewards/margins": 0.05034847557544708, + "rewards/rejected": 0.022260094061493874, + "step": 436 + }, + { + "epoch": 0.2664228014022253, + "grad_norm": 56.44823636494468, + "learning_rate": 1.8790243902439023e-08, + "logits/chosen": 0.2524694502353668, + "logits/rejected": 0.31131431460380554, + "logps/chosen": -205.24276733398438, + "logps/rejected": -92.745849609375, + "loss": 0.6648, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15956419706344604, + "rewards/margins": 0.17301282286643982, + "rewards/rejected": -0.013448631390929222, + "step": 437 + }, + { + "epoch": 0.26703246456332874, + "grad_norm": 94.67153857744482, + "learning_rate": 1.8834146341463415e-08, + "logits/chosen": 0.2888627350330353, + "logits/rejected": 0.4149096608161926, + "logps/chosen": -194.0454559326172, + "logps/rejected": -111.11502838134766, + "loss": 0.6869, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14418041706085205, + "rewards/margins": 0.13503269851207733, + "rewards/rejected": 0.009147727862000465, + "step": 438 + }, + { + "epoch": 0.26764212772443224, + "grad_norm": 71.36579622673247, + "learning_rate": 1.8878048780487806e-08, + "logits/chosen": -0.11654005944728851, + "logits/rejected": 0.27451732754707336, + "logps/chosen": -329.767822265625, + "logps/rejected": -143.21238708496094, + "loss": 0.6615, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19159859418869019, + "rewards/margins": 0.10668019950389862, + "rewards/rejected": 0.08491840958595276, + "step": 439 + }, + { + "epoch": 0.26825179088553575, + "grad_norm": 66.68799340256919, + "learning_rate": 1.8921951219512195e-08, + "logits/chosen": 0.08359631896018982, + "logits/rejected": 0.05886491760611534, + "logps/chosen": -59.129661560058594, + "logps/rejected": -121.8931884765625, + "loss": 0.6586, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.009189892560243607, + "rewards/margins": -0.0016784649342298508, + "rewards/rejected": 0.010868358425796032, + "step": 440 + }, + { + "epoch": 0.26886145404663925, + "grad_norm": 78.97967669661037, + "learning_rate": 1.8965853658536586e-08, + "logits/chosen": -0.0766112208366394, + "logits/rejected": 0.24237942695617676, + "logps/chosen": -144.33291625976562, + "logps/rejected": -142.45741271972656, + "loss": 0.6706, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.055260419845581055, + "rewards/margins": -0.01594863086938858, + "rewards/rejected": 0.07120904326438904, + "step": 441 + }, + { + "epoch": 0.26947111720774275, + "grad_norm": 88.67666461641667, + "learning_rate": 1.9009756097560975e-08, + "logits/chosen": -0.1746220588684082, + "logits/rejected": 0.1630934476852417, + "logps/chosen": -135.13832092285156, + "logps/rejected": -153.19435119628906, + "loss": 0.6835, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01004643552005291, + "rewards/margins": -0.08069337159395218, + "rewards/rejected": 0.07064693421125412, + "step": 442 + }, + { + "epoch": 0.2700807803688462, + "grad_norm": 60.63270295307191, + "learning_rate": 1.9053658536585366e-08, + "logits/chosen": 0.14319084584712982, + "logits/rejected": 0.1383337825536728, + "logps/chosen": -39.036170959472656, + "logps/rejected": -64.15113830566406, + "loss": 0.6846, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004637223668396473, + "rewards/margins": -0.01796860620379448, + "rewards/rejected": 0.022605828940868378, + "step": 443 + }, + { + "epoch": 0.2706904435299497, + "grad_norm": 59.29665474045625, + "learning_rate": 1.9097560975609755e-08, + "logits/chosen": -0.1462271511554718, + "logits/rejected": -0.19049985706806183, + "logps/chosen": -37.981868743896484, + "logps/rejected": -66.49958801269531, + "loss": 0.6768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03529193997383118, + "rewards/margins": -0.0017494894564151764, + "rewards/rejected": 0.03704142943024635, + "step": 444 + }, + { + "epoch": 0.2713001066910532, + "grad_norm": 75.98124784362513, + "learning_rate": 1.9141463414634146e-08, + "logits/chosen": 0.3471428155899048, + "logits/rejected": 0.19750745594501495, + "logps/chosen": -215.55172729492188, + "logps/rejected": -281.86029052734375, + "loss": 0.6387, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19224663078784943, + "rewards/margins": 0.24797995388507843, + "rewards/rejected": -0.055733323097229004, + "step": 445 + }, + { + "epoch": 0.2719097698521567, + "grad_norm": 73.1487317087868, + "learning_rate": 1.9185365853658538e-08, + "logits/chosen": 0.06400503218173981, + "logits/rejected": -0.08633884787559509, + "logps/chosen": -156.72970581054688, + "logps/rejected": -203.78372192382812, + "loss": 0.6654, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07347560673952103, + "rewards/margins": 0.017678987234830856, + "rewards/rejected": 0.05579661950469017, + "step": 446 + }, + { + "epoch": 0.27251943301326015, + "grad_norm": 69.84496644271896, + "learning_rate": 1.9229268292682927e-08, + "logits/chosen": 0.032298870384693146, + "logits/rejected": 0.2304471880197525, + "logps/chosen": -155.3098602294922, + "logps/rejected": -86.67618560791016, + "loss": 0.6769, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02124052867293358, + "rewards/margins": 0.008774537593126297, + "rewards/rejected": 0.01246599294245243, + "step": 447 + }, + { + "epoch": 0.27312909617436365, + "grad_norm": 60.17134465077584, + "learning_rate": 1.9273170731707318e-08, + "logits/chosen": 0.3111189901828766, + "logits/rejected": 0.2346172034740448, + "logps/chosen": -184.82394409179688, + "logps/rejected": -142.02420043945312, + "loss": 0.6522, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12719501554965973, + "rewards/margins": 0.08638709783554077, + "rewards/rejected": 0.04080791771411896, + "step": 448 + }, + { + "epoch": 0.27373875933546715, + "grad_norm": 68.92332057326264, + "learning_rate": 1.9317073170731707e-08, + "logits/chosen": 0.1487908959388733, + "logits/rejected": 0.4169542193412781, + "logps/chosen": -278.8198547363281, + "logps/rejected": -315.25506591796875, + "loss": 0.6517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2707786560058594, + "rewards/margins": 0.28594255447387695, + "rewards/rejected": -0.015163922682404518, + "step": 449 + }, + { + "epoch": 0.27434842249657065, + "grad_norm": 64.30204877730138, + "learning_rate": 1.9360975609756098e-08, + "logits/chosen": -0.09532143175601959, + "logits/rejected": 0.07058137655258179, + "logps/chosen": -135.10035705566406, + "logps/rejected": -127.02887725830078, + "loss": 0.6586, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04978819191455841, + "rewards/margins": 0.04082760959863663, + "rewards/rejected": 0.00896057952195406, + "step": 450 + }, + { + "epoch": 0.27495808565767416, + "grad_norm": 57.44123209699844, + "learning_rate": 1.9404878048780487e-08, + "logits/chosen": 0.28252366185188293, + "logits/rejected": 0.29748132824897766, + "logps/chosen": -127.99627685546875, + "logps/rejected": -151.4498291015625, + "loss": 0.6555, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13848179578781128, + "rewards/margins": 0.18644532561302185, + "rewards/rejected": -0.04796352982521057, + "step": 451 + }, + { + "epoch": 0.2755677488187776, + "grad_norm": 86.40538860877604, + "learning_rate": 1.944878048780488e-08, + "logits/chosen": 0.10914494842290878, + "logits/rejected": 0.16794736683368683, + "logps/chosen": -52.78816223144531, + "logps/rejected": -59.555259704589844, + "loss": 0.6253, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02861565351486206, + "rewards/margins": -0.04731559008359909, + "rewards/rejected": 0.07593125104904175, + "step": 452 + }, + { + "epoch": 0.2761774119798811, + "grad_norm": 63.60873431451276, + "learning_rate": 1.949268292682927e-08, + "logits/chosen": 0.03662291169166565, + "logits/rejected": 0.5540964007377625, + "logps/chosen": -319.99407958984375, + "logps/rejected": -136.2207489013672, + "loss": 0.6653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.009144261479377747, + "rewards/margins": 0.0010973364114761353, + "rewards/rejected": -0.010241604410111904, + "step": 453 + }, + { + "epoch": 0.2767870751409846, + "grad_norm": 63.343343518333135, + "learning_rate": 1.953658536585366e-08, + "logits/chosen": 0.21227677166461945, + "logits/rejected": 0.1923665702342987, + "logps/chosen": -12.800304412841797, + "logps/rejected": -34.73240661621094, + "loss": 0.6839, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.012887751683592796, + "rewards/margins": 0.014210665598511696, + "rewards/rejected": -0.027098417282104492, + "step": 454 + }, + { + "epoch": 0.2773967383020881, + "grad_norm": 63.29144790106512, + "learning_rate": 1.958048780487805e-08, + "logits/chosen": 0.3340422511100769, + "logits/rejected": 0.20748065412044525, + "logps/chosen": -132.12779235839844, + "logps/rejected": -201.5473175048828, + "loss": 0.6634, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.04269295185804367, + "rewards/margins": -0.011317897588014603, + "rewards/rejected": 0.05401084944605827, + "step": 455 + }, + { + "epoch": 0.2780064014631916, + "grad_norm": 74.57964819213488, + "learning_rate": 1.962439024390244e-08, + "logits/chosen": 0.07027482241392136, + "logits/rejected": -0.019699640572071075, + "logps/chosen": -161.471923828125, + "logps/rejected": -185.7052001953125, + "loss": 0.653, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.059368353337049484, + "rewards/margins": 0.10203978419303894, + "rewards/rejected": -0.042671434581279755, + "step": 456 + }, + { + "epoch": 0.27861606462429506, + "grad_norm": 67.61667659665781, + "learning_rate": 1.966829268292683e-08, + "logits/chosen": 0.20456407964229584, + "logits/rejected": 0.18659041821956635, + "logps/chosen": -123.71971130371094, + "logps/rejected": -143.08709716796875, + "loss": 0.6727, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.044207461178302765, + "rewards/margins": 0.03395986557006836, + "rewards/rejected": 0.010247595608234406, + "step": 457 + }, + { + "epoch": 0.27922572778539856, + "grad_norm": 69.41472807818889, + "learning_rate": 1.971219512195122e-08, + "logits/chosen": 0.48095908761024475, + "logits/rejected": 0.4224778413772583, + "logps/chosen": -266.6629943847656, + "logps/rejected": -149.05494689941406, + "loss": 0.6443, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1881321519613266, + "rewards/margins": 0.13835391402244568, + "rewards/rejected": 0.04977824538946152, + "step": 458 + }, + { + "epoch": 0.27983539094650206, + "grad_norm": 72.22389428627339, + "learning_rate": 1.975609756097561e-08, + "logits/chosen": -0.3132532835006714, + "logits/rejected": -0.18809807300567627, + "logps/chosen": -119.96725463867188, + "logps/rejected": -111.36251068115234, + "loss": 0.6584, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00742526026442647, + "rewards/margins": -0.05493436008691788, + "rewards/rejected": 0.047509100288152695, + "step": 459 + }, + { + "epoch": 0.28044505410760556, + "grad_norm": 68.39678394859612, + "learning_rate": 1.9800000000000002e-08, + "logits/chosen": 0.1794874668121338, + "logits/rejected": 0.10086102783679962, + "logps/chosen": -127.66497802734375, + "logps/rejected": -135.0052490234375, + "loss": 0.6475, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08427328616380692, + "rewards/margins": 0.06482025980949402, + "rewards/rejected": 0.019453026354312897, + "step": 460 + }, + { + "epoch": 0.28105471726870906, + "grad_norm": 68.2361183005348, + "learning_rate": 1.984390243902439e-08, + "logits/chosen": 0.07122084498405457, + "logits/rejected": 0.031408704817295074, + "logps/chosen": -110.41446685791016, + "logps/rejected": -122.53499603271484, + "loss": 0.6785, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03592252731323242, + "rewards/margins": 0.07255663722753525, + "rewards/rejected": -0.10847915709018707, + "step": 461 + }, + { + "epoch": 0.2816643804298125, + "grad_norm": 69.020638385843, + "learning_rate": 1.9887804878048782e-08, + "logits/chosen": 0.14568686485290527, + "logits/rejected": 0.26515433192253113, + "logps/chosen": -209.664306640625, + "logps/rejected": -171.8083953857422, + "loss": 0.6472, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21378999948501587, + "rewards/margins": 0.15649661421775818, + "rewards/rejected": 0.05729339271783829, + "step": 462 + }, + { + "epoch": 0.282274043590916, + "grad_norm": 73.05756135117628, + "learning_rate": 1.993170731707317e-08, + "logits/chosen": 0.09534000605344772, + "logits/rejected": 0.02491987869143486, + "logps/chosen": -219.74197387695312, + "logps/rejected": -208.32476806640625, + "loss": 0.661, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07846088707447052, + "rewards/margins": 0.06449966132640839, + "rewards/rejected": 0.013961220160126686, + "step": 463 + }, + { + "epoch": 0.2828837067520195, + "grad_norm": 69.79106803917409, + "learning_rate": 1.9975609756097562e-08, + "logits/chosen": -0.06213008612394333, + "logits/rejected": -0.23186719417572021, + "logps/chosen": -153.84791564941406, + "logps/rejected": -151.7867431640625, + "loss": 0.6721, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10366620123386383, + "rewards/margins": 0.07202887535095215, + "rewards/rejected": 0.03163733333349228, + "step": 464 + }, + { + "epoch": 0.283493369913123, + "grad_norm": 61.283147710449725, + "learning_rate": 2.001951219512195e-08, + "logits/chosen": 0.21901625394821167, + "logits/rejected": 0.20077916979789734, + "logps/chosen": -7.487462997436523, + "logps/rejected": -7.173701286315918, + "loss": 0.678, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006824742071330547, + "rewards/margins": 0.01945982500910759, + "rewards/rejected": -0.012635082937777042, + "step": 465 + }, + { + "epoch": 0.28410303307422646, + "grad_norm": 62.43884619620277, + "learning_rate": 2.0063414634146342e-08, + "logits/chosen": 0.29454317688941956, + "logits/rejected": 0.2875673770904541, + "logps/chosen": -146.99061584472656, + "logps/rejected": -144.388671875, + "loss": 0.6507, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21434439718723297, + "rewards/margins": 0.20665687322616577, + "rewards/rejected": 0.0076875220984220505, + "step": 466 + }, + { + "epoch": 0.28471269623532997, + "grad_norm": 79.08259419954352, + "learning_rate": 2.0107317073170734e-08, + "logits/chosen": 0.522144079208374, + "logits/rejected": 0.1844376027584076, + "logps/chosen": -106.72376251220703, + "logps/rejected": -123.9677505493164, + "loss": 0.6678, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04608582705259323, + "rewards/margins": 0.05327960103750229, + "rewards/rejected": -0.007193779572844505, + "step": 467 + }, + { + "epoch": 0.28532235939643347, + "grad_norm": 72.96261350010558, + "learning_rate": 2.0151219512195122e-08, + "logits/chosen": 0.08090576529502869, + "logits/rejected": 0.14955011010169983, + "logps/chosen": -181.6560821533203, + "logps/rejected": -222.9476318359375, + "loss": 0.6544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08407467603683472, + "rewards/margins": 0.15113595128059387, + "rewards/rejected": -0.06706128269433975, + "step": 468 + }, + { + "epoch": 0.28593202255753697, + "grad_norm": 77.14306159327754, + "learning_rate": 2.0195121951219514e-08, + "logits/chosen": -0.27836892008781433, + "logits/rejected": 0.03967548906803131, + "logps/chosen": -338.9379577636719, + "logps/rejected": -126.48583984375, + "loss": 0.6544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4431110620498657, + "rewards/margins": 0.39103490114212036, + "rewards/rejected": 0.052076149731874466, + "step": 469 + }, + { + "epoch": 0.28654168571864047, + "grad_norm": 61.034460109343556, + "learning_rate": 2.0239024390243902e-08, + "logits/chosen": 0.23134541511535645, + "logits/rejected": 0.22853803634643555, + "logps/chosen": -31.802772521972656, + "logps/rejected": -12.457967758178711, + "loss": 0.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018924130126833916, + "rewards/margins": 0.1008741557598114, + "rewards/rejected": -0.08195002377033234, + "step": 470 + }, + { + "epoch": 0.2871513488797439, + "grad_norm": 68.55916541041393, + "learning_rate": 2.028292682926829e-08, + "logits/chosen": -0.09053162485361099, + "logits/rejected": 0.25271421670913696, + "logps/chosen": -321.14013671875, + "logps/rejected": -105.07038879394531, + "loss": 0.6581, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18394261598587036, + "rewards/margins": 0.1811237931251526, + "rewards/rejected": 0.002818802371621132, + "step": 471 + }, + { + "epoch": 0.2877610120408474, + "grad_norm": 82.15582208471778, + "learning_rate": 2.032682926829268e-08, + "logits/chosen": 0.2014414221048355, + "logits/rejected": -0.21294736862182617, + "logps/chosen": -173.34231567382812, + "logps/rejected": -726.0479736328125, + "loss": 0.6744, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.008779885247349739, + "rewards/margins": -0.19394630193710327, + "rewards/rejected": 0.20272618532180786, + "step": 472 + }, + { + "epoch": 0.2883706752019509, + "grad_norm": 56.49253641977703, + "learning_rate": 2.037073170731707e-08, + "logits/chosen": 0.3929775059223175, + "logits/rejected": 0.10422532260417938, + "logps/chosen": -12.824280738830566, + "logps/rejected": -29.03037452697754, + "loss": 0.6727, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.013209665194153786, + "rewards/margins": -0.004942881874740124, + "rewards/rejected": -0.008266782388091087, + "step": 473 + }, + { + "epoch": 0.2889803383630544, + "grad_norm": 69.26657882016576, + "learning_rate": 2.041463414634146e-08, + "logits/chosen": 0.1593598872423172, + "logits/rejected": 0.19768331944942474, + "logps/chosen": -210.8773193359375, + "logps/rejected": -146.36282348632812, + "loss": 0.6278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23647665977478027, + "rewards/margins": 0.2512618601322174, + "rewards/rejected": -0.014785194769501686, + "step": 474 + }, + { + "epoch": 0.2895900015241579, + "grad_norm": 64.71109478449996, + "learning_rate": 2.045853658536585e-08, + "logits/chosen": -0.18643224239349365, + "logits/rejected": 0.19490055739879608, + "logps/chosen": -214.46121215820312, + "logps/rejected": -137.38888549804688, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.114950992166996, + "rewards/margins": 0.140077143907547, + "rewards/rejected": -0.02512616105377674, + "step": 475 + }, + { + "epoch": 0.29019966468526137, + "grad_norm": 66.42831607463249, + "learning_rate": 2.0502439024390242e-08, + "logits/chosen": 0.032998643815517426, + "logits/rejected": -0.06949509680271149, + "logps/chosen": -20.744892120361328, + "logps/rejected": -38.61202621459961, + "loss": 0.6738, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.005787133239209652, + "rewards/margins": -0.038832686841487885, + "rewards/rejected": 0.033045556396245956, + "step": 476 + }, + { + "epoch": 0.2908093278463649, + "grad_norm": 66.66597610193936, + "learning_rate": 2.054634146341463e-08, + "logits/chosen": 0.2674751877784729, + "logits/rejected": 0.32825157046318054, + "logps/chosen": -374.01495361328125, + "logps/rejected": -170.8480987548828, + "loss": 0.6634, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13175562024116516, + "rewards/margins": 0.16506260633468628, + "rewards/rejected": -0.03330698236823082, + "step": 477 + }, + { + "epoch": 0.2914189910074684, + "grad_norm": 64.16360845160342, + "learning_rate": 2.0590243902439023e-08, + "logits/chosen": 0.04146502912044525, + "logits/rejected": 0.177695631980896, + "logps/chosen": -215.79151916503906, + "logps/rejected": -265.5438232421875, + "loss": 0.6816, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07885445654392242, + "rewards/margins": -0.013722196221351624, + "rewards/rejected": 0.09257666021585464, + "step": 478 + }, + { + "epoch": 0.2920286541685719, + "grad_norm": 58.67004279648168, + "learning_rate": 2.063414634146341e-08, + "logits/chosen": 0.11150379478931427, + "logits/rejected": 0.21114593744277954, + "logps/chosen": -55.13713836669922, + "logps/rejected": -42.53518295288086, + "loss": 0.6733, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07448139041662216, + "rewards/margins": 0.0382765531539917, + "rewards/rejected": 0.03620484098792076, + "step": 479 + }, + { + "epoch": 0.2926383173296754, + "grad_norm": 59.23975018464548, + "learning_rate": 2.0678048780487803e-08, + "logits/chosen": 0.271587610244751, + "logits/rejected": 0.3958515524864197, + "logps/chosen": -154.6184844970703, + "logps/rejected": -168.01158142089844, + "loss": 0.65, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2721033990383148, + "rewards/margins": 0.2399720847606659, + "rewards/rejected": 0.03213133662939072, + "step": 480 + }, + { + "epoch": 0.2932479804907788, + "grad_norm": 76.54887162726922, + "learning_rate": 2.072195121951219e-08, + "logits/chosen": -0.045136116445064545, + "logits/rejected": 0.20272579789161682, + "logps/chosen": -180.22154235839844, + "logps/rejected": -48.23255157470703, + "loss": 0.6418, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1005738377571106, + "rewards/margins": 0.06824728101491928, + "rewards/rejected": 0.03232654929161072, + "step": 481 + }, + { + "epoch": 0.2938576436518823, + "grad_norm": 66.95281779745834, + "learning_rate": 2.0765853658536583e-08, + "logits/chosen": 0.20636984705924988, + "logits/rejected": 0.22792255878448486, + "logps/chosen": -78.6353988647461, + "logps/rejected": -67.68612670898438, + "loss": 0.6323, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10261476784944534, + "rewards/margins": 0.1378861665725708, + "rewards/rejected": -0.03527141734957695, + "step": 482 + }, + { + "epoch": 0.29446730681298583, + "grad_norm": 60.80437736239082, + "learning_rate": 2.0809756097560974e-08, + "logits/chosen": 0.027224496006965637, + "logits/rejected": 0.17680969834327698, + "logps/chosen": -102.2612075805664, + "logps/rejected": -51.69792175292969, + "loss": 0.6685, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11472471058368683, + "rewards/margins": 0.05192282423377037, + "rewards/rejected": 0.06280189007520676, + "step": 483 + }, + { + "epoch": 0.29507696997408933, + "grad_norm": 62.56218317882643, + "learning_rate": 2.0853658536585363e-08, + "logits/chosen": 0.08629345893859863, + "logits/rejected": 0.07546723634004593, + "logps/chosen": -208.2896270751953, + "logps/rejected": -216.16351318359375, + "loss": 0.6228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19518309831619263, + "rewards/margins": 0.28668278455734253, + "rewards/rejected": -0.0914997085928917, + "step": 484 + }, + { + "epoch": 0.29568663313519283, + "grad_norm": 65.77568573255537, + "learning_rate": 2.0897560975609754e-08, + "logits/chosen": 0.2876260280609131, + "logits/rejected": 0.16935913264751434, + "logps/chosen": -144.30734252929688, + "logps/rejected": -227.69224548339844, + "loss": 0.6375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16153675317764282, + "rewards/margins": 0.0677390769124031, + "rewards/rejected": 0.09379769116640091, + "step": 485 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 59.660312881733, + "learning_rate": 2.0941463414634143e-08, + "logits/chosen": 0.05226050317287445, + "logits/rejected": 0.29561787843704224, + "logps/chosen": -224.3292694091797, + "logps/rejected": -93.36639404296875, + "loss": 0.6362, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0905165821313858, + "rewards/margins": 0.09735140949487686, + "rewards/rejected": -0.006834829226136208, + "step": 486 + }, + { + "epoch": 0.2969059594573998, + "grad_norm": 71.22731953559592, + "learning_rate": 2.0985365853658534e-08, + "logits/chosen": 0.16766256093978882, + "logits/rejected": 0.23183496296405792, + "logps/chosen": -97.92478942871094, + "logps/rejected": -118.4116439819336, + "loss": 0.6206, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1377440094947815, + "rewards/margins": 0.1952911913394928, + "rewards/rejected": -0.05754717439413071, + "step": 487 + }, + { + "epoch": 0.2975156226185033, + "grad_norm": 64.11624827435091, + "learning_rate": 2.1029268292682923e-08, + "logits/chosen": -0.0726562887430191, + "logits/rejected": 0.1790645569562912, + "logps/chosen": -81.5301742553711, + "logps/rejected": -37.156185150146484, + "loss": 0.6551, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07962194085121155, + "rewards/margins": 0.08470235764980316, + "rewards/rejected": -0.005080413073301315, + "step": 488 + }, + { + "epoch": 0.2981252857796068, + "grad_norm": 73.46932744002982, + "learning_rate": 2.1073170731707315e-08, + "logits/chosen": 0.18244323134422302, + "logits/rejected": 0.29160937666893005, + "logps/chosen": -217.43319702148438, + "logps/rejected": -187.8267364501953, + "loss": 0.6529, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14290522038936615, + "rewards/margins": 0.05252870172262192, + "rewards/rejected": 0.09037652611732483, + "step": 489 + }, + { + "epoch": 0.29873494894071023, + "grad_norm": 67.30755185649637, + "learning_rate": 2.1117073170731706e-08, + "logits/chosen": 0.10757433623075485, + "logits/rejected": 0.23386675119400024, + "logps/chosen": -245.03697204589844, + "logps/rejected": -123.92584228515625, + "loss": 0.6577, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2160666435956955, + "rewards/margins": 0.07086057960987091, + "rewards/rejected": 0.14520607888698578, + "step": 490 + }, + { + "epoch": 0.29934461210181373, + "grad_norm": 68.91968012122946, + "learning_rate": 2.1160975609756095e-08, + "logits/chosen": -0.05915515869855881, + "logits/rejected": 0.0703243836760521, + "logps/chosen": -400.88525390625, + "logps/rejected": -329.91278076171875, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3326101303100586, + "rewards/margins": 0.3214416801929474, + "rewards/rejected": 0.011168431490659714, + "step": 491 + }, + { + "epoch": 0.29995427526291724, + "grad_norm": 66.67348569851553, + "learning_rate": 2.1204878048780486e-08, + "logits/chosen": 0.21445180475711823, + "logits/rejected": 0.17440001666545868, + "logps/chosen": -259.8293151855469, + "logps/rejected": -273.2416076660156, + "loss": 0.6422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17863722145557404, + "rewards/margins": 0.11631821095943451, + "rewards/rejected": 0.06231900304555893, + "step": 492 + }, + { + "epoch": 0.30056393842402074, + "grad_norm": 75.67037991925316, + "learning_rate": 2.1248780487804875e-08, + "logits/chosen": 0.2672348618507385, + "logits/rejected": 0.24317914247512817, + "logps/chosen": -19.860942840576172, + "logps/rejected": -30.22275161743164, + "loss": 0.6498, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05794403329491615, + "rewards/margins": -0.011715312488377094, + "rewards/rejected": -0.04622872173786163, + "step": 493 + }, + { + "epoch": 0.30117360158512424, + "grad_norm": 68.92658229754335, + "learning_rate": 2.1292682926829266e-08, + "logits/chosen": -0.319587767124176, + "logits/rejected": 0.1461396962404251, + "logps/chosen": -361.044921875, + "logps/rejected": -213.08692932128906, + "loss": 0.6352, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23673886060714722, + "rewards/margins": 0.19788327813148499, + "rewards/rejected": 0.038855601102113724, + "step": 494 + }, + { + "epoch": 0.3017832647462277, + "grad_norm": 71.12561791125117, + "learning_rate": 2.1336585365853658e-08, + "logits/chosen": 0.24701619148254395, + "logits/rejected": 0.24567127227783203, + "logps/chosen": -120.77401733398438, + "logps/rejected": -37.010032653808594, + "loss": 0.6856, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1529422104358673, + "rewards/margins": 0.18868255615234375, + "rewards/rejected": -0.03574035316705704, + "step": 495 + }, + { + "epoch": 0.3023929279073312, + "grad_norm": 73.09061978854592, + "learning_rate": 2.1380487804878046e-08, + "logits/chosen": 0.030435562133789062, + "logits/rejected": -0.13487783074378967, + "logps/chosen": -59.882545471191406, + "logps/rejected": -77.14221954345703, + "loss": 0.664, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.24862869083881378, + "rewards/margins": -0.03769826889038086, + "rewards/rejected": -0.2109304517507553, + "step": 496 + }, + { + "epoch": 0.3030025910684347, + "grad_norm": 76.35528849301075, + "learning_rate": 2.1424390243902438e-08, + "logits/chosen": 0.03241553157567978, + "logits/rejected": 0.4508446156978607, + "logps/chosen": -304.6023254394531, + "logps/rejected": -99.01864624023438, + "loss": 0.6397, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2151942253112793, + "rewards/margins": 0.18481206893920898, + "rewards/rejected": 0.030382152646780014, + "step": 497 + }, + { + "epoch": 0.3036122542295382, + "grad_norm": 69.77074908699126, + "learning_rate": 2.1468292682926826e-08, + "logits/chosen": 0.2624257802963257, + "logits/rejected": 0.2517472505569458, + "logps/chosen": -111.65718078613281, + "logps/rejected": -158.25135803222656, + "loss": 0.6524, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11233830451965332, + "rewards/margins": 0.2487785518169403, + "rewards/rejected": -0.1364402323961258, + "step": 498 + }, + { + "epoch": 0.3042219173906417, + "grad_norm": 69.85888827015366, + "learning_rate": 2.1512195121951218e-08, + "logits/chosen": 0.5841522216796875, + "logits/rejected": 0.40747517347335815, + "logps/chosen": -68.72338104248047, + "logps/rejected": -371.1597595214844, + "loss": 0.6572, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03852539137005806, + "rewards/margins": -0.04676118120551109, + "rewards/rejected": 0.08528657257556915, + "step": 499 + }, + { + "epoch": 0.30483158055174514, + "grad_norm": 60.87334931589632, + "learning_rate": 2.1556097560975607e-08, + "logits/chosen": 0.09502071887254715, + "logits/rejected": 0.3352612853050232, + "logps/chosen": -258.5905456542969, + "logps/rejected": -231.02549743652344, + "loss": 0.6487, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.059263478964567184, + "rewards/margins": 0.0008697099983692169, + "rewards/rejected": 0.05839376524090767, + "step": 500 + }, + { + "epoch": 0.30483158055174514, + "eval_logits/chosen": 0.05119064450263977, + "eval_logits/rejected": 0.1229129433631897, + "eval_logps/chosen": -161.55780029296875, + "eval_logps/rejected": -108.90592193603516, + "eval_loss": 0.6521755456924438, + "eval_rewards/accuracies": 0.6060606241226196, + "eval_rewards/chosen": 0.10773464292287827, + "eval_rewards/margins": 0.07899720221757889, + "eval_rewards/rejected": 0.028737450018525124, + "eval_runtime": 39.2311, + "eval_samples_per_second": 6.729, + "eval_steps_per_second": 0.841, + "step": 500 + }, + { + "epoch": 0.30544124371284864, + "grad_norm": 54.512994578320736, + "learning_rate": 2.1599999999999998e-08, + "logits/chosen": -0.19323980808258057, + "logits/rejected": 0.19141802191734314, + "logps/chosen": -122.07552337646484, + "logps/rejected": -51.705169677734375, + "loss": 0.6675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00966480653733015, + "rewards/margins": 0.031769122928380966, + "rewards/rejected": -0.04143393039703369, + "step": 501 + }, + { + "epoch": 0.30605090687395214, + "grad_norm": 66.01792835999346, + "learning_rate": 2.164390243902439e-08, + "logits/chosen": 0.2453068345785141, + "logits/rejected": 0.23593895137310028, + "logps/chosen": -256.44146728515625, + "logps/rejected": -243.70411682128906, + "loss": 0.6598, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18942949175834656, + "rewards/margins": 0.08905204385519028, + "rewards/rejected": 0.10037745535373688, + "step": 502 + }, + { + "epoch": 0.30666057003505565, + "grad_norm": 58.327484640099456, + "learning_rate": 2.1687804878048778e-08, + "logits/chosen": 0.13334240019321442, + "logits/rejected": 0.1443691849708557, + "logps/chosen": -109.47007751464844, + "logps/rejected": -176.93238830566406, + "loss": 0.6313, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.040549665689468384, + "rewards/margins": 0.02135545201599598, + "rewards/rejected": 0.019194208085536957, + "step": 503 + }, + { + "epoch": 0.30727023319615915, + "grad_norm": 64.57842170183204, + "learning_rate": 2.173170731707317e-08, + "logits/chosen": 0.0882779061794281, + "logits/rejected": 0.019421786069869995, + "logps/chosen": -98.05096435546875, + "logps/rejected": -113.34808349609375, + "loss": 0.6554, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11694498360157013, + "rewards/margins": 0.12198923528194427, + "rewards/rejected": -0.005044247955083847, + "step": 504 + }, + { + "epoch": 0.3078798963572626, + "grad_norm": 60.71924137109685, + "learning_rate": 2.1775609756097558e-08, + "logits/chosen": -0.025103464722633362, + "logits/rejected": -0.01396152377128601, + "logps/chosen": -132.69093322753906, + "logps/rejected": -148.00677490234375, + "loss": 0.6409, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10391964763402939, + "rewards/margins": 0.02545750141143799, + "rewards/rejected": 0.0784621462225914, + "step": 505 + }, + { + "epoch": 0.3084895595183661, + "grad_norm": 64.21484896317976, + "learning_rate": 2.181951219512195e-08, + "logits/chosen": 0.14935000240802765, + "logits/rejected": 0.26964497566223145, + "logps/chosen": -89.99710083007812, + "logps/rejected": -61.493988037109375, + "loss": 0.651, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.172231987118721, + "rewards/margins": 0.16800904273986816, + "rewards/rejected": 0.0042229327373206615, + "step": 506 + }, + { + "epoch": 0.3090992226794696, + "grad_norm": 68.52761020448763, + "learning_rate": 2.186341463414634e-08, + "logits/chosen": 0.1626947522163391, + "logits/rejected": 0.300977498292923, + "logps/chosen": -204.31988525390625, + "logps/rejected": -131.5135040283203, + "loss": 0.6806, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.20247884094715118, + "rewards/margins": 0.03694174811244011, + "rewards/rejected": 0.16553710401058197, + "step": 507 + }, + { + "epoch": 0.3097088858405731, + "grad_norm": 75.73589330780636, + "learning_rate": 2.190731707317073e-08, + "logits/chosen": -0.043320655822753906, + "logits/rejected": -0.17623624205589294, + "logps/chosen": -141.2119598388672, + "logps/rejected": -193.40248107910156, + "loss": 0.6823, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.029924681410193443, + "rewards/margins": -0.10780796408653259, + "rewards/rejected": 0.13773265480995178, + "step": 508 + }, + { + "epoch": 0.31031854900167655, + "grad_norm": 73.67312848325095, + "learning_rate": 2.1951219512195122e-08, + "logits/chosen": 0.09232969582080841, + "logits/rejected": 0.07798745483160019, + "logps/chosen": -214.9461212158203, + "logps/rejected": -155.3734893798828, + "loss": 0.6579, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10752177238464355, + "rewards/margins": 0.11612154543399811, + "rewards/rejected": -0.008599769324064255, + "step": 509 + }, + { + "epoch": 0.31092821216278005, + "grad_norm": 69.67169645034262, + "learning_rate": 2.199512195121951e-08, + "logits/chosen": 0.17757156491279602, + "logits/rejected": 0.14873629808425903, + "logps/chosen": -140.59393310546875, + "logps/rejected": -138.13308715820312, + "loss": 0.6346, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07352197170257568, + "rewards/margins": 0.045384474098682404, + "rewards/rejected": 0.02813749387860298, + "step": 510 + }, + { + "epoch": 0.31153787532388355, + "grad_norm": 66.8733357522459, + "learning_rate": 2.2039024390243902e-08, + "logits/chosen": 0.129908949136734, + "logits/rejected": 0.20046426355838776, + "logps/chosen": -265.88330078125, + "logps/rejected": -248.80580139160156, + "loss": 0.6551, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21615388989448547, + "rewards/margins": 0.056390319019556046, + "rewards/rejected": 0.15976357460021973, + "step": 511 + }, + { + "epoch": 0.31214753848498705, + "grad_norm": 66.97302911001606, + "learning_rate": 2.208292682926829e-08, + "logits/chosen": 0.19567690789699554, + "logits/rejected": 0.42208293080329895, + "logps/chosen": -201.44737243652344, + "logps/rejected": -131.23837280273438, + "loss": 0.6529, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14756140112876892, + "rewards/margins": 0.18925122916698456, + "rewards/rejected": -0.041689835488796234, + "step": 512 + }, + { + "epoch": 0.31275720164609055, + "grad_norm": 67.92666175878672, + "learning_rate": 2.2126829268292682e-08, + "logits/chosen": 0.00613509863615036, + "logits/rejected": 0.0940328985452652, + "logps/chosen": -113.25761413574219, + "logps/rejected": -125.87974548339844, + "loss": 0.6497, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10114751756191254, + "rewards/margins": 0.04731915518641472, + "rewards/rejected": 0.05382836237549782, + "step": 513 + }, + { + "epoch": 0.313366864807194, + "grad_norm": 59.70995852571872, + "learning_rate": 2.217073170731707e-08, + "logits/chosen": -0.24690371751785278, + "logits/rejected": -0.135573610663414, + "logps/chosen": -265.9849548339844, + "logps/rejected": -128.41067504882812, + "loss": 0.6592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32507163286209106, + "rewards/margins": 0.31498491764068604, + "rewards/rejected": 0.010086726397275925, + "step": 514 + }, + { + "epoch": 0.3139765279682975, + "grad_norm": 70.7049647088918, + "learning_rate": 2.2214634146341462e-08, + "logits/chosen": 0.19668368995189667, + "logits/rejected": 0.18600206077098846, + "logps/chosen": -104.42039489746094, + "logps/rejected": -74.9419174194336, + "loss": 0.6313, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17575618624687195, + "rewards/margins": 0.18259717524051666, + "rewards/rejected": -0.006841002032160759, + "step": 515 + }, + { + "epoch": 0.314586191129401, + "grad_norm": 63.080128882078014, + "learning_rate": 2.2258536585365854e-08, + "logits/chosen": 0.08212514966726303, + "logits/rejected": -0.007908239960670471, + "logps/chosen": -23.616920471191406, + "logps/rejected": -70.23422241210938, + "loss": 0.6324, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06444795429706573, + "rewards/margins": 0.04576684534549713, + "rewards/rejected": -0.11021479964256287, + "step": 516 + }, + { + "epoch": 0.3151958542905045, + "grad_norm": 60.7181333066584, + "learning_rate": 2.2302439024390242e-08, + "logits/chosen": -0.053847894072532654, + "logits/rejected": 0.23323538899421692, + "logps/chosen": -251.07766723632812, + "logps/rejected": -144.61624145507812, + "loss": 0.6492, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1936260312795639, + "rewards/margins": 0.14233648777008057, + "rewards/rejected": 0.051289550960063934, + "step": 517 + }, + { + "epoch": 0.315805517451608, + "grad_norm": 61.019751894456455, + "learning_rate": 2.2346341463414634e-08, + "logits/chosen": -0.14775434136390686, + "logits/rejected": 0.1770121157169342, + "logps/chosen": -433.21710205078125, + "logps/rejected": -132.9249267578125, + "loss": 0.6587, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33931484818458557, + "rewards/margins": 0.3179498314857483, + "rewards/rejected": 0.021365046501159668, + "step": 518 + }, + { + "epoch": 0.31641518061271146, + "grad_norm": 82.2978366121133, + "learning_rate": 2.2390243902439022e-08, + "logits/chosen": 0.1245865672826767, + "logits/rejected": 0.052594467997550964, + "logps/chosen": -242.2100830078125, + "logps/rejected": -366.5093688964844, + "loss": 0.6718, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.040619898587465286, + "rewards/margins": -0.1416400671005249, + "rewards/rejected": 0.10102016478776932, + "step": 519 + }, + { + "epoch": 0.31702484377381496, + "grad_norm": 66.87296880096541, + "learning_rate": 2.2434146341463414e-08, + "logits/chosen": 0.37499451637268066, + "logits/rejected": 0.380936861038208, + "logps/chosen": -191.12860107421875, + "logps/rejected": -161.9185791015625, + "loss": 0.6564, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04185749590396881, + "rewards/margins": 0.08646363019943237, + "rewards/rejected": -0.04460614174604416, + "step": 520 + }, + { + "epoch": 0.31763450693491846, + "grad_norm": 72.23456407773578, + "learning_rate": 2.2478048780487802e-08, + "logits/chosen": 0.27072739601135254, + "logits/rejected": 0.36028990149497986, + "logps/chosen": -156.28427124023438, + "logps/rejected": -105.19752502441406, + "loss": 0.6365, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.00997886061668396, + "rewards/margins": -0.12803566455841064, + "rewards/rejected": 0.1380145400762558, + "step": 521 + }, + { + "epoch": 0.31824417009602196, + "grad_norm": 66.88803954158226, + "learning_rate": 2.2521951219512194e-08, + "logits/chosen": 0.30075281858444214, + "logits/rejected": 0.1766529381275177, + "logps/chosen": -33.087581634521484, + "logps/rejected": -53.23389434814453, + "loss": 0.6681, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05227477476000786, + "rewards/margins": 0.07364573329687119, + "rewards/rejected": -0.021370958536863327, + "step": 522 + }, + { + "epoch": 0.31885383325712546, + "grad_norm": 63.14207407142708, + "learning_rate": 2.2565853658536586e-08, + "logits/chosen": 0.1084495335817337, + "logits/rejected": 0.05239188298583031, + "logps/chosen": -147.90431213378906, + "logps/rejected": -254.87730407714844, + "loss": 0.6116, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018667571246623993, + "rewards/margins": 0.12227742373943329, + "rewards/rejected": -0.1036098524928093, + "step": 523 + }, + { + "epoch": 0.3194634964182289, + "grad_norm": 53.073831628957116, + "learning_rate": 2.2609756097560974e-08, + "logits/chosen": 0.16721415519714355, + "logits/rejected": 0.18646599352359772, + "logps/chosen": -146.1288299560547, + "logps/rejected": -74.22055053710938, + "loss": 0.6414, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2748778462409973, + "rewards/margins": 0.17356166243553162, + "rewards/rejected": 0.10131621360778809, + "step": 524 + }, + { + "epoch": 0.3200731595793324, + "grad_norm": 65.1383885375395, + "learning_rate": 2.2653658536585366e-08, + "logits/chosen": 0.24064935743808746, + "logits/rejected": 0.2412213236093521, + "logps/chosen": -24.251789093017578, + "logps/rejected": -13.947285652160645, + "loss": 0.6341, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002303923014551401, + "rewards/margins": 0.0070059895515441895, + "rewards/rejected": -0.009309912100434303, + "step": 525 + }, + { + "epoch": 0.3206828227404359, + "grad_norm": 65.56934674682218, + "learning_rate": 2.2697560975609754e-08, + "logits/chosen": 0.3520042300224304, + "logits/rejected": 0.0972924530506134, + "logps/chosen": -71.07540893554688, + "logps/rejected": -154.98980712890625, + "loss": 0.6716, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.018268859013915062, + "rewards/margins": -0.175251767039299, + "rewards/rejected": 0.1569828987121582, + "step": 526 + }, + { + "epoch": 0.3212924859015394, + "grad_norm": 68.21147111224226, + "learning_rate": 2.2741463414634146e-08, + "logits/chosen": 0.06844624876976013, + "logits/rejected": 0.07510246336460114, + "logps/chosen": -305.83209228515625, + "logps/rejected": -314.4248352050781, + "loss": 0.6482, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3196808099746704, + "rewards/margins": 0.11593285948038101, + "rewards/rejected": 0.2037479281425476, + "step": 527 + }, + { + "epoch": 0.3219021490626429, + "grad_norm": 67.98983586043427, + "learning_rate": 2.2785365853658534e-08, + "logits/chosen": -0.11653149127960205, + "logits/rejected": -0.13821932673454285, + "logps/chosen": -87.24510955810547, + "logps/rejected": -108.01617431640625, + "loss": 0.6435, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13828030228614807, + "rewards/margins": 0.0777205377817154, + "rewards/rejected": 0.06055976822972298, + "step": 528 + }, + { + "epoch": 0.32251181222374636, + "grad_norm": 78.38926868013422, + "learning_rate": 2.2829268292682926e-08, + "logits/chosen": 0.1056259423494339, + "logits/rejected": 0.052796632051467896, + "logps/chosen": -328.3974609375, + "logps/rejected": -224.63218688964844, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1256667822599411, + "rewards/margins": 0.1184113547205925, + "rewards/rejected": 0.007255414500832558, + "step": 529 + }, + { + "epoch": 0.32312147538484987, + "grad_norm": 67.09081275699884, + "learning_rate": 2.2873170731707317e-08, + "logits/chosen": 0.18698683381080627, + "logits/rejected": -0.042787060141563416, + "logps/chosen": -15.280387878417969, + "logps/rejected": -164.86746215820312, + "loss": 0.6887, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.09855636209249496, + "rewards/margins": -0.0753563866019249, + "rewards/rejected": -0.023199977353215218, + "step": 530 + }, + { + "epoch": 0.32373113854595337, + "grad_norm": 68.6934870738876, + "learning_rate": 2.2917073170731706e-08, + "logits/chosen": 0.08309216797351837, + "logits/rejected": 0.07431484758853912, + "logps/chosen": -52.191993713378906, + "logps/rejected": -73.80946350097656, + "loss": 0.6592, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04072005674242973, + "rewards/margins": -0.010257862508296967, + "rewards/rejected": 0.0509779192507267, + "step": 531 + }, + { + "epoch": 0.32434080170705687, + "grad_norm": 71.91083212544031, + "learning_rate": 2.2960975609756097e-08, + "logits/chosen": 0.20694595575332642, + "logits/rejected": 0.2705208659172058, + "logps/chosen": -229.86709594726562, + "logps/rejected": -244.8079833984375, + "loss": 0.6034, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36465075612068176, + "rewards/margins": 0.37818261981010437, + "rewards/rejected": -0.013531871140003204, + "step": 532 + }, + { + "epoch": 0.3249504648681603, + "grad_norm": 60.6253480805561, + "learning_rate": 2.3004878048780486e-08, + "logits/chosen": 0.06808324158191681, + "logits/rejected": 0.08612547814846039, + "logps/chosen": -123.4905776977539, + "logps/rejected": -94.32249450683594, + "loss": 0.6406, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16934238374233246, + "rewards/margins": 0.1628831923007965, + "rewards/rejected": 0.006459187716245651, + "step": 533 + }, + { + "epoch": 0.3255601280292638, + "grad_norm": 61.22671591917024, + "learning_rate": 2.3048780487804878e-08, + "logits/chosen": 0.16294969618320465, + "logits/rejected": 0.35293275117874146, + "logps/chosen": -184.82269287109375, + "logps/rejected": -99.59314727783203, + "loss": 0.6244, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1992354691028595, + "rewards/margins": 0.0191505029797554, + "rewards/rejected": 0.1800849735736847, + "step": 534 + }, + { + "epoch": 0.3261697911903673, + "grad_norm": 62.82742071842365, + "learning_rate": 2.309268292682927e-08, + "logits/chosen": 0.06455646455287933, + "logits/rejected": 0.20298981666564941, + "logps/chosen": -290.6690673828125, + "logps/rejected": -134.99990844726562, + "loss": 0.6624, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21653535962104797, + "rewards/margins": 0.08200506120920181, + "rewards/rejected": 0.13453030586242676, + "step": 535 + }, + { + "epoch": 0.3267794543514708, + "grad_norm": 65.34483004409513, + "learning_rate": 2.3136585365853658e-08, + "logits/chosen": 0.08539235591888428, + "logits/rejected": 0.01889946684241295, + "logps/chosen": -30.40401840209961, + "logps/rejected": -52.76930236816406, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05695812776684761, + "rewards/margins": 0.10740622878074646, + "rewards/rejected": -0.05044809728860855, + "step": 536 + }, + { + "epoch": 0.3273891175125743, + "grad_norm": 65.11274057229112, + "learning_rate": 2.318048780487805e-08, + "logits/chosen": 0.03304370492696762, + "logits/rejected": 0.12191274762153625, + "logps/chosen": -131.54193115234375, + "logps/rejected": -61.98631286621094, + "loss": 0.6131, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2209971845149994, + "rewards/margins": 0.14555290341377258, + "rewards/rejected": 0.07544426620006561, + "step": 537 + }, + { + "epoch": 0.32799878067367777, + "grad_norm": 59.33708176810461, + "learning_rate": 2.3224390243902438e-08, + "logits/chosen": 0.3022095859050751, + "logits/rejected": 0.24946600198745728, + "logps/chosen": -273.614501953125, + "logps/rejected": -171.8970947265625, + "loss": 0.6172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2901607155799866, + "rewards/margins": 0.3553035855293274, + "rewards/rejected": -0.06514289230108261, + "step": 538 + }, + { + "epoch": 0.32860844383478127, + "grad_norm": 65.66457853221078, + "learning_rate": 2.326829268292683e-08, + "logits/chosen": -0.1431131213903427, + "logits/rejected": 0.35114675760269165, + "logps/chosen": -268.68603515625, + "logps/rejected": -232.76162719726562, + "loss": 0.6342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17569391429424286, + "rewards/margins": 0.11183520406484604, + "rewards/rejected": 0.06385871767997742, + "step": 539 + }, + { + "epoch": 0.3292181069958848, + "grad_norm": 60.60200061745704, + "learning_rate": 2.3312195121951218e-08, + "logits/chosen": -0.08310922980308533, + "logits/rejected": 0.19544613361358643, + "logps/chosen": -364.769287109375, + "logps/rejected": -413.97247314453125, + "loss": 0.6362, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4459804594516754, + "rewards/margins": -0.013006415218114853, + "rewards/rejected": 0.45898687839508057, + "step": 540 + }, + { + "epoch": 0.3298277701569883, + "grad_norm": 59.5024423401539, + "learning_rate": 2.335609756097561e-08, + "logits/chosen": -0.17287477850914001, + "logits/rejected": -0.12527330219745636, + "logps/chosen": -251.46896362304688, + "logps/rejected": -162.64642333984375, + "loss": 0.6182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31934016942977905, + "rewards/margins": 0.1540745496749878, + "rewards/rejected": 0.16526560485363007, + "step": 541 + }, + { + "epoch": 0.3304374333180918, + "grad_norm": 66.87614981420792, + "learning_rate": 2.34e-08, + "logits/chosen": 0.156075119972229, + "logits/rejected": 0.4485046863555908, + "logps/chosen": -497.4200439453125, + "logps/rejected": -308.34283447265625, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3739921748638153, + "rewards/margins": 0.38053667545318604, + "rewards/rejected": -0.006544498726725578, + "step": 542 + }, + { + "epoch": 0.3310470964791952, + "grad_norm": 65.61149390934, + "learning_rate": 2.344390243902439e-08, + "logits/chosen": 0.06245467811822891, + "logits/rejected": 0.21399304270744324, + "logps/chosen": -416.5924987792969, + "logps/rejected": -322.1553649902344, + "loss": 0.6671, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22536620497703552, + "rewards/margins": 0.2942011058330536, + "rewards/rejected": -0.06883488595485687, + "step": 543 + }, + { + "epoch": 0.3316567596402987, + "grad_norm": 67.07083115943713, + "learning_rate": 2.348780487804878e-08, + "logits/chosen": -0.0005963281728327274, + "logits/rejected": 0.07341817021369934, + "logps/chosen": -107.6330337524414, + "logps/rejected": -145.70623779296875, + "loss": 0.6575, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001352405408397317, + "rewards/margins": -0.07420498877763748, + "rewards/rejected": 0.07285258173942566, + "step": 544 + }, + { + "epoch": 0.33226642280140223, + "grad_norm": 81.63252931962576, + "learning_rate": 2.353170731707317e-08, + "logits/chosen": 0.16309864819049835, + "logits/rejected": 0.42371779680252075, + "logps/chosen": -288.0001525878906, + "logps/rejected": -364.6177978515625, + "loss": 0.6596, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04256870597600937, + "rewards/margins": -0.13648854196071625, + "rewards/rejected": 0.17905724048614502, + "step": 545 + }, + { + "epoch": 0.33287608596250573, + "grad_norm": 75.45374995454429, + "learning_rate": 2.357560975609756e-08, + "logits/chosen": 0.16709935665130615, + "logits/rejected": 0.05223012715578079, + "logps/chosen": -65.19556427001953, + "logps/rejected": -85.7304916381836, + "loss": 0.6486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07292532920837402, + "rewards/margins": 0.1342860907316208, + "rewards/rejected": -0.061360765248537064, + "step": 546 + }, + { + "epoch": 0.33348574912360923, + "grad_norm": 72.95823694610114, + "learning_rate": 2.361951219512195e-08, + "logits/chosen": -0.14129552245140076, + "logits/rejected": 0.22636649012565613, + "logps/chosen": -236.18309020996094, + "logps/rejected": -157.30853271484375, + "loss": 0.6344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3008248209953308, + "rewards/margins": 0.20586901903152466, + "rewards/rejected": 0.09495582431554794, + "step": 547 + }, + { + "epoch": 0.3340954122847127, + "grad_norm": 71.19061168092637, + "learning_rate": 2.366341463414634e-08, + "logits/chosen": 0.19440124928951263, + "logits/rejected": 0.21193870902061462, + "logps/chosen": -179.04261779785156, + "logps/rejected": -165.11044311523438, + "loss": 0.622, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15733900666236877, + "rewards/margins": 0.35409873723983765, + "rewards/rejected": -0.19675976037979126, + "step": 548 + }, + { + "epoch": 0.3347050754458162, + "grad_norm": 63.34199159951003, + "learning_rate": 2.3707317073170733e-08, + "logits/chosen": 0.249933123588562, + "logits/rejected": 0.23116062581539154, + "logps/chosen": -184.37347412109375, + "logps/rejected": -255.3083038330078, + "loss": 0.5696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2683231234550476, + "rewards/margins": 0.49700772762298584, + "rewards/rejected": -0.22868461906909943, + "step": 549 + }, + { + "epoch": 0.3353147386069197, + "grad_norm": 55.812426895975854, + "learning_rate": 2.375121951219512e-08, + "logits/chosen": 0.1399931013584137, + "logits/rejected": 0.1378995031118393, + "logps/chosen": -126.45514678955078, + "logps/rejected": -93.26830291748047, + "loss": 0.6243, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.318451464176178, + "rewards/margins": 0.1935044676065445, + "rewards/rejected": 0.12494699656963348, + "step": 550 + }, + { + "epoch": 0.3359244017680232, + "grad_norm": 63.73615632089044, + "learning_rate": 2.3795121951219513e-08, + "logits/chosen": -0.05238550901412964, + "logits/rejected": -0.06300175189971924, + "logps/chosen": -62.070892333984375, + "logps/rejected": -65.6217041015625, + "loss": 0.6442, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.06361308693885803, + "rewards/margins": -0.04511312395334244, + "rewards/rejected": 0.10872621089220047, + "step": 551 + }, + { + "epoch": 0.33653406492912663, + "grad_norm": 60.98861820658699, + "learning_rate": 2.38390243902439e-08, + "logits/chosen": 0.00033330172300338745, + "logits/rejected": 0.49101191759109497, + "logps/chosen": -167.7310791015625, + "logps/rejected": -69.52841186523438, + "loss": 0.6688, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1323298215866089, + "rewards/margins": -0.07170376926660538, + "rewards/rejected": 0.20403359830379486, + "step": 552 + }, + { + "epoch": 0.33714372809023013, + "grad_norm": 62.068599127478905, + "learning_rate": 2.3882926829268293e-08, + "logits/chosen": 0.07346828281879425, + "logits/rejected": 0.02286439761519432, + "logps/chosen": -62.5282096862793, + "logps/rejected": -20.32370376586914, + "loss": 0.6152, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07491831481456757, + "rewards/margins": 0.1205805242061615, + "rewards/rejected": -0.045662201941013336, + "step": 553 + }, + { + "epoch": 0.33775339125133363, + "grad_norm": 71.62768706552319, + "learning_rate": 2.392682926829268e-08, + "logits/chosen": 0.12269076704978943, + "logits/rejected": 0.061230845749378204, + "logps/chosen": -83.42967224121094, + "logps/rejected": -191.16082763671875, + "loss": 0.6555, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0845014825463295, + "rewards/margins": -0.11915123462677002, + "rewards/rejected": 0.20365272462368011, + "step": 554 + }, + { + "epoch": 0.33836305441243714, + "grad_norm": 80.26393332917321, + "learning_rate": 2.3970731707317073e-08, + "logits/chosen": 0.30099964141845703, + "logits/rejected": 0.3035103380680084, + "logps/chosen": -101.5709228515625, + "logps/rejected": -15.632425308227539, + "loss": 0.6439, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0711577907204628, + "rewards/margins": 0.25126728415489197, + "rewards/rejected": -0.18010951578617096, + "step": 555 + }, + { + "epoch": 0.33897271757354064, + "grad_norm": 71.05295871031151, + "learning_rate": 2.4014634146341465e-08, + "logits/chosen": -0.012122197076678276, + "logits/rejected": 0.025360623374581337, + "logps/chosen": -188.68231201171875, + "logps/rejected": -147.80508422851562, + "loss": 0.6335, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04689235985279083, + "rewards/margins": -0.04648181423544884, + "rewards/rejected": 0.09337417036294937, + "step": 556 + }, + { + "epoch": 0.3395823807346441, + "grad_norm": 59.3051777736367, + "learning_rate": 2.4058536585365853e-08, + "logits/chosen": 0.1398051679134369, + "logits/rejected": 0.09347712993621826, + "logps/chosen": -144.3924102783203, + "logps/rejected": -133.54249572753906, + "loss": 0.6568, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14586153626441956, + "rewards/margins": 0.03632477670907974, + "rewards/rejected": 0.10953675210475922, + "step": 557 + }, + { + "epoch": 0.3401920438957476, + "grad_norm": 89.3280318886537, + "learning_rate": 2.4102439024390245e-08, + "logits/chosen": 0.26811841130256653, + "logits/rejected": 0.23905262351036072, + "logps/chosen": -41.126434326171875, + "logps/rejected": -100.11540222167969, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.055931802839040756, + "rewards/margins": 0.059824489057064056, + "rewards/rejected": -0.003892684355378151, + "step": 558 + }, + { + "epoch": 0.3408017070568511, + "grad_norm": 71.71546246082835, + "learning_rate": 2.4146341463414633e-08, + "logits/chosen": -0.025946272537112236, + "logits/rejected": -0.0661325678229332, + "logps/chosen": -365.43255615234375, + "logps/rejected": -279.7004089355469, + "loss": 0.6458, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09921663254499435, + "rewards/margins": 0.05765338987112045, + "rewards/rejected": 0.0415632463991642, + "step": 559 + }, + { + "epoch": 0.3414113702179546, + "grad_norm": 60.206821993863926, + "learning_rate": 2.4190243902439025e-08, + "logits/chosen": -0.03434105962514877, + "logits/rejected": -0.031055085361003876, + "logps/chosen": -103.45355224609375, + "logps/rejected": -94.31544494628906, + "loss": 0.6573, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12553460896015167, + "rewards/margins": 0.10331953316926956, + "rewards/rejected": 0.022215068340301514, + "step": 560 + }, + { + "epoch": 0.3420210333790581, + "grad_norm": 61.698290186140916, + "learning_rate": 2.4234146341463413e-08, + "logits/chosen": 0.38516178727149963, + "logits/rejected": 0.11057097464799881, + "logps/chosen": -31.34721565246582, + "logps/rejected": -58.38990783691406, + "loss": 0.6082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02334180474281311, + "rewards/margins": 0.1805194914340973, + "rewards/rejected": -0.15717768669128418, + "step": 561 + }, + { + "epoch": 0.34263069654016154, + "grad_norm": 68.31080004404484, + "learning_rate": 2.4278048780487805e-08, + "logits/chosen": 0.1655484139919281, + "logits/rejected": 0.1072499006986618, + "logps/chosen": -163.44290161132812, + "logps/rejected": -214.94918823242188, + "loss": 0.6252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4357205927371979, + "rewards/margins": 0.15322071313858032, + "rewards/rejected": 0.28249990940093994, + "step": 562 + }, + { + "epoch": 0.34324035970126504, + "grad_norm": 63.549523457687265, + "learning_rate": 2.4321951219512197e-08, + "logits/chosen": 0.17718505859375, + "logits/rejected": 0.32033926248550415, + "logps/chosen": -126.55470275878906, + "logps/rejected": -42.37413024902344, + "loss": 0.6359, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2075459212064743, + "rewards/margins": 0.22927585244178772, + "rewards/rejected": -0.02172992192208767, + "step": 563 + }, + { + "epoch": 0.34385002286236854, + "grad_norm": 69.94560030516728, + "learning_rate": 2.4365853658536585e-08, + "logits/chosen": 0.04947260022163391, + "logits/rejected": 0.13389050960540771, + "logps/chosen": -206.68463134765625, + "logps/rejected": -107.59349822998047, + "loss": 0.6131, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33965569734573364, + "rewards/margins": 0.27065369486808777, + "rewards/rejected": 0.06900197267532349, + "step": 564 + }, + { + "epoch": 0.34445968602347204, + "grad_norm": 63.260715615579315, + "learning_rate": 2.4409756097560977e-08, + "logits/chosen": 0.0563773475587368, + "logits/rejected": 0.27324387431144714, + "logps/chosen": -266.93096923828125, + "logps/rejected": -228.7593994140625, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17397937178611755, + "rewards/margins": 0.08989344537258148, + "rewards/rejected": 0.08408594131469727, + "step": 565 + }, + { + "epoch": 0.34506934918457555, + "grad_norm": 59.0380784420323, + "learning_rate": 2.4453658536585365e-08, + "logits/chosen": -0.053584471344947815, + "logits/rejected": 0.08549386262893677, + "logps/chosen": -189.98947143554688, + "logps/rejected": -146.95396423339844, + "loss": 0.6506, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2520523965358734, + "rewards/margins": 0.15931786596775055, + "rewards/rejected": 0.09273453056812286, + "step": 566 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 62.21306142356516, + "learning_rate": 2.4497560975609757e-08, + "logits/chosen": 0.22712036967277527, + "logits/rejected": 0.28604844212532043, + "logps/chosen": -157.5207061767578, + "logps/rejected": -165.08287048339844, + "loss": 0.6307, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14640095829963684, + "rewards/margins": 0.2731837034225464, + "rewards/rejected": -0.12678274512290955, + "step": 567 + }, + { + "epoch": 0.3462886755067825, + "grad_norm": 63.807436263494004, + "learning_rate": 2.4541463414634145e-08, + "logits/chosen": -0.3708324432373047, + "logits/rejected": -0.13896696269512177, + "logps/chosen": -153.66146850585938, + "logps/rejected": -145.76602172851562, + "loss": 0.6455, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08404551446437836, + "rewards/margins": 0.06497307866811752, + "rewards/rejected": 0.019072428345680237, + "step": 568 + }, + { + "epoch": 0.346898338667886, + "grad_norm": 72.64496190421848, + "learning_rate": 2.4585365853658537e-08, + "logits/chosen": 0.250077486038208, + "logits/rejected": 0.24369224905967712, + "logps/chosen": -26.746715545654297, + "logps/rejected": -8.553413391113281, + "loss": 0.6595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04704287275671959, + "rewards/margins": 0.07402165234088898, + "rewards/rejected": -0.12106452882289886, + "step": 569 + }, + { + "epoch": 0.3475080018289895, + "grad_norm": 66.27267922513238, + "learning_rate": 2.462926829268293e-08, + "logits/chosen": 0.16420923173427582, + "logits/rejected": 0.26274797320365906, + "logps/chosen": -132.50790405273438, + "logps/rejected": -165.24411010742188, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3622353971004486, + "rewards/margins": 0.5996338129043579, + "rewards/rejected": -0.2373984009027481, + "step": 570 + }, + { + "epoch": 0.348117664990093, + "grad_norm": 66.13326481370741, + "learning_rate": 2.4673170731707317e-08, + "logits/chosen": 0.20698092877864838, + "logits/rejected": 0.1994612067937851, + "logps/chosen": -127.6097412109375, + "logps/rejected": -132.74673461914062, + "loss": 0.6664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24468494951725006, + "rewards/margins": 0.17810288071632385, + "rewards/rejected": 0.06658206135034561, + "step": 571 + }, + { + "epoch": 0.34872732815119645, + "grad_norm": 57.7065467173085, + "learning_rate": 2.471707317073171e-08, + "logits/chosen": 0.3937755823135376, + "logits/rejected": 0.3192245662212372, + "logps/chosen": -169.89508056640625, + "logps/rejected": -196.94338989257812, + "loss": 0.6427, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28047704696655273, + "rewards/margins": 0.1190960705280304, + "rewards/rejected": 0.16138097643852234, + "step": 572 + }, + { + "epoch": 0.34933699131229995, + "grad_norm": 61.772205677062786, + "learning_rate": 2.4760975609756094e-08, + "logits/chosen": 0.03855567425489426, + "logits/rejected": 0.19162461161613464, + "logps/chosen": -150.66961669921875, + "logps/rejected": -136.82655334472656, + "loss": 0.6092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3261014521121979, + "rewards/margins": 0.2646014392375946, + "rewards/rejected": 0.061499979346990585, + "step": 573 + }, + { + "epoch": 0.34994665447340345, + "grad_norm": 79.087034481911, + "learning_rate": 2.4804878048780485e-08, + "logits/chosen": -0.046487804502248764, + "logits/rejected": -0.0954296812415123, + "logps/chosen": -118.04635620117188, + "logps/rejected": -134.03256225585938, + "loss": 0.6336, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.09429363906383514, + "rewards/margins": -0.03607354685664177, + "rewards/rejected": 0.1303671896457672, + "step": 574 + }, + { + "epoch": 0.35055631763450695, + "grad_norm": 61.058163521118374, + "learning_rate": 2.4848780487804874e-08, + "logits/chosen": 0.3075591027736664, + "logits/rejected": 0.25359150767326355, + "logps/chosen": -80.47167205810547, + "logps/rejected": -81.86988067626953, + "loss": 0.6882, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.029255548492074013, + "rewards/margins": 0.03396584466099739, + "rewards/rejected": -0.004710295237600803, + "step": 575 + }, + { + "epoch": 0.3511659807956104, + "grad_norm": 68.89227737141198, + "learning_rate": 2.4892682926829265e-08, + "logits/chosen": -0.1473444551229477, + "logits/rejected": 0.18843889236450195, + "logps/chosen": -91.84371185302734, + "logps/rejected": -104.91816711425781, + "loss": 0.6436, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.044121529906988144, + "rewards/margins": -0.1411334127187729, + "rewards/rejected": 0.09701188653707504, + "step": 576 + }, + { + "epoch": 0.3517756439567139, + "grad_norm": 64.52521815440315, + "learning_rate": 2.4936585365853654e-08, + "logits/chosen": 0.2751694619655609, + "logits/rejected": 0.1315845549106598, + "logps/chosen": -6.20741605758667, + "logps/rejected": -28.119037628173828, + "loss": 0.6317, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004677858203649521, + "rewards/margins": -0.004088805988430977, + "rewards/rejected": 0.008766662329435349, + "step": 577 + }, + { + "epoch": 0.3523853071178174, + "grad_norm": 61.48646710998834, + "learning_rate": 2.4980487804878046e-08, + "logits/chosen": 0.10936979949474335, + "logits/rejected": 0.008752191439270973, + "logps/chosen": -75.4015884399414, + "logps/rejected": -107.59015655517578, + "loss": 0.6482, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06906801462173462, + "rewards/margins": 0.146926149725914, + "rewards/rejected": -0.07785812765359879, + "step": 578 + }, + { + "epoch": 0.3529949702789209, + "grad_norm": 63.98283651957225, + "learning_rate": 2.5024390243902437e-08, + "logits/chosen": 0.39335334300994873, + "logits/rejected": 0.28164124488830566, + "logps/chosen": -45.73773956298828, + "logps/rejected": -87.71354675292969, + "loss": 0.6305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.043201033025979996, + "rewards/margins": 0.014671443030238152, + "rewards/rejected": -0.0578724704682827, + "step": 579 + }, + { + "epoch": 0.3536046334400244, + "grad_norm": 72.69712815816003, + "learning_rate": 2.5068292682926826e-08, + "logits/chosen": -0.1359988898038864, + "logits/rejected": -0.0037320032715797424, + "logps/chosen": -393.5069885253906, + "logps/rejected": -309.19940185546875, + "loss": 0.6311, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3374473452568054, + "rewards/margins": -0.012655630707740784, + "rewards/rejected": 0.3501029908657074, + "step": 580 + }, + { + "epoch": 0.35421429660112785, + "grad_norm": 70.34952103315145, + "learning_rate": 2.5112195121951217e-08, + "logits/chosen": -0.2626177668571472, + "logits/rejected": -0.10716196149587631, + "logps/chosen": -189.8940887451172, + "logps/rejected": -88.66100311279297, + "loss": 0.6163, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2558242678642273, + "rewards/margins": 0.19720950722694397, + "rewards/rejected": 0.058614760637283325, + "step": 581 + }, + { + "epoch": 0.35482395976223136, + "grad_norm": 60.92662912150131, + "learning_rate": 2.5156097560975606e-08, + "logits/chosen": 0.17372733354568481, + "logits/rejected": 0.09753230959177017, + "logps/chosen": -40.65727996826172, + "logps/rejected": -53.03999710083008, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01947101578116417, + "rewards/margins": 0.16377048194408417, + "rewards/rejected": -0.1442994624376297, + "step": 582 + }, + { + "epoch": 0.35543362292333486, + "grad_norm": 65.82111635184187, + "learning_rate": 2.5199999999999997e-08, + "logits/chosen": 0.15699997544288635, + "logits/rejected": 0.1702466607093811, + "logps/chosen": -108.42682647705078, + "logps/rejected": -67.42084503173828, + "loss": 0.633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08658311516046524, + "rewards/margins": 0.3346436321735382, + "rewards/rejected": -0.24806050956249237, + "step": 583 + }, + { + "epoch": 0.35604328608443836, + "grad_norm": 61.589723908941714, + "learning_rate": 2.5243902439024386e-08, + "logits/chosen": 0.24087904393672943, + "logits/rejected": 0.4224730134010315, + "logps/chosen": -124.71540832519531, + "logps/rejected": -80.59169006347656, + "loss": 0.6058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06363040953874588, + "rewards/margins": 0.09655527770519257, + "rewards/rejected": -0.032924868166446686, + "step": 584 + }, + { + "epoch": 0.35665294924554186, + "grad_norm": 69.94307749141346, + "learning_rate": 2.5287804878048777e-08, + "logits/chosen": -0.1330254077911377, + "logits/rejected": 0.2797281742095947, + "logps/chosen": -322.4261169433594, + "logps/rejected": -200.09893798828125, + "loss": 0.6805, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.354610800743103, + "rewards/margins": 0.17412516474723816, + "rewards/rejected": 0.18048560619354248, + "step": 585 + }, + { + "epoch": 0.3572626124066453, + "grad_norm": 60.674530083099555, + "learning_rate": 2.533170731707317e-08, + "logits/chosen": 0.15709838271141052, + "logits/rejected": 0.09432573616504669, + "logps/chosen": -201.58251953125, + "logps/rejected": -287.25811767578125, + "loss": 0.6596, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22978299856185913, + "rewards/margins": 0.34129127860069275, + "rewards/rejected": -0.11150828003883362, + "step": 586 + }, + { + "epoch": 0.3578722755677488, + "grad_norm": 57.70496811338579, + "learning_rate": 2.5375609756097557e-08, + "logits/chosen": 0.13312861323356628, + "logits/rejected": 0.14644429087638855, + "logps/chosen": -26.847915649414062, + "logps/rejected": -12.667652130126953, + "loss": 0.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05336686596274376, + "rewards/margins": 0.060440223664045334, + "rewards/rejected": -0.11380708962678909, + "step": 587 + }, + { + "epoch": 0.3584819387288523, + "grad_norm": 66.64224583434539, + "learning_rate": 2.541951219512195e-08, + "logits/chosen": 0.07735345512628555, + "logits/rejected": -0.014005718752741814, + "logps/chosen": -289.8581237792969, + "logps/rejected": -347.80267333984375, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4404403567314148, + "rewards/margins": 0.3935664892196655, + "rewards/rejected": 0.04687386006116867, + "step": 588 + }, + { + "epoch": 0.3590916018899558, + "grad_norm": 70.91864831755105, + "learning_rate": 2.5463414634146338e-08, + "logits/chosen": -0.07984452694654465, + "logits/rejected": -0.11352888494729996, + "logps/chosen": -50.47224044799805, + "logps/rejected": -70.70494842529297, + "loss": 0.6122, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0006060954183340073, + "rewards/margins": 0.10464400053024292, + "rewards/rejected": -0.10403790324926376, + "step": 589 + }, + { + "epoch": 0.3597012650510593, + "grad_norm": 58.66353988250958, + "learning_rate": 2.550731707317073e-08, + "logits/chosen": 0.1725425124168396, + "logits/rejected": 0.13134923577308655, + "logps/chosen": -162.28570556640625, + "logps/rejected": -213.63504028320312, + "loss": 0.6036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2620026767253876, + "rewards/margins": 0.5470747947692871, + "rewards/rejected": -0.2850721478462219, + "step": 590 + }, + { + "epoch": 0.36031092821216276, + "grad_norm": 67.7212065027267, + "learning_rate": 2.555121951219512e-08, + "logits/chosen": 0.3118616044521332, + "logits/rejected": 0.37517768144607544, + "logps/chosen": -32.91007995605469, + "logps/rejected": -36.30499267578125, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0029077772051095963, + "rewards/margins": -0.015037847682833672, + "rewards/rejected": 0.017945624887943268, + "step": 591 + }, + { + "epoch": 0.36092059137326626, + "grad_norm": 61.84826057047049, + "learning_rate": 2.559512195121951e-08, + "logits/chosen": 0.09155456721782684, + "logits/rejected": 0.17857593297958374, + "logps/chosen": -71.72034454345703, + "logps/rejected": -34.7586669921875, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006021538749337196, + "rewards/margins": 0.09854938089847565, + "rewards/rejected": -0.10457092523574829, + "step": 592 + }, + { + "epoch": 0.36153025453436977, + "grad_norm": 59.44271924749285, + "learning_rate": 2.56390243902439e-08, + "logits/chosen": 0.14848221838474274, + "logits/rejected": 0.13061706721782684, + "logps/chosen": -45.13452911376953, + "logps/rejected": -103.86456298828125, + "loss": 0.5884, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.042713914066553116, + "rewards/margins": 0.01090623065829277, + "rewards/rejected": -0.053620144724845886, + "step": 593 + }, + { + "epoch": 0.36213991769547327, + "grad_norm": 55.28440428018266, + "learning_rate": 2.568292682926829e-08, + "logits/chosen": 0.007122687995433807, + "logits/rejected": 0.0011376794427633286, + "logps/chosen": -131.5517120361328, + "logps/rejected": -125.87528991699219, + "loss": 0.6337, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07693450897932053, + "rewards/margins": 0.0759023129940033, + "rewards/rejected": 0.0010321959853172302, + "step": 594 + }, + { + "epoch": 0.3627495808565767, + "grad_norm": 57.442289571363595, + "learning_rate": 2.572682926829268e-08, + "logits/chosen": 0.26929086446762085, + "logits/rejected": 0.06947465986013412, + "logps/chosen": -281.8733215332031, + "logps/rejected": -255.4256134033203, + "loss": 0.6305, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21762491762638092, + "rewards/margins": 0.6367673277854919, + "rewards/rejected": -0.41914239525794983, + "step": 595 + }, + { + "epoch": 0.3633592440176802, + "grad_norm": 60.725394018101916, + "learning_rate": 2.577073170731707e-08, + "logits/chosen": 0.13580162823200226, + "logits/rejected": 0.1205705925822258, + "logps/chosen": -12.920661926269531, + "logps/rejected": -41.98260498046875, + "loss": 0.5966, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.025257421657443047, + "rewards/margins": 0.2696884274482727, + "rewards/rejected": -0.2949458658695221, + "step": 596 + }, + { + "epoch": 0.3639689071787837, + "grad_norm": 61.11923315504998, + "learning_rate": 2.581463414634146e-08, + "logits/chosen": 0.2698628902435303, + "logits/rejected": 0.24275220930576324, + "logps/chosen": -144.9392547607422, + "logps/rejected": -102.06745910644531, + "loss": 0.5998, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3213733732700348, + "rewards/margins": 0.4839481711387634, + "rewards/rejected": -0.16257481276988983, + "step": 597 + }, + { + "epoch": 0.3645785703398872, + "grad_norm": 78.61711851550798, + "learning_rate": 2.5858536585365853e-08, + "logits/chosen": 0.050785765051841736, + "logits/rejected": 0.19618116319179535, + "logps/chosen": -277.87274169921875, + "logps/rejected": -258.81658935546875, + "loss": 0.6219, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07663650810718536, + "rewards/margins": 0.04727335274219513, + "rewards/rejected": 0.029363155364990234, + "step": 598 + }, + { + "epoch": 0.3651882335009907, + "grad_norm": 54.04566101972143, + "learning_rate": 2.590243902439024e-08, + "logits/chosen": 0.11316078901290894, + "logits/rejected": 0.1296045333147049, + "logps/chosen": -11.70112419128418, + "logps/rejected": -17.967309951782227, + "loss": 0.6094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22652791440486908, + "rewards/margins": 0.0768495574593544, + "rewards/rejected": -0.3033774495124817, + "step": 599 + }, + { + "epoch": 0.36579789666209417, + "grad_norm": 82.02657260746992, + "learning_rate": 2.5946341463414633e-08, + "logits/chosen": 0.0012502595782279968, + "logits/rejected": -0.0025876015424728394, + "logps/chosen": -196.02114868164062, + "logps/rejected": -176.1319580078125, + "loss": 0.6134, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20056456327438354, + "rewards/margins": 0.05789215862751007, + "rewards/rejected": 0.14267241954803467, + "step": 600 + }, + { + "epoch": 0.36640755982319767, + "grad_norm": 73.01668902389659, + "learning_rate": 2.599024390243902e-08, + "logits/chosen": 0.08000324666500092, + "logits/rejected": 0.10965539515018463, + "logps/chosen": -262.96356201171875, + "logps/rejected": -286.09930419921875, + "loss": 0.6877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32981452345848083, + "rewards/margins": 0.4419025182723999, + "rewards/rejected": -0.11208801716566086, + "step": 601 + }, + { + "epoch": 0.3670172229843012, + "grad_norm": 62.326988624410966, + "learning_rate": 2.6034146341463413e-08, + "logits/chosen": 0.21029147505760193, + "logits/rejected": 0.18066349625587463, + "logps/chosen": -10.674514770507812, + "logps/rejected": -10.799851417541504, + "loss": 0.6504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0390445850789547, + "rewards/margins": 0.20681487023830414, + "rewards/rejected": -0.24585945904254913, + "step": 602 + }, + { + "epoch": 0.3676268861454047, + "grad_norm": 88.21455280476565, + "learning_rate": 2.60780487804878e-08, + "logits/chosen": 0.22360165417194366, + "logits/rejected": 0.3238444924354553, + "logps/chosen": -282.7586669921875, + "logps/rejected": -458.24884033203125, + "loss": 0.6415, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21803241968154907, + "rewards/margins": 0.10435838997364044, + "rewards/rejected": 0.11367402970790863, + "step": 603 + }, + { + "epoch": 0.3682365493065082, + "grad_norm": 74.28301102381702, + "learning_rate": 2.6121951219512193e-08, + "logits/chosen": 0.09303002059459686, + "logits/rejected": 0.0715787410736084, + "logps/chosen": -138.5469512939453, + "logps/rejected": -226.74530029296875, + "loss": 0.5986, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2518017590045929, + "rewards/margins": 0.0004120338708162308, + "rewards/rejected": 0.2513897120952606, + "step": 604 + }, + { + "epoch": 0.3688462124676116, + "grad_norm": 64.03420564684374, + "learning_rate": 2.6165853658536585e-08, + "logits/chosen": 0.04821214824914932, + "logits/rejected": 0.02449846640229225, + "logps/chosen": -17.56277847290039, + "logps/rejected": -19.87277603149414, + "loss": 0.6, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028018198907375336, + "rewards/margins": 0.03261035308241844, + "rewards/rejected": -0.060628555715084076, + "step": 605 + }, + { + "epoch": 0.3694558756287151, + "grad_norm": 85.19802946627, + "learning_rate": 2.6209756097560973e-08, + "logits/chosen": 0.04915054887533188, + "logits/rejected": 0.17558442056179047, + "logps/chosen": -292.1290588378906, + "logps/rejected": -250.21385192871094, + "loss": 0.6789, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2755224406719208, + "rewards/margins": 0.2095935046672821, + "rewards/rejected": 0.06592892855405807, + "step": 606 + }, + { + "epoch": 0.3700655387898186, + "grad_norm": 67.86329763868036, + "learning_rate": 2.6253658536585365e-08, + "logits/chosen": 0.10586974769830704, + "logits/rejected": 0.2383757382631302, + "logps/chosen": -30.485618591308594, + "logps/rejected": -30.937406539916992, + "loss": 0.6889, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.23874562978744507, + "rewards/margins": -0.07639502733945847, + "rewards/rejected": -0.162350594997406, + "step": 607 + }, + { + "epoch": 0.37067520195092213, + "grad_norm": 65.64589668684825, + "learning_rate": 2.6297560975609753e-08, + "logits/chosen": 0.4129244089126587, + "logits/rejected": 0.43831074237823486, + "logps/chosen": -120.45503234863281, + "logps/rejected": -67.74662780761719, + "loss": 0.6077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21287314593791962, + "rewards/margins": 0.41677525639533997, + "rewards/rejected": -0.20390209555625916, + "step": 608 + }, + { + "epoch": 0.37128486511202563, + "grad_norm": 66.54793285466044, + "learning_rate": 2.6341463414634145e-08, + "logits/chosen": -0.08084888756275177, + "logits/rejected": 0.12200430035591125, + "logps/chosen": -184.31590270996094, + "logps/rejected": -74.2082748413086, + "loss": 0.649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030466752126812935, + "rewards/margins": 0.13831934332847595, + "rewards/rejected": -0.16878609359264374, + "step": 609 + }, + { + "epoch": 0.3718945282731291, + "grad_norm": 65.99674430249515, + "learning_rate": 2.6385365853658533e-08, + "logits/chosen": -0.0379466786980629, + "logits/rejected": -0.013483867049217224, + "logps/chosen": -125.67252349853516, + "logps/rejected": -161.0394287109375, + "loss": 0.6256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09288941323757172, + "rewards/margins": 0.2684478461742401, + "rewards/rejected": -0.1755584180355072, + "step": 610 + }, + { + "epoch": 0.3725041914342326, + "grad_norm": 76.62280528901553, + "learning_rate": 2.6429268292682925e-08, + "logits/chosen": 0.07567193359136581, + "logits/rejected": 0.07395648211240768, + "logps/chosen": -209.67477416992188, + "logps/rejected": -286.802001953125, + "loss": 0.5782, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.17806631326675415, + "rewards/margins": -0.10854325443506241, + "rewards/rejected": 0.28660956025123596, + "step": 611 + }, + { + "epoch": 0.3731138545953361, + "grad_norm": 62.54991344920887, + "learning_rate": 2.6473170731707317e-08, + "logits/chosen": 0.3633056879043579, + "logits/rejected": 0.004547901451587677, + "logps/chosen": -409.0431213378906, + "logps/rejected": -439.3016662597656, + "loss": 0.5697, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4457567632198334, + "rewards/margins": 0.5710694193840027, + "rewards/rejected": -0.12531261146068573, + "step": 612 + }, + { + "epoch": 0.3737235177564396, + "grad_norm": 66.9626481658841, + "learning_rate": 2.6517073170731705e-08, + "logits/chosen": -0.004068553447723389, + "logits/rejected": 0.010545983910560608, + "logps/chosen": -14.627309799194336, + "logps/rejected": -17.238407135009766, + "loss": 0.6377, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005788658745586872, + "rewards/margins": 0.06385204941034317, + "rewards/rejected": -0.06964070349931717, + "step": 613 + }, + { + "epoch": 0.3743331809175431, + "grad_norm": 74.93035243234006, + "learning_rate": 2.6560975609756097e-08, + "logits/chosen": 0.08081544935703278, + "logits/rejected": 0.40592074394226074, + "logps/chosen": -182.25711059570312, + "logps/rejected": -106.70711517333984, + "loss": 0.6102, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11156254261732101, + "rewards/margins": 0.03907126933336258, + "rewards/rejected": 0.07249126583337784, + "step": 614 + }, + { + "epoch": 0.37494284407864653, + "grad_norm": 61.75363896667214, + "learning_rate": 2.6604878048780485e-08, + "logits/chosen": 0.2579991817474365, + "logits/rejected": 0.18988223373889923, + "logps/chosen": -255.18344116210938, + "logps/rejected": -276.07342529296875, + "loss": 0.599, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6108578443527222, + "rewards/margins": 1.0364336967468262, + "rewards/rejected": -0.42557579278945923, + "step": 615 + }, + { + "epoch": 0.37555250723975003, + "grad_norm": 63.01762850157472, + "learning_rate": 2.6648780487804877e-08, + "logits/chosen": 0.2430221140384674, + "logits/rejected": 0.2056267410516739, + "logps/chosen": -56.53331756591797, + "logps/rejected": -109.94190216064453, + "loss": 0.632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.002829909324645996, + "rewards/margins": 0.08583585917949677, + "rewards/rejected": -0.08866576850414276, + "step": 616 + }, + { + "epoch": 0.37616217040085353, + "grad_norm": 61.26783468198263, + "learning_rate": 2.6692682926829265e-08, + "logits/chosen": 0.05079840123653412, + "logits/rejected": 0.24230441451072693, + "logps/chosen": -239.391845703125, + "logps/rejected": -182.6611785888672, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3134889006614685, + "rewards/margins": 0.33168116211891174, + "rewards/rejected": -0.01819225214421749, + "step": 617 + }, + { + "epoch": 0.37677183356195704, + "grad_norm": 72.05342352266527, + "learning_rate": 2.6736585365853657e-08, + "logits/chosen": 0.03983582556247711, + "logits/rejected": 0.2674873173236847, + "logps/chosen": -188.82066345214844, + "logps/rejected": -243.94949340820312, + "loss": 0.6583, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36443835496902466, + "rewards/margins": 0.336212694644928, + "rewards/rejected": 0.02822566032409668, + "step": 618 + }, + { + "epoch": 0.3773814967230605, + "grad_norm": 53.29340159227129, + "learning_rate": 2.678048780487805e-08, + "logits/chosen": 0.08060707151889801, + "logits/rejected": 0.2136414647102356, + "logps/chosen": -106.3077392578125, + "logps/rejected": -167.1398468017578, + "loss": 0.6813, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12663131952285767, + "rewards/margins": -0.055943287909030914, + "rewards/rejected": 0.18257459998130798, + "step": 619 + }, + { + "epoch": 0.377991159884164, + "grad_norm": 66.32848817785538, + "learning_rate": 2.6824390243902437e-08, + "logits/chosen": 0.1777549833059311, + "logits/rejected": 0.20667846500873566, + "logps/chosen": -265.8216857910156, + "logps/rejected": -93.6600341796875, + "loss": 0.6218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17918093502521515, + "rewards/margins": 0.2043815553188324, + "rewards/rejected": -0.0252006184309721, + "step": 620 + }, + { + "epoch": 0.3786008230452675, + "grad_norm": 64.159257404677, + "learning_rate": 2.686829268292683e-08, + "logits/chosen": 0.028806494548916817, + "logits/rejected": 0.1387287676334381, + "logps/chosen": -235.42311096191406, + "logps/rejected": -163.48081970214844, + "loss": 0.604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31116387248039246, + "rewards/margins": 0.46241557598114014, + "rewards/rejected": -0.15125170350074768, + "step": 621 + }, + { + "epoch": 0.379210486206371, + "grad_norm": 61.34920112663299, + "learning_rate": 2.6912195121951217e-08, + "logits/chosen": 0.2551673650741577, + "logits/rejected": 0.2296556830406189, + "logps/chosen": -73.38729095458984, + "logps/rejected": -97.46251678466797, + "loss": 0.6072, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06867722421884537, + "rewards/margins": 0.24628911912441254, + "rewards/rejected": -0.17761187255382538, + "step": 622 + }, + { + "epoch": 0.3798201493674745, + "grad_norm": 64.58048001676416, + "learning_rate": 2.695609756097561e-08, + "logits/chosen": 0.16222016513347626, + "logits/rejected": 0.4692246913909912, + "logps/chosen": -123.88907623291016, + "logps/rejected": -281.60882568359375, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24188756942749023, + "rewards/margins": 0.4141005575656891, + "rewards/rejected": -0.17221297323703766, + "step": 623 + }, + { + "epoch": 0.38042981252857794, + "grad_norm": 64.59196605516046, + "learning_rate": 2.6999999999999997e-08, + "logits/chosen": 0.2751791477203369, + "logits/rejected": 0.020809587091207504, + "logps/chosen": -121.01636505126953, + "logps/rejected": -124.5457992553711, + "loss": 0.6112, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2425137460231781, + "rewards/margins": 0.3062474727630615, + "rewards/rejected": -0.06373371928930283, + "step": 624 + }, + { + "epoch": 0.38103947568968144, + "grad_norm": 76.26461460607403, + "learning_rate": 2.704390243902439e-08, + "logits/chosen": 0.3228560984134674, + "logits/rejected": 0.26304954290390015, + "logps/chosen": -377.4947509765625, + "logps/rejected": -228.72023010253906, + "loss": 0.6185, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9364129900932312, + "rewards/margins": 0.6026533842086792, + "rewards/rejected": 0.333759605884552, + "step": 625 + }, + { + "epoch": 0.38164913885078494, + "grad_norm": 61.474582734685946, + "learning_rate": 2.708780487804878e-08, + "logits/chosen": 0.04169579595327377, + "logits/rejected": -0.024029143154621124, + "logps/chosen": -77.93717193603516, + "logps/rejected": -43.741764068603516, + "loss": 0.604, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03765379264950752, + "rewards/margins": 0.2079656720161438, + "rewards/rejected": -0.17031188309192657, + "step": 626 + }, + { + "epoch": 0.38225880201188844, + "grad_norm": 78.40463661409733, + "learning_rate": 2.713170731707317e-08, + "logits/chosen": 0.18412914872169495, + "logits/rejected": 0.25874099135398865, + "logps/chosen": -205.51585388183594, + "logps/rejected": -260.1470642089844, + "loss": 0.6656, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02864876016974449, + "rewards/margins": -0.2759316861629486, + "rewards/rejected": 0.24728290736675262, + "step": 627 + }, + { + "epoch": 0.38286846517299195, + "grad_norm": 62.60707704672063, + "learning_rate": 2.717560975609756e-08, + "logits/chosen": 0.15112635493278503, + "logits/rejected": 0.19681477546691895, + "logps/chosen": -54.30095672607422, + "logps/rejected": -87.17314147949219, + "loss": 0.6241, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03811212629079819, + "rewards/margins": -0.048415981233119965, + "rewards/rejected": 0.08652810752391815, + "step": 628 + }, + { + "epoch": 0.3834781283340954, + "grad_norm": 58.090678691824294, + "learning_rate": 2.721951219512195e-08, + "logits/chosen": -0.14929108321666718, + "logits/rejected": -0.1105942651629448, + "logps/chosen": -111.02405548095703, + "logps/rejected": -58.99564743041992, + "loss": 0.5989, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.041006699204444885, + "rewards/margins": 0.2893308401107788, + "rewards/rejected": -0.24832415580749512, + "step": 629 + }, + { + "epoch": 0.3840877914951989, + "grad_norm": 72.43955388155143, + "learning_rate": 2.726341463414634e-08, + "logits/chosen": 0.35203802585601807, + "logits/rejected": 0.2940349876880646, + "logps/chosen": -56.03721618652344, + "logps/rejected": -87.06900024414062, + "loss": 0.6378, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26018720865249634, + "rewards/margins": 0.35242322087287903, + "rewards/rejected": -0.0922359973192215, + "step": 630 + }, + { + "epoch": 0.3846974546563024, + "grad_norm": 77.05555555464018, + "learning_rate": 2.7307317073170732e-08, + "logits/chosen": 0.02849944308400154, + "logits/rejected": 0.06376279145479202, + "logps/chosen": -233.94810485839844, + "logps/rejected": -178.75323486328125, + "loss": 0.6211, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.23587554693222046, + "rewards/margins": 0.2200503647327423, + "rewards/rejected": 0.01582520082592964, + "step": 631 + }, + { + "epoch": 0.3853071178174059, + "grad_norm": 67.95815795868499, + "learning_rate": 2.735121951219512e-08, + "logits/chosen": 0.16180747747421265, + "logits/rejected": 0.1460152268409729, + "logps/chosen": -79.08220672607422, + "logps/rejected": -144.6344757080078, + "loss": 0.7137, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15169088542461395, + "rewards/margins": 0.2004600316286087, + "rewards/rejected": -0.04876915365457535, + "step": 632 + }, + { + "epoch": 0.3859167809785094, + "grad_norm": 66.57615580150977, + "learning_rate": 2.7395121951219512e-08, + "logits/chosen": 0.05967415124177933, + "logits/rejected": 0.04310440272092819, + "logps/chosen": -130.2741241455078, + "logps/rejected": -153.22393798828125, + "loss": 0.6184, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07257428020238876, + "rewards/margins": 0.13157668709754944, + "rewards/rejected": -0.2041509598493576, + "step": 633 + }, + { + "epoch": 0.38652644413961285, + "grad_norm": 70.27046849210986, + "learning_rate": 2.74390243902439e-08, + "logits/chosen": 0.07667946815490723, + "logits/rejected": 0.1487220972776413, + "logps/chosen": -220.3509979248047, + "logps/rejected": -176.93246459960938, + "loss": 0.6569, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22415469586849213, + "rewards/margins": 0.02412354201078415, + "rewards/rejected": 0.20003116130828857, + "step": 634 + }, + { + "epoch": 0.38713610730071635, + "grad_norm": 77.25348241348446, + "learning_rate": 2.7482926829268292e-08, + "logits/chosen": 0.197032630443573, + "logits/rejected": 0.2097669243812561, + "logps/chosen": -259.7798156738281, + "logps/rejected": -191.10232543945312, + "loss": 0.6192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2788991630077362, + "rewards/margins": 0.4779600501060486, + "rewards/rejected": -0.19906088709831238, + "step": 635 + }, + { + "epoch": 0.38774577046181985, + "grad_norm": 88.06831080556853, + "learning_rate": 2.752682926829268e-08, + "logits/chosen": -0.2233126014471054, + "logits/rejected": 0.20519399642944336, + "logps/chosen": -324.0554504394531, + "logps/rejected": -229.15579223632812, + "loss": 0.6627, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42614394426345825, + "rewards/margins": 0.22994571924209595, + "rewards/rejected": 0.1961982250213623, + "step": 636 + }, + { + "epoch": 0.38835543362292335, + "grad_norm": 59.53481307899062, + "learning_rate": 2.7570731707317072e-08, + "logits/chosen": 0.2308369278907776, + "logits/rejected": 0.1919964998960495, + "logps/chosen": -342.44024658203125, + "logps/rejected": -247.9741668701172, + "loss": 0.6123, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4173545241355896, + "rewards/margins": 0.3117733597755432, + "rewards/rejected": 0.10558116436004639, + "step": 637 + }, + { + "epoch": 0.3889650967840268, + "grad_norm": 72.4663834849674, + "learning_rate": 2.7614634146341464e-08, + "logits/chosen": -0.15110087394714355, + "logits/rejected": -0.034562867134809494, + "logps/chosen": -423.80523681640625, + "logps/rejected": -349.52862548828125, + "loss": 0.6287, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10292831063270569, + "rewards/margins": 0.027959585189819336, + "rewards/rejected": 0.07496871054172516, + "step": 638 + }, + { + "epoch": 0.3895747599451303, + "grad_norm": 64.31086506734529, + "learning_rate": 2.7658536585365852e-08, + "logits/chosen": 0.28114038705825806, + "logits/rejected": 0.18702393770217896, + "logps/chosen": -143.953857421875, + "logps/rejected": -242.09315490722656, + "loss": 0.6291, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38662075996398926, + "rewards/margins": 0.841455340385437, + "rewards/rejected": -0.45483461022377014, + "step": 639 + }, + { + "epoch": 0.3901844231062338, + "grad_norm": 74.62046947229135, + "learning_rate": 2.7702439024390244e-08, + "logits/chosen": 0.09527795016765594, + "logits/rejected": 0.19550162553787231, + "logps/chosen": -236.70750427246094, + "logps/rejected": -175.36721801757812, + "loss": 0.6556, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3368799388408661, + "rewards/margins": 0.3067728579044342, + "rewards/rejected": 0.03010706976056099, + "step": 640 + }, + { + "epoch": 0.3907940862673373, + "grad_norm": 58.84896249669811, + "learning_rate": 2.7746341463414632e-08, + "logits/chosen": 0.21856433153152466, + "logits/rejected": 0.5366150140762329, + "logps/chosen": -138.56478881835938, + "logps/rejected": -122.19781494140625, + "loss": 0.6536, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.12763060629367828, + "rewards/margins": -0.14063982665538788, + "rewards/rejected": 0.013009227812290192, + "step": 641 + }, + { + "epoch": 0.3914037494284408, + "grad_norm": 74.98145898545785, + "learning_rate": 2.7790243902439024e-08, + "logits/chosen": 0.009811624884605408, + "logits/rejected": -0.10907647758722305, + "logps/chosen": -280.00177001953125, + "logps/rejected": -388.580078125, + "loss": 0.5937, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1680523157119751, + "rewards/margins": -0.2368372678756714, + "rewards/rejected": 0.4048895835876465, + "step": 642 + }, + { + "epoch": 0.39201341258954425, + "grad_norm": 63.40548124910095, + "learning_rate": 2.7834146341463412e-08, + "logits/chosen": 0.13475245237350464, + "logits/rejected": 0.07800167798995972, + "logps/chosen": -103.65362548828125, + "logps/rejected": -132.5025634765625, + "loss": 0.6457, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22668324410915375, + "rewards/margins": 0.20280678570270538, + "rewards/rejected": 0.023876458406448364, + "step": 643 + }, + { + "epoch": 0.39262307575064775, + "grad_norm": 59.45460211674044, + "learning_rate": 2.7878048780487804e-08, + "logits/chosen": 0.05927290767431259, + "logits/rejected": 0.15233726799488068, + "logps/chosen": -156.35134887695312, + "logps/rejected": -108.80286407470703, + "loss": 0.6305, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2818257808685303, + "rewards/margins": 0.09953179210424423, + "rewards/rejected": 0.18229399621486664, + "step": 644 + }, + { + "epoch": 0.39323273891175126, + "grad_norm": 74.82741469493133, + "learning_rate": 2.7921951219512196e-08, + "logits/chosen": -0.12207479774951935, + "logits/rejected": -0.033077970147132874, + "logps/chosen": -167.2114715576172, + "logps/rejected": -316.16888427734375, + "loss": 0.6721, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09867286682128906, + "rewards/margins": -0.18251752853393555, + "rewards/rejected": 0.2811903953552246, + "step": 645 + }, + { + "epoch": 0.39384240207285476, + "grad_norm": 58.8966000715914, + "learning_rate": 2.7965853658536584e-08, + "logits/chosen": 0.1431388258934021, + "logits/rejected": 0.15884003043174744, + "logps/chosen": -183.662353515625, + "logps/rejected": -136.40989685058594, + "loss": 0.584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3585079312324524, + "rewards/margins": 0.18456687033176422, + "rewards/rejected": 0.17394104599952698, + "step": 646 + }, + { + "epoch": 0.39445206523395826, + "grad_norm": 66.56144323024465, + "learning_rate": 2.8009756097560976e-08, + "logits/chosen": -0.05217306315898895, + "logits/rejected": 0.31576934456825256, + "logps/chosen": -491.3421630859375, + "logps/rejected": -256.726806640625, + "loss": 0.6575, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40523847937583923, + "rewards/margins": 0.09070847928524017, + "rewards/rejected": 0.31453001499176025, + "step": 647 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 65.05690657077261, + "learning_rate": 2.8053658536585364e-08, + "logits/chosen": 0.10608386993408203, + "logits/rejected": 0.3763849437236786, + "logps/chosen": -301.9100036621094, + "logps/rejected": -167.71836853027344, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43449002504348755, + "rewards/margins": 0.6286348700523376, + "rewards/rejected": -0.1941448450088501, + "step": 648 + }, + { + "epoch": 0.3956713915561652, + "grad_norm": 55.662315019410975, + "learning_rate": 2.8097560975609756e-08, + "logits/chosen": 0.08954723179340363, + "logits/rejected": 0.25629645586013794, + "logps/chosen": -235.14169311523438, + "logps/rejected": -162.6898956298828, + "loss": 0.5806, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31468138098716736, + "rewards/margins": 0.3700261116027832, + "rewards/rejected": -0.05534471571445465, + "step": 649 + }, + { + "epoch": 0.3962810547172687, + "grad_norm": 54.99441906497402, + "learning_rate": 2.8141463414634144e-08, + "logits/chosen": 0.379417359828949, + "logits/rejected": 0.3932171165943146, + "logps/chosen": -134.65982055664062, + "logps/rejected": -186.13771057128906, + "loss": 0.5903, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26799216866493225, + "rewards/margins": 0.41732701659202576, + "rewards/rejected": -0.1493348479270935, + "step": 650 + }, + { + "epoch": 0.3968907178783722, + "grad_norm": 63.65642017121965, + "learning_rate": 2.8185365853658536e-08, + "logits/chosen": 0.18639619648456573, + "logits/rejected": 0.11451603472232819, + "logps/chosen": -37.6251220703125, + "logps/rejected": -46.764347076416016, + "loss": 0.6263, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009578034281730652, + "rewards/margins": -0.06655454635620117, + "rewards/rejected": 0.05697651207447052, + "step": 651 + }, + { + "epoch": 0.3975003810394757, + "grad_norm": 61.55800631767217, + "learning_rate": 2.8229268292682928e-08, + "logits/chosen": 0.21169736981391907, + "logits/rejected": 0.054191380739212036, + "logps/chosen": -64.55624389648438, + "logps/rejected": -203.20297241210938, + "loss": 0.5654, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36324232816696167, + "rewards/margins": 0.8434259295463562, + "rewards/rejected": -0.4801836311817169, + "step": 652 + }, + { + "epoch": 0.39811004420057916, + "grad_norm": 67.02171009729318, + "learning_rate": 2.8273170731707316e-08, + "logits/chosen": 0.3522152304649353, + "logits/rejected": 0.22334620356559753, + "logps/chosen": -42.5975456237793, + "logps/rejected": -94.68328094482422, + "loss": 0.5757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35991886258125305, + "rewards/margins": 0.660916268825531, + "rewards/rejected": -0.30099746584892273, + "step": 653 + }, + { + "epoch": 0.39871970736168266, + "grad_norm": 60.33990850522962, + "learning_rate": 2.8317073170731708e-08, + "logits/chosen": -0.06947705149650574, + "logits/rejected": -0.008604679256677628, + "logps/chosen": -624.090576171875, + "logps/rejected": -350.2554931640625, + "loss": 0.6184, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5592214465141296, + "rewards/margins": 0.21749423444271088, + "rewards/rejected": 0.3417271375656128, + "step": 654 + }, + { + "epoch": 0.39932937052278616, + "grad_norm": 72.4216369967682, + "learning_rate": 2.8360975609756096e-08, + "logits/chosen": 0.08260777592658997, + "logits/rejected": 0.41911986470222473, + "logps/chosen": -200.58506774902344, + "logps/rejected": -147.6679229736328, + "loss": 0.6381, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.11888381093740463, + "rewards/margins": -0.4000794291496277, + "rewards/rejected": 0.28119561076164246, + "step": 655 + }, + { + "epoch": 0.39993903368388967, + "grad_norm": 59.37134808479706, + "learning_rate": 2.8404878048780488e-08, + "logits/chosen": 0.09101929515600204, + "logits/rejected": -0.015520691871643066, + "logps/chosen": -25.448081970214844, + "logps/rejected": -34.50359344482422, + "loss": 0.6068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1263624131679535, + "rewards/margins": 0.1127481684088707, + "rewards/rejected": -0.23911058902740479, + "step": 656 + }, + { + "epoch": 0.40054869684499317, + "grad_norm": 73.76492939555055, + "learning_rate": 2.8448780487804876e-08, + "logits/chosen": -0.08089803159236908, + "logits/rejected": -0.1267295479774475, + "logps/chosen": -129.6185302734375, + "logps/rejected": -112.44964599609375, + "loss": 0.5982, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33473771810531616, + "rewards/margins": 0.12635132670402527, + "rewards/rejected": -0.46108904480934143, + "step": 657 + }, + { + "epoch": 0.4011583600060966, + "grad_norm": 68.49005292734218, + "learning_rate": 2.8492682926829268e-08, + "logits/chosen": 0.049766190350055695, + "logits/rejected": 0.5010601282119751, + "logps/chosen": -229.0572967529297, + "logps/rejected": -115.39363861083984, + "loss": 0.6345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2862285077571869, + "rewards/margins": 0.4368448853492737, + "rewards/rejected": -0.1506163626909256, + "step": 658 + }, + { + "epoch": 0.4017680231672001, + "grad_norm": 57.899060720807014, + "learning_rate": 2.853658536585366e-08, + "logits/chosen": -0.002681232988834381, + "logits/rejected": 0.18251019716262817, + "logps/chosen": -274.3082275390625, + "logps/rejected": -143.87628173828125, + "loss": 0.6054, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4599049389362335, + "rewards/margins": 0.3964844346046448, + "rewards/rejected": 0.06342048943042755, + "step": 659 + }, + { + "epoch": 0.4023776863283036, + "grad_norm": 58.124639074578916, + "learning_rate": 2.8580487804878048e-08, + "logits/chosen": -0.04075242951512337, + "logits/rejected": -0.0851912871003151, + "logps/chosen": -100.02337646484375, + "logps/rejected": -100.20783233642578, + "loss": 0.6088, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2120712846517563, + "rewards/margins": 0.08371191471815109, + "rewards/rejected": 0.1283593773841858, + "step": 660 + }, + { + "epoch": 0.4029873494894071, + "grad_norm": 56.53324777516352, + "learning_rate": 2.862439024390244e-08, + "logits/chosen": 0.26801446080207825, + "logits/rejected": 0.19303756952285767, + "logps/chosen": -99.32714080810547, + "logps/rejected": -147.7371063232422, + "loss": 0.6217, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012360095977783203, + "rewards/margins": 0.24427659809589386, + "rewards/rejected": -0.23191648721694946, + "step": 661 + }, + { + "epoch": 0.40359701265051057, + "grad_norm": 64.62602867593822, + "learning_rate": 2.8668292682926828e-08, + "logits/chosen": 0.39362233877182007, + "logits/rejected": 0.3923282027244568, + "logps/chosen": -154.65005493164062, + "logps/rejected": -84.58635711669922, + "loss": 0.6036, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1573656052350998, + "rewards/margins": 0.14125274121761322, + "rewards/rejected": 0.016112878918647766, + "step": 662 + }, + { + "epoch": 0.40420667581161407, + "grad_norm": 67.35757001483378, + "learning_rate": 2.871219512195122e-08, + "logits/chosen": 0.08849135041236877, + "logits/rejected": 0.013642445206642151, + "logps/chosen": -12.219024658203125, + "logps/rejected": -46.77790069580078, + "loss": 0.6035, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00918569229543209, + "rewards/margins": 0.018711738288402557, + "rewards/rejected": -0.027897430583834648, + "step": 663 + }, + { + "epoch": 0.40481633897271757, + "grad_norm": 66.45965976248175, + "learning_rate": 2.875609756097561e-08, + "logits/chosen": 0.10113909840583801, + "logits/rejected": 0.3675948977470398, + "logps/chosen": -184.7261199951172, + "logps/rejected": -80.58538055419922, + "loss": 0.6096, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2339383214712143, + "rewards/margins": 0.21842002868652344, + "rewards/rejected": 0.015518282540142536, + "step": 664 + }, + { + "epoch": 0.4054260021338211, + "grad_norm": 67.09031137593702, + "learning_rate": 2.88e-08, + "logits/chosen": 0.06466561555862427, + "logits/rejected": 0.1334514021873474, + "logps/chosen": -119.71806335449219, + "logps/rejected": -60.799110412597656, + "loss": 0.6022, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.189555361866951, + "rewards/margins": 0.3266139030456543, + "rewards/rejected": -0.1370585411787033, + "step": 665 + }, + { + "epoch": 0.4060356652949246, + "grad_norm": 68.8168633458203, + "learning_rate": 2.884390243902439e-08, + "logits/chosen": 0.17734110355377197, + "logits/rejected": 0.3334428071975708, + "logps/chosen": -156.93003845214844, + "logps/rejected": -160.6712188720703, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3025633692741394, + "rewards/margins": 0.23625826835632324, + "rewards/rejected": 0.06630511581897736, + "step": 666 + }, + { + "epoch": 0.406645328456028, + "grad_norm": 55.28685825452296, + "learning_rate": 2.888780487804878e-08, + "logits/chosen": 0.2348455935716629, + "logits/rejected": 0.2635599374771118, + "logps/chosen": -31.70298957824707, + "logps/rejected": -49.15739440917969, + "loss": 0.5776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13425521552562714, + "rewards/margins": 0.05660948157310486, + "rewards/rejected": -0.1908647119998932, + "step": 667 + }, + { + "epoch": 0.4072549916171315, + "grad_norm": 65.06682601829672, + "learning_rate": 2.893170731707317e-08, + "logits/chosen": 0.08870015293359756, + "logits/rejected": -0.01683107763528824, + "logps/chosen": -234.7543182373047, + "logps/rejected": -288.1865539550781, + "loss": 0.5969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09488782286643982, + "rewards/margins": 0.6672009229660034, + "rewards/rejected": -0.7620887160301208, + "step": 668 + }, + { + "epoch": 0.407864654778235, + "grad_norm": 69.1607193783598, + "learning_rate": 2.897560975609756e-08, + "logits/chosen": 0.18325266242027283, + "logits/rejected": 0.15346255898475647, + "logps/chosen": -9.335733413696289, + "logps/rejected": -34.26789093017578, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.21492107212543488, + "rewards/margins": -0.04538650065660477, + "rewards/rejected": -0.1695345640182495, + "step": 669 + }, + { + "epoch": 0.4084743179393385, + "grad_norm": 61.33412935798923, + "learning_rate": 2.901951219512195e-08, + "logits/chosen": 0.20572735369205475, + "logits/rejected": 0.2354370504617691, + "logps/chosen": -287.7242736816406, + "logps/rejected": -196.66990661621094, + "loss": 0.6019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4633364975452423, + "rewards/margins": 0.35491088032722473, + "rewards/rejected": 0.10842561721801758, + "step": 670 + }, + { + "epoch": 0.40908398110044203, + "grad_norm": 63.76018813216311, + "learning_rate": 2.9063414634146343e-08, + "logits/chosen": 0.3203410506248474, + "logits/rejected": 0.2411998212337494, + "logps/chosen": -88.51042938232422, + "logps/rejected": -224.8445587158203, + "loss": 0.6055, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17546065151691437, + "rewards/margins": 0.21716642379760742, + "rewards/rejected": -0.041705772280693054, + "step": 671 + }, + { + "epoch": 0.4096936442615455, + "grad_norm": 59.626468760791596, + "learning_rate": 2.9107317073170732e-08, + "logits/chosen": 0.18035395443439484, + "logits/rejected": 0.0916692316532135, + "logps/chosen": -27.960620880126953, + "logps/rejected": -65.0535888671875, + "loss": 0.6099, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0217488631606102, + "rewards/margins": 0.33223849534988403, + "rewards/rejected": -0.35398733615875244, + "step": 672 + }, + { + "epoch": 0.410303307422649, + "grad_norm": 60.57315382535163, + "learning_rate": 2.9151219512195123e-08, + "logits/chosen": -0.18432225286960602, + "logits/rejected": 0.004740915726870298, + "logps/chosen": -175.02383422851562, + "logps/rejected": -133.32498168945312, + "loss": 0.5955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11643826961517334, + "rewards/margins": 0.003228917717933655, + "rewards/rejected": 0.11320935189723969, + "step": 673 + }, + { + "epoch": 0.4109129705837525, + "grad_norm": 62.77459814493296, + "learning_rate": 2.9195121951219512e-08, + "logits/chosen": 0.16996510326862335, + "logits/rejected": 0.2245965600013733, + "logps/chosen": -117.73829650878906, + "logps/rejected": -110.91796112060547, + "loss": 0.5902, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2513304054737091, + "rewards/margins": 0.13217899203300476, + "rewards/rejected": 0.11915141344070435, + "step": 674 + }, + { + "epoch": 0.411522633744856, + "grad_norm": 59.23703862462656, + "learning_rate": 2.9239024390243903e-08, + "logits/chosen": 0.006302252411842346, + "logits/rejected": 0.4283897578716278, + "logps/chosen": -104.32003784179688, + "logps/rejected": -91.69224548339844, + "loss": 0.6458, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0008779680356383324, + "rewards/margins": -0.0031674085184931755, + "rewards/rejected": 0.002289438620209694, + "step": 675 + }, + { + "epoch": 0.4121322969059595, + "grad_norm": 67.56419466850504, + "learning_rate": 2.928292682926829e-08, + "logits/chosen": 0.04366625100374222, + "logits/rejected": 0.001990571618080139, + "logps/chosen": -24.013965606689453, + "logps/rejected": -29.999656677246094, + "loss": 0.6031, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020657753571867943, + "rewards/margins": 0.2397357076406479, + "rewards/rejected": -0.2603934705257416, + "step": 676 + }, + { + "epoch": 0.41274196006706293, + "grad_norm": 66.70018573927544, + "learning_rate": 2.932682926829268e-08, + "logits/chosen": 0.13844181597232819, + "logits/rejected": 0.11456754058599472, + "logps/chosen": -126.58256530761719, + "logps/rejected": -96.23551940917969, + "loss": 0.6074, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02874419093132019, + "rewards/margins": 0.3369070887565613, + "rewards/rejected": -0.3081628680229187, + "step": 677 + }, + { + "epoch": 0.41335162322816643, + "grad_norm": 73.67012423016767, + "learning_rate": 2.937073170731707e-08, + "logits/chosen": 0.22111207246780396, + "logits/rejected": 0.0935177430510521, + "logps/chosen": -200.04986572265625, + "logps/rejected": -174.7672882080078, + "loss": 0.595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3721103072166443, + "rewards/margins": 0.1897158920764923, + "rewards/rejected": 0.18239441514015198, + "step": 678 + }, + { + "epoch": 0.41396128638926993, + "grad_norm": 69.02246934189627, + "learning_rate": 2.941463414634146e-08, + "logits/chosen": 0.16653214395046234, + "logits/rejected": 0.27699148654937744, + "logps/chosen": -318.82958984375, + "logps/rejected": -242.1185302734375, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32409214973449707, + "rewards/margins": 0.6411502957344055, + "rewards/rejected": -0.31705817580223083, + "step": 679 + }, + { + "epoch": 0.41457094955037344, + "grad_norm": 54.36427580713745, + "learning_rate": 2.9458536585365852e-08, + "logits/chosen": 0.031883664429187775, + "logits/rejected": 0.08884549885988235, + "logps/chosen": -35.28754425048828, + "logps/rejected": -49.85274124145508, + "loss": 0.5745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031313348561525345, + "rewards/margins": 0.12778198719024658, + "rewards/rejected": -0.15909533202648163, + "step": 680 + }, + { + "epoch": 0.4151806127114769, + "grad_norm": 70.23710329187017, + "learning_rate": 2.950243902439024e-08, + "logits/chosen": 0.40195852518081665, + "logits/rejected": 0.1879790872335434, + "logps/chosen": -227.4771728515625, + "logps/rejected": -247.98159790039062, + "loss": 0.6005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17011643946170807, + "rewards/margins": 0.34443551301956177, + "rewards/rejected": -0.1743190735578537, + "step": 681 + }, + { + "epoch": 0.4157902758725804, + "grad_norm": 66.99963384126403, + "learning_rate": 2.9546341463414632e-08, + "logits/chosen": 0.11015382409095764, + "logits/rejected": 0.12308009713888168, + "logps/chosen": -291.40081787109375, + "logps/rejected": -224.32606506347656, + "loss": 0.603, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.16739743947982788, + "rewards/margins": 0.07820190489292145, + "rewards/rejected": 0.08919551968574524, + "step": 682 + }, + { + "epoch": 0.4163999390336839, + "grad_norm": 64.95623940593457, + "learning_rate": 2.959024390243902e-08, + "logits/chosen": 0.33314457535743713, + "logits/rejected": 0.2583604156970978, + "logps/chosen": -101.11124420166016, + "logps/rejected": -135.29530334472656, + "loss": 0.5881, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22422949969768524, + "rewards/margins": 0.09729447960853577, + "rewards/rejected": -0.3215239644050598, + "step": 683 + }, + { + "epoch": 0.4170096021947874, + "grad_norm": 58.67034444294828, + "learning_rate": 2.9634146341463412e-08, + "logits/chosen": 0.06277957558631897, + "logits/rejected": -0.015074517577886581, + "logps/chosen": -152.89991760253906, + "logps/rejected": -185.5033721923828, + "loss": 0.5628, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2418362945318222, + "rewards/margins": 0.06912834942340851, + "rewards/rejected": 0.1727079302072525, + "step": 684 + }, + { + "epoch": 0.4176192653558909, + "grad_norm": 50.70518998130048, + "learning_rate": 2.96780487804878e-08, + "logits/chosen": 0.02843397855758667, + "logits/rejected": -0.058416157960891724, + "logps/chosen": -117.25051879882812, + "logps/rejected": -99.71661376953125, + "loss": 0.5401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39249762892723083, + "rewards/margins": 0.43272140622138977, + "rewards/rejected": -0.040223799645900726, + "step": 685 + }, + { + "epoch": 0.41822892851699434, + "grad_norm": 74.10719820300136, + "learning_rate": 2.9721951219512192e-08, + "logits/chosen": 0.22975647449493408, + "logits/rejected": -0.20629757642745972, + "logps/chosen": -252.9994659423828, + "logps/rejected": -247.56211853027344, + "loss": 0.6202, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.09769769012928009, + "rewards/margins": -0.06985299289226532, + "rewards/rejected": 0.1675506830215454, + "step": 686 + }, + { + "epoch": 0.41883859167809784, + "grad_norm": 71.0633430218036, + "learning_rate": 2.9765853658536584e-08, + "logits/chosen": 0.011537957936525345, + "logits/rejected": -0.014044620096683502, + "logps/chosen": -163.05384826660156, + "logps/rejected": -109.02774047851562, + "loss": 0.5604, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21511390805244446, + "rewards/margins": 0.24831253290176392, + "rewards/rejected": -0.03319861367344856, + "step": 687 + }, + { + "epoch": 0.41944825483920134, + "grad_norm": 60.030546752215145, + "learning_rate": 2.980975609756097e-08, + "logits/chosen": 0.03469804674386978, + "logits/rejected": 0.09261640161275864, + "logps/chosen": -114.98950958251953, + "logps/rejected": -114.08073425292969, + "loss": 0.5694, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0984640121459961, + "rewards/margins": 0.2803994119167328, + "rewards/rejected": -0.1819353997707367, + "step": 688 + }, + { + "epoch": 0.42005791800030484, + "grad_norm": 67.9177216292696, + "learning_rate": 2.985365853658536e-08, + "logits/chosen": 0.1416039764881134, + "logits/rejected": 0.2830074429512024, + "logps/chosen": -146.13885498046875, + "logps/rejected": -79.72393798828125, + "loss": 0.5778, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18982058763504028, + "rewards/margins": 0.2407340556383133, + "rewards/rejected": -0.050913479179143906, + "step": 689 + }, + { + "epoch": 0.42066758116140834, + "grad_norm": 58.77329784263347, + "learning_rate": 2.9897560975609756e-08, + "logits/chosen": 0.19391655921936035, + "logits/rejected": 0.008816692978143692, + "logps/chosen": -16.752595901489258, + "logps/rejected": -24.750043869018555, + "loss": 0.5741, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014958436600863934, + "rewards/margins": 0.28227686882019043, + "rewards/rejected": -0.2972353398799896, + "step": 690 + }, + { + "epoch": 0.4212772443225118, + "grad_norm": 67.38783502416696, + "learning_rate": 2.9941463414634144e-08, + "logits/chosen": 0.06132356822490692, + "logits/rejected": 0.1812308132648468, + "logps/chosen": -61.43677520751953, + "logps/rejected": -55.48193359375, + "loss": 0.6256, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05021069198846817, + "rewards/margins": 0.13919878005981445, + "rewards/rejected": -0.18940944969654083, + "step": 691 + }, + { + "epoch": 0.4218869074836153, + "grad_norm": 57.59018466854295, + "learning_rate": 2.998536585365853e-08, + "logits/chosen": 0.22436045110225677, + "logits/rejected": 0.09355799108743668, + "logps/chosen": -144.72726440429688, + "logps/rejected": -72.65602111816406, + "loss": 0.5584, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25349390506744385, + "rewards/margins": 0.3784920275211334, + "rewards/rejected": -0.12499812245368958, + "step": 692 + }, + { + "epoch": 0.4224965706447188, + "grad_norm": 54.43371138409108, + "learning_rate": 3.002926829268292e-08, + "logits/chosen": 0.09764564037322998, + "logits/rejected": 0.07500110566616058, + "logps/chosen": -63.83013153076172, + "logps/rejected": -87.51184844970703, + "loss": 0.5458, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21895256638526917, + "rewards/margins": 0.5135568976402283, + "rewards/rejected": -0.2946043312549591, + "step": 693 + }, + { + "epoch": 0.4231062338058223, + "grad_norm": 63.66155495833237, + "learning_rate": 3.0073170731707316e-08, + "logits/chosen": 0.11742034554481506, + "logits/rejected": 0.1333976536989212, + "logps/chosen": -63.16908264160156, + "logps/rejected": -106.1064224243164, + "loss": 0.5838, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.26999402046203613, + "rewards/margins": -0.02053636871278286, + "rewards/rejected": -0.24945765733718872, + "step": 694 + }, + { + "epoch": 0.4237158969669258, + "grad_norm": 72.38315149284581, + "learning_rate": 3.0117073170731704e-08, + "logits/chosen": 0.26569753885269165, + "logits/rejected": -0.08579610288143158, + "logps/chosen": -88.88272094726562, + "logps/rejected": -156.12924194335938, + "loss": 0.6408, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.042749255895614624, + "rewards/margins": -0.1455920785665512, + "rewards/rejected": 0.1028428003191948, + "step": 695 + }, + { + "epoch": 0.42432556012802924, + "grad_norm": 56.84359588093456, + "learning_rate": 3.016097560975609e-08, + "logits/chosen": -0.04833687096834183, + "logits/rejected": 0.22585663199424744, + "logps/chosen": -249.55300903320312, + "logps/rejected": -227.1507110595703, + "loss": 0.608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2435014843940735, + "rewards/margins": 0.49496057629585266, + "rewards/rejected": -0.2514590919017792, + "step": 696 + }, + { + "epoch": 0.42493522328913275, + "grad_norm": 69.98617860582188, + "learning_rate": 3.020487804878049e-08, + "logits/chosen": 0.12984676659107208, + "logits/rejected": 0.15866082906723022, + "logps/chosen": -122.64289093017578, + "logps/rejected": -97.21748352050781, + "loss": 0.6185, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0028444305062294006, + "rewards/margins": -0.02080116793513298, + "rewards/rejected": 0.02364560216665268, + "step": 697 + }, + { + "epoch": 0.42554488645023625, + "grad_norm": 70.86364125078781, + "learning_rate": 3.0248780487804876e-08, + "logits/chosen": 0.22259321808815002, + "logits/rejected": 0.601961076259613, + "logps/chosen": -230.63555908203125, + "logps/rejected": -80.44770050048828, + "loss": 0.6267, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2698615491390228, + "rewards/margins": 0.30879026651382446, + "rewards/rejected": -0.03892870992422104, + "step": 698 + }, + { + "epoch": 0.42615454961133975, + "grad_norm": 77.52768160366813, + "learning_rate": 3.0292682926829264e-08, + "logits/chosen": 0.2709192633628845, + "logits/rejected": 0.26073017716407776, + "logps/chosen": -533.0921020507812, + "logps/rejected": -514.8524169921875, + "loss": 0.6034, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5236976742744446, + "rewards/margins": 0.33411556482315063, + "rewards/rejected": 0.18958207964897156, + "step": 699 + }, + { + "epoch": 0.42676421277244325, + "grad_norm": 62.52940453105315, + "learning_rate": 3.033658536585366e-08, + "logits/chosen": -0.030992362648248672, + "logits/rejected": -0.010659024119377136, + "logps/chosen": -117.72844696044922, + "logps/rejected": -120.53231048583984, + "loss": 0.5939, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11818039417266846, + "rewards/margins": 0.2509658634662628, + "rewards/rejected": -0.13278548419475555, + "step": 700 + }, + { + "epoch": 0.4273738759335467, + "grad_norm": 52.743191936731016, + "learning_rate": 3.038048780487805e-08, + "logits/chosen": 0.3419884443283081, + "logits/rejected": 0.3479151725769043, + "logps/chosen": -28.024581909179688, + "logps/rejected": -8.396501541137695, + "loss": 0.5528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19714289903640747, + "rewards/margins": 0.2800540626049042, + "rewards/rejected": -0.47719693183898926, + "step": 701 + }, + { + "epoch": 0.4279835390946502, + "grad_norm": 67.83314550822708, + "learning_rate": 3.0424390243902436e-08, + "logits/chosen": 0.16552375257015228, + "logits/rejected": 0.2708529531955719, + "logps/chosen": -193.01100158691406, + "logps/rejected": -141.35247802734375, + "loss": 0.6163, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1269087791442871, + "rewards/margins": 0.08467017859220505, + "rewards/rejected": 0.04223859682679176, + "step": 702 + }, + { + "epoch": 0.4285932022557537, + "grad_norm": 72.33856499657284, + "learning_rate": 3.0468292682926824e-08, + "logits/chosen": 0.16995982825756073, + "logits/rejected": 0.3033416271209717, + "logps/chosen": -49.045066833496094, + "logps/rejected": -54.02178955078125, + "loss": 0.6279, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08551420271396637, + "rewards/margins": 0.43687137961387634, + "rewards/rejected": -0.5223855972290039, + "step": 703 + }, + { + "epoch": 0.4292028654168572, + "grad_norm": 69.2924140296883, + "learning_rate": 3.051219512195122e-08, + "logits/chosen": 0.09492413699626923, + "logits/rejected": 0.39009159803390503, + "logps/chosen": -131.39935302734375, + "logps/rejected": -21.53441047668457, + "loss": 0.5929, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.35798388719558716, + "rewards/margins": -0.26453471183776855, + "rewards/rejected": -0.0934491753578186, + "step": 704 + }, + { + "epoch": 0.42981252857796065, + "grad_norm": 61.93779746328726, + "learning_rate": 3.055609756097561e-08, + "logits/chosen": 0.252395898103714, + "logits/rejected": 0.07689037919044495, + "logps/chosen": -129.8458709716797, + "logps/rejected": -188.48562622070312, + "loss": 0.578, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34194260835647583, + "rewards/margins": 0.27869632840156555, + "rewards/rejected": 0.06324627995491028, + "step": 705 + }, + { + "epoch": 0.43042219173906415, + "grad_norm": 71.25144144882567, + "learning_rate": 3.0599999999999996e-08, + "logits/chosen": -0.12025362253189087, + "logits/rejected": -0.15497341752052307, + "logps/chosen": -54.242740631103516, + "logps/rejected": -93.62307739257812, + "loss": 0.6163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23059195280075073, + "rewards/margins": -0.023295823484659195, + "rewards/rejected": -0.20729614794254303, + "step": 706 + }, + { + "epoch": 0.43103185490016765, + "grad_norm": 49.678861831397526, + "learning_rate": 3.064390243902439e-08, + "logits/chosen": 0.13708041608333588, + "logits/rejected": 0.11697007715702057, + "logps/chosen": -133.09588623046875, + "logps/rejected": -121.47759246826172, + "loss": 0.544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4279193580150604, + "rewards/margins": 0.4899780750274658, + "rewards/rejected": -0.06205863878130913, + "step": 707 + }, + { + "epoch": 0.43164151806127116, + "grad_norm": 62.799527935945854, + "learning_rate": 3.068780487804878e-08, + "logits/chosen": 0.08546426892280579, + "logits/rejected": 0.35718727111816406, + "logps/chosen": -164.70767211914062, + "logps/rejected": -302.91119384765625, + "loss": 0.6283, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.013271257281303406, + "rewards/margins": -0.034261077642440796, + "rewards/rejected": 0.020989812910556793, + "step": 708 + }, + { + "epoch": 0.43225118122237466, + "grad_norm": 64.40455509713055, + "learning_rate": 3.073170731707317e-08, + "logits/chosen": 0.1983512043952942, + "logits/rejected": 0.155623197555542, + "logps/chosen": -110.73567199707031, + "logps/rejected": -110.76133728027344, + "loss": 0.5777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2301923781633377, + "rewards/margins": 0.6113991737365723, + "rewards/rejected": -0.38120681047439575, + "step": 709 + }, + { + "epoch": 0.4328608443834781, + "grad_norm": 69.52137292022668, + "learning_rate": 3.0775609756097556e-08, + "logits/chosen": 0.3022085428237915, + "logits/rejected": 0.28429025411605835, + "logps/chosen": -240.02157592773438, + "logps/rejected": -189.4278106689453, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41948139667510986, + "rewards/margins": 0.3640276789665222, + "rewards/rejected": 0.05545370653271675, + "step": 710 + }, + { + "epoch": 0.4334705075445816, + "grad_norm": 61.579931805538884, + "learning_rate": 3.081951219512195e-08, + "logits/chosen": 0.04821467399597168, + "logits/rejected": 0.01756604015827179, + "logps/chosen": -68.94324493408203, + "logps/rejected": -69.59725952148438, + "loss": 0.5596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04010574519634247, + "rewards/margins": 0.022175416350364685, + "rewards/rejected": -0.06228116154670715, + "step": 711 + }, + { + "epoch": 0.4340801707056851, + "grad_norm": 58.74671687754795, + "learning_rate": 3.086341463414634e-08, + "logits/chosen": -0.02244146168231964, + "logits/rejected": 0.3006266951560974, + "logps/chosen": -388.98675537109375, + "logps/rejected": -187.48379516601562, + "loss": 0.4945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9284478425979614, + "rewards/margins": 0.9021939635276794, + "rewards/rejected": 0.026253893971443176, + "step": 712 + }, + { + "epoch": 0.4346898338667886, + "grad_norm": 66.28226054586283, + "learning_rate": 3.090731707317073e-08, + "logits/chosen": 0.3966588079929352, + "logits/rejected": 0.44131675362586975, + "logps/chosen": -29.034849166870117, + "logps/rejected": -33.7619514465332, + "loss": 0.5545, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01138560101389885, + "rewards/margins": 0.20000213384628296, + "rewards/rejected": -0.18861651420593262, + "step": 713 + }, + { + "epoch": 0.4352994970278921, + "grad_norm": 62.41945677588309, + "learning_rate": 3.095121951219512e-08, + "logits/chosen": 0.10338619351387024, + "logits/rejected": 0.2881770431995392, + "logps/chosen": -201.3583526611328, + "logps/rejected": -242.2822723388672, + "loss": 0.575, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3309534788131714, + "rewards/margins": 0.6583471298217773, + "rewards/rejected": -0.32739365100860596, + "step": 714 + }, + { + "epoch": 0.43590916018899556, + "grad_norm": 61.40612035038867, + "learning_rate": 3.099512195121951e-08, + "logits/chosen": 0.10096565634012222, + "logits/rejected": 0.09104756265878677, + "logps/chosen": -163.83914184570312, + "logps/rejected": -104.75081634521484, + "loss": 0.6446, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.284401535987854, + "rewards/margins": 0.19382619857788086, + "rewards/rejected": 0.09057533740997314, + "step": 715 + }, + { + "epoch": 0.43651882335009906, + "grad_norm": 65.1815971155587, + "learning_rate": 3.10390243902439e-08, + "logits/chosen": 0.13695839047431946, + "logits/rejected": 0.049626439809799194, + "logps/chosen": -129.7420196533203, + "logps/rejected": -202.95199584960938, + "loss": 0.6115, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.12976928055286407, + "rewards/margins": -0.125985786318779, + "rewards/rejected": -0.0037834858521819115, + "step": 716 + }, + { + "epoch": 0.43712848651120256, + "grad_norm": 62.22586883955595, + "learning_rate": 3.108292682926829e-08, + "logits/chosen": -0.07620099186897278, + "logits/rejected": -0.012126855552196503, + "logps/chosen": -160.33538818359375, + "logps/rejected": -143.08425903320312, + "loss": 0.6466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0032648593187332153, + "rewards/margins": 0.24996376037597656, + "rewards/rejected": -0.25322863459587097, + "step": 717 + }, + { + "epoch": 0.43773814967230606, + "grad_norm": 54.70946843812598, + "learning_rate": 3.112682926829268e-08, + "logits/chosen": 0.11618568748235703, + "logits/rejected": 0.07170344144105911, + "logps/chosen": -225.55813598632812, + "logps/rejected": -188.16357421875, + "loss": 0.4905, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5569877028465271, + "rewards/margins": 0.6170770525932312, + "rewards/rejected": -0.060089416801929474, + "step": 718 + }, + { + "epoch": 0.43834781283340957, + "grad_norm": 73.89704569216683, + "learning_rate": 3.117073170731707e-08, + "logits/chosen": 0.005682835355401039, + "logits/rejected": 0.06556177884340286, + "logps/chosen": -310.00128173828125, + "logps/rejected": -245.61236572265625, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6474918723106384, + "rewards/margins": 0.668273389339447, + "rewards/rejected": -0.020781513303518295, + "step": 719 + }, + { + "epoch": 0.438957475994513, + "grad_norm": 69.77210354800309, + "learning_rate": 3.121463414634146e-08, + "logits/chosen": -0.1030096784234047, + "logits/rejected": 0.20871181786060333, + "logps/chosen": -204.392578125, + "logps/rejected": -116.71359252929688, + "loss": 0.5929, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3954564034938812, + "rewards/margins": 0.3550819456577301, + "rewards/rejected": 0.04037447273731232, + "step": 720 + }, + { + "epoch": 0.4395671391556165, + "grad_norm": 56.44675678828737, + "learning_rate": 3.1258536585365855e-08, + "logits/chosen": -0.025892585515975952, + "logits/rejected": 0.22776633501052856, + "logps/chosen": -143.7974395751953, + "logps/rejected": -73.87933349609375, + "loss": 0.6297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024740634486079216, + "rewards/margins": 0.2636898159980774, + "rewards/rejected": -0.28843045234680176, + "step": 721 + }, + { + "epoch": 0.44017680231672, + "grad_norm": 71.2787091506665, + "learning_rate": 3.130243902439024e-08, + "logits/chosen": 0.23216727375984192, + "logits/rejected": 0.16670744121074677, + "logps/chosen": -52.42512130737305, + "logps/rejected": -98.56019592285156, + "loss": 0.6078, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2387477457523346, + "rewards/margins": 0.1126733347773552, + "rewards/rejected": 0.1260744035243988, + "step": 722 + }, + { + "epoch": 0.4407864654778235, + "grad_norm": 64.68698901349826, + "learning_rate": 3.134634146341463e-08, + "logits/chosen": 0.39107462763786316, + "logits/rejected": 0.0693548172712326, + "logps/chosen": -133.65731811523438, + "logps/rejected": -388.3984375, + "loss": 0.6925, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06935802102088928, + "rewards/margins": 0.1404474377632141, + "rewards/rejected": -0.07108942419290543, + "step": 723 + }, + { + "epoch": 0.44139612863892697, + "grad_norm": 64.92084998872986, + "learning_rate": 3.139024390243902e-08, + "logits/chosen": 0.22215789556503296, + "logits/rejected": 0.5344216227531433, + "logps/chosen": -234.95579528808594, + "logps/rejected": -183.60125732421875, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.61018306016922, + "rewards/margins": 0.33607998490333557, + "rewards/rejected": 0.2741030156612396, + "step": 724 + }, + { + "epoch": 0.44200579180003047, + "grad_norm": 64.20521057241812, + "learning_rate": 3.1434146341463415e-08, + "logits/chosen": 0.1445278376340866, + "logits/rejected": 0.24122527241706848, + "logps/chosen": -255.03555297851562, + "logps/rejected": -215.16542053222656, + "loss": 0.5486, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.309909850358963, + "rewards/margins": 0.14310120046138763, + "rewards/rejected": 0.16680864989757538, + "step": 725 + }, + { + "epoch": 0.44261545496113397, + "grad_norm": 60.5035617542339, + "learning_rate": 3.1478048780487803e-08, + "logits/chosen": 0.28575438261032104, + "logits/rejected": 0.19118987023830414, + "logps/chosen": -41.21653747558594, + "logps/rejected": -101.06883239746094, + "loss": 0.5956, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09021880477666855, + "rewards/margins": 0.3644470274448395, + "rewards/rejected": -0.2742282450199127, + "step": 726 + }, + { + "epoch": 0.44322511812223747, + "grad_norm": 68.43650913601698, + "learning_rate": 3.152195121951219e-08, + "logits/chosen": 0.15166331827640533, + "logits/rejected": 0.1310874968767166, + "logps/chosen": -208.40158081054688, + "logps/rejected": -109.47624206542969, + "loss": 0.5784, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16175362467765808, + "rewards/margins": 0.06366458535194397, + "rewards/rejected": 0.09808903932571411, + "step": 727 + }, + { + "epoch": 0.443834781283341, + "grad_norm": 64.958778101188, + "learning_rate": 3.156585365853659e-08, + "logits/chosen": 0.39212292432785034, + "logits/rejected": 0.32695478200912476, + "logps/chosen": -361.8396301269531, + "logps/rejected": -218.62094116210938, + "loss": 0.6119, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31615012884140015, + "rewards/margins": 0.035128410905599594, + "rewards/rejected": 0.28102171421051025, + "step": 728 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 57.81165897409726, + "learning_rate": 3.1609756097560975e-08, + "logits/chosen": 0.2175525724887848, + "logits/rejected": 0.17773234844207764, + "logps/chosen": -161.65342712402344, + "logps/rejected": -194.79393005371094, + "loss": 0.5842, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27169203758239746, + "rewards/margins": 0.369501531124115, + "rewards/rejected": -0.09780949354171753, + "step": 729 + }, + { + "epoch": 0.4450541076055479, + "grad_norm": 68.99268646805703, + "learning_rate": 3.1653658536585363e-08, + "logits/chosen": 0.13952380418777466, + "logits/rejected": 0.15980111062526703, + "logps/chosen": -12.834968566894531, + "logps/rejected": -6.9011125564575195, + "loss": 0.6208, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17561447620391846, + "rewards/margins": -0.17670869827270508, + "rewards/rejected": 0.0010942351073026657, + "step": 730 + }, + { + "epoch": 0.4456637707666514, + "grad_norm": 81.46389111150688, + "learning_rate": 3.169756097560975e-08, + "logits/chosen": 0.15873503684997559, + "logits/rejected": 0.10875527560710907, + "logps/chosen": -317.6955871582031, + "logps/rejected": -255.65602111816406, + "loss": 0.686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5661007165908813, + "rewards/margins": 0.5059922337532043, + "rewards/rejected": 0.06010846048593521, + "step": 731 + }, + { + "epoch": 0.4462734339277549, + "grad_norm": 77.1070906210175, + "learning_rate": 3.174146341463415e-08, + "logits/chosen": 0.2867441177368164, + "logits/rejected": 0.1727285385131836, + "logps/chosen": -242.66256713867188, + "logps/rejected": -271.26776123046875, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3202134370803833, + "rewards/margins": 0.5896958112716675, + "rewards/rejected": -0.26948240399360657, + "step": 732 + }, + { + "epoch": 0.4468830970888584, + "grad_norm": 58.25214993847489, + "learning_rate": 3.1785365853658535e-08, + "logits/chosen": -0.1107330471277237, + "logits/rejected": 0.035387180745601654, + "logps/chosen": -206.23092651367188, + "logps/rejected": -154.17787170410156, + "loss": 0.5676, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.06631675362586975, + "rewards/margins": -0.09842417389154434, + "rewards/rejected": 0.1647409200668335, + "step": 733 + }, + { + "epoch": 0.4474927602499619, + "grad_norm": 59.82856209161144, + "learning_rate": 3.1829268292682924e-08, + "logits/chosen": -0.1534700095653534, + "logits/rejected": 0.06522519141435623, + "logps/chosen": -416.1253662109375, + "logps/rejected": -306.8860168457031, + "loss": 0.5623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9827251434326172, + "rewards/margins": 0.9360064268112183, + "rewards/rejected": 0.04671870172023773, + "step": 734 + }, + { + "epoch": 0.4481024234110654, + "grad_norm": 61.92935271858862, + "learning_rate": 3.187317073170732e-08, + "logits/chosen": -0.005850538611412048, + "logits/rejected": -0.1219499334692955, + "logps/chosen": -98.82186889648438, + "logps/rejected": -199.60157775878906, + "loss": 0.5855, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30541688203811646, + "rewards/margins": 0.47873765230178833, + "rewards/rejected": -0.1733207404613495, + "step": 735 + }, + { + "epoch": 0.4487120865721689, + "grad_norm": 58.759158962511094, + "learning_rate": 3.191707317073171e-08, + "logits/chosen": -0.1131751537322998, + "logits/rejected": 0.20346003770828247, + "logps/chosen": -393.7450866699219, + "logps/rejected": -288.71002197265625, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8758038282394409, + "rewards/margins": 1.149061679840088, + "rewards/rejected": -0.273257851600647, + "step": 736 + }, + { + "epoch": 0.4493217497332724, + "grad_norm": 77.27702057249535, + "learning_rate": 3.1960975609756095e-08, + "logits/chosen": 0.20635244250297546, + "logits/rejected": -0.15667811036109924, + "logps/chosen": -94.55105590820312, + "logps/rejected": -290.2279052734375, + "loss": 0.6516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2631429433822632, + "rewards/margins": 0.18857741355895996, + "rewards/rejected": 0.07456552982330322, + "step": 737 + }, + { + "epoch": 0.4499314128943759, + "grad_norm": 62.02774374273467, + "learning_rate": 3.2004878048780484e-08, + "logits/chosen": 0.18018031120300293, + "logits/rejected": 0.17460641264915466, + "logps/chosen": -21.496278762817383, + "logps/rejected": -11.181031227111816, + "loss": 0.6009, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.506889820098877, + "rewards/margins": -0.26636621356010437, + "rewards/rejected": -0.24052362143993378, + "step": 738 + }, + { + "epoch": 0.45054107605547933, + "grad_norm": 59.67016395528802, + "learning_rate": 3.204878048780488e-08, + "logits/chosen": 0.2724151015281677, + "logits/rejected": 0.16841967403888702, + "logps/chosen": -197.8876953125, + "logps/rejected": -262.4444274902344, + "loss": 0.5878, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4625575542449951, + "rewards/margins": 0.21330539882183075, + "rewards/rejected": 0.24925214052200317, + "step": 739 + }, + { + "epoch": 0.45115073921658283, + "grad_norm": 73.864837429737, + "learning_rate": 3.209268292682927e-08, + "logits/chosen": 0.020081382244825363, + "logits/rejected": 0.18595314025878906, + "logps/chosen": -279.2730712890625, + "logps/rejected": -325.4978942871094, + "loss": 0.6138, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.013555884361267, + "rewards/margins": 0.3410736918449402, + "rewards/rejected": 0.6724821925163269, + "step": 740 + }, + { + "epoch": 0.45176040237768633, + "grad_norm": 65.65281963931344, + "learning_rate": 3.2136585365853655e-08, + "logits/chosen": -0.26649999618530273, + "logits/rejected": -0.09467080235481262, + "logps/chosen": -210.10275268554688, + "logps/rejected": -148.3544464111328, + "loss": 0.5708, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09878063201904297, + "rewards/margins": -0.03505063056945801, + "rewards/rejected": 0.13383126258850098, + "step": 741 + }, + { + "epoch": 0.45237006553878983, + "grad_norm": 75.23993241177425, + "learning_rate": 3.218048780487805e-08, + "logits/chosen": 0.0372052900493145, + "logits/rejected": 0.13422900438308716, + "logps/chosen": -291.0384826660156, + "logps/rejected": -262.3160400390625, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5097771883010864, + "rewards/margins": 0.5051177740097046, + "rewards/rejected": 0.004659377038478851, + "step": 742 + }, + { + "epoch": 0.45297972869989334, + "grad_norm": 61.379429711158096, + "learning_rate": 3.222439024390244e-08, + "logits/chosen": 0.32957613468170166, + "logits/rejected": 0.33307385444641113, + "logps/chosen": -110.95623016357422, + "logps/rejected": -84.92561340332031, + "loss": 0.5974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02196323871612549, + "rewards/margins": 0.2710154354572296, + "rewards/rejected": -0.2929787039756775, + "step": 743 + }, + { + "epoch": 0.4535893918609968, + "grad_norm": 68.18531020727382, + "learning_rate": 3.226829268292683e-08, + "logits/chosen": 0.32582759857177734, + "logits/rejected": 0.25220736861228943, + "logps/chosen": -272.1997375488281, + "logps/rejected": -230.9661102294922, + "loss": 0.6077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2933657467365265, + "rewards/margins": 0.34890851378440857, + "rewards/rejected": -0.055542752146720886, + "step": 744 + }, + { + "epoch": 0.4541990550221003, + "grad_norm": 58.3393249286719, + "learning_rate": 3.2312195121951216e-08, + "logits/chosen": 0.42729368805885315, + "logits/rejected": 0.3558027148246765, + "logps/chosen": -98.97964477539062, + "logps/rejected": -104.35940551757812, + "loss": 0.5867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4682806730270386, + "rewards/margins": 0.07165151834487915, + "rewards/rejected": 0.3966291844844818, + "step": 745 + }, + { + "epoch": 0.4548087181832038, + "grad_norm": 66.89483590330624, + "learning_rate": 3.235609756097561e-08, + "logits/chosen": 0.08102698624134064, + "logits/rejected": -0.11477988958358765, + "logps/chosen": -257.57708740234375, + "logps/rejected": -305.8965148925781, + "loss": 0.6073, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38230252265930176, + "rewards/margins": 0.29078590869903564, + "rewards/rejected": 0.09151659905910492, + "step": 746 + }, + { + "epoch": 0.4554183813443073, + "grad_norm": 70.641732269457, + "learning_rate": 3.24e-08, + "logits/chosen": -0.10593121498823166, + "logits/rejected": -0.029123404994606972, + "logps/chosen": -417.87298583984375, + "logps/rejected": -265.234619140625, + "loss": 0.5788, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7471208572387695, + "rewards/margins": 0.828920841217041, + "rewards/rejected": -0.08180002123117447, + "step": 747 + }, + { + "epoch": 0.45602804450541073, + "grad_norm": 55.11463059725924, + "learning_rate": 3.244390243902439e-08, + "logits/chosen": 0.4029293656349182, + "logits/rejected": 0.32838955521583557, + "logps/chosen": -159.41629028320312, + "logps/rejected": -205.91880798339844, + "loss": 0.6031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11138274520635605, + "rewards/margins": 0.7855024933815002, + "rewards/rejected": -0.8968852758407593, + "step": 748 + }, + { + "epoch": 0.45663770766651424, + "grad_norm": 63.761896045577224, + "learning_rate": 3.248780487804878e-08, + "logits/chosen": -0.12749917805194855, + "logits/rejected": 0.07667224854230881, + "logps/chosen": -210.73178100585938, + "logps/rejected": -122.18229675292969, + "loss": 0.5588, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33733201026916504, + "rewards/margins": 0.7119225263595581, + "rewards/rejected": -0.37459051609039307, + "step": 749 + }, + { + "epoch": 0.45724737082761774, + "grad_norm": 62.32785592849077, + "learning_rate": 3.253170731707317e-08, + "logits/chosen": 0.1593983918428421, + "logits/rejected": -0.07764932513237, + "logps/chosen": -70.96792602539062, + "logps/rejected": -189.618408203125, + "loss": 0.6099, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015467909164726734, + "rewards/margins": -0.08667121827602386, + "rewards/rejected": 0.10213913023471832, + "step": 750 + }, + { + "epoch": 0.45785703398872124, + "grad_norm": 60.625771850025146, + "learning_rate": 3.257560975609756e-08, + "logits/chosen": 0.17726343870162964, + "logits/rejected": 0.08285678178071976, + "logps/chosen": -150.5395050048828, + "logps/rejected": -177.4819793701172, + "loss": 0.5451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4170660078525543, + "rewards/margins": 0.6691288352012634, + "rewards/rejected": -0.2520628273487091, + "step": 751 + }, + { + "epoch": 0.45846669714982474, + "grad_norm": 62.306705278437036, + "learning_rate": 3.261951219512195e-08, + "logits/chosen": 0.4701884090900421, + "logits/rejected": 0.14265279471874237, + "logps/chosen": -153.5452423095703, + "logps/rejected": -329.3506164550781, + "loss": 0.5386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16269707679748535, + "rewards/margins": 0.24957947432994843, + "rewards/rejected": -0.41227656602859497, + "step": 752 + }, + { + "epoch": 0.4590763603109282, + "grad_norm": 56.55226551304226, + "learning_rate": 3.266341463414634e-08, + "logits/chosen": 0.1673717498779297, + "logits/rejected": 0.3038822114467621, + "logps/chosen": -96.20490264892578, + "logps/rejected": -66.98352813720703, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.062169261276721954, + "rewards/margins": 0.11970269680023193, + "rewards/rejected": -0.05753343924880028, + "step": 753 + }, + { + "epoch": 0.4596860234720317, + "grad_norm": 78.33667368470162, + "learning_rate": 3.270731707317073e-08, + "logits/chosen": 0.06571709364652634, + "logits/rejected": 0.0390804223716259, + "logps/chosen": -42.88839340209961, + "logps/rejected": -102.15044403076172, + "loss": 0.6561, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.013066366314888, + "rewards/margins": 0.48513418436050415, + "rewards/rejected": -0.47206783294677734, + "step": 754 + }, + { + "epoch": 0.4602956866331352, + "grad_norm": 64.30184268769287, + "learning_rate": 3.275121951219512e-08, + "logits/chosen": 0.3882122337818146, + "logits/rejected": 0.3242022395133972, + "logps/chosen": -19.027729034423828, + "logps/rejected": -46.444732666015625, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08555036038160324, + "rewards/margins": 0.20770028233528137, + "rewards/rejected": -0.12214992195367813, + "step": 755 + }, + { + "epoch": 0.4609053497942387, + "grad_norm": 65.97320436866737, + "learning_rate": 3.2795121951219514e-08, + "logits/chosen": -0.12974077463150024, + "logits/rejected": -0.003422953188419342, + "logps/chosen": -273.21502685546875, + "logps/rejected": -384.43841552734375, + "loss": 0.6027, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04085254669189453, + "rewards/margins": -0.15066151320934296, + "rewards/rejected": 0.1915140599012375, + "step": 756 + }, + { + "epoch": 0.4615150129553422, + "grad_norm": 69.53782871578429, + "learning_rate": 3.28390243902439e-08, + "logits/chosen": -0.09778957068920135, + "logits/rejected": 0.13234618306159973, + "logps/chosen": -320.2364501953125, + "logps/rejected": -250.09844970703125, + "loss": 0.6215, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20290826261043549, + "rewards/margins": -0.07620492577552795, + "rewards/rejected": 0.27911320328712463, + "step": 757 + }, + { + "epoch": 0.46212467611644564, + "grad_norm": 53.91068098227483, + "learning_rate": 3.288292682926829e-08, + "logits/chosen": -0.09435955435037613, + "logits/rejected": 0.3796374201774597, + "logps/chosen": -120.08363342285156, + "logps/rejected": -117.78384399414062, + "loss": 0.5211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44105276465415955, + "rewards/margins": 0.37706083059310913, + "rewards/rejected": 0.06399193406105042, + "step": 758 + }, + { + "epoch": 0.46273433927754914, + "grad_norm": 61.732824810998856, + "learning_rate": 3.292682926829268e-08, + "logits/chosen": 0.15379494428634644, + "logits/rejected": 0.1573495864868164, + "logps/chosen": -17.440587997436523, + "logps/rejected": -26.378684997558594, + "loss": 0.5885, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09881146252155304, + "rewards/margins": 0.1264808624982834, + "rewards/rejected": -0.22529232501983643, + "step": 759 + }, + { + "epoch": 0.46334400243865265, + "grad_norm": 67.92257104269588, + "learning_rate": 3.2970731707317074e-08, + "logits/chosen": 0.20818915963172913, + "logits/rejected": 0.18402251601219177, + "logps/chosen": -188.99795532226562, + "logps/rejected": -129.03713989257812, + "loss": 0.609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17216423153877258, + "rewards/margins": 0.8478347063064575, + "rewards/rejected": -0.6756705045700073, + "step": 760 + }, + { + "epoch": 0.46395366559975615, + "grad_norm": 58.53609886318093, + "learning_rate": 3.301463414634146e-08, + "logits/chosen": 0.1354576051235199, + "logits/rejected": 0.09735407680273056, + "logps/chosen": -47.41239547729492, + "logps/rejected": -47.80207443237305, + "loss": 0.5519, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21174819767475128, + "rewards/margins": 0.41935431957244873, + "rewards/rejected": -0.20760615170001984, + "step": 761 + }, + { + "epoch": 0.46456332876085965, + "grad_norm": 68.31152706043322, + "learning_rate": 3.305853658536585e-08, + "logits/chosen": 0.46516382694244385, + "logits/rejected": 0.38613224029541016, + "logps/chosen": -70.07086181640625, + "logps/rejected": -119.79405975341797, + "loss": 0.6268, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27662283182144165, + "rewards/margins": 0.43011096119880676, + "rewards/rejected": -0.15348811447620392, + "step": 762 + }, + { + "epoch": 0.4651729919219631, + "grad_norm": 56.83479023691454, + "learning_rate": 3.3102439024390246e-08, + "logits/chosen": 0.08870130777359009, + "logits/rejected": 0.08552171289920807, + "logps/chosen": -178.08047485351562, + "logps/rejected": -160.9407501220703, + "loss": 0.5903, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7953975796699524, + "rewards/margins": 0.7640186548233032, + "rewards/rejected": 0.031378939747810364, + "step": 763 + }, + { + "epoch": 0.4657826550830666, + "grad_norm": 55.16864538813876, + "learning_rate": 3.3146341463414634e-08, + "logits/chosen": 0.22163109481334686, + "logits/rejected": 0.15488380193710327, + "logps/chosen": -106.62368774414062, + "logps/rejected": -110.352783203125, + "loss": 0.5992, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18241918087005615, + "rewards/margins": 0.18216146528720856, + "rewards/rejected": 0.0002577081322669983, + "step": 764 + }, + { + "epoch": 0.4663923182441701, + "grad_norm": 59.708401288405774, + "learning_rate": 3.319024390243902e-08, + "logits/chosen": -0.07213751971721649, + "logits/rejected": 0.07934854179620743, + "logps/chosen": -121.12588500976562, + "logps/rejected": -86.45989227294922, + "loss": 0.5534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13942264020442963, + "rewards/margins": 0.250957727432251, + "rewards/rejected": -0.3903804123401642, + "step": 765 + }, + { + "epoch": 0.4670019814052736, + "grad_norm": 71.78748459918226, + "learning_rate": 3.323414634146341e-08, + "logits/chosen": 0.11978058516979218, + "logits/rejected": 0.14904363453388214, + "logps/chosen": -119.83893585205078, + "logps/rejected": -123.9186782836914, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0574684701859951, + "rewards/margins": 0.0947904884815216, + "rewards/rejected": -0.0373220220208168, + "step": 766 + }, + { + "epoch": 0.46761164456637705, + "grad_norm": 64.72349909503079, + "learning_rate": 3.3278048780487806e-08, + "logits/chosen": 0.1507500261068344, + "logits/rejected": 0.18580779433250427, + "logps/chosen": -165.88743591308594, + "logps/rejected": -148.12942504882812, + "loss": 0.5935, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2788294553756714, + "rewards/margins": 0.4257279634475708, + "rewards/rejected": -0.1468985229730606, + "step": 767 + }, + { + "epoch": 0.46822130772748055, + "grad_norm": 70.78417825316278, + "learning_rate": 3.3321951219512195e-08, + "logits/chosen": 0.1303892731666565, + "logits/rejected": 0.1609259992837906, + "logps/chosen": -254.47705078125, + "logps/rejected": -325.6808776855469, + "loss": 0.6062, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01449042558670044, + "rewards/margins": -0.027748242020606995, + "rewards/rejected": 0.04223868250846863, + "step": 768 + }, + { + "epoch": 0.46883097088858405, + "grad_norm": 60.23105195383743, + "learning_rate": 3.336585365853658e-08, + "logits/chosen": 0.22307035326957703, + "logits/rejected": 0.25752493739128113, + "logps/chosen": -244.8804931640625, + "logps/rejected": -125.85395050048828, + "loss": 0.5232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41586071252822876, + "rewards/margins": 0.7935232520103455, + "rewards/rejected": -0.3776625394821167, + "step": 769 + }, + { + "epoch": 0.46944063404968756, + "grad_norm": 77.77191005675465, + "learning_rate": 3.340975609756098e-08, + "logits/chosen": -0.04936651140451431, + "logits/rejected": -0.1360277533531189, + "logps/chosen": -56.69487762451172, + "logps/rejected": -89.06639099121094, + "loss": 0.6283, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02953328937292099, + "rewards/margins": 0.020573187619447708, + "rewards/rejected": 0.00896010547876358, + "step": 770 + }, + { + "epoch": 0.47005029721079106, + "grad_norm": 72.08810281987006, + "learning_rate": 3.3453658536585366e-08, + "logits/chosen": 0.43544837832450867, + "logits/rejected": 0.35976696014404297, + "logps/chosen": -77.24356842041016, + "logps/rejected": -106.05728149414062, + "loss": 0.6727, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.12813350558280945, + "rewards/margins": 0.019798681139945984, + "rewards/rejected": 0.10833483189344406, + "step": 771 + }, + { + "epoch": 0.4706599603718945, + "grad_norm": 65.0640779404551, + "learning_rate": 3.3497560975609755e-08, + "logits/chosen": 0.005504929460585117, + "logits/rejected": 0.35064205527305603, + "logps/chosen": -111.17489624023438, + "logps/rejected": -67.89197540283203, + "loss": 0.6024, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017039109021425247, + "rewards/margins": 0.2605786621570587, + "rewards/rejected": -0.24353954195976257, + "step": 772 + }, + { + "epoch": 0.471269623532998, + "grad_norm": 71.73403148285288, + "learning_rate": 3.354146341463414e-08, + "logits/chosen": 0.22017183899879456, + "logits/rejected": 0.06910734623670578, + "logps/chosen": -134.75439453125, + "logps/rejected": -162.3470916748047, + "loss": 0.6302, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06771198660135269, + "rewards/margins": -0.23933552205562592, + "rewards/rejected": 0.17162355780601501, + "step": 773 + }, + { + "epoch": 0.4718792866941015, + "grad_norm": 64.68294632257992, + "learning_rate": 3.358536585365854e-08, + "logits/chosen": 0.04541367292404175, + "logits/rejected": 0.33783969283103943, + "logps/chosen": -463.33154296875, + "logps/rejected": -363.5344543457031, + "loss": 0.4604, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3243526816368103, + "rewards/margins": 0.3368990421295166, + "rewards/rejected": -0.012546353042125702, + "step": 774 + }, + { + "epoch": 0.472488949855205, + "grad_norm": 56.07396666651526, + "learning_rate": 3.3629268292682926e-08, + "logits/chosen": 0.055941544473171234, + "logits/rejected": 0.06496048718690872, + "logps/chosen": -14.561441421508789, + "logps/rejected": -14.488389015197754, + "loss": 0.5139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004245104733854532, + "rewards/margins": 0.3938329219818115, + "rewards/rejected": -0.3895878493785858, + "step": 775 + }, + { + "epoch": 0.4730986130163085, + "grad_norm": 75.1658068785713, + "learning_rate": 3.3673170731707315e-08, + "logits/chosen": 0.3035067617893219, + "logits/rejected": 0.17092974483966827, + "logps/chosen": -83.666015625, + "logps/rejected": -142.3246612548828, + "loss": 0.6228, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009215408936142921, + "rewards/margins": 0.10090556740760803, + "rewards/rejected": -0.09169016033411026, + "step": 776 + }, + { + "epoch": 0.47370827617741196, + "grad_norm": 71.25566605177507, + "learning_rate": 3.371707317073171e-08, + "logits/chosen": -0.01590004190802574, + "logits/rejected": 0.060013532638549805, + "logps/chosen": -80.48770904541016, + "logps/rejected": -15.409080505371094, + "loss": 0.6352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018049947917461395, + "rewards/margins": 0.41123759746551514, + "rewards/rejected": -0.4292875826358795, + "step": 777 + }, + { + "epoch": 0.47431793933851546, + "grad_norm": 64.25828490732628, + "learning_rate": 3.376097560975609e-08, + "logits/chosen": -0.13783808052539825, + "logits/rejected": -0.1846710592508316, + "logps/chosen": -90.84169006347656, + "logps/rejected": -165.5963897705078, + "loss": 0.5637, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2065526843070984, + "rewards/margins": 0.4452173709869385, + "rewards/rejected": -0.2386646866798401, + "step": 778 + }, + { + "epoch": 0.47492760249961896, + "grad_norm": 58.87728516311014, + "learning_rate": 3.3804878048780487e-08, + "logits/chosen": 0.06521575152873993, + "logits/rejected": 0.16959774494171143, + "logps/chosen": -236.0845947265625, + "logps/rejected": -152.92738342285156, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16076885163784027, + "rewards/margins": 0.25768551230430603, + "rewards/rejected": -0.09691667556762695, + "step": 779 + }, + { + "epoch": 0.47553726566072246, + "grad_norm": 68.22586588198327, + "learning_rate": 3.3848780487804875e-08, + "logits/chosen": 0.14443618059158325, + "logits/rejected": 0.10864615440368652, + "logps/chosen": -75.4122543334961, + "logps/rejected": -193.24659729003906, + "loss": 0.5583, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.258346289396286, + "rewards/margins": 1.209718108177185, + "rewards/rejected": -0.9513717889785767, + "step": 780 + }, + { + "epoch": 0.47614692882182597, + "grad_norm": 58.829637383991205, + "learning_rate": 3.3892682926829263e-08, + "logits/chosen": 0.1799907088279724, + "logits/rejected": 0.1539163887500763, + "logps/chosen": -20.66031265258789, + "logps/rejected": -24.9801082611084, + "loss": 0.555, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4109232723712921, + "rewards/margins": 0.29648423194885254, + "rewards/rejected": -0.7074074745178223, + "step": 781 + }, + { + "epoch": 0.4767565919829294, + "grad_norm": 73.41252613376706, + "learning_rate": 3.393658536585365e-08, + "logits/chosen": 0.049098651856184006, + "logits/rejected": 0.28672829270362854, + "logps/chosen": -61.963924407958984, + "logps/rejected": -24.833805084228516, + "loss": 0.5672, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2053811103105545, + "rewards/margins": -0.0545276403427124, + "rewards/rejected": -0.1508534699678421, + "step": 782 + }, + { + "epoch": 0.4773662551440329, + "grad_norm": 66.08018083342746, + "learning_rate": 3.398048780487805e-08, + "logits/chosen": 0.45004844665527344, + "logits/rejected": 0.1189308911561966, + "logps/chosen": -116.97759246826172, + "logps/rejected": -171.6212615966797, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.443662166595459, + "rewards/margins": 0.3807649314403534, + "rewards/rejected": 0.06289726495742798, + "step": 783 + }, + { + "epoch": 0.4779759183051364, + "grad_norm": 75.15703338972078, + "learning_rate": 3.4024390243902435e-08, + "logits/chosen": 0.23602449893951416, + "logits/rejected": 0.30505746603012085, + "logps/chosen": -216.32568359375, + "logps/rejected": -34.69765853881836, + "loss": 0.6908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19814635813236237, + "rewards/margins": 0.23832713067531586, + "rewards/rejected": -0.040180787444114685, + "step": 784 + }, + { + "epoch": 0.4785855814662399, + "grad_norm": 73.88348076281862, + "learning_rate": 3.4068292682926823e-08, + "logits/chosen": 0.32259875535964966, + "logits/rejected": 0.0982908308506012, + "logps/chosen": -185.67340087890625, + "logps/rejected": -297.8865661621094, + "loss": 0.5996, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5188018083572388, + "rewards/margins": 0.3751254975795746, + "rewards/rejected": 0.1436762809753418, + "step": 785 + }, + { + "epoch": 0.4791952446273434, + "grad_norm": 51.28441397571893, + "learning_rate": 3.411219512195122e-08, + "logits/chosen": -0.10330458730459213, + "logits/rejected": 0.06638757139444351, + "logps/chosen": -153.25030517578125, + "logps/rejected": -143.166015625, + "loss": 0.525, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29190418124198914, + "rewards/margins": 0.3344435691833496, + "rewards/rejected": -0.042539406567811966, + "step": 786 + }, + { + "epoch": 0.47980490778844687, + "grad_norm": 65.09062711909723, + "learning_rate": 3.415609756097561e-08, + "logits/chosen": 0.33637189865112305, + "logits/rejected": 0.19565415382385254, + "logps/chosen": -157.9989013671875, + "logps/rejected": -210.083251953125, + "loss": 0.5036, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3962502181529999, + "rewards/margins": 0.6286591291427612, + "rewards/rejected": -0.23240889608860016, + "step": 787 + }, + { + "epoch": 0.48041457094955037, + "grad_norm": 77.862511455403, + "learning_rate": 3.4199999999999995e-08, + "logits/chosen": 0.09237313270568848, + "logits/rejected": 0.25440582633018494, + "logps/chosen": -161.5814666748047, + "logps/rejected": -174.3079071044922, + "loss": 0.7826, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13777244091033936, + "rewards/margins": 0.24795600771903992, + "rewards/rejected": -0.11018357425928116, + "step": 788 + }, + { + "epoch": 0.48102423411065387, + "grad_norm": 66.8724467149785, + "learning_rate": 3.4243902439024384e-08, + "logits/chosen": 0.10143324732780457, + "logits/rejected": -0.1377866119146347, + "logps/chosen": -98.13770294189453, + "logps/rejected": -253.71200561523438, + "loss": 0.541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03572654724121094, + "rewards/margins": 0.2794824540615082, + "rewards/rejected": -0.24375592172145844, + "step": 789 + }, + { + "epoch": 0.48163389727175737, + "grad_norm": 69.73316263825721, + "learning_rate": 3.428780487804878e-08, + "logits/chosen": 0.10662485659122467, + "logits/rejected": -0.2586067318916321, + "logps/chosen": -158.9857940673828, + "logps/rejected": -393.30560302734375, + "loss": 0.6193, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19432978332042694, + "rewards/margins": 0.19431820511817932, + "rewards/rejected": 1.1574476957321167e-05, + "step": 790 + }, + { + "epoch": 0.4822435604328608, + "grad_norm": 61.74529493736086, + "learning_rate": 3.433170731707317e-08, + "logits/chosen": -0.05212089419364929, + "logits/rejected": -0.020766697824001312, + "logps/chosen": -66.56182861328125, + "logps/rejected": -34.273502349853516, + "loss": 0.6464, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17125204205513, + "rewards/margins": 0.32071420550346375, + "rewards/rejected": -0.14946216344833374, + "step": 791 + }, + { + "epoch": 0.4828532235939643, + "grad_norm": 70.43491750749516, + "learning_rate": 3.4375609756097555e-08, + "logits/chosen": 0.25808200240135193, + "logits/rejected": 0.18567781150341034, + "logps/chosen": -238.76187133789062, + "logps/rejected": -250.26937866210938, + "loss": 0.5948, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35204005241394043, + "rewards/margins": 0.3273167014122009, + "rewards/rejected": 0.024723336100578308, + "step": 792 + }, + { + "epoch": 0.4834628867550678, + "grad_norm": 66.69868840488981, + "learning_rate": 3.441951219512195e-08, + "logits/chosen": 0.030955903232097626, + "logits/rejected": 0.27113527059555054, + "logps/chosen": -398.656494140625, + "logps/rejected": -278.81207275390625, + "loss": 0.6059, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.07109531760215759, + "rewards/margins": 0.1095837950706482, + "rewards/rejected": -0.0384884774684906, + "step": 793 + }, + { + "epoch": 0.4840725499161713, + "grad_norm": 63.418428506006464, + "learning_rate": 3.446341463414634e-08, + "logits/chosen": -0.17916786670684814, + "logits/rejected": 0.534211277961731, + "logps/chosen": -233.7653350830078, + "logps/rejected": -164.34133911132812, + "loss": 0.6669, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025973699986934662, + "rewards/margins": 0.17903375625610352, + "rewards/rejected": -0.15306004881858826, + "step": 794 + }, + { + "epoch": 0.4846822130772748, + "grad_norm": 76.56433277831285, + "learning_rate": 3.450731707317073e-08, + "logits/chosen": 0.09799869358539581, + "logits/rejected": 0.17340679466724396, + "logps/chosen": -222.72579956054688, + "logps/rejected": -194.01608276367188, + "loss": 0.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.159549742937088, + "rewards/margins": 0.5828905701637268, + "rewards/rejected": -0.4233408570289612, + "step": 795 + }, + { + "epoch": 0.4852918762383783, + "grad_norm": 65.71962843442641, + "learning_rate": 3.455121951219512e-08, + "logits/chosen": 0.2741386592388153, + "logits/rejected": 0.17925170063972473, + "logps/chosen": -96.66497802734375, + "logps/rejected": -116.22662353515625, + "loss": 0.6093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08882968872785568, + "rewards/margins": 0.23059532046318054, + "rewards/rejected": -0.31942498683929443, + "step": 796 + }, + { + "epoch": 0.4859015393994818, + "grad_norm": 62.00727816555687, + "learning_rate": 3.459512195121951e-08, + "logits/chosen": 0.14391323924064636, + "logits/rejected": 0.09784625470638275, + "logps/chosen": -168.19680786132812, + "logps/rejected": -135.7476348876953, + "loss": 0.5661, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1364220678806305, + "rewards/margins": 0.20289844274520874, + "rewards/rejected": -0.33932051062583923, + "step": 797 + }, + { + "epoch": 0.4865112025605853, + "grad_norm": 56.18647820236656, + "learning_rate": 3.46390243902439e-08, + "logits/chosen": 0.16822977364063263, + "logits/rejected": 0.30192238092422485, + "logps/chosen": -10.904973983764648, + "logps/rejected": -24.79149055480957, + "loss": 0.6166, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09225364774465561, + "rewards/margins": 0.22615376114845276, + "rewards/rejected": -0.31840741634368896, + "step": 798 + }, + { + "epoch": 0.4871208657216888, + "grad_norm": 95.52585477358602, + "learning_rate": 3.468292682926829e-08, + "logits/chosen": 0.21458560228347778, + "logits/rejected": 0.4480295479297638, + "logps/chosen": -96.76959228515625, + "logps/rejected": -62.71880340576172, + "loss": 0.7004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16143067181110382, + "rewards/margins": 0.20936936140060425, + "rewards/rejected": -0.3708000183105469, + "step": 799 + }, + { + "epoch": 0.4877305288827923, + "grad_norm": 76.03065118312753, + "learning_rate": 3.472682926829268e-08, + "logits/chosen": 0.0231266301125288, + "logits/rejected": 0.031965479254722595, + "logps/chosen": -240.28280639648438, + "logps/rejected": -293.67279052734375, + "loss": 0.593, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24128007888793945, + "rewards/margins": 0.6292192339897156, + "rewards/rejected": -0.38793909549713135, + "step": 800 + }, + { + "epoch": 0.4883401920438957, + "grad_norm": 66.69555162121341, + "learning_rate": 3.477073170731707e-08, + "logits/chosen": 0.1899629682302475, + "logits/rejected": -0.011241592466831207, + "logps/chosen": -166.8588409423828, + "logps/rejected": -318.6372985839844, + "loss": 0.65, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2232269048690796, + "rewards/margins": 0.338846355676651, + "rewards/rejected": -0.11561945825815201, + "step": 801 + }, + { + "epoch": 0.48894985520499923, + "grad_norm": 63.70093626901587, + "learning_rate": 3.481463414634146e-08, + "logits/chosen": 0.14436128735542297, + "logits/rejected": 0.17100831866264343, + "logps/chosen": -100.22640228271484, + "logps/rejected": -85.49988555908203, + "loss": 0.5956, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2008926421403885, + "rewards/margins": 0.4725527763366699, + "rewards/rejected": -0.27166011929512024, + "step": 802 + }, + { + "epoch": 0.48955951836610273, + "grad_norm": 57.13753896864066, + "learning_rate": 3.4858536585365854e-08, + "logits/chosen": 0.21252959966659546, + "logits/rejected": 0.2079596370458603, + "logps/chosen": -17.76156234741211, + "logps/rejected": -30.843708038330078, + "loss": 0.5982, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32302653789520264, + "rewards/margins": 0.3257739543914795, + "rewards/rejected": -0.6488004922866821, + "step": 803 + }, + { + "epoch": 0.49016918152720623, + "grad_norm": 83.44429727902448, + "learning_rate": 3.490243902439024e-08, + "logits/chosen": 0.029475249350070953, + "logits/rejected": 0.16109654307365417, + "logps/chosen": -177.29476928710938, + "logps/rejected": -206.28036499023438, + "loss": 0.6798, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.046961214393377304, + "rewards/margins": -0.10674400627613068, + "rewards/rejected": 0.15370520949363708, + "step": 804 + }, + { + "epoch": 0.49077884468830973, + "grad_norm": 55.19283959826443, + "learning_rate": 3.494634146341463e-08, + "logits/chosen": 0.07019703090190887, + "logits/rejected": 0.16316017508506775, + "logps/chosen": -129.90567016601562, + "logps/rejected": -123.85980224609375, + "loss": 0.5812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18906404078006744, + "rewards/margins": -0.09188410639762878, + "rewards/rejected": 0.28094813227653503, + "step": 805 + }, + { + "epoch": 0.4913885078494132, + "grad_norm": 71.98262420823814, + "learning_rate": 3.499024390243902e-08, + "logits/chosen": 0.22487740218639374, + "logits/rejected": 0.24013380706310272, + "logps/chosen": -52.62399673461914, + "logps/rejected": -78.3775634765625, + "loss": 0.6248, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.27006229758262634, + "rewards/margins": -0.24889151751995087, + "rewards/rejected": -0.021170781925320625, + "step": 806 + }, + { + "epoch": 0.4919981710105167, + "grad_norm": 63.106620429570484, + "learning_rate": 3.5034146341463414e-08, + "logits/chosen": 0.17405636608600616, + "logits/rejected": 0.15376809239387512, + "logps/chosen": -63.981300354003906, + "logps/rejected": -71.03421020507812, + "loss": 0.5691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07564292848110199, + "rewards/margins": 0.6431266069412231, + "rewards/rejected": -0.7187695503234863, + "step": 807 + }, + { + "epoch": 0.4926078341716202, + "grad_norm": 77.21823359518474, + "learning_rate": 3.50780487804878e-08, + "logits/chosen": 0.2948898375034332, + "logits/rejected": 0.07608085125684738, + "logps/chosen": -281.1902770996094, + "logps/rejected": -407.0839538574219, + "loss": 0.5953, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11270199716091156, + "rewards/margins": 0.30205726623535156, + "rewards/rejected": -0.1893552839756012, + "step": 808 + }, + { + "epoch": 0.4932174973327237, + "grad_norm": 65.03744614118033, + "learning_rate": 3.512195121951219e-08, + "logits/chosen": -0.0038300901651382446, + "logits/rejected": -0.020449087023735046, + "logps/chosen": -101.77850341796875, + "logps/rejected": -89.84571838378906, + "loss": 0.5875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48315876722335815, + "rewards/margins": 0.2094717025756836, + "rewards/rejected": -0.6926305294036865, + "step": 809 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 60.374855244179564, + "learning_rate": 3.5165853658536586e-08, + "logits/chosen": -0.4043070077896118, + "logits/rejected": 0.2836885452270508, + "logps/chosen": -317.43115234375, + "logps/rejected": -238.12161254882812, + "loss": 0.4963, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3807636499404907, + "rewards/margins": 0.14152394235134125, + "rewards/rejected": 0.23923969268798828, + "step": 810 + }, + { + "epoch": 0.49443682365493064, + "grad_norm": 58.054693130003635, + "learning_rate": 3.5209756097560974e-08, + "logits/chosen": 0.2596984803676605, + "logits/rejected": 0.18857869505882263, + "logps/chosen": -195.6874237060547, + "logps/rejected": -212.06976318359375, + "loss": 0.5753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16618141531944275, + "rewards/margins": 1.0704714059829712, + "rewards/rejected": -0.904289960861206, + "step": 811 + }, + { + "epoch": 0.49504648681603414, + "grad_norm": 66.85845810270146, + "learning_rate": 3.525365853658536e-08, + "logits/chosen": 0.01069004088640213, + "logits/rejected": 0.1511375904083252, + "logps/chosen": -119.74539184570312, + "logps/rejected": -60.879364013671875, + "loss": 0.623, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.029255736619234085, + "rewards/margins": 0.06261497735977173, + "rewards/rejected": -0.09187071770429611, + "step": 812 + }, + { + "epoch": 0.49565614997713764, + "grad_norm": 64.44519241903701, + "learning_rate": 3.529756097560975e-08, + "logits/chosen": -0.05384228006005287, + "logits/rejected": -0.09951944649219513, + "logps/chosen": -115.72950744628906, + "logps/rejected": -226.60693359375, + "loss": 0.5969, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09954290091991425, + "rewards/margins": 0.9953127503395081, + "rewards/rejected": -0.8957698345184326, + "step": 813 + }, + { + "epoch": 0.49626581313824114, + "grad_norm": 64.73999018145003, + "learning_rate": 3.5341463414634146e-08, + "logits/chosen": -0.02636069990694523, + "logits/rejected": 0.27930817008018494, + "logps/chosen": -138.90631103515625, + "logps/rejected": -101.73639678955078, + "loss": 0.6013, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15106813609600067, + "rewards/margins": 0.04017619788646698, + "rewards/rejected": 0.11089195311069489, + "step": 814 + }, + { + "epoch": 0.4968754762993446, + "grad_norm": 80.06874543046527, + "learning_rate": 3.5385365853658534e-08, + "logits/chosen": 0.10787045955657959, + "logits/rejected": 0.16068562865257263, + "logps/chosen": -182.59161376953125, + "logps/rejected": -177.74913024902344, + "loss": 0.6295, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33856871724128723, + "rewards/margins": 0.3896040916442871, + "rewards/rejected": -0.051035378128290176, + "step": 815 + }, + { + "epoch": 0.4974851394604481, + "grad_norm": 63.31855345344853, + "learning_rate": 3.542926829268292e-08, + "logits/chosen": -0.02349713072180748, + "logits/rejected": 0.2184727042913437, + "logps/chosen": -324.282470703125, + "logps/rejected": -248.28179931640625, + "loss": 0.5179, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9143310785293579, + "rewards/margins": 1.4258538484573364, + "rewards/rejected": -0.5115228891372681, + "step": 816 + }, + { + "epoch": 0.4980948026215516, + "grad_norm": 68.31154055717026, + "learning_rate": 3.547317073170732e-08, + "logits/chosen": 0.2828770577907562, + "logits/rejected": 0.13964536786079407, + "logps/chosen": -114.00566864013672, + "logps/rejected": -157.14462280273438, + "loss": 0.5882, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21594074368476868, + "rewards/margins": -0.019449200481176376, + "rewards/rejected": 0.23538993299007416, + "step": 817 + }, + { + "epoch": 0.4987044657826551, + "grad_norm": 62.69883087366106, + "learning_rate": 3.5517073170731706e-08, + "logits/chosen": 0.2960382401943207, + "logits/rejected": 0.1413854956626892, + "logps/chosen": -118.39256286621094, + "logps/rejected": -212.56390380859375, + "loss": 0.5799, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15546150505542755, + "rewards/margins": 0.7765097618103027, + "rewards/rejected": -0.6210482120513916, + "step": 818 + }, + { + "epoch": 0.4993141289437586, + "grad_norm": 75.52294565825872, + "learning_rate": 3.5560975609756094e-08, + "logits/chosen": -0.18223996460437775, + "logits/rejected": -0.20841598510742188, + "logps/chosen": -236.8354949951172, + "logps/rejected": -340.8524169921875, + "loss": 0.5831, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37170901894569397, + "rewards/margins": -0.08836756646633148, + "rewards/rejected": 0.46007657051086426, + "step": 819 + }, + { + "epoch": 0.49992379210486204, + "grad_norm": 53.98723231146931, + "learning_rate": 3.560487804878048e-08, + "logits/chosen": -0.06694330275058746, + "logits/rejected": -0.0628398060798645, + "logps/chosen": -155.86611938476562, + "logps/rejected": -212.4217071533203, + "loss": 0.5504, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22390814125537872, + "rewards/margins": -0.0001362636685371399, + "rewards/rejected": 0.22404442727565765, + "step": 820 + }, + { + "epoch": 0.5005334552659656, + "grad_norm": 72.27416260264785, + "learning_rate": 3.564878048780488e-08, + "logits/chosen": -0.04299474135041237, + "logits/rejected": -0.007550263777375221, + "logps/chosen": -96.05432891845703, + "logps/rejected": -57.760093688964844, + "loss": 0.5394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42605799436569214, + "rewards/margins": 0.6398237347602844, + "rewards/rejected": -0.21376578509807587, + "step": 821 + }, + { + "epoch": 0.501143118427069, + "grad_norm": 55.982740371709816, + "learning_rate": 3.5692682926829266e-08, + "logits/chosen": 0.37543606758117676, + "logits/rejected": 0.1279752552509308, + "logps/chosen": -35.941551208496094, + "logps/rejected": -66.75108337402344, + "loss": 0.525, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15648934245109558, + "rewards/margins": -0.1225501149892807, + "rewards/rejected": -0.03393923491239548, + "step": 822 + }, + { + "epoch": 0.5017527815881725, + "grad_norm": 64.53581083310333, + "learning_rate": 3.5736585365853655e-08, + "logits/chosen": 0.1085236594080925, + "logits/rejected": 0.1588239073753357, + "logps/chosen": -83.08793640136719, + "logps/rejected": -34.56549072265625, + "loss": 0.6437, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0573994405567646, + "rewards/margins": 0.02727280557155609, + "rewards/rejected": 0.03012663498520851, + "step": 823 + }, + { + "epoch": 0.502362444749276, + "grad_norm": 67.90415545067161, + "learning_rate": 3.578048780487805e-08, + "logits/chosen": 0.019908029586076736, + "logits/rejected": 0.31911247968673706, + "logps/chosen": -297.48858642578125, + "logps/rejected": -111.53302764892578, + "loss": 0.6178, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.19981935620307922, + "rewards/margins": -0.2561364769935608, + "rewards/rejected": 0.056317128241062164, + "step": 824 + }, + { + "epoch": 0.5029721079103795, + "grad_norm": 74.7628221054287, + "learning_rate": 3.582439024390244e-08, + "logits/chosen": 0.09637068212032318, + "logits/rejected": 0.14802901446819305, + "logps/chosen": -271.69989013671875, + "logps/rejected": -203.29689025878906, + "loss": 0.5978, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02346685528755188, + "rewards/margins": 0.22066234052181244, + "rewards/rejected": -0.24412918090820312, + "step": 825 + }, + { + "epoch": 0.503581771071483, + "grad_norm": 53.67901711490742, + "learning_rate": 3.5868292682926826e-08, + "logits/chosen": 0.10175606608390808, + "logits/rejected": 0.3559953570365906, + "logps/chosen": -196.19017028808594, + "logps/rejected": -213.1588897705078, + "loss": 0.6125, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.038393739610910416, + "rewards/margins": 0.41897690296173096, + "rewards/rejected": -0.38058316707611084, + "step": 826 + }, + { + "epoch": 0.5041914342325865, + "grad_norm": 61.24527426927978, + "learning_rate": 3.5912195121951215e-08, + "logits/chosen": 0.2617185711860657, + "logits/rejected": 0.2609824538230896, + "logps/chosen": -9.725150108337402, + "logps/rejected": -24.58201026916504, + "loss": 0.5685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2893357276916504, + "rewards/margins": -0.005685422569513321, + "rewards/rejected": -0.28365030884742737, + "step": 827 + }, + { + "epoch": 0.50480109739369, + "grad_norm": 60.296376350666556, + "learning_rate": 3.595609756097561e-08, + "logits/chosen": 0.08205138146877289, + "logits/rejected": 0.09890572726726532, + "logps/chosen": -92.11471557617188, + "logps/rejected": -139.8546905517578, + "loss": 0.5405, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.185858815908432, + "rewards/margins": 0.019047502428293228, + "rewards/rejected": 0.16681131720542908, + "step": 828 + }, + { + "epoch": 0.5054107605547935, + "grad_norm": 62.6042647877219, + "learning_rate": 3.6e-08, + "logits/chosen": 0.007369082421064377, + "logits/rejected": -0.05879281833767891, + "logps/chosen": -149.5255126953125, + "logps/rejected": -332.4320373535156, + "loss": 0.5992, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43801242113113403, + "rewards/margins": 0.6028183698654175, + "rewards/rejected": -0.16480599343776703, + "step": 829 + }, + { + "epoch": 0.506020423715897, + "grad_norm": 68.81408940827944, + "learning_rate": 3.6043902439024386e-08, + "logits/chosen": -0.05554075911641121, + "logits/rejected": 0.07496682554483414, + "logps/chosen": -106.6781234741211, + "logps/rejected": -69.4856185913086, + "loss": 0.5799, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16205430030822754, + "rewards/margins": 0.10765685886144638, + "rewards/rejected": -0.2697111666202545, + "step": 830 + }, + { + "epoch": 0.5066300868770005, + "grad_norm": 63.766841041160916, + "learning_rate": 3.608780487804878e-08, + "logits/chosen": 0.2696358561515808, + "logits/rejected": 0.11477681994438171, + "logps/chosen": -103.76864624023438, + "logps/rejected": -169.93309020996094, + "loss": 0.6062, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2814573645591736, + "rewards/margins": -0.2618228495121002, + "rewards/rejected": -0.019634537398815155, + "step": 831 + }, + { + "epoch": 0.507239750038104, + "grad_norm": 66.44566325037502, + "learning_rate": 3.613170731707317e-08, + "logits/chosen": 0.006435887888073921, + "logits/rejected": 0.14947624504566193, + "logps/chosen": -276.1130065917969, + "logps/rejected": -197.30471801757812, + "loss": 0.5337, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0059828683733940125, + "rewards/margins": -0.0971691757440567, + "rewards/rejected": 0.10315204411745071, + "step": 832 + }, + { + "epoch": 0.5078494131992074, + "grad_norm": 70.33743299351269, + "learning_rate": 3.617560975609756e-08, + "logits/chosen": 0.12623409926891327, + "logits/rejected": 0.08283305913209915, + "logps/chosen": -187.23097229003906, + "logps/rejected": -140.1327362060547, + "loss": 0.539, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0500035360455513, + "rewards/margins": 0.39642155170440674, + "rewards/rejected": -0.34641802310943604, + "step": 833 + }, + { + "epoch": 0.508459076360311, + "grad_norm": 119.87922314374883, + "learning_rate": 3.6219512195121947e-08, + "logits/chosen": 0.28614529967308044, + "logits/rejected": 0.12593020498752594, + "logps/chosen": -44.036285400390625, + "logps/rejected": -101.87958526611328, + "loss": 0.6763, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10472209751605988, + "rewards/margins": 0.09965291619300842, + "rewards/rejected": 0.0050691841170191765, + "step": 834 + }, + { + "epoch": 0.5090687395214144, + "grad_norm": 55.442194966055226, + "learning_rate": 3.626341463414634e-08, + "logits/chosen": 0.0738789290189743, + "logits/rejected": 0.09468599408864975, + "logps/chosen": -87.6014633178711, + "logps/rejected": -74.97404479980469, + "loss": 0.4878, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07335515320301056, + "rewards/margins": 0.3396499752998352, + "rewards/rejected": -0.26629480719566345, + "step": 835 + }, + { + "epoch": 0.509678402682518, + "grad_norm": 69.88154759613471, + "learning_rate": 3.630731707317073e-08, + "logits/chosen": -0.13200712203979492, + "logits/rejected": -0.39851513504981995, + "logps/chosen": -58.32075119018555, + "logps/rejected": -90.2609634399414, + "loss": 0.6329, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1759040355682373, + "rewards/margins": 0.18853089213371277, + "rewards/rejected": -0.012626864947378635, + "step": 836 + }, + { + "epoch": 0.5102880658436214, + "grad_norm": 77.89392463115561, + "learning_rate": 3.635121951219512e-08, + "logits/chosen": 0.0990343987941742, + "logits/rejected": -0.007436167448759079, + "logps/chosen": -202.43553161621094, + "logps/rejected": -415.59759521484375, + "loss": 0.629, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41394180059432983, + "rewards/margins": 1.3712553977966309, + "rewards/rejected": -0.957313597202301, + "step": 837 + }, + { + "epoch": 0.5108977290047249, + "grad_norm": 78.6798043342552, + "learning_rate": 3.6395121951219513e-08, + "logits/chosen": 0.19441884756088257, + "logits/rejected": 0.02460932731628418, + "logps/chosen": -160.54754638671875, + "logps/rejected": -182.2922821044922, + "loss": 0.6971, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20073196291923523, + "rewards/margins": 0.34828808903694153, + "rewards/rejected": -0.1475561112165451, + "step": 838 + }, + { + "epoch": 0.5115073921658284, + "grad_norm": 57.78983825593819, + "learning_rate": 3.64390243902439e-08, + "logits/chosen": 0.33323365449905396, + "logits/rejected": 0.31273379921913147, + "logps/chosen": -209.37721252441406, + "logps/rejected": -98.8693618774414, + "loss": 0.5343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11098715662956238, + "rewards/margins": 0.09107886254787445, + "rewards/rejected": -0.20206600427627563, + "step": 839 + }, + { + "epoch": 0.5121170553269319, + "grad_norm": 75.61649299804209, + "learning_rate": 3.648292682926829e-08, + "logits/chosen": 0.11817595362663269, + "logits/rejected": 0.14771093428134918, + "logps/chosen": -24.33318519592285, + "logps/rejected": -10.833059310913086, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1404966413974762, + "rewards/margins": 0.28867948055267334, + "rewards/rejected": -0.42917612195014954, + "step": 840 + }, + { + "epoch": 0.5127267184880354, + "grad_norm": 65.68720728079661, + "learning_rate": 3.652682926829268e-08, + "logits/chosen": 0.18452531099319458, + "logits/rejected": 0.18431918323040009, + "logps/chosen": -7.983917236328125, + "logps/rejected": -15.626996994018555, + "loss": 0.5683, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.25485336780548096, + "rewards/margins": -0.12557300925254822, + "rewards/rejected": -0.12928032875061035, + "step": 841 + }, + { + "epoch": 0.5133363816491389, + "grad_norm": 72.0016158402061, + "learning_rate": 3.6570731707317073e-08, + "logits/chosen": 0.04569125175476074, + "logits/rejected": 0.0868416428565979, + "logps/chosen": -43.657135009765625, + "logps/rejected": -40.785888671875, + "loss": 0.5914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11346039175987244, + "rewards/margins": 0.20358936488628387, + "rewards/rejected": -0.09012897312641144, + "step": 842 + }, + { + "epoch": 0.5139460448102423, + "grad_norm": 72.82023770265735, + "learning_rate": 3.661463414634146e-08, + "logits/chosen": -0.1773550808429718, + "logits/rejected": -0.10552921891212463, + "logps/chosen": -317.5165710449219, + "logps/rejected": -306.7833251953125, + "loss": 0.637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8384914994239807, + "rewards/margins": 0.588058590888977, + "rewards/rejected": 0.25043290853500366, + "step": 843 + }, + { + "epoch": 0.5145557079713459, + "grad_norm": 62.38563307817259, + "learning_rate": 3.665853658536585e-08, + "logits/chosen": 0.003631487488746643, + "logits/rejected": 0.038138628005981445, + "logps/chosen": -165.54515075683594, + "logps/rejected": -152.398681640625, + "loss": 0.5624, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12048117816448212, + "rewards/margins": 0.28906017541885376, + "rewards/rejected": -0.16857901215553284, + "step": 844 + }, + { + "epoch": 0.5151653711324493, + "grad_norm": 59.068977108334586, + "learning_rate": 3.6702439024390245e-08, + "logits/chosen": 0.6819825172424316, + "logits/rejected": 0.3414430618286133, + "logps/chosen": -51.40108871459961, + "logps/rejected": -103.67996215820312, + "loss": 0.5724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02640927955508232, + "rewards/margins": 0.23062248528003693, + "rewards/rejected": -0.25703176856040955, + "step": 845 + }, + { + "epoch": 0.5157750342935528, + "grad_norm": 52.870770819009095, + "learning_rate": 3.6746341463414634e-08, + "logits/chosen": 0.09804527461528778, + "logits/rejected": 0.22699229419231415, + "logps/chosen": -123.43194580078125, + "logps/rejected": -122.18966674804688, + "loss": 0.5079, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11057642847299576, + "rewards/margins": 0.23703187704086304, + "rewards/rejected": -0.12645544111728668, + "step": 846 + }, + { + "epoch": 0.5163846974546563, + "grad_norm": 65.58905706663117, + "learning_rate": 3.679024390243902e-08, + "logits/chosen": 0.08693084865808487, + "logits/rejected": 0.333507776260376, + "logps/chosen": -99.66699981689453, + "logps/rejected": -41.3652458190918, + "loss": 0.5967, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.32221508026123047, + "rewards/margins": -0.25529253482818604, + "rewards/rejected": -0.06692254543304443, + "step": 847 + }, + { + "epoch": 0.5169943606157598, + "grad_norm": 76.42954531141284, + "learning_rate": 3.683414634146341e-08, + "logits/chosen": -0.02600632980465889, + "logits/rejected": 0.11422056704759598, + "logps/chosen": -102.6796875, + "logps/rejected": -38.83604431152344, + "loss": 0.6332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05238047614693642, + "rewards/margins": 0.4671940803527832, + "rewards/rejected": -0.5195745229721069, + "step": 848 + }, + { + "epoch": 0.5176040237768633, + "grad_norm": 70.49971776592591, + "learning_rate": 3.6878048780487805e-08, + "logits/chosen": 0.038019824773073196, + "logits/rejected": 0.04546715319156647, + "logps/chosen": -92.1142807006836, + "logps/rejected": -51.81432342529297, + "loss": 0.6389, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.29714739322662354, + "rewards/margins": -0.02509160526096821, + "rewards/rejected": -0.2720557749271393, + "step": 849 + }, + { + "epoch": 0.5182136869379668, + "grad_norm": 98.01020646077737, + "learning_rate": 3.6921951219512194e-08, + "logits/chosen": -0.25814080238342285, + "logits/rejected": 0.328311562538147, + "logps/chosen": -208.58871459960938, + "logps/rejected": -94.48977661132812, + "loss": 0.5648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17906787991523743, + "rewards/margins": 0.3485424816608429, + "rewards/rejected": -0.16947460174560547, + "step": 850 + }, + { + "epoch": 0.5188233500990702, + "grad_norm": 59.49566034028254, + "learning_rate": 3.696585365853658e-08, + "logits/chosen": 0.04231855273246765, + "logits/rejected": 0.1485375612974167, + "logps/chosen": -116.27388000488281, + "logps/rejected": -76.1816177368164, + "loss": 0.5772, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14461486041545868, + "rewards/margins": 0.4042167067527771, + "rewards/rejected": -0.259601891040802, + "step": 851 + }, + { + "epoch": 0.5194330132601738, + "grad_norm": 77.50850284821571, + "learning_rate": 3.700975609756098e-08, + "logits/chosen": 0.15469692647457123, + "logits/rejected": -0.03396916389465332, + "logps/chosen": -231.93939208984375, + "logps/rejected": -350.9715881347656, + "loss": 0.5876, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37171655893325806, + "rewards/margins": 0.7721027135848999, + "rewards/rejected": -0.40038609504699707, + "step": 852 + }, + { + "epoch": 0.5200426764212772, + "grad_norm": 60.73460630736764, + "learning_rate": 3.7053658536585365e-08, + "logits/chosen": 0.23656564950942993, + "logits/rejected": 0.1381596028804779, + "logps/chosen": -174.81961059570312, + "logps/rejected": -293.6786193847656, + "loss": 0.4706, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0001943148672580719, + "rewards/margins": 0.24363532662391663, + "rewards/rejected": -0.24344101548194885, + "step": 853 + }, + { + "epoch": 0.5206523395823808, + "grad_norm": 59.29308126765653, + "learning_rate": 3.7097560975609754e-08, + "logits/chosen": -0.01903049647808075, + "logits/rejected": 0.09655285626649857, + "logps/chosen": -266.18084716796875, + "logps/rejected": -188.98684692382812, + "loss": 0.5426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3428747355937958, + "rewards/margins": 0.3217155337333679, + "rewards/rejected": 0.021159224212169647, + "step": 854 + }, + { + "epoch": 0.5212620027434842, + "grad_norm": 67.15913121769145, + "learning_rate": 3.714146341463414e-08, + "logits/chosen": 0.1330699771642685, + "logits/rejected": 0.23801834881305695, + "logps/chosen": -152.31396484375, + "logps/rejected": -93.39073944091797, + "loss": 0.5694, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2283179610967636, + "rewards/margins": 0.4553031921386719, + "rewards/rejected": -0.22698521614074707, + "step": 855 + }, + { + "epoch": 0.5218716659045877, + "grad_norm": 62.83732358509938, + "learning_rate": 3.718536585365854e-08, + "logits/chosen": 0.05359729379415512, + "logits/rejected": 0.08941934257745743, + "logps/chosen": -55.36945343017578, + "logps/rejected": -112.3162841796875, + "loss": 0.6318, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4063263237476349, + "rewards/margins": -0.3283662497997284, + "rewards/rejected": -0.0779600739479065, + "step": 856 + }, + { + "epoch": 0.5224813290656912, + "grad_norm": 67.01778152575903, + "learning_rate": 3.7229268292682926e-08, + "logits/chosen": 0.14143003523349762, + "logits/rejected": 0.033660512417554855, + "logps/chosen": -113.7740249633789, + "logps/rejected": -213.09373474121094, + "loss": 0.5523, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1885804831981659, + "rewards/margins": 0.6856738328933716, + "rewards/rejected": -0.4970932900905609, + "step": 857 + }, + { + "epoch": 0.5230909922267947, + "grad_norm": 79.12985412209427, + "learning_rate": 3.7273170731707314e-08, + "logits/chosen": 0.04618055000901222, + "logits/rejected": 0.01835821568965912, + "logps/chosen": -317.59649658203125, + "logps/rejected": -311.95257568359375, + "loss": 0.5556, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05542896315455437, + "rewards/margins": -0.06943736970424652, + "rewards/rejected": 0.01400841772556305, + "step": 858 + }, + { + "epoch": 0.5237006553878982, + "grad_norm": 63.238133753750255, + "learning_rate": 3.731707317073171e-08, + "logits/chosen": 0.09808889776468277, + "logits/rejected": 0.20892862975597382, + "logps/chosen": -304.3889465332031, + "logps/rejected": -354.45733642578125, + "loss": 0.529, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2641480565071106, + "rewards/margins": 0.3940035402774811, + "rewards/rejected": -0.1298554539680481, + "step": 859 + }, + { + "epoch": 0.5243103185490017, + "grad_norm": 59.80341584344199, + "learning_rate": 3.73609756097561e-08, + "logits/chosen": 0.18731476366519928, + "logits/rejected": 0.1431311070919037, + "logps/chosen": -176.1766357421875, + "logps/rejected": -171.20559692382812, + "loss": 0.5285, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4174979329109192, + "rewards/margins": 0.9883413910865784, + "rewards/rejected": -0.5708434581756592, + "step": 860 + }, + { + "epoch": 0.5249199817101051, + "grad_norm": 63.5129674699019, + "learning_rate": 3.7404878048780486e-08, + "logits/chosen": 0.22578765451908112, + "logits/rejected": 0.22811663150787354, + "logps/chosen": -21.193035125732422, + "logps/rejected": -17.314197540283203, + "loss": 0.5605, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32173627614974976, + "rewards/margins": 0.3135220408439636, + "rewards/rejected": -0.6352583169937134, + "step": 861 + }, + { + "epoch": 0.5255296448712087, + "grad_norm": 62.97554727957017, + "learning_rate": 3.7448780487804874e-08, + "logits/chosen": 0.1587522029876709, + "logits/rejected": 0.171729177236557, + "logps/chosen": -9.075733184814453, + "logps/rejected": -16.870502471923828, + "loss": 0.5485, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12192894518375397, + "rewards/margins": 0.08783310651779175, + "rewards/rejected": -0.2097620666027069, + "step": 862 + }, + { + "epoch": 0.5261393080323121, + "grad_norm": 61.41324613191589, + "learning_rate": 3.749268292682927e-08, + "logits/chosen": 0.2167873978614807, + "logits/rejected": 0.5014666318893433, + "logps/chosen": -66.5404052734375, + "logps/rejected": -27.729721069335938, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31119388341903687, + "rewards/margins": 0.0655917152762413, + "rewards/rejected": -0.37678560614585876, + "step": 863 + }, + { + "epoch": 0.5267489711934157, + "grad_norm": 57.71637274764013, + "learning_rate": 3.753658536585366e-08, + "logits/chosen": 0.20771121978759766, + "logits/rejected": 0.20334795117378235, + "logps/chosen": -61.600059509277344, + "logps/rejected": -48.765316009521484, + "loss": 0.5433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10763809084892273, + "rewards/margins": 0.006966426968574524, + "rewards/rejected": -0.11460452526807785, + "step": 864 + }, + { + "epoch": 0.5273586343545191, + "grad_norm": 67.18479937320431, + "learning_rate": 3.7580487804878046e-08, + "logits/chosen": -0.12771549820899963, + "logits/rejected": 0.0017118752002716064, + "logps/chosen": -308.0665283203125, + "logps/rejected": -276.1645812988281, + "loss": 0.5406, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2535865902900696, + "rewards/margins": 0.3989185392856598, + "rewards/rejected": -0.1453319638967514, + "step": 865 + }, + { + "epoch": 0.5279682975156226, + "grad_norm": 66.86041384032525, + "learning_rate": 3.762439024390244e-08, + "logits/chosen": 0.008922770619392395, + "logits/rejected": 0.037095263600349426, + "logps/chosen": -95.94760131835938, + "logps/rejected": -49.44132995605469, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08840463310480118, + "rewards/margins": 0.4873013198375702, + "rewards/rejected": -0.3988966941833496, + "step": 866 + }, + { + "epoch": 0.5285779606767261, + "grad_norm": 56.773550955702355, + "learning_rate": 3.766829268292683e-08, + "logits/chosen": -0.13588771224021912, + "logits/rejected": -0.21735835075378418, + "logps/chosen": -71.41838073730469, + "logps/rejected": -94.79507446289062, + "loss": 0.5806, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02211012877523899, + "rewards/margins": 0.020595155656337738, + "rewards/rejected": -0.04270528629422188, + "step": 867 + }, + { + "epoch": 0.5291876238378296, + "grad_norm": 76.66512714672785, + "learning_rate": 3.771219512195122e-08, + "logits/chosen": 0.17641451954841614, + "logits/rejected": 0.181544229388237, + "logps/chosen": -59.29175567626953, + "logps/rejected": -83.70962524414062, + "loss": 0.5483, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016960304230451584, + "rewards/margins": 0.236171156167984, + "rewards/rejected": -0.2531314492225647, + "step": 868 + }, + { + "epoch": 0.5297972869989331, + "grad_norm": 75.57283352876863, + "learning_rate": 3.775609756097561e-08, + "logits/chosen": -0.008752023801207542, + "logits/rejected": 0.08318771421909332, + "logps/chosen": -147.72816467285156, + "logps/rejected": -135.38661193847656, + "loss": 0.6189, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08691678941249847, + "rewards/margins": -0.1018247976899147, + "rewards/rejected": 0.18874159455299377, + "step": 869 + }, + { + "epoch": 0.5304069501600366, + "grad_norm": 64.29654345833464, + "learning_rate": 3.78e-08, + "logits/chosen": 0.11647903174161911, + "logits/rejected": 0.17515388131141663, + "logps/chosen": -140.824462890625, + "logps/rejected": -57.53905487060547, + "loss": 0.6441, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11540143936872482, + "rewards/margins": 0.5147880911827087, + "rewards/rejected": -0.39938661456108093, + "step": 870 + }, + { + "epoch": 0.53101661332114, + "grad_norm": 65.75072184060713, + "learning_rate": 3.784390243902439e-08, + "logits/chosen": 0.1211961880326271, + "logits/rejected": 0.09717921912670135, + "logps/chosen": -455.9720458984375, + "logps/rejected": -421.9548645019531, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0023680515587329865, + "rewards/margins": 1.077951431274414, + "rewards/rejected": -1.0755834579467773, + "step": 871 + }, + { + "epoch": 0.5316262764822436, + "grad_norm": 63.004622087496614, + "learning_rate": 3.788780487804878e-08, + "logits/chosen": -0.015010075643658638, + "logits/rejected": 0.10390684753656387, + "logps/chosen": -275.31884765625, + "logps/rejected": -142.97242736816406, + "loss": 0.5365, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.734698474407196, + "rewards/margins": 0.42196959257125854, + "rewards/rejected": 0.3127288818359375, + "step": 872 + }, + { + "epoch": 0.532235939643347, + "grad_norm": 61.66937202736482, + "learning_rate": 3.793170731707317e-08, + "logits/chosen": -0.03673422336578369, + "logits/rejected": 0.0396769754588604, + "logps/chosen": -183.277099609375, + "logps/rejected": -121.23249053955078, + "loss": 0.5385, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.21187247335910797, + "rewards/margins": -0.20256415009498596, + "rewards/rejected": -0.009308312088251114, + "step": 873 + }, + { + "epoch": 0.5328456028044506, + "grad_norm": 63.32267307609421, + "learning_rate": 3.797560975609756e-08, + "logits/chosen": -0.06537565588951111, + "logits/rejected": 0.0261433944106102, + "logps/chosen": -133.59780883789062, + "logps/rejected": -172.2808837890625, + "loss": 0.5802, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.053683459758758545, + "rewards/margins": 0.19040435552597046, + "rewards/rejected": -0.13672089576721191, + "step": 874 + }, + { + "epoch": 0.533455265965554, + "grad_norm": 66.27506589205731, + "learning_rate": 3.801951219512195e-08, + "logits/chosen": 0.07901205122470856, + "logits/rejected": 0.07933249324560165, + "logps/chosen": -52.86782455444336, + "logps/rejected": -62.64750289916992, + "loss": 0.5814, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1269572675228119, + "rewards/margins": -0.14318525791168213, + "rewards/rejected": 0.016227995976805687, + "step": 875 + }, + { + "epoch": 0.5340649291266575, + "grad_norm": 54.33717921908857, + "learning_rate": 3.8063414634146344e-08, + "logits/chosen": 0.12236712872982025, + "logits/rejected": 0.09126129746437073, + "logps/chosen": -13.379688262939453, + "logps/rejected": -44.06802749633789, + "loss": 0.4841, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10507187992334366, + "rewards/margins": 0.14723534882068634, + "rewards/rejected": -0.2523072361946106, + "step": 876 + }, + { + "epoch": 0.534674592287761, + "grad_norm": 66.47825394805919, + "learning_rate": 3.810731707317073e-08, + "logits/chosen": 0.06389956176280975, + "logits/rejected": 0.28784507513046265, + "logps/chosen": -306.0577087402344, + "logps/rejected": -66.86945343017578, + "loss": 0.6358, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.242964506149292, + "rewards/margins": 0.01772509515285492, + "rewards/rejected": -0.2606896460056305, + "step": 877 + }, + { + "epoch": 0.5352842554488645, + "grad_norm": 74.50690460636137, + "learning_rate": 3.815121951219512e-08, + "logits/chosen": 0.3442728519439697, + "logits/rejected": 0.30142903327941895, + "logps/chosen": -226.09994506835938, + "logps/rejected": -223.4921417236328, + "loss": 0.6019, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7171735167503357, + "rewards/margins": 0.784739077091217, + "rewards/rejected": -0.06756556034088135, + "step": 878 + }, + { + "epoch": 0.535893918609968, + "grad_norm": 67.43150128420167, + "learning_rate": 3.819512195121951e-08, + "logits/chosen": 0.09721434116363525, + "logits/rejected": 0.04542946070432663, + "logps/chosen": -130.51011657714844, + "logps/rejected": -212.75540161132812, + "loss": 0.5525, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28920498490333557, + "rewards/margins": 0.26738715171813965, + "rewards/rejected": -0.5565921664237976, + "step": 879 + }, + { + "epoch": 0.5365035817710715, + "grad_norm": 64.15874977445931, + "learning_rate": 3.8239024390243905e-08, + "logits/chosen": 0.04385033994913101, + "logits/rejected": 0.07755731046199799, + "logps/chosen": -23.495189666748047, + "logps/rejected": -20.148441314697266, + "loss": 0.5657, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47862696647644043, + "rewards/margins": 0.13002800941467285, + "rewards/rejected": -0.6086549758911133, + "step": 880 + }, + { + "epoch": 0.5371132449321749, + "grad_norm": 53.00174423455976, + "learning_rate": 3.828292682926829e-08, + "logits/chosen": 0.2399432361125946, + "logits/rejected": 0.07920615375041962, + "logps/chosen": -80.57872009277344, + "logps/rejected": -93.03114318847656, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13346940279006958, + "rewards/margins": 0.15735048055648804, + "rewards/rejected": -0.2908198833465576, + "step": 881 + }, + { + "epoch": 0.5377229080932785, + "grad_norm": 58.27008171272014, + "learning_rate": 3.832682926829268e-08, + "logits/chosen": 0.2204049527645111, + "logits/rejected": 0.2619776725769043, + "logps/chosen": -112.0592269897461, + "logps/rejected": -84.74361419677734, + "loss": 0.5639, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29275035858154297, + "rewards/margins": 0.8456393480300903, + "rewards/rejected": -0.5528889298439026, + "step": 882 + }, + { + "epoch": 0.5383325712543819, + "grad_norm": 62.09697051583322, + "learning_rate": 3.8370731707317076e-08, + "logits/chosen": 0.21296963095664978, + "logits/rejected": 0.2081933617591858, + "logps/chosen": -141.5391082763672, + "logps/rejected": -146.18832397460938, + "loss": 0.5585, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11810910701751709, + "rewards/margins": 0.5128703117370605, + "rewards/rejected": -0.39476117491722107, + "step": 883 + }, + { + "epoch": 0.5389422344154855, + "grad_norm": 61.80920607763717, + "learning_rate": 3.8414634146341465e-08, + "logits/chosen": 0.2989388704299927, + "logits/rejected": 0.336221843957901, + "logps/chosen": -19.220060348510742, + "logps/rejected": -17.0111083984375, + "loss": 0.6451, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9095132350921631, + "rewards/margins": 0.013400629162788391, + "rewards/rejected": -0.9229139089584351, + "step": 884 + }, + { + "epoch": 0.539551897576589, + "grad_norm": 58.934770284041925, + "learning_rate": 3.845853658536585e-08, + "logits/chosen": 0.2550952136516571, + "logits/rejected": 0.28509005904197693, + "logps/chosen": -191.87539672851562, + "logps/rejected": -186.66815185546875, + "loss": 0.5078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20852214097976685, + "rewards/margins": 0.72804856300354, + "rewards/rejected": -0.5195264220237732, + "step": 885 + }, + { + "epoch": 0.5401615607376924, + "grad_norm": 96.03745592632615, + "learning_rate": 3.850243902439024e-08, + "logits/chosen": 0.2564608156681061, + "logits/rejected": 0.19282256066799164, + "logps/chosen": -195.31539916992188, + "logps/rejected": -199.86245727539062, + "loss": 0.4757, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08128748834133148, + "rewards/margins": 0.8918872475624084, + "rewards/rejected": -0.8105998039245605, + "step": 886 + }, + { + "epoch": 0.540771223898796, + "grad_norm": 70.6692264776895, + "learning_rate": 3.8546341463414636e-08, + "logits/chosen": 0.17313377559185028, + "logits/rejected": 0.272353857755661, + "logps/chosen": -82.64848327636719, + "logps/rejected": -50.599117279052734, + "loss": 0.6496, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.050652459263801575, + "rewards/margins": -0.05836388096213341, + "rewards/rejected": 0.10901632905006409, + "step": 887 + }, + { + "epoch": 0.5413808870598994, + "grad_norm": 69.05191052518173, + "learning_rate": 3.8590243902439025e-08, + "logits/chosen": -0.11841931194067001, + "logits/rejected": 0.1710411161184311, + "logps/chosen": -233.16030883789062, + "logps/rejected": -112.22335052490234, + "loss": 0.5506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4198493957519531, + "rewards/margins": 0.03017844259738922, + "rewards/rejected": -0.45002782344818115, + "step": 888 + }, + { + "epoch": 0.5419905502210028, + "grad_norm": 72.33339946514252, + "learning_rate": 3.863414634146341e-08, + "logits/chosen": 0.164263516664505, + "logits/rejected": 0.2504717707633972, + "logps/chosen": -300.3990783691406, + "logps/rejected": -249.3133544921875, + "loss": 0.617, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.005677081644535065, + "rewards/margins": 0.24519182741641998, + "rewards/rejected": -0.23951473832130432, + "step": 889 + }, + { + "epoch": 0.5426002133821064, + "grad_norm": 61.011371088142006, + "learning_rate": 3.867804878048781e-08, + "logits/chosen": 0.14612877368927002, + "logits/rejected": 0.12802213430404663, + "logps/chosen": -198.5043487548828, + "logps/rejected": -97.168701171875, + "loss": 0.4808, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4118140935897827, + "rewards/margins": 1.3088165521621704, + "rewards/rejected": -0.8970025777816772, + "step": 890 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 70.19987568244505, + "learning_rate": 3.8721951219512197e-08, + "logits/chosen": 0.07287600636482239, + "logits/rejected": 0.051877789199352264, + "logps/chosen": -293.9187927246094, + "logps/rejected": -132.78085327148438, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4249412417411804, + "rewards/margins": 0.5122901797294617, + "rewards/rejected": -0.08734893053770065, + "step": 891 + }, + { + "epoch": 0.5438195397043134, + "grad_norm": 60.54433657791981, + "learning_rate": 3.8765853658536585e-08, + "logits/chosen": 0.021549783647060394, + "logits/rejected": 0.06050729751586914, + "logps/chosen": -61.32946014404297, + "logps/rejected": -33.854331970214844, + "loss": 0.528, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10759210586547852, + "rewards/margins": 0.653812050819397, + "rewards/rejected": -0.7614041566848755, + "step": 892 + }, + { + "epoch": 0.5444292028654169, + "grad_norm": 65.48766296584395, + "learning_rate": 3.8809756097560973e-08, + "logits/chosen": 0.05073666572570801, + "logits/rejected": -0.06363290548324585, + "logps/chosen": -230.37246704101562, + "logps/rejected": -273.86712646484375, + "loss": 0.5731, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08037916570901871, + "rewards/margins": 0.78180992603302, + "rewards/rejected": -0.7014308571815491, + "step": 893 + }, + { + "epoch": 0.5450388660265203, + "grad_norm": 70.97853505550982, + "learning_rate": 3.885365853658537e-08, + "logits/chosen": 0.1399250328540802, + "logits/rejected": 0.22606408596038818, + "logps/chosen": -87.23171997070312, + "logps/rejected": -57.06981658935547, + "loss": 0.5862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46670445799827576, + "rewards/margins": 0.7650110721588135, + "rewards/rejected": -0.2983066439628601, + "step": 894 + }, + { + "epoch": 0.5456485291876239, + "grad_norm": 63.7807241221083, + "learning_rate": 3.889756097560976e-08, + "logits/chosen": 0.0728025808930397, + "logits/rejected": 0.027045216411352158, + "logps/chosen": -124.65536499023438, + "logps/rejected": -142.7744140625, + "loss": 0.5279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08896440267562866, + "rewards/margins": 0.5423890352249146, + "rewards/rejected": -0.6313533782958984, + "step": 895 + }, + { + "epoch": 0.5462581923487273, + "grad_norm": 96.02331057254527, + "learning_rate": 3.8941463414634145e-08, + "logits/chosen": 0.1260644495487213, + "logits/rejected": 0.12321975827217102, + "logps/chosen": -155.9744415283203, + "logps/rejected": -182.54986572265625, + "loss": 0.5877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024586722254753113, + "rewards/margins": 0.18018847703933716, + "rewards/rejected": -0.15560175478458405, + "step": 896 + }, + { + "epoch": 0.5468678555098309, + "grad_norm": 66.8006183697143, + "learning_rate": 3.898536585365854e-08, + "logits/chosen": 0.02144142985343933, + "logits/rejected": 0.5131258368492126, + "logps/chosen": -351.69073486328125, + "logps/rejected": -134.90220642089844, + "loss": 0.5518, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5111550688743591, + "rewards/margins": 0.5789368748664856, + "rewards/rejected": -0.06778179109096527, + "step": 897 + }, + { + "epoch": 0.5474775186709343, + "grad_norm": 72.3824971990929, + "learning_rate": 3.902926829268293e-08, + "logits/chosen": 0.08979152888059616, + "logits/rejected": 0.06702014803886414, + "logps/chosen": -48.506126403808594, + "logps/rejected": -31.855939865112305, + "loss": 0.57, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40456414222717285, + "rewards/margins": 0.7067840695381165, + "rewards/rejected": -1.1113481521606445, + "step": 898 + }, + { + "epoch": 0.5480871818320378, + "grad_norm": 65.49358279038536, + "learning_rate": 3.907317073170732e-08, + "logits/chosen": 0.13155023753643036, + "logits/rejected": 0.16425983607769012, + "logps/chosen": -167.928955078125, + "logps/rejected": -142.37261962890625, + "loss": 0.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.509557843208313, + "rewards/margins": 0.36642348766326904, + "rewards/rejected": 0.14313434064388275, + "step": 899 + }, + { + "epoch": 0.5486968449931413, + "grad_norm": 72.51322204893103, + "learning_rate": 3.9117073170731705e-08, + "logits/chosen": 0.16980314254760742, + "logits/rejected": 0.1725219041109085, + "logps/chosen": -122.37548065185547, + "logps/rejected": -105.88607788085938, + "loss": 0.5305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15686455368995667, + "rewards/margins": 0.8508955240249634, + "rewards/rejected": -1.0077600479125977, + "step": 900 + }, + { + "epoch": 0.5493065081542448, + "grad_norm": 66.56509400685799, + "learning_rate": 3.91609756097561e-08, + "logits/chosen": 0.06980474293231964, + "logits/rejected": 0.1476413458585739, + "logps/chosen": -149.52793884277344, + "logps/rejected": -30.278118133544922, + "loss": 0.5101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29264557361602783, + "rewards/margins": 0.5442541837692261, + "rewards/rejected": -0.25160861015319824, + "step": 901 + }, + { + "epoch": 0.5499161713153483, + "grad_norm": 83.17406807513954, + "learning_rate": 3.920487804878049e-08, + "logits/chosen": -0.29419541358947754, + "logits/rejected": 0.16222888231277466, + "logps/chosen": -263.3582763671875, + "logps/rejected": -228.7915496826172, + "loss": 0.6433, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2893264889717102, + "rewards/margins": -0.17224933207035065, + "rewards/rejected": -0.11707716435194016, + "step": 902 + }, + { + "epoch": 0.5505258344764518, + "grad_norm": 68.37619134528198, + "learning_rate": 3.924878048780488e-08, + "logits/chosen": 0.2935391962528229, + "logits/rejected": 0.32337260246276855, + "logps/chosen": -30.69774627685547, + "logps/rejected": -38.217796325683594, + "loss": 0.5518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022282440215349197, + "rewards/margins": 0.13705119490623474, + "rewards/rejected": -0.15933363139629364, + "step": 903 + }, + { + "epoch": 0.5511354976375552, + "grad_norm": 70.41511856903495, + "learning_rate": 3.929268292682927e-08, + "logits/chosen": 0.33251404762268066, + "logits/rejected": 0.22450914978981018, + "logps/chosen": -298.6009521484375, + "logps/rejected": -312.72027587890625, + "loss": 0.595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24860572814941406, + "rewards/margins": 1.0966771841049194, + "rewards/rejected": -0.8480713963508606, + "step": 904 + }, + { + "epoch": 0.5517451607986588, + "grad_norm": 65.6598566284955, + "learning_rate": 3.933658536585366e-08, + "logits/chosen": -0.023178113624453545, + "logits/rejected": 0.34410110116004944, + "logps/chosen": -171.30035400390625, + "logps/rejected": -213.72647094726562, + "loss": 0.5795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48885175585746765, + "rewards/margins": 0.25117263197898865, + "rewards/rejected": 0.23767909407615662, + "step": 905 + }, + { + "epoch": 0.5523548239597622, + "grad_norm": 70.87554003593823, + "learning_rate": 3.938048780487805e-08, + "logits/chosen": 0.1324978768825531, + "logits/rejected": 0.06611113250255585, + "logps/chosen": -20.51483917236328, + "logps/rejected": -63.52241897583008, + "loss": 0.5555, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13383133709430695, + "rewards/margins": 0.04824209585785866, + "rewards/rejected": 0.08558925241231918, + "step": 906 + }, + { + "epoch": 0.5529644871208658, + "grad_norm": 67.18938327107257, + "learning_rate": 3.942439024390244e-08, + "logits/chosen": -0.09553372114896774, + "logits/rejected": 0.03629935532808304, + "logps/chosen": -319.3539733886719, + "logps/rejected": -232.38819885253906, + "loss": 0.5308, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5030369758605957, + "rewards/margins": 0.9751700162887573, + "rewards/rejected": -0.4721331000328064, + "step": 907 + }, + { + "epoch": 0.5535741502819692, + "grad_norm": 63.656136051102344, + "learning_rate": 3.946829268292683e-08, + "logits/chosen": 0.09303402900695801, + "logits/rejected": 0.11336948722600937, + "logps/chosen": -119.49852752685547, + "logps/rejected": -111.59620666503906, + "loss": 0.5668, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33455780148506165, + "rewards/margins": 0.30913645029067993, + "rewards/rejected": 0.025421354919672012, + "step": 908 + }, + { + "epoch": 0.5541838134430727, + "grad_norm": 69.6908758824901, + "learning_rate": 3.951219512195122e-08, + "logits/chosen": -0.09655658900737762, + "logits/rejected": -0.21397748589515686, + "logps/chosen": -96.808349609375, + "logps/rejected": -326.0218505859375, + "loss": 0.5141, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17469826340675354, + "rewards/margins": 0.20265799760818481, + "rewards/rejected": -0.027959734201431274, + "step": 909 + }, + { + "epoch": 0.5547934766041762, + "grad_norm": 89.759948990915, + "learning_rate": 3.955609756097561e-08, + "logits/chosen": 0.13062049448490143, + "logits/rejected": 0.11106927692890167, + "logps/chosen": -349.80029296875, + "logps/rejected": -142.99050903320312, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4324016571044922, + "rewards/margins": 0.0628436878323555, + "rewards/rejected": 0.3695579469203949, + "step": 910 + }, + { + "epoch": 0.5554031397652797, + "grad_norm": 69.4579941607974, + "learning_rate": 3.9600000000000004e-08, + "logits/chosen": 0.07106968015432358, + "logits/rejected": 0.26455602049827576, + "logps/chosen": -244.58111572265625, + "logps/rejected": -200.248046875, + "loss": 0.5304, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20474356412887573, + "rewards/margins": 0.17258234322071075, + "rewards/rejected": 0.032161224633455276, + "step": 911 + }, + { + "epoch": 0.5560128029263832, + "grad_norm": 76.18955954738001, + "learning_rate": 3.964390243902439e-08, + "logits/chosen": -0.20763921737670898, + "logits/rejected": -0.17405462265014648, + "logps/chosen": -152.68344116210938, + "logps/rejected": -226.15586853027344, + "loss": 0.612, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05544380843639374, + "rewards/margins": 0.907264232635498, + "rewards/rejected": -0.8518204092979431, + "step": 912 + }, + { + "epoch": 0.5566224660874867, + "grad_norm": 67.28490303937213, + "learning_rate": 3.968780487804878e-08, + "logits/chosen": 0.06502640247344971, + "logits/rejected": 0.2051590085029602, + "logps/chosen": -424.8998107910156, + "logps/rejected": -411.7484130859375, + "loss": 0.5342, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9498125314712524, + "rewards/margins": 1.6604827642440796, + "rewards/rejected": -0.7106701731681824, + "step": 913 + }, + { + "epoch": 0.5572321292485901, + "grad_norm": 70.57081369071896, + "learning_rate": 3.973170731707317e-08, + "logits/chosen": 0.2460608035326004, + "logits/rejected": 0.15885694324970245, + "logps/chosen": -14.876495361328125, + "logps/rejected": -28.577953338623047, + "loss": 0.6335, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39212724566459656, + "rewards/margins": 0.036636412143707275, + "rewards/rejected": -0.42876365780830383, + "step": 914 + }, + { + "epoch": 0.5578417924096937, + "grad_norm": 59.11274273520954, + "learning_rate": 3.9775609756097564e-08, + "logits/chosen": 0.0618540458381176, + "logits/rejected": 0.059687189757823944, + "logps/chosen": -68.97518157958984, + "logps/rejected": -85.0937271118164, + "loss": 0.5981, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006830386817455292, + "rewards/margins": 0.06073179841041565, + "rewards/rejected": -0.06756218522787094, + "step": 915 + }, + { + "epoch": 0.5584514555707971, + "grad_norm": 51.8397678888205, + "learning_rate": 3.981951219512195e-08, + "logits/chosen": 0.2564215660095215, + "logits/rejected": 0.2258543074131012, + "logps/chosen": -18.547195434570312, + "logps/rejected": -23.24953842163086, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3841609060764313, + "rewards/margins": 0.4379531145095825, + "rewards/rejected": -0.8221140503883362, + "step": 916 + }, + { + "epoch": 0.5590611187319007, + "grad_norm": 65.88073805729441, + "learning_rate": 3.986341463414634e-08, + "logits/chosen": 0.18171361088752747, + "logits/rejected": 0.0895804762840271, + "logps/chosen": -57.35551071166992, + "logps/rejected": -79.67073822021484, + "loss": 0.5368, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07624735683202744, + "rewards/margins": 0.8581210374832153, + "rewards/rejected": -0.7818737030029297, + "step": 917 + }, + { + "epoch": 0.5596707818930041, + "grad_norm": 62.610922622098954, + "learning_rate": 3.9907317073170736e-08, + "logits/chosen": -0.21924518048763275, + "logits/rejected": -0.3091610074043274, + "logps/chosen": -234.76693725585938, + "logps/rejected": -350.0892028808594, + "loss": 0.6026, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06699717789888382, + "rewards/margins": 0.21809442341327667, + "rewards/rejected": -0.2850916087627411, + "step": 918 + }, + { + "epoch": 0.5602804450541076, + "grad_norm": 51.50259242433927, + "learning_rate": 3.9951219512195124e-08, + "logits/chosen": 0.0463281124830246, + "logits/rejected": 0.09033738076686859, + "logps/chosen": -159.4967803955078, + "logps/rejected": -67.3938217163086, + "loss": 0.4737, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40997254848480225, + "rewards/margins": 0.6638243198394775, + "rewards/rejected": -0.2538517713546753, + "step": 919 + }, + { + "epoch": 0.5608901082152111, + "grad_norm": 66.66138931593925, + "learning_rate": 3.999512195121951e-08, + "logits/chosen": -0.0645560771226883, + "logits/rejected": 0.14277854561805725, + "logps/chosen": -244.21054077148438, + "logps/rejected": -223.73312377929688, + "loss": 0.5307, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47790470719337463, + "rewards/margins": 0.6905884742736816, + "rewards/rejected": -0.21268370747566223, + "step": 920 + }, + { + "epoch": 0.5614997713763146, + "grad_norm": 67.47433297965408, + "learning_rate": 4.00390243902439e-08, + "logits/chosen": 0.3004991412162781, + "logits/rejected": 0.3832394778728485, + "logps/chosen": -248.2864532470703, + "logps/rejected": -215.03794860839844, + "loss": 0.5642, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5374934673309326, + "rewards/margins": 0.7683519721031189, + "rewards/rejected": -0.23085854947566986, + "step": 921 + }, + { + "epoch": 0.5621094345374181, + "grad_norm": 54.94697086770058, + "learning_rate": 4.0082926829268296e-08, + "logits/chosen": -0.0013063345104455948, + "logits/rejected": 0.19500063359737396, + "logps/chosen": -227.93211364746094, + "logps/rejected": -151.1141357421875, + "loss": 0.4935, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5804883241653442, + "rewards/margins": 0.4667474031448364, + "rewards/rejected": 0.1137409657239914, + "step": 922 + }, + { + "epoch": 0.5627190976985216, + "grad_norm": 64.04981408028173, + "learning_rate": 4.0126829268292684e-08, + "logits/chosen": 0.07343435287475586, + "logits/rejected": -0.07359534502029419, + "logps/chosen": -130.68255615234375, + "logps/rejected": -151.26487731933594, + "loss": 0.5517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20980963110923767, + "rewards/margins": 0.731520414352417, + "rewards/rejected": -0.5217108726501465, + "step": 923 + }, + { + "epoch": 0.563328760859625, + "grad_norm": 60.291269856199136, + "learning_rate": 4.017073170731707e-08, + "logits/chosen": -0.09406832605600357, + "logits/rejected": 0.11262158304452896, + "logps/chosen": -251.36114501953125, + "logps/rejected": -160.82583618164062, + "loss": 0.5505, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004199355840682983, + "rewards/margins": 0.11875885725021362, + "rewards/rejected": -0.11455950140953064, + "step": 924 + }, + { + "epoch": 0.5639384240207286, + "grad_norm": 65.0621827815013, + "learning_rate": 4.021463414634147e-08, + "logits/chosen": 0.11223086714744568, + "logits/rejected": -0.03253212571144104, + "logps/chosen": -41.525733947753906, + "logps/rejected": -84.82911682128906, + "loss": 0.5188, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3485022783279419, + "rewards/margins": 0.012273922562599182, + "rewards/rejected": -0.3607761859893799, + "step": 925 + }, + { + "epoch": 0.564548087181832, + "grad_norm": 56.796599648277606, + "learning_rate": 4.0258536585365856e-08, + "logits/chosen": 0.1416153758764267, + "logits/rejected": 0.15603217482566833, + "logps/chosen": -245.56536865234375, + "logps/rejected": -110.10095977783203, + "loss": 0.4634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6076117753982544, + "rewards/margins": 1.1378636360168457, + "rewards/rejected": -0.5302518606185913, + "step": 926 + }, + { + "epoch": 0.5651577503429356, + "grad_norm": 67.01272873587294, + "learning_rate": 4.0302439024390244e-08, + "logits/chosen": 0.051580868661403656, + "logits/rejected": 0.20569553971290588, + "logps/chosen": -174.1630859375, + "logps/rejected": -179.910888671875, + "loss": 0.6088, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3665463924407959, + "rewards/margins": 0.7571026086807251, + "rewards/rejected": -0.390556275844574, + "step": 927 + }, + { + "epoch": 0.565767413504039, + "grad_norm": 66.70958922544808, + "learning_rate": 4.034634146341463e-08, + "logits/chosen": 0.1787756085395813, + "logits/rejected": 0.1766122579574585, + "logps/chosen": -135.0958251953125, + "logps/rejected": -103.45654296875, + "loss": 0.5702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36315441131591797, + "rewards/margins": 0.9478495121002197, + "rewards/rejected": -0.5846951603889465, + "step": 928 + }, + { + "epoch": 0.5663770766651425, + "grad_norm": 78.72233349624145, + "learning_rate": 4.039024390243903e-08, + "logits/chosen": 0.16744698584079742, + "logits/rejected": -0.0412500724196434, + "logps/chosen": -72.60401916503906, + "logps/rejected": -136.37399291992188, + "loss": 0.5395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0803537592291832, + "rewards/margins": 0.4975617229938507, + "rewards/rejected": -0.5779154896736145, + "step": 929 + }, + { + "epoch": 0.566986739826246, + "grad_norm": 65.98584845026838, + "learning_rate": 4.0434146341463416e-08, + "logits/chosen": -0.10447554290294647, + "logits/rejected": -0.03891365975141525, + "logps/chosen": -291.3166809082031, + "logps/rejected": -287.75225830078125, + "loss": 0.5893, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45769044756889343, + "rewards/margins": 0.3405483365058899, + "rewards/rejected": 0.11714211106300354, + "step": 930 + }, + { + "epoch": 0.5675964029873495, + "grad_norm": 58.353142155605795, + "learning_rate": 4.0478048780487805e-08, + "logits/chosen": 0.2372773289680481, + "logits/rejected": 0.10270366817712784, + "logps/chosen": -177.44635009765625, + "logps/rejected": -283.463623046875, + "loss": 0.5667, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37482908368110657, + "rewards/margins": 0.3479050099849701, + "rewards/rejected": 0.026924047619104385, + "step": 931 + }, + { + "epoch": 0.5682060661484529, + "grad_norm": 72.97153861954438, + "learning_rate": 4.0521951219512186e-08, + "logits/chosen": 0.09728141129016876, + "logits/rejected": 0.06666161864995956, + "logps/chosen": -266.215087890625, + "logps/rejected": -192.38037109375, + "loss": 0.5327, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8658161163330078, + "rewards/margins": 0.7490353584289551, + "rewards/rejected": 0.11678075790405273, + "step": 932 + }, + { + "epoch": 0.5688157293095565, + "grad_norm": 51.366865251220304, + "learning_rate": 4.056585365853658e-08, + "logits/chosen": 0.3116620182991028, + "logits/rejected": 0.25762253999710083, + "logps/chosen": -179.5105743408203, + "logps/rejected": -229.45407104492188, + "loss": 0.4982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1297607719898224, + "rewards/margins": 1.1740977764129639, + "rewards/rejected": -1.3038586378097534, + "step": 933 + }, + { + "epoch": 0.5694253924706599, + "grad_norm": 70.49128080381824, + "learning_rate": 4.060975609756097e-08, + "logits/chosen": -0.3792869448661804, + "logits/rejected": 0.07428139448165894, + "logps/chosen": -390.8101806640625, + "logps/rejected": -231.0935516357422, + "loss": 0.5915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7346107959747314, + "rewards/margins": 0.936454176902771, + "rewards/rejected": -0.20184330642223358, + "step": 934 + }, + { + "epoch": 0.5700350556317635, + "grad_norm": 76.78091355138127, + "learning_rate": 4.065365853658536e-08, + "logits/chosen": 0.19513678550720215, + "logits/rejected": 0.292969286441803, + "logps/chosen": -280.4601745605469, + "logps/rejected": -232.74989318847656, + "loss": 0.5518, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11097672581672668, + "rewards/margins": 0.47675424814224243, + "rewards/rejected": -0.36577752232551575, + "step": 935 + }, + { + "epoch": 0.5706447187928669, + "grad_norm": 83.47007403711588, + "learning_rate": 4.069756097560975e-08, + "logits/chosen": 0.40284186601638794, + "logits/rejected": 0.1840941309928894, + "logps/chosen": -217.3123779296875, + "logps/rejected": -430.5103759765625, + "loss": 0.5134, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9091841578483582, + "rewards/margins": 1.3295221328735352, + "rewards/rejected": -0.4203380346298218, + "step": 936 + }, + { + "epoch": 0.5712543819539704, + "grad_norm": 60.346002831050065, + "learning_rate": 4.074146341463414e-08, + "logits/chosen": 0.17786885797977448, + "logits/rejected": 0.19254018366336823, + "logps/chosen": -221.40101623535156, + "logps/rejected": -169.64413452148438, + "loss": 0.5908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26896461844444275, + "rewards/margins": 0.8941740989685059, + "rewards/rejected": -0.6252095699310303, + "step": 937 + }, + { + "epoch": 0.5718640451150739, + "grad_norm": 70.27171933319133, + "learning_rate": 4.078536585365853e-08, + "logits/chosen": 0.22855284810066223, + "logits/rejected": 0.2680366635322571, + "logps/chosen": -60.72048568725586, + "logps/rejected": -69.85613250732422, + "loss": 0.6047, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02325417473912239, + "rewards/margins": 0.02311794087290764, + "rewards/rejected": 0.0001362226903438568, + "step": 938 + }, + { + "epoch": 0.5724737082761774, + "grad_norm": 75.11512603751487, + "learning_rate": 4.082926829268292e-08, + "logits/chosen": -0.007381145842373371, + "logits/rejected": 0.12239038944244385, + "logps/chosen": -203.8936767578125, + "logps/rejected": -116.77022552490234, + "loss": 0.5919, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07102936506271362, + "rewards/margins": 0.3084529638290405, + "rewards/rejected": -0.2374236136674881, + "step": 939 + }, + { + "epoch": 0.5730833714372809, + "grad_norm": 77.50486609865688, + "learning_rate": 4.087317073170731e-08, + "logits/chosen": -0.046913955360651016, + "logits/rejected": 0.12509378790855408, + "logps/chosen": -213.4773406982422, + "logps/rejected": -218.0093994140625, + "loss": 0.643, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20496401190757751, + "rewards/margins": -0.17977844178676605, + "rewards/rejected": -0.025185570120811462, + "step": 940 + }, + { + "epoch": 0.5736930345983844, + "grad_norm": 91.34440591980292, + "learning_rate": 4.09170731707317e-08, + "logits/chosen": 0.04759877920150757, + "logits/rejected": 0.032981082797050476, + "logps/chosen": -256.04766845703125, + "logps/rejected": -182.34393310546875, + "loss": 0.5821, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005273975431919098, + "rewards/margins": 0.3753475844860077, + "rewards/rejected": -0.3700735867023468, + "step": 941 + }, + { + "epoch": 0.5743026977594878, + "grad_norm": 70.78543660509564, + "learning_rate": 4.096097560975609e-08, + "logits/chosen": 0.031253427267074585, + "logits/rejected": 0.09773498773574829, + "logps/chosen": -184.8252410888672, + "logps/rejected": -68.20289611816406, + "loss": 0.6523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12901097536087036, + "rewards/margins": 0.34820958971977234, + "rewards/rejected": -0.21919861435890198, + "step": 942 + }, + { + "epoch": 0.5749123609205914, + "grad_norm": 61.64130036814762, + "learning_rate": 4.1004878048780485e-08, + "logits/chosen": 0.08989404141902924, + "logits/rejected": 0.117643803358078, + "logps/chosen": -237.77716064453125, + "logps/rejected": -140.23069763183594, + "loss": 0.5281, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4668736159801483, + "rewards/margins": 0.7956620454788208, + "rewards/rejected": -0.3287883698940277, + "step": 943 + }, + { + "epoch": 0.5755220240816948, + "grad_norm": 68.6945218222372, + "learning_rate": 4.104878048780487e-08, + "logits/chosen": 0.20969705283641815, + "logits/rejected": 0.17862199246883392, + "logps/chosen": -64.42387390136719, + "logps/rejected": -90.00393676757812, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11434374749660492, + "rewards/margins": 0.636167049407959, + "rewards/rejected": -0.7505108118057251, + "step": 944 + }, + { + "epoch": 0.5761316872427984, + "grad_norm": 92.56663195856316, + "learning_rate": 4.109268292682926e-08, + "logits/chosen": 0.23893681168556213, + "logits/rejected": 0.30128633975982666, + "logps/chosen": -254.2822265625, + "logps/rejected": -179.03640747070312, + "loss": 0.7023, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20787009596824646, + "rewards/margins": -0.08017578721046448, + "rewards/rejected": 0.2880459129810333, + "step": 945 + }, + { + "epoch": 0.5767413504039018, + "grad_norm": 70.43087604972249, + "learning_rate": 4.113658536585365e-08, + "logits/chosen": 0.18003559112548828, + "logits/rejected": 0.21685735881328583, + "logps/chosen": -154.2003936767578, + "logps/rejected": -217.83892822265625, + "loss": 0.5489, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.031725883483887e-05, + "rewards/margins": 0.23531605303287506, + "rewards/rejected": -0.2353963553905487, + "step": 946 + }, + { + "epoch": 0.5773510135650053, + "grad_norm": 71.21231053272065, + "learning_rate": 4.1180487804878045e-08, + "logits/chosen": -0.06945399940013885, + "logits/rejected": -0.06084947660565376, + "logps/chosen": -96.3125228881836, + "logps/rejected": -100.3994369506836, + "loss": 0.6224, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5278519988059998, + "rewards/margins": 0.04955005645751953, + "rewards/rejected": -0.5774020552635193, + "step": 947 + }, + { + "epoch": 0.5779606767261088, + "grad_norm": 65.26086714554752, + "learning_rate": 4.1224390243902433e-08, + "logits/chosen": 0.22658458352088928, + "logits/rejected": 0.12774381041526794, + "logps/chosen": -270.35333251953125, + "logps/rejected": -205.83335876464844, + "loss": 0.5235, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33648747205734253, + "rewards/margins": 0.10773984342813492, + "rewards/rejected": 0.2287476658821106, + "step": 948 + }, + { + "epoch": 0.5785703398872123, + "grad_norm": 81.54389536975877, + "learning_rate": 4.126829268292682e-08, + "logits/chosen": 0.04804135859012604, + "logits/rejected": 0.0008035674691200256, + "logps/chosen": -120.53761291503906, + "logps/rejected": -125.85433959960938, + "loss": 0.5928, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.24643567204475403, + "rewards/margins": -0.33742666244506836, + "rewards/rejected": 0.09099098294973373, + "step": 949 + }, + { + "epoch": 0.5791800030483159, + "grad_norm": 82.12086780288085, + "learning_rate": 4.131219512195122e-08, + "logits/chosen": 0.2091103047132492, + "logits/rejected": -0.04949437081813812, + "logps/chosen": -201.59471130371094, + "logps/rejected": -286.5262756347656, + "loss": 0.521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27747267484664917, + "rewards/margins": 1.1522293090820312, + "rewards/rejected": -0.8747566342353821, + "step": 950 + }, + { + "epoch": 0.5797896662094193, + "grad_norm": 69.04790105659762, + "learning_rate": 4.1356097560975605e-08, + "logits/chosen": 0.25812891125679016, + "logits/rejected": 0.03673887252807617, + "logps/chosen": -91.65919494628906, + "logps/rejected": -90.58918762207031, + "loss": 0.6029, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04524761438369751, + "rewards/margins": 0.2664438784122467, + "rewards/rejected": -0.3116915225982666, + "step": 951 + }, + { + "epoch": 0.5803993293705227, + "grad_norm": 87.67035025167849, + "learning_rate": 4.1399999999999994e-08, + "logits/chosen": 0.06877303123474121, + "logits/rejected": 0.15757039189338684, + "logps/chosen": -244.7814178466797, + "logps/rejected": -294.79486083984375, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5146667957305908, + "rewards/margins": 0.38214778900146484, + "rewards/rejected": -0.8968145847320557, + "step": 952 + }, + { + "epoch": 0.5810089925316263, + "grad_norm": 63.742718134835904, + "learning_rate": 4.144390243902438e-08, + "logits/chosen": -0.04098774120211601, + "logits/rejected": 0.09652681648731232, + "logps/chosen": -222.15682983398438, + "logps/rejected": -176.15093994140625, + "loss": 0.4896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22891703248023987, + "rewards/margins": 0.5597389936447144, + "rewards/rejected": -0.3308219909667969, + "step": 953 + }, + { + "epoch": 0.5816186556927297, + "grad_norm": 62.23053401449768, + "learning_rate": 4.148780487804878e-08, + "logits/chosen": -0.25104236602783203, + "logits/rejected": -0.2953557074069977, + "logps/chosen": -156.33468627929688, + "logps/rejected": -279.6402587890625, + "loss": 0.5958, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1959829032421112, + "rewards/margins": 1.0025813579559326, + "rewards/rejected": -0.806598424911499, + "step": 954 + }, + { + "epoch": 0.5822283188538333, + "grad_norm": 67.38110967504043, + "learning_rate": 4.1531707317073165e-08, + "logits/chosen": -0.020015094429254532, + "logits/rejected": 0.13997766375541687, + "logps/chosen": -184.95852661132812, + "logps/rejected": -136.2812957763672, + "loss": 0.5132, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2717243432998657, + "rewards/margins": 0.4409331679344177, + "rewards/rejected": -0.1692088395357132, + "step": 955 + }, + { + "epoch": 0.5828379820149368, + "grad_norm": 61.08016186118609, + "learning_rate": 4.1575609756097554e-08, + "logits/chosen": 0.007236507721245289, + "logits/rejected": 0.09439484775066376, + "logps/chosen": -262.1932373046875, + "logps/rejected": -176.46446228027344, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20543357729911804, + "rewards/margins": 0.30884745717048645, + "rewards/rejected": -0.10341387242078781, + "step": 956 + }, + { + "epoch": 0.5834476451760402, + "grad_norm": 69.39354863186936, + "learning_rate": 4.161951219512195e-08, + "logits/chosen": -0.15755020081996918, + "logits/rejected": -0.3018410801887512, + "logps/chosen": -358.4852294921875, + "logps/rejected": -470.5343017578125, + "loss": 0.5343, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21769274771213531, + "rewards/margins": 0.16905879974365234, + "rewards/rejected": 0.048633962869644165, + "step": 957 + }, + { + "epoch": 0.5840573083371438, + "grad_norm": 86.61322570045859, + "learning_rate": 4.166341463414634e-08, + "logits/chosen": -0.02643953263759613, + "logits/rejected": -0.18218819797039032, + "logps/chosen": -41.61878967285156, + "logps/rejected": -220.1629638671875, + "loss": 0.6194, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44664257764816284, + "rewards/margins": 0.0005926638841629028, + "rewards/rejected": -0.44723525643348694, + "step": 958 + }, + { + "epoch": 0.5846669714982472, + "grad_norm": 62.05519185121555, + "learning_rate": 4.1707317073170725e-08, + "logits/chosen": -0.34936612844467163, + "logits/rejected": -0.14146625995635986, + "logps/chosen": -313.269287109375, + "logps/rejected": -150.72903442382812, + "loss": 0.5197, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2663576602935791, + "rewards/margins": 0.5314609408378601, + "rewards/rejected": -0.2651032507419586, + "step": 959 + }, + { + "epoch": 0.5852766346593508, + "grad_norm": 61.8538936273219, + "learning_rate": 4.1751219512195114e-08, + "logits/chosen": 0.08507747203111649, + "logits/rejected": 0.04343874752521515, + "logps/chosen": -97.35144805908203, + "logps/rejected": -92.90192413330078, + "loss": 0.5136, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20625300705432892, + "rewards/margins": -0.16570445895195007, + "rewards/rejected": -0.04054853320121765, + "step": 960 + }, + { + "epoch": 0.5858862978204542, + "grad_norm": 69.59421865613955, + "learning_rate": 4.179512195121951e-08, + "logits/chosen": 0.022425949573516846, + "logits/rejected": 0.0824100449681282, + "logps/chosen": -365.5918884277344, + "logps/rejected": -397.54443359375, + "loss": 0.5295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3914957046508789, + "rewards/margins": 1.1910912990570068, + "rewards/rejected": -0.7995957136154175, + "step": 961 + }, + { + "epoch": 0.5864959609815577, + "grad_norm": 76.91340708220399, + "learning_rate": 4.18390243902439e-08, + "logits/chosen": 0.3572169840335846, + "logits/rejected": 0.2854265570640564, + "logps/chosen": -30.446258544921875, + "logps/rejected": -36.61526107788086, + "loss": 0.6124, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4780455231666565, + "rewards/margins": -0.24744245409965515, + "rewards/rejected": -0.23060305416584015, + "step": 962 + }, + { + "epoch": 0.5871056241426612, + "grad_norm": 57.46941019347026, + "learning_rate": 4.1882926829268286e-08, + "logits/chosen": 0.21760831773281097, + "logits/rejected": 0.13755109906196594, + "logps/chosen": -193.99929809570312, + "logps/rejected": -162.6146697998047, + "loss": 0.5556, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17073407769203186, + "rewards/margins": 0.8833505511283875, + "rewards/rejected": -0.7126164436340332, + "step": 963 + }, + { + "epoch": 0.5877152873037647, + "grad_norm": 62.629537187405504, + "learning_rate": 4.192682926829268e-08, + "logits/chosen": 0.04777471721172333, + "logits/rejected": 0.055221885442733765, + "logps/chosen": -98.55489349365234, + "logps/rejected": -166.0889434814453, + "loss": 0.526, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.33463388681411743, + "rewards/margins": -0.12675432860851288, + "rewards/rejected": -0.20787954330444336, + "step": 964 + }, + { + "epoch": 0.5883249504648682, + "grad_norm": 75.29882126736368, + "learning_rate": 4.197073170731707e-08, + "logits/chosen": 0.08066365867853165, + "logits/rejected": 0.27275511622428894, + "logps/chosen": -282.13165283203125, + "logps/rejected": -201.79586791992188, + "loss": 0.6298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40546715259552, + "rewards/margins": 1.2127996683120728, + "rewards/rejected": -0.8073325753211975, + "step": 965 + }, + { + "epoch": 0.5889346136259717, + "grad_norm": 73.94280467512128, + "learning_rate": 4.201463414634146e-08, + "logits/chosen": 0.14924946427345276, + "logits/rejected": 0.12114415317773819, + "logps/chosen": -24.413921356201172, + "logps/rejected": -26.602420806884766, + "loss": 0.559, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42113298177719116, + "rewards/margins": 0.08444094657897949, + "rewards/rejected": -0.5055739283561707, + "step": 966 + }, + { + "epoch": 0.5895442767870751, + "grad_norm": 72.32887870017322, + "learning_rate": 4.2058536585365846e-08, + "logits/chosen": 0.37911567091941833, + "logits/rejected": 0.14017818868160248, + "logps/chosen": -34.29087448120117, + "logps/rejected": -59.50763702392578, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01295555755496025, + "rewards/margins": 0.1707940399646759, + "rewards/rejected": -0.15783849358558655, + "step": 967 + }, + { + "epoch": 0.5901539399481787, + "grad_norm": 85.10928808814798, + "learning_rate": 4.210243902439024e-08, + "logits/chosen": 0.02898680604994297, + "logits/rejected": 0.17125096917152405, + "logps/chosen": -128.51568603515625, + "logps/rejected": -93.6944580078125, + "loss": 0.5281, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4702191948890686, + "rewards/margins": -0.3852922022342682, + "rewards/rejected": -0.08492698520421982, + "step": 968 + }, + { + "epoch": 0.5907636031092821, + "grad_norm": 64.60074775054339, + "learning_rate": 4.214634146341463e-08, + "logits/chosen": 0.4834749698638916, + "logits/rejected": 0.6553069353103638, + "logps/chosen": -277.7457275390625, + "logps/rejected": -201.1106414794922, + "loss": 0.5532, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4859507083892822, + "rewards/margins": 0.24577556550502777, + "rewards/rejected": 0.24017517268657684, + "step": 969 + }, + { + "epoch": 0.5913732662703857, + "grad_norm": 71.28492793310289, + "learning_rate": 4.219024390243902e-08, + "logits/chosen": 0.2692300081253052, + "logits/rejected": 0.31959807872772217, + "logps/chosen": -174.9003143310547, + "logps/rejected": -136.06768798828125, + "loss": 0.5951, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0905541181564331, + "rewards/margins": 0.4873233437538147, + "rewards/rejected": -0.5778774619102478, + "step": 970 + }, + { + "epoch": 0.5919829294314891, + "grad_norm": 63.96088036649985, + "learning_rate": 4.223414634146341e-08, + "logits/chosen": -0.14569628238677979, + "logits/rejected": -0.007821448147296906, + "logps/chosen": -106.27445983886719, + "logps/rejected": -71.78903198242188, + "loss": 0.5752, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2875051200389862, + "rewards/margins": -0.14759160578250885, + "rewards/rejected": -0.13991349935531616, + "step": 971 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 78.3828918011369, + "learning_rate": 4.22780487804878e-08, + "logits/chosen": -0.027136696502566338, + "logits/rejected": 0.2894710898399353, + "logps/chosen": -156.28268432617188, + "logps/rejected": -223.8277587890625, + "loss": 0.6042, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.290881872177124, + "rewards/margins": -0.12722742557525635, + "rewards/rejected": -0.16365444660186768, + "step": 972 + }, + { + "epoch": 0.5932022557536961, + "grad_norm": 60.19492529290873, + "learning_rate": 4.232195121951219e-08, + "logits/chosen": 0.1515863537788391, + "logits/rejected": -0.06478295475244522, + "logps/chosen": -335.7998046875, + "logps/rejected": -445.21929931640625, + "loss": 0.4768, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6023845672607422, + "rewards/margins": 1.3038933277130127, + "rewards/rejected": -0.7015087008476257, + "step": 973 + }, + { + "epoch": 0.5938119189147996, + "grad_norm": 56.192626708773645, + "learning_rate": 4.236585365853658e-08, + "logits/chosen": -0.017517834901809692, + "logits/rejected": -0.14214028418064117, + "logps/chosen": -217.49298095703125, + "logps/rejected": -184.72012329101562, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18382145464420319, + "rewards/margins": 0.42155730724334717, + "rewards/rejected": -0.6053788065910339, + "step": 974 + }, + { + "epoch": 0.594421582075903, + "grad_norm": 82.4116097946905, + "learning_rate": 4.240975609756097e-08, + "logits/chosen": -0.14134134352207184, + "logits/rejected": -0.06352880597114563, + "logps/chosen": -513.8828735351562, + "logps/rejected": -412.7125549316406, + "loss": 0.5264, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.817517101764679, + "rewards/margins": 0.8614957332611084, + "rewards/rejected": -0.04397859424352646, + "step": 975 + }, + { + "epoch": 0.5950312452370066, + "grad_norm": 68.60202196421983, + "learning_rate": 4.245365853658536e-08, + "logits/chosen": -0.06833840161561966, + "logits/rejected": 0.08813981711864471, + "logps/chosen": -86.92523193359375, + "logps/rejected": -91.20805358886719, + "loss": 0.538, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06409449130296707, + "rewards/margins": -0.0014316122978925705, + "rewards/rejected": 0.06552610546350479, + "step": 976 + }, + { + "epoch": 0.59564090839811, + "grad_norm": 63.80535251075345, + "learning_rate": 4.249756097560975e-08, + "logits/chosen": -0.13767874240875244, + "logits/rejected": -0.09763995558023453, + "logps/chosen": -107.87483215332031, + "logps/rejected": -147.85107421875, + "loss": 0.5485, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4442089796066284, + "rewards/margins": -0.43157342076301575, + "rewards/rejected": -0.012635531835258007, + "step": 977 + }, + { + "epoch": 0.5962505715592136, + "grad_norm": 58.96860958544613, + "learning_rate": 4.2541463414634144e-08, + "logits/chosen": 0.2365347146987915, + "logits/rejected": 0.23233827948570251, + "logps/chosen": -232.26263427734375, + "logps/rejected": -258.4931335449219, + "loss": 0.4819, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28829145431518555, + "rewards/margins": 0.654391884803772, + "rewards/rejected": -0.3661004602909088, + "step": 978 + }, + { + "epoch": 0.596860234720317, + "grad_norm": 56.34013149049837, + "learning_rate": 4.258536585365853e-08, + "logits/chosen": 0.13740472495555878, + "logits/rejected": 0.12650051712989807, + "logps/chosen": -58.84000015258789, + "logps/rejected": -66.1595458984375, + "loss": 0.5355, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20688855648040771, + "rewards/margins": 0.028081614524126053, + "rewards/rejected": -0.23497018218040466, + "step": 979 + }, + { + "epoch": 0.5974698978814205, + "grad_norm": 70.09919354884728, + "learning_rate": 4.262926829268292e-08, + "logits/chosen": -0.05875900015234947, + "logits/rejected": -0.19873517751693726, + "logps/chosen": -141.268310546875, + "logps/rejected": -179.65872192382812, + "loss": 0.6015, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4807414710521698, + "rewards/margins": 0.02805427461862564, + "rewards/rejected": -0.5087957382202148, + "step": 980 + }, + { + "epoch": 0.598079561042524, + "grad_norm": 70.50180958730834, + "learning_rate": 4.2673170731707316e-08, + "logits/chosen": 0.22438283264636993, + "logits/rejected": 0.2016701102256775, + "logps/chosen": -172.8882598876953, + "logps/rejected": -207.31109619140625, + "loss": 0.5625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1393517404794693, + "rewards/margins": 0.16943779587745667, + "rewards/rejected": -0.30878955125808716, + "step": 981 + }, + { + "epoch": 0.5986892242036275, + "grad_norm": 47.63198791448597, + "learning_rate": 4.2717073170731704e-08, + "logits/chosen": 0.13777931034564972, + "logits/rejected": 0.1058683693408966, + "logps/chosen": -130.87457275390625, + "logps/rejected": -157.2276611328125, + "loss": 0.5121, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2509147822856903, + "rewards/margins": 0.616559624671936, + "rewards/rejected": -0.8674744367599487, + "step": 982 + }, + { + "epoch": 0.599298887364731, + "grad_norm": 72.20509391638281, + "learning_rate": 4.276097560975609e-08, + "logits/chosen": 0.237956240773201, + "logits/rejected": 0.13941863179206848, + "logps/chosen": -145.96958923339844, + "logps/rejected": -284.981201171875, + "loss": 0.5956, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11076083034276962, + "rewards/margins": 0.25600630044937134, + "rewards/rejected": -0.14524544775485992, + "step": 983 + }, + { + "epoch": 0.5999085505258345, + "grad_norm": 64.97602705586922, + "learning_rate": 4.280487804878048e-08, + "logits/chosen": 0.10785136371850967, + "logits/rejected": -0.051660746335983276, + "logps/chosen": -32.13536834716797, + "logps/rejected": -46.07811737060547, + "loss": 0.5479, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.061608847230672836, + "rewards/margins": 0.35192739963531494, + "rewards/rejected": -0.4135362505912781, + "step": 984 + }, + { + "epoch": 0.6005182136869379, + "grad_norm": 74.32472065832633, + "learning_rate": 4.2848780487804876e-08, + "logits/chosen": 0.22914312779903412, + "logits/rejected": 0.04533011466264725, + "logps/chosen": -81.81015014648438, + "logps/rejected": -99.21920013427734, + "loss": 0.5848, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3640258312225342, + "rewards/margins": 0.7226523160934448, + "rewards/rejected": -0.35862642526626587, + "step": 985 + }, + { + "epoch": 0.6011278768480415, + "grad_norm": 63.60531552456201, + "learning_rate": 4.2892682926829265e-08, + "logits/chosen": 0.16781377792358398, + "logits/rejected": 0.13715007901191711, + "logps/chosen": -140.97671508789062, + "logps/rejected": -256.8678283691406, + "loss": 0.4367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07384621351957321, + "rewards/margins": 0.8329897522926331, + "rewards/rejected": -0.9068359136581421, + "step": 986 + }, + { + "epoch": 0.6017375400091449, + "grad_norm": 57.035252503405076, + "learning_rate": 4.293658536585365e-08, + "logits/chosen": -0.14472906291484833, + "logits/rejected": 0.16360566020011902, + "logps/chosen": -85.19625091552734, + "logps/rejected": -50.558929443359375, + "loss": 0.5303, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08855142444372177, + "rewards/margins": -0.014545775949954987, + "rewards/rejected": -0.07400565594434738, + "step": 987 + }, + { + "epoch": 0.6023472031702485, + "grad_norm": 56.92118331145405, + "learning_rate": 4.298048780487805e-08, + "logits/chosen": 0.12844596803188324, + "logits/rejected": 0.2380760908126831, + "logps/chosen": -123.90941619873047, + "logps/rejected": -85.47669219970703, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6185051798820496, + "rewards/margins": 0.6352143883705139, + "rewards/rejected": -0.016709256917238235, + "step": 988 + }, + { + "epoch": 0.6029568663313519, + "grad_norm": 61.22634805888205, + "learning_rate": 4.3024390243902436e-08, + "logits/chosen": 0.137892946600914, + "logits/rejected": 0.2502715289592743, + "logps/chosen": -138.75047302246094, + "logps/rejected": -18.938377380371094, + "loss": 0.5409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08287467807531357, + "rewards/margins": 0.5290653109550476, + "rewards/rejected": -0.6119399666786194, + "step": 989 + }, + { + "epoch": 0.6035665294924554, + "grad_norm": 63.03320331999485, + "learning_rate": 4.3068292682926825e-08, + "logits/chosen": 0.2903212010860443, + "logits/rejected": 0.25095999240875244, + "logps/chosen": -284.98394775390625, + "logps/rejected": -293.1748046875, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4981961250305176, + "rewards/margins": 1.0244200229644775, + "rewards/rejected": -0.5262239575386047, + "step": 990 + }, + { + "epoch": 0.6041761926535589, + "grad_norm": 56.12693618714919, + "learning_rate": 4.311219512195121e-08, + "logits/chosen": 0.3138669729232788, + "logits/rejected": 0.06886722147464752, + "logps/chosen": -221.6787567138672, + "logps/rejected": -371.2520751953125, + "loss": 0.4959, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40865635871887207, + "rewards/margins": 1.7064982652664185, + "rewards/rejected": -1.2978421449661255, + "step": 991 + }, + { + "epoch": 0.6047858558146624, + "grad_norm": 65.94861499842719, + "learning_rate": 4.315609756097561e-08, + "logits/chosen": 0.18619008362293243, + "logits/rejected": 0.11499892175197601, + "logps/chosen": -280.6131591796875, + "logps/rejected": -166.84622192382812, + "loss": 0.5502, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04074333980679512, + "rewards/margins": 0.4415995478630066, + "rewards/rejected": -0.40085622668266296, + "step": 992 + }, + { + "epoch": 0.6053955189757659, + "grad_norm": 70.49265773787819, + "learning_rate": 4.3199999999999996e-08, + "logits/chosen": -0.009435049258172512, + "logits/rejected": 0.2987651526927948, + "logps/chosen": -302.96124267578125, + "logps/rejected": -192.93569946289062, + "loss": 0.5567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06466828286647797, + "rewards/margins": 0.46986982226371765, + "rewards/rejected": -0.5345380902290344, + "step": 993 + }, + { + "epoch": 0.6060051821368694, + "grad_norm": 61.530930320466695, + "learning_rate": 4.3243902439024385e-08, + "logits/chosen": 0.1342477798461914, + "logits/rejected": 0.15137697756290436, + "logps/chosen": -184.71934509277344, + "logps/rejected": -140.88229370117188, + "loss": 0.5371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1671849489212036, + "rewards/margins": 0.7406481504440308, + "rewards/rejected": -0.5734632015228271, + "step": 994 + }, + { + "epoch": 0.6066148452979728, + "grad_norm": 67.61584483534448, + "learning_rate": 4.328780487804878e-08, + "logits/chosen": 0.09232601523399353, + "logits/rejected": 0.3603507876396179, + "logps/chosen": -429.93292236328125, + "logps/rejected": -311.12701416015625, + "loss": 0.5626, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46371012926101685, + "rewards/margins": 0.33862996101379395, + "rewards/rejected": 0.12508010864257812, + "step": 995 + }, + { + "epoch": 0.6072245084590764, + "grad_norm": 80.14941941324331, + "learning_rate": 4.333170731707317e-08, + "logits/chosen": -0.181582972407341, + "logits/rejected": -0.0784536823630333, + "logps/chosen": -293.38812255859375, + "logps/rejected": -180.1998291015625, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05510110408067703, + "rewards/margins": 0.2038726508617401, + "rewards/rejected": -0.14877153933048248, + "step": 996 + }, + { + "epoch": 0.6078341716201798, + "grad_norm": 64.12534841741636, + "learning_rate": 4.3375609756097557e-08, + "logits/chosen": -0.07904496788978577, + "logits/rejected": 0.06720384955406189, + "logps/chosen": -391.43597412109375, + "logps/rejected": -271.29388427734375, + "loss": 0.466, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.9313393235206604, + "rewards/margins": 1.4372742176055908, + "rewards/rejected": -0.5059348940849304, + "step": 997 + }, + { + "epoch": 0.6084438347812834, + "grad_norm": 62.79329595765985, + "learning_rate": 4.3419512195121945e-08, + "logits/chosen": 0.11170154809951782, + "logits/rejected": 0.08879280090332031, + "logps/chosen": -182.6605224609375, + "logps/rejected": -114.03193664550781, + "loss": 0.6013, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35989508032798767, + "rewards/margins": 1.1250437498092651, + "rewards/rejected": -0.7651485800743103, + "step": 998 + }, + { + "epoch": 0.6090534979423868, + "grad_norm": 73.49008345337796, + "learning_rate": 4.346341463414634e-08, + "logits/chosen": 0.05520867183804512, + "logits/rejected": 0.09620153158903122, + "logps/chosen": -95.85297393798828, + "logps/rejected": -52.026451110839844, + "loss": 0.5837, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.012775249779224396, + "rewards/margins": 0.11057233810424805, + "rewards/rejected": -0.12334759533405304, + "step": 999 + }, + { + "epoch": 0.6096631611034903, + "grad_norm": 85.98186056287153, + "learning_rate": 4.350731707317073e-08, + "logits/chosen": 0.16709566116333008, + "logits/rejected": 0.12938140332698822, + "logps/chosen": -238.64108276367188, + "logps/rejected": -247.4223175048828, + "loss": 0.5785, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43460771441459656, + "rewards/margins": 0.6481321454048157, + "rewards/rejected": -0.21352441608905792, + "step": 1000 + }, + { + "epoch": 0.6096631611034903, + "eval_logits/chosen": 0.015029330737888813, + "eval_logits/rejected": 0.09182646870613098, + "eval_logps/chosen": -162.3784637451172, + "eval_logps/rejected": -112.31405639648438, + "eval_loss": 0.5489267706871033, + "eval_rewards/accuracies": 0.5454545617103577, + "eval_rewards/chosen": 0.025668036192655563, + "eval_rewards/margins": 0.33774468302726746, + "eval_rewards/rejected": -0.3120766580104828, + "eval_runtime": 36.0159, + "eval_samples_per_second": 7.33, + "eval_steps_per_second": 0.916, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1640, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 105686681518080.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}