diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40171 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6, + "eval_steps": 500, + "global_step": 2868, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -2.2282397747039795, + "logits/rejected": -1.7104178667068481, + "logps/chosen": -277.4985046386719, + "logps/rejected": -283.1687927246094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.9831498861312866, + "logits/rejected": -1.8470757007598877, + "logps/chosen": -266.929931640625, + "logps/rejected": -278.9443359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 3e-06, + "logits/chosen": -1.998997449874878, + "logits/rejected": -1.5633281469345093, + "logps/chosen": -398.8482971191406, + "logps/rejected": -361.72137451171875, + "loss": 0.6779, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008343720808625221, + "rewards/margins": 0.033162664622068405, + "rewards/rejected": -0.041506387293338776, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.0003268718719482, + "logits/rejected": -1.7799348831176758, + "logps/chosen": -282.08514404296875, + "logps/rejected": -252.58938598632812, + "loss": 0.6878, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008050250820815563, + "rewards/margins": 0.012078572064638138, + "rewards/rejected": -0.020128823816776276, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2.224046230316162, + "logits/rejected": -2.0547187328338623, + "logps/chosen": -202.93685913085938, + "logps/rejected": -208.28402709960938, + "loss": 0.7109, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.011011920869350433, + "rewards/margins": -0.033290065824985504, + "rewards/rejected": 0.04430198669433594, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 6e-06, + "logits/chosen": -2.2208118438720703, + "logits/rejected": -1.7688552141189575, + "logps/chosen": -426.36590576171875, + "logps/rejected": -263.5623779296875, + "loss": 0.6819, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.14544084668159485, + "rewards/margins": 0.02388599142432213, + "rewards/rejected": 0.12155484408140182, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 7e-06, + "logits/chosen": -2.1621174812316895, + "logits/rejected": -1.9987232685089111, + "logps/chosen": -269.5228271484375, + "logps/rejected": -226.28701782226562, + "loss": 0.687, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.039055682718753815, + "rewards/margins": 0.013098955154418945, + "rewards/rejected": 0.02595672756433487, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -2.036137580871582, + "logits/rejected": -1.858740210533142, + "logps/chosen": -322.3067321777344, + "logps/rejected": -247.76370239257812, + "loss": 0.7017, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03971423953771591, + "rewards/margins": -0.015833258628845215, + "rewards/rejected": -0.023880982771515846, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 9e-06, + "logits/chosen": -2.3385281562805176, + "logits/rejected": -2.152259349822998, + "logps/chosen": -342.80950927734375, + "logps/rejected": -332.3177490234375, + "loss": 0.6956, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07614301145076752, + "rewards/margins": -0.0035643568262457848, + "rewards/rejected": 0.07970735430717468, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "logits/chosen": -2.1200649738311768, + "logits/rejected": -1.795499563217163, + "logps/chosen": -450.69927978515625, + "logps/rejected": -349.3813781738281, + "loss": 0.7038, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.026192951947450638, + "rewards/margins": -0.017020108178257942, + "rewards/rejected": 0.04321306198835373, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -2.361680507659912, + "logits/rejected": -1.5736029148101807, + "logps/chosen": -385.343505859375, + "logps/rejected": -252.7088165283203, + "loss": 0.6631, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.018131183460354805, + "rewards/margins": 0.06300930678844452, + "rewards/rejected": -0.08114049583673477, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 1.2e-05, + "logits/chosen": -2.1654326915740967, + "logits/rejected": -1.7892608642578125, + "logps/chosen": -267.4467468261719, + "logps/rejected": -194.1240692138672, + "loss": 0.6679, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.046659138053655624, + "rewards/margins": 0.054810453206300735, + "rewards/rejected": -0.00815131701529026, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -2.1767282485961914, + "logits/rejected": -1.8208816051483154, + "logps/chosen": -352.0940246582031, + "logps/rejected": -267.9727783203125, + "loss": 0.6301, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21059754490852356, + "rewards/margins": 0.13529108464717865, + "rewards/rejected": 0.0753064677119255, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 1.4e-05, + "logits/chosen": -2.2854440212249756, + "logits/rejected": -2.080085515975952, + "logps/chosen": -376.6247253417969, + "logps/rejected": -278.5927734375, + "loss": 0.6278, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15183602273464203, + "rewards/margins": 0.1464289128780365, + "rewards/rejected": 0.005407098680734634, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 1.5000000000000002e-05, + "logits/chosen": -2.4406135082244873, + "logits/rejected": -2.322054147720337, + "logps/chosen": -298.485595703125, + "logps/rejected": -260.11932373046875, + "loss": 0.6534, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004409570246934891, + "rewards/margins": 0.10971016436815262, + "rewards/rejected": -0.10530060529708862, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -2.136800527572632, + "logits/rejected": -2.085320472717285, + "logps/chosen": -313.06280517578125, + "logps/rejected": -308.3344421386719, + "loss": 0.6359, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07214680314064026, + "rewards/margins": 0.1383473426103592, + "rewards/rejected": -0.06620054692029953, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 1.7e-05, + "logits/chosen": -2.343367099761963, + "logits/rejected": -1.9561631679534912, + "logps/chosen": -357.57171630859375, + "logps/rejected": -278.8397521972656, + "loss": 0.5886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09857945144176483, + "rewards/margins": 0.24340946972370148, + "rewards/rejected": -0.14483001828193665, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 1.8e-05, + "logits/chosen": -2.080395221710205, + "logits/rejected": -1.8958829641342163, + "logps/chosen": -295.2388000488281, + "logps/rejected": -324.56915283203125, + "loss": 0.5596, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20325438678264618, + "rewards/margins": 0.32939791679382324, + "rewards/rejected": -0.12614354491233826, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 1.9e-05, + "logits/chosen": -2.3108816146850586, + "logits/rejected": -2.1787753105163574, + "logps/chosen": -396.53424072265625, + "logps/rejected": -331.013916015625, + "loss": 0.5661, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08369699120521545, + "rewards/margins": 0.3793570399284363, + "rewards/rejected": -0.29566001892089844, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "logits/chosen": -2.091219902038574, + "logits/rejected": -2.02215313911438, + "logps/chosen": -257.3657531738281, + "logps/rejected": -274.3866882324219, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07300397753715515, + "rewards/margins": 0.10496324300765991, + "rewards/rejected": -0.03195926174521446, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.9995798319327732e-05, + "logits/chosen": -2.039426326751709, + "logits/rejected": -1.903738260269165, + "logps/chosen": -240.83859252929688, + "logps/rejected": -356.79248046875, + "loss": 0.8172, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16449320316314697, + "rewards/margins": -0.060936544090509415, + "rewards/rejected": -0.10355665534734726, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 1.9991596638655465e-05, + "logits/chosen": -2.1439273357391357, + "logits/rejected": -1.8001049757003784, + "logps/chosen": -311.072265625, + "logps/rejected": -224.07933044433594, + "loss": 0.6006, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14014743268489838, + "rewards/margins": 0.37872254848480225, + "rewards/rejected": -0.518869936466217, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 1.9987394957983196e-05, + "logits/chosen": -2.2514448165893555, + "logits/rejected": -1.93721604347229, + "logps/chosen": -260.8023376464844, + "logps/rejected": -245.62612915039062, + "loss": 0.6351, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11218864470720291, + "rewards/margins": 0.15137380361557007, + "rewards/rejected": -0.2635624408721924, + "step": 23 + }, + { + "epoch": 0.01, + "learning_rate": 1.9983193277310926e-05, + "logits/chosen": -2.155559539794922, + "logits/rejected": -2.109809637069702, + "logps/chosen": -171.87376403808594, + "logps/rejected": -253.1971893310547, + "loss": 0.5501, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.003980059176683426, + "rewards/margins": 0.3934435248374939, + "rewards/rejected": -0.3894634544849396, + "step": 24 + }, + { + "epoch": 0.01, + "learning_rate": 1.9978991596638656e-05, + "logits/chosen": -2.0442819595336914, + "logits/rejected": -1.5270578861236572, + "logps/chosen": -331.35784912109375, + "logps/rejected": -254.6148681640625, + "loss": 0.4698, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26868611574172974, + "rewards/margins": 0.6276190876960754, + "rewards/rejected": -0.3589329421520233, + "step": 25 + }, + { + "epoch": 0.01, + "learning_rate": 1.997478991596639e-05, + "logits/chosen": -2.1941871643066406, + "logits/rejected": -1.9360427856445312, + "logps/chosen": -286.4983215332031, + "logps/rejected": -254.10491943359375, + "loss": 0.4371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09487167000770569, + "rewards/margins": 0.8353039026260376, + "rewards/rejected": -0.7404322624206543, + "step": 26 + }, + { + "epoch": 0.01, + "learning_rate": 1.997058823529412e-05, + "logits/chosen": -2.3349223136901855, + "logits/rejected": -2.3109145164489746, + "logps/chosen": -306.22509765625, + "logps/rejected": -261.090576171875, + "loss": 0.3504, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.43325841426849365, + "rewards/margins": 1.2106494903564453, + "rewards/rejected": -0.7773910760879517, + "step": 27 + }, + { + "epoch": 0.01, + "learning_rate": 1.996638655462185e-05, + "logits/chosen": -2.251776933670044, + "logits/rejected": -2.2390496730804443, + "logps/chosen": -351.43438720703125, + "logps/rejected": -364.4002685546875, + "loss": 0.384, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4874539375305176, + "rewards/margins": 0.96163010597229, + "rewards/rejected": -0.4741760790348053, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 1.996218487394958e-05, + "logits/chosen": -2.179680109024048, + "logits/rejected": -1.943859577178955, + "logps/chosen": -353.5611572265625, + "logps/rejected": -363.4343566894531, + "loss": 0.554, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01377946138381958, + "rewards/margins": 0.7094612717628479, + "rewards/rejected": -0.6956818103790283, + "step": 29 + }, + { + "epoch": 0.01, + "learning_rate": 1.9957983193277314e-05, + "logits/chosen": -2.1971545219421387, + "logits/rejected": -1.6987268924713135, + "logps/chosen": -452.05914306640625, + "logps/rejected": -332.03753662109375, + "loss": 0.7594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.534920334815979, + "rewards/margins": 0.350356787443161, + "rewards/rejected": -0.8852770924568176, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 1.9953781512605044e-05, + "logits/chosen": -2.1026246547698975, + "logits/rejected": -1.983717679977417, + "logps/chosen": -306.4521484375, + "logps/rejected": -327.5496520996094, + "loss": 0.6098, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03515242040157318, + "rewards/margins": 0.6194229125976562, + "rewards/rejected": -0.6545753479003906, + "step": 31 + }, + { + "epoch": 0.01, + "learning_rate": 1.9949579831932774e-05, + "logits/chosen": -1.9110617637634277, + "logits/rejected": -1.9333263635635376, + "logps/chosen": -259.166259765625, + "logps/rejected": -358.37451171875, + "loss": 0.3246, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07927685976028442, + "rewards/margins": 1.2982083559036255, + "rewards/rejected": -1.3774852752685547, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 1.9945378151260505e-05, + "logits/chosen": -1.9312708377838135, + "logits/rejected": -1.825966715812683, + "logps/chosen": -383.91357421875, + "logps/rejected": -316.36798095703125, + "loss": 0.5947, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2903977930545807, + "rewards/margins": 0.9727724194526672, + "rewards/rejected": -1.2631702423095703, + "step": 33 + }, + { + "epoch": 0.01, + "learning_rate": 1.9941176470588238e-05, + "logits/chosen": -2.1588261127471924, + "logits/rejected": -1.9789544343948364, + "logps/chosen": -382.4302978515625, + "logps/rejected": -377.56048583984375, + "loss": 0.4118, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20616267621517181, + "rewards/margins": 0.9832318425178528, + "rewards/rejected": -0.7770692110061646, + "step": 34 + }, + { + "epoch": 0.01, + "learning_rate": 1.993697478991597e-05, + "logits/chosen": -2.2012393474578857, + "logits/rejected": -2.1398000717163086, + "logps/chosen": -253.782470703125, + "logps/rejected": -289.1595764160156, + "loss": 0.6182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3398907780647278, + "rewards/margins": 0.6650518178939819, + "rewards/rejected": -1.0049426555633545, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 1.99327731092437e-05, + "logits/chosen": -2.526705265045166, + "logits/rejected": -2.1168999671936035, + "logps/chosen": -298.3293762207031, + "logps/rejected": -284.14892578125, + "loss": 0.5028, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3161202669143677, + "rewards/margins": 1.0427557229995728, + "rewards/rejected": -0.7266354560852051, + "step": 36 + }, + { + "epoch": 0.01, + "learning_rate": 1.992857142857143e-05, + "logits/chosen": -2.27311372756958, + "logits/rejected": -1.748600959777832, + "logps/chosen": -293.44232177734375, + "logps/rejected": -297.465087890625, + "loss": 0.4748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1936989277601242, + "rewards/margins": 1.0498151779174805, + "rewards/rejected": -0.8561161756515503, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 1.9924369747899163e-05, + "logits/chosen": -2.2410919666290283, + "logits/rejected": -1.9170420169830322, + "logps/chosen": -335.0601806640625, + "logps/rejected": -250.7333221435547, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02380967140197754, + "rewards/margins": 0.4384457468986511, + "rewards/rejected": -0.4146360754966736, + "step": 38 + }, + { + "epoch": 0.01, + "learning_rate": 1.9920168067226893e-05, + "logits/chosen": -2.2150847911834717, + "logits/rejected": -2.0835230350494385, + "logps/chosen": -239.5399627685547, + "logps/rejected": -255.68206787109375, + "loss": 0.4157, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26076507568359375, + "rewards/margins": 0.8942439556121826, + "rewards/rejected": -0.6334788203239441, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 1.9915966386554623e-05, + "logits/chosen": -2.5332202911376953, + "logits/rejected": -2.0763649940490723, + "logps/chosen": -311.97802734375, + "logps/rejected": -266.4057922363281, + "loss": 0.4129, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17071732878684998, + "rewards/margins": 1.2794352769851685, + "rewards/rejected": -1.108717918395996, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 1.9911764705882353e-05, + "logits/chosen": -2.471341609954834, + "logits/rejected": -1.6814744472503662, + "logps/chosen": -355.7557067871094, + "logps/rejected": -301.5888671875, + "loss": 0.4109, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.49135756492614746, + "rewards/margins": 1.131295919418335, + "rewards/rejected": -0.6399383544921875, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 1.9907563025210087e-05, + "logits/chosen": -2.2631304264068604, + "logits/rejected": -2.234004259109497, + "logps/chosen": -327.2241516113281, + "logps/rejected": -300.0874938964844, + "loss": 0.4831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06589288264513016, + "rewards/margins": 0.9846144914627075, + "rewards/rejected": -1.0505073070526123, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 1.9903361344537817e-05, + "logits/chosen": -2.5259833335876465, + "logits/rejected": -2.2826600074768066, + "logps/chosen": -267.2774658203125, + "logps/rejected": -274.1180725097656, + "loss": 0.4359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12646692991256714, + "rewards/margins": 0.9644603133201599, + "rewards/rejected": -1.0909273624420166, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 1.9899159663865547e-05, + "logits/chosen": -2.385162353515625, + "logits/rejected": -1.7944622039794922, + "logps/chosen": -453.2611083984375, + "logps/rejected": -320.607421875, + "loss": 0.2998, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5784784555435181, + "rewards/margins": 1.655571699142456, + "rewards/rejected": -1.0770931243896484, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 1.989495798319328e-05, + "logits/chosen": -1.7950670719146729, + "logits/rejected": -2.163729667663574, + "logps/chosen": -198.92381286621094, + "logps/rejected": -242.31094360351562, + "loss": 0.391, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20681682229042053, + "rewards/margins": 1.026400089263916, + "rewards/rejected": -0.8195833563804626, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 1.989075630252101e-05, + "logits/chosen": -2.1599843502044678, + "logits/rejected": -2.060497283935547, + "logps/chosen": -295.61444091796875, + "logps/rejected": -259.965576171875, + "loss": 0.5432, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3103800415992737, + "rewards/margins": 0.9843414425849915, + "rewards/rejected": -0.6739614009857178, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 1.988655462184874e-05, + "logits/chosen": -2.0689070224761963, + "logits/rejected": -2.012744426727295, + "logps/chosen": -232.267822265625, + "logps/rejected": -297.8428649902344, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39347603917121887, + "rewards/margins": 1.4002763032913208, + "rewards/rejected": -1.7937523126602173, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 1.988235294117647e-05, + "logits/chosen": -2.195972442626953, + "logits/rejected": -1.976593255996704, + "logps/chosen": -293.2135314941406, + "logps/rejected": -389.1631164550781, + "loss": 0.4143, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4289471209049225, + "rewards/margins": 1.2965888977050781, + "rewards/rejected": -0.8676416873931885, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 1.9878151260504205e-05, + "logits/chosen": -2.09908127784729, + "logits/rejected": -1.894362211227417, + "logps/chosen": -316.28729248046875, + "logps/rejected": -289.5418395996094, + "loss": 0.4047, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4415051341056824, + "rewards/margins": 1.1790716648101807, + "rewards/rejected": -0.7375665903091431, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 1.9873949579831935e-05, + "logits/chosen": -2.077441930770874, + "logits/rejected": -2.47188138961792, + "logps/chosen": -264.8441162109375, + "logps/rejected": -433.2682800292969, + "loss": 0.4666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13278540968894958, + "rewards/margins": 1.287182331085205, + "rewards/rejected": -1.154396891593933, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.9869747899159666e-05, + "logits/chosen": -2.270512819290161, + "logits/rejected": -2.3522863388061523, + "logps/chosen": -333.0877685546875, + "logps/rejected": -311.7333679199219, + "loss": 0.2762, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30799394845962524, + "rewards/margins": 1.776007056236267, + "rewards/rejected": -1.4680131673812866, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 1.9865546218487396e-05, + "logits/chosen": -1.980552077293396, + "logits/rejected": -1.7865663766860962, + "logps/chosen": -296.15966796875, + "logps/rejected": -359.48095703125, + "loss": 0.3227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21640604734420776, + "rewards/margins": 1.4678544998168945, + "rewards/rejected": -1.2514485120773315, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 1.986134453781513e-05, + "logits/chosen": -2.0458903312683105, + "logits/rejected": -2.013563871383667, + "logps/chosen": -206.70895385742188, + "logps/rejected": -264.04412841796875, + "loss": 0.3541, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39075812697410583, + "rewards/margins": 1.5346307754516602, + "rewards/rejected": -1.1438727378845215, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 1.985714285714286e-05, + "logits/chosen": -2.0617918968200684, + "logits/rejected": -2.1093711853027344, + "logps/chosen": -276.7305908203125, + "logps/rejected": -346.8241882324219, + "loss": 0.4043, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.25451546907424927, + "rewards/margins": 1.3389711380004883, + "rewards/rejected": -1.0844557285308838, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 1.985294117647059e-05, + "logits/chosen": -1.857073187828064, + "logits/rejected": -1.600212574005127, + "logps/chosen": -333.90447998046875, + "logps/rejected": -300.74462890625, + "loss": 0.5176, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08902762830257416, + "rewards/margins": 1.1346933841705322, + "rewards/rejected": -1.0456656217575073, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 1.984873949579832e-05, + "logits/chosen": -2.073125123977661, + "logits/rejected": -1.9293947219848633, + "logps/chosen": -361.3773193359375, + "logps/rejected": -243.97341918945312, + "loss": 0.1839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6487305760383606, + "rewards/margins": 2.422443389892578, + "rewards/rejected": -1.7737128734588623, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 1.9844537815126054e-05, + "logits/chosen": -2.073249340057373, + "logits/rejected": -1.8651671409606934, + "logps/chosen": -283.1850891113281, + "logps/rejected": -260.81707763671875, + "loss": 0.263, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5375748872756958, + "rewards/margins": 2.4909462928771973, + "rewards/rejected": -1.953371286392212, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 1.9840336134453784e-05, + "logits/chosen": -2.2339138984680176, + "logits/rejected": -2.1560802459716797, + "logps/chosen": -360.5123596191406, + "logps/rejected": -459.8302917480469, + "loss": 0.2335, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18645963072776794, + "rewards/margins": 2.2421610355377197, + "rewards/rejected": -2.05570125579834, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 1.9836134453781514e-05, + "logits/chosen": -2.163757801055908, + "logits/rejected": -2.1205854415893555, + "logps/chosen": -259.7275390625, + "logps/rejected": -281.04296875, + "loss": 0.6058, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37449726462364197, + "rewards/margins": 0.622344434261322, + "rewards/rejected": -0.24784713983535767, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 1.9831932773109244e-05, + "logits/chosen": -2.19521427154541, + "logits/rejected": -2.30275297164917, + "logps/chosen": -314.6921081542969, + "logps/rejected": -378.01129150390625, + "loss": 0.3264, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23214736580848694, + "rewards/margins": 1.752747654914856, + "rewards/rejected": -1.5206003189086914, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 1.9827731092436978e-05, + "logits/chosen": -2.1164236068725586, + "logits/rejected": -1.7807343006134033, + "logps/chosen": -293.411865234375, + "logps/rejected": -250.91403198242188, + "loss": 0.4, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.20865458250045776, + "rewards/margins": 1.714974284172058, + "rewards/rejected": -1.923628807067871, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 1.9823529411764708e-05, + "logits/chosen": -2.1823620796203613, + "logits/rejected": -2.0653858184814453, + "logps/chosen": -156.7562255859375, + "logps/rejected": -190.81983947753906, + "loss": 0.3667, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2006514072418213, + "rewards/margins": 1.9494022130966187, + "rewards/rejected": -1.7487506866455078, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 1.981932773109244e-05, + "logits/chosen": -2.387006998062134, + "logits/rejected": -1.9845913648605347, + "logps/chosen": -400.44439697265625, + "logps/rejected": -324.6969909667969, + "loss": 0.4412, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4312658905982971, + "rewards/margins": 1.395607352256775, + "rewards/rejected": -0.9643415212631226, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 1.981512605042017e-05, + "logits/chosen": -2.2400944232940674, + "logits/rejected": -2.0497281551361084, + "logps/chosen": -303.5828552246094, + "logps/rejected": -257.9069519042969, + "loss": 0.763, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03154442459344864, + "rewards/margins": 0.9810715913772583, + "rewards/rejected": -1.0126160383224487, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 1.9810924369747902e-05, + "logits/chosen": -2.16957950592041, + "logits/rejected": -1.5903396606445312, + "logps/chosen": -362.58184814453125, + "logps/rejected": -244.8644561767578, + "loss": 0.3795, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4543456435203552, + "rewards/margins": 1.6872265338897705, + "rewards/rejected": -1.23288094997406, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 1.9806722689075632e-05, + "logits/chosen": -1.9502336978912354, + "logits/rejected": -2.0793938636779785, + "logps/chosen": -299.5069580078125, + "logps/rejected": -331.5346984863281, + "loss": 0.4049, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17613552510738373, + "rewards/margins": 1.658630609512329, + "rewards/rejected": -1.4824949502944946, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 1.9802521008403363e-05, + "logits/chosen": -2.1173956394195557, + "logits/rejected": -2.0675406455993652, + "logps/chosen": -302.7386779785156, + "logps/rejected": -370.3883361816406, + "loss": 0.2002, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.41592830419540405, + "rewards/margins": 2.346912145614624, + "rewards/rejected": -1.9309837818145752, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 1.9798319327731096e-05, + "logits/chosen": -2.1113007068634033, + "logits/rejected": -1.8868319988250732, + "logps/chosen": -338.0023193359375, + "logps/rejected": -306.9248046875, + "loss": 0.3672, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3922884166240692, + "rewards/margins": 1.854377031326294, + "rewards/rejected": -1.4620883464813232, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 1.9794117647058827e-05, + "logits/chosen": -1.895932674407959, + "logits/rejected": -1.6613662242889404, + "logps/chosen": -363.0187072753906, + "logps/rejected": -502.70294189453125, + "loss": 0.3624, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6353020668029785, + "rewards/margins": 2.4968245029449463, + "rewards/rejected": -1.8615224361419678, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 1.9789915966386557e-05, + "logits/chosen": -1.977718710899353, + "logits/rejected": -1.753633975982666, + "logps/chosen": -276.27935791015625, + "logps/rejected": -266.5805969238281, + "loss": 0.5753, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.015602737665176392, + "rewards/margins": 1.6934807300567627, + "rewards/rejected": -1.6778781414031982, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 1.9785714285714287e-05, + "logits/chosen": -2.136392831802368, + "logits/rejected": -2.1143226623535156, + "logps/chosen": -266.6164855957031, + "logps/rejected": -341.439453125, + "loss": 0.4063, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6387972831726074, + "rewards/margins": 1.8382182121276855, + "rewards/rejected": -1.1994209289550781, + "step": 71 + }, + { + "epoch": 0.02, + "learning_rate": 1.978151260504202e-05, + "logits/chosen": -2.1034064292907715, + "logits/rejected": -1.6448578834533691, + "logps/chosen": -320.40631103515625, + "logps/rejected": -245.26319885253906, + "loss": 0.3839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16450253129005432, + "rewards/margins": 2.016575336456299, + "rewards/rejected": -1.8520729541778564, + "step": 72 + }, + { + "epoch": 0.02, + "learning_rate": 1.977731092436975e-05, + "logits/chosen": -1.767681360244751, + "logits/rejected": -1.9492697715759277, + "logps/chosen": -329.1862487792969, + "logps/rejected": -435.3006896972656, + "loss": 0.3475, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07233725488185883, + "rewards/margins": 2.7824957370758057, + "rewards/rejected": -2.710158586502075, + "step": 73 + }, + { + "epoch": 0.02, + "learning_rate": 1.977310924369748e-05, + "logits/chosen": -1.9439988136291504, + "logits/rejected": -1.6310694217681885, + "logps/chosen": -319.0556640625, + "logps/rejected": -282.9508361816406, + "loss": 0.3585, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.03838210180401802, + "rewards/margins": 2.573625087738037, + "rewards/rejected": -2.535243034362793, + "step": 74 + }, + { + "epoch": 0.02, + "learning_rate": 1.976890756302521e-05, + "logits/chosen": -1.7561790943145752, + "logits/rejected": -1.949948787689209, + "logps/chosen": -258.57366943359375, + "logps/rejected": -538.1862182617188, + "loss": 0.3153, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23872801661491394, + "rewards/margins": 2.344884157180786, + "rewards/rejected": -2.10615611076355, + "step": 75 + }, + { + "epoch": 0.02, + "learning_rate": 1.9764705882352945e-05, + "logits/chosen": -2.3019189834594727, + "logits/rejected": -2.0765202045440674, + "logps/chosen": -270.93817138671875, + "logps/rejected": -274.4999694824219, + "loss": 0.581, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2101031243801117, + "rewards/margins": 1.4574387073516846, + "rewards/rejected": -1.2473355531692505, + "step": 76 + }, + { + "epoch": 0.02, + "learning_rate": 1.9760504201680675e-05, + "logits/chosen": -2.0135693550109863, + "logits/rejected": -2.2255821228027344, + "logps/chosen": -273.78082275390625, + "logps/rejected": -289.54486083984375, + "loss": 0.694, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.009193005971610546, + "rewards/margins": 0.7785494923591614, + "rewards/rejected": -0.7877424955368042, + "step": 77 + }, + { + "epoch": 0.02, + "learning_rate": 1.9756302521008405e-05, + "logits/chosen": -2.212216377258301, + "logits/rejected": -2.2388317584991455, + "logps/chosen": -263.4593505859375, + "logps/rejected": -271.59466552734375, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40438124537467957, + "rewards/margins": 1.8790018558502197, + "rewards/rejected": -1.4746206998825073, + "step": 78 + }, + { + "epoch": 0.02, + "learning_rate": 1.9752100840336136e-05, + "logits/chosen": -1.9206621646881104, + "logits/rejected": -1.7983925342559814, + "logps/chosen": -379.12115478515625, + "logps/rejected": -394.8392028808594, + "loss": 0.5027, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5758500099182129, + "rewards/margins": 1.5463589429855347, + "rewards/rejected": -0.9705088138580322, + "step": 79 + }, + { + "epoch": 0.02, + "learning_rate": 1.974789915966387e-05, + "logits/chosen": -2.0521106719970703, + "logits/rejected": -1.7378277778625488, + "logps/chosen": -256.2105712890625, + "logps/rejected": -302.44635009765625, + "loss": 0.4625, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2178654670715332, + "rewards/margins": 1.67887544631958, + "rewards/rejected": -1.4610100984573364, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.97436974789916e-05, + "logits/chosen": -2.124636650085449, + "logits/rejected": -2.006908655166626, + "logps/chosen": -178.78956604003906, + "logps/rejected": -279.22357177734375, + "loss": 0.4233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6088790893554688, + "rewards/margins": 1.2809927463531494, + "rewards/rejected": -0.6721135377883911, + "step": 81 + }, + { + "epoch": 0.02, + "learning_rate": 1.973949579831933e-05, + "logits/chosen": -2.103243827819824, + "logits/rejected": -1.9861217737197876, + "logps/chosen": -389.2289123535156, + "logps/rejected": -385.99365234375, + "loss": 0.4647, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7544265985488892, + "rewards/margins": 1.7135108709335327, + "rewards/rejected": -0.9590842723846436, + "step": 82 + }, + { + "epoch": 0.02, + "learning_rate": 1.973529411764706e-05, + "logits/chosen": -1.8935327529907227, + "logits/rejected": -1.8215477466583252, + "logps/chosen": -210.7542266845703, + "logps/rejected": -241.78558349609375, + "loss": 0.3477, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3735242187976837, + "rewards/margins": 1.599092960357666, + "rewards/rejected": -1.2255687713623047, + "step": 83 + }, + { + "epoch": 0.02, + "learning_rate": 1.9731092436974793e-05, + "logits/chosen": -2.29447603225708, + "logits/rejected": -2.161421298980713, + "logps/chosen": -275.94500732421875, + "logps/rejected": -238.6147918701172, + "loss": 0.2717, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5089815855026245, + "rewards/margins": 1.4455078840255737, + "rewards/rejected": -0.9365262985229492, + "step": 84 + }, + { + "epoch": 0.02, + "learning_rate": 1.9726890756302524e-05, + "logits/chosen": -1.8601469993591309, + "logits/rejected": -1.8802802562713623, + "logps/chosen": -235.6843719482422, + "logps/rejected": -287.5454406738281, + "loss": 0.4983, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.013763606548309326, + "rewards/margins": 1.4783430099487305, + "rewards/rejected": -1.464579463005066, + "step": 85 + }, + { + "epoch": 0.02, + "learning_rate": 1.9722689075630254e-05, + "logits/chosen": -2.183777332305908, + "logits/rejected": -1.9824378490447998, + "logps/chosen": -268.580078125, + "logps/rejected": -227.19166564941406, + "loss": 0.3715, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7063171863555908, + "rewards/margins": 2.124077320098877, + "rewards/rejected": -1.4177602529525757, + "step": 86 + }, + { + "epoch": 0.02, + "learning_rate": 1.9718487394957987e-05, + "logits/chosen": -1.9882242679595947, + "logits/rejected": -2.1597676277160645, + "logps/chosen": -240.87130737304688, + "logps/rejected": -204.73440551757812, + "loss": 0.3994, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6949409246444702, + "rewards/margins": 1.5625944137573242, + "rewards/rejected": -0.8676536083221436, + "step": 87 + }, + { + "epoch": 0.02, + "learning_rate": 1.9714285714285718e-05, + "logits/chosen": -1.8498257398605347, + "logits/rejected": -1.7139078378677368, + "logps/chosen": -291.52764892578125, + "logps/rejected": -339.3037414550781, + "loss": 0.4453, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.44802042841911316, + "rewards/margins": 1.2824641466140747, + "rewards/rejected": -0.8344437479972839, + "step": 88 + }, + { + "epoch": 0.02, + "learning_rate": 1.9710084033613448e-05, + "logits/chosen": -2.162508726119995, + "logits/rejected": -1.9323761463165283, + "logps/chosen": -344.0239562988281, + "logps/rejected": -314.748046875, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7742578983306885, + "rewards/margins": 1.3817992210388184, + "rewards/rejected": -0.6075414419174194, + "step": 89 + }, + { + "epoch": 0.02, + "learning_rate": 1.9705882352941178e-05, + "logits/chosen": -1.9877519607543945, + "logits/rejected": -1.9254560470581055, + "logps/chosen": -287.8323974609375, + "logps/rejected": -344.7055358886719, + "loss": 0.4725, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8281481862068176, + "rewards/margins": 1.6265738010406494, + "rewards/rejected": -0.798425555229187, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 1.9701680672268912e-05, + "logits/chosen": -2.1865835189819336, + "logits/rejected": -1.8122529983520508, + "logps/chosen": -297.27642822265625, + "logps/rejected": -297.00433349609375, + "loss": 0.6156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2508367896080017, + "rewards/margins": 1.7117033004760742, + "rewards/rejected": -1.4608666896820068, + "step": 91 + }, + { + "epoch": 0.02, + "learning_rate": 1.9697478991596642e-05, + "logits/chosen": -2.2020838260650635, + "logits/rejected": -1.9021496772766113, + "logps/chosen": -326.5546875, + "logps/rejected": -308.9887390136719, + "loss": 0.3583, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.3239891529083252, + "rewards/margins": 2.97361421585083, + "rewards/rejected": -1.6496249437332153, + "step": 92 + }, + { + "epoch": 0.02, + "learning_rate": 1.9693277310924372e-05, + "logits/chosen": -2.2372398376464844, + "logits/rejected": -2.2184348106384277, + "logps/chosen": -292.1517028808594, + "logps/rejected": -269.4548034667969, + "loss": 0.3216, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6820988655090332, + "rewards/margins": 1.7071502208709717, + "rewards/rejected": -1.025051236152649, + "step": 93 + }, + { + "epoch": 0.02, + "learning_rate": 1.9689075630252102e-05, + "logits/chosen": -2.0083584785461426, + "logits/rejected": -2.0635998249053955, + "logps/chosen": -276.5091552734375, + "logps/rejected": -315.46673583984375, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39702826738357544, + "rewards/margins": 1.4720776081085205, + "rewards/rejected": -1.0750494003295898, + "step": 94 + }, + { + "epoch": 0.02, + "learning_rate": 1.9684873949579833e-05, + "logits/chosen": -2.2963638305664062, + "logits/rejected": -1.4817904233932495, + "logps/chosen": -336.0533447265625, + "logps/rejected": -276.6947326660156, + "loss": 0.7605, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5585237741470337, + "rewards/margins": 1.2757525444030762, + "rewards/rejected": -0.7172287702560425, + "step": 95 + }, + { + "epoch": 0.02, + "learning_rate": 1.9680672268907563e-05, + "logits/chosen": -2.1417922973632812, + "logits/rejected": -1.952133059501648, + "logps/chosen": -302.29949951171875, + "logps/rejected": -260.100341796875, + "loss": 0.3972, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18869030475616455, + "rewards/margins": 1.1416288614273071, + "rewards/rejected": -1.3303192853927612, + "step": 96 + }, + { + "epoch": 0.02, + "learning_rate": 1.9676470588235293e-05, + "logits/chosen": -2.2551465034484863, + "logits/rejected": -2.1994338035583496, + "logps/chosen": -259.6070556640625, + "logps/rejected": -292.28729248046875, + "loss": 0.4951, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12263701111078262, + "rewards/margins": 2.051563024520874, + "rewards/rejected": -1.9289261102676392, + "step": 97 + }, + { + "epoch": 0.02, + "learning_rate": 1.9672268907563027e-05, + "logits/chosen": -2.120534658432007, + "logits/rejected": -1.8351314067840576, + "logps/chosen": -325.5702819824219, + "logps/rejected": -244.7603759765625, + "loss": 0.3843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8541961312294006, + "rewards/margins": 2.112898826599121, + "rewards/rejected": -1.2587027549743652, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 1.9668067226890757e-05, + "logits/chosen": -2.172074317932129, + "logits/rejected": -2.0238358974456787, + "logps/chosen": -468.5791320800781, + "logps/rejected": -352.8669128417969, + "loss": 0.274, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4931235313415527, + "rewards/margins": 2.0761213302612305, + "rewards/rejected": -0.5829975605010986, + "step": 99 + }, + { + "epoch": 0.02, + "learning_rate": 1.9663865546218487e-05, + "logits/chosen": -2.0707366466522217, + "logits/rejected": -2.0279312133789062, + "logps/chosen": -283.825439453125, + "logps/rejected": -292.2297668457031, + "loss": 0.5522, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.49507033824920654, + "rewards/margins": 1.2633992433547974, + "rewards/rejected": -0.7683289051055908, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 1.965966386554622e-05, + "logits/chosen": -2.2249021530151367, + "logits/rejected": -2.0144784450531006, + "logps/chosen": -388.1253967285156, + "logps/rejected": -267.12945556640625, + "loss": 0.4395, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0910539627075195, + "rewards/margins": 1.1310594081878662, + "rewards/rejected": -0.04000537097454071, + "step": 101 + }, + { + "epoch": 0.02, + "learning_rate": 1.965546218487395e-05, + "logits/chosen": -2.3198399543762207, + "logits/rejected": -2.315199136734009, + "logps/chosen": -255.4583282470703, + "logps/rejected": -218.7842559814453, + "loss": 0.5806, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6165440082550049, + "rewards/margins": 1.0216189622879028, + "rewards/rejected": -0.40507495403289795, + "step": 102 + }, + { + "epoch": 0.02, + "learning_rate": 1.965126050420168e-05, + "logits/chosen": -1.944331169128418, + "logits/rejected": -1.6027865409851074, + "logps/chosen": -299.79693603515625, + "logps/rejected": -264.605712890625, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1237390041351318, + "rewards/margins": 2.4650118350982666, + "rewards/rejected": -1.3412729501724243, + "step": 103 + }, + { + "epoch": 0.02, + "learning_rate": 1.964705882352941e-05, + "logits/chosen": -2.237401008605957, + "logits/rejected": -1.9374208450317383, + "logps/chosen": -291.79156494140625, + "logps/rejected": -227.7943115234375, + "loss": 0.2397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8778971433639526, + "rewards/margins": 2.1618452072143555, + "rewards/rejected": -1.2839481830596924, + "step": 104 + }, + { + "epoch": 0.02, + "learning_rate": 1.9642857142857145e-05, + "logits/chosen": -1.9214826822280884, + "logits/rejected": -1.8263112306594849, + "logps/chosen": -237.01693725585938, + "logps/rejected": -332.3307800292969, + "loss": 0.2999, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6954737901687622, + "rewards/margins": 2.454423427581787, + "rewards/rejected": -1.7589497566223145, + "step": 105 + }, + { + "epoch": 0.02, + "learning_rate": 1.9638655462184875e-05, + "logits/chosen": -2.1410441398620605, + "logits/rejected": -1.6226427555084229, + "logps/chosen": -322.0953369140625, + "logps/rejected": -296.8033447265625, + "loss": 0.7646, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3530750274658203, + "rewards/margins": 0.81498122215271, + "rewards/rejected": -0.46190622448921204, + "step": 106 + }, + { + "epoch": 0.02, + "learning_rate": 1.9634453781512605e-05, + "logits/chosen": -2.15517258644104, + "logits/rejected": -1.472048282623291, + "logps/chosen": -366.901611328125, + "logps/rejected": -310.87249755859375, + "loss": 0.3827, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5737791061401367, + "rewards/margins": 1.995422601699829, + "rewards/rejected": -1.4216433763504028, + "step": 107 + }, + { + "epoch": 0.02, + "learning_rate": 1.9630252100840336e-05, + "logits/chosen": -2.0106077194213867, + "logits/rejected": -2.048567056655884, + "logps/chosen": -219.6585235595703, + "logps/rejected": -252.0364990234375, + "loss": 0.3465, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3845852017402649, + "rewards/margins": 1.800605058670044, + "rewards/rejected": -1.4160197973251343, + "step": 108 + }, + { + "epoch": 0.02, + "learning_rate": 1.962605042016807e-05, + "logits/chosen": -2.107731819152832, + "logits/rejected": -1.3294410705566406, + "logps/chosen": -358.4281005859375, + "logps/rejected": -257.8708190917969, + "loss": 0.3206, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3075557947158813, + "rewards/margins": 1.9053153991699219, + "rewards/rejected": -0.5977598428726196, + "step": 109 + }, + { + "epoch": 0.02, + "learning_rate": 1.96218487394958e-05, + "logits/chosen": -2.026577949523926, + "logits/rejected": -2.2460622787475586, + "logps/chosen": -220.59320068359375, + "logps/rejected": -325.16802978515625, + "loss": 0.2967, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3612469732761383, + "rewards/margins": 2.7077808380126953, + "rewards/rejected": -2.346534013748169, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 1.961764705882353e-05, + "logits/chosen": -1.7622581720352173, + "logits/rejected": -1.7473357915878296, + "logps/chosen": -223.30801391601562, + "logps/rejected": -242.68637084960938, + "loss": 0.7774, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3355538249015808, + "rewards/margins": 1.0961945056915283, + "rewards/rejected": -0.7606406807899475, + "step": 111 + }, + { + "epoch": 0.02, + "learning_rate": 1.961344537815126e-05, + "logits/chosen": -2.2168211936950684, + "logits/rejected": -1.904067039489746, + "logps/chosen": -288.23004150390625, + "logps/rejected": -282.98504638671875, + "loss": 0.4975, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9544592499732971, + "rewards/margins": 1.863938570022583, + "rewards/rejected": -0.9094793796539307, + "step": 112 + }, + { + "epoch": 0.02, + "learning_rate": 1.9609243697478994e-05, + "logits/chosen": -2.2095961570739746, + "logits/rejected": -2.1803181171417236, + "logps/chosen": -246.35244750976562, + "logps/rejected": -244.9532928466797, + "loss": 0.3461, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6990171670913696, + "rewards/margins": 1.7915334701538086, + "rewards/rejected": -1.0925161838531494, + "step": 113 + }, + { + "epoch": 0.02, + "learning_rate": 1.9605042016806724e-05, + "logits/chosen": -2.1334476470947266, + "logits/rejected": -2.1948623657226562, + "logps/chosen": -326.85205078125, + "logps/rejected": -364.0404968261719, + "loss": 0.261, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2773776054382324, + "rewards/margins": 1.8966569900512695, + "rewards/rejected": -0.6192792654037476, + "step": 114 + }, + { + "epoch": 0.02, + "learning_rate": 1.9600840336134454e-05, + "logits/chosen": -1.9861092567443848, + "logits/rejected": -2.0006840229034424, + "logps/chosen": -285.4367370605469, + "logps/rejected": -315.72027587890625, + "loss": 0.2797, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.927230179309845, + "rewards/margins": 2.433245897293091, + "rewards/rejected": -1.5060157775878906, + "step": 115 + }, + { + "epoch": 0.02, + "learning_rate": 1.9596638655462184e-05, + "logits/chosen": -1.9135236740112305, + "logits/rejected": -1.761857032775879, + "logps/chosen": -329.7629699707031, + "logps/rejected": -281.57208251953125, + "loss": 0.6673, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19162462651729584, + "rewards/margins": 1.2261087894439697, + "rewards/rejected": -1.0344840288162231, + "step": 116 + }, + { + "epoch": 0.02, + "learning_rate": 1.9592436974789918e-05, + "logits/chosen": -2.148016929626465, + "logits/rejected": -2.074423313140869, + "logps/chosen": -225.41738891601562, + "logps/rejected": -238.00477600097656, + "loss": 0.2159, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8234500885009766, + "rewards/margins": 2.332329750061035, + "rewards/rejected": -1.5088801383972168, + "step": 117 + }, + { + "epoch": 0.02, + "learning_rate": 1.9588235294117648e-05, + "logits/chosen": -2.361898899078369, + "logits/rejected": -2.055607318878174, + "logps/chosen": -289.41802978515625, + "logps/rejected": -242.832275390625, + "loss": 0.2781, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8082376718521118, + "rewards/margins": 2.2316887378692627, + "rewards/rejected": -1.4234510660171509, + "step": 118 + }, + { + "epoch": 0.02, + "learning_rate": 1.9584033613445378e-05, + "logits/chosen": -2.2593865394592285, + "logits/rejected": -1.8688687086105347, + "logps/chosen": -333.76416015625, + "logps/rejected": -245.79701232910156, + "loss": 0.2647, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.189161777496338, + "rewards/margins": 2.233233690261841, + "rewards/rejected": -1.044072151184082, + "step": 119 + }, + { + "epoch": 0.03, + "learning_rate": 1.957983193277311e-05, + "logits/chosen": -2.0981814861297607, + "logits/rejected": -1.8817328214645386, + "logps/chosen": -328.9977722167969, + "logps/rejected": -331.95281982421875, + "loss": 0.4821, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7882914543151855, + "rewards/margins": 1.6770789623260498, + "rewards/rejected": -0.8887875080108643, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.9575630252100842e-05, + "logits/chosen": -2.294989585876465, + "logits/rejected": -2.0717599391937256, + "logps/chosen": -441.1888427734375, + "logps/rejected": -380.23779296875, + "loss": 0.3929, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.201527714729309, + "rewards/margins": 1.7749658823013306, + "rewards/rejected": -0.5734382271766663, + "step": 121 + }, + { + "epoch": 0.03, + "learning_rate": 1.9571428571428572e-05, + "logits/chosen": -2.1778323650360107, + "logits/rejected": -2.0903797149658203, + "logps/chosen": -267.32635498046875, + "logps/rejected": -308.8648681640625, + "loss": 0.2719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7462610602378845, + "rewards/margins": 2.151460647583008, + "rewards/rejected": -1.4051997661590576, + "step": 122 + }, + { + "epoch": 0.03, + "learning_rate": 1.9567226890756303e-05, + "logits/chosen": -1.9987823963165283, + "logits/rejected": -1.9070273637771606, + "logps/chosen": -291.7791442871094, + "logps/rejected": -235.40542602539062, + "loss": 0.7292, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4117959141731262, + "rewards/margins": 0.7061718702316284, + "rewards/rejected": -0.2943759262561798, + "step": 123 + }, + { + "epoch": 0.03, + "learning_rate": 1.9563025210084036e-05, + "logits/chosen": -2.3301608562469482, + "logits/rejected": -2.1610896587371826, + "logps/chosen": -312.8262023925781, + "logps/rejected": -219.18545532226562, + "loss": 0.7426, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22050249576568604, + "rewards/margins": 1.0287131071090698, + "rewards/rejected": -0.8082106113433838, + "step": 124 + }, + { + "epoch": 0.03, + "learning_rate": 1.9558823529411766e-05, + "logits/chosen": -1.8436377048492432, + "logits/rejected": -2.0876355171203613, + "logps/chosen": -262.98211669921875, + "logps/rejected": -287.9124450683594, + "loss": 0.4098, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7412158846855164, + "rewards/margins": 2.2547714710235596, + "rewards/rejected": -1.513555645942688, + "step": 125 + }, + { + "epoch": 0.03, + "learning_rate": 1.9554621848739497e-05, + "logits/chosen": -1.9680488109588623, + "logits/rejected": -1.5334417819976807, + "logps/chosen": -293.45220947265625, + "logps/rejected": -337.33306884765625, + "loss": 0.4283, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4352495074272156, + "rewards/margins": 1.2213841676712036, + "rewards/rejected": -0.786134660243988, + "step": 126 + }, + { + "epoch": 0.03, + "learning_rate": 1.9550420168067227e-05, + "logits/chosen": -2.027254581451416, + "logits/rejected": -1.6008858680725098, + "logps/chosen": -292.0217590332031, + "logps/rejected": -231.9427490234375, + "loss": 0.3687, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3913624882698059, + "rewards/margins": 1.060205340385437, + "rewards/rejected": -0.6688427925109863, + "step": 127 + }, + { + "epoch": 0.03, + "learning_rate": 1.954621848739496e-05, + "logits/chosen": -2.1365368366241455, + "logits/rejected": -2.2989044189453125, + "logps/chosen": -280.9864196777344, + "logps/rejected": -294.4695129394531, + "loss": 0.6608, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39271318912506104, + "rewards/margins": 1.1895842552185059, + "rewards/rejected": -0.7968710064888, + "step": 128 + }, + { + "epoch": 0.03, + "learning_rate": 1.954201680672269e-05, + "logits/chosen": -2.260463237762451, + "logits/rejected": -1.9508836269378662, + "logps/chosen": -325.3363952636719, + "logps/rejected": -304.3674621582031, + "loss": 0.3539, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7612236738204956, + "rewards/margins": 2.525355339050293, + "rewards/rejected": -1.764131784439087, + "step": 129 + }, + { + "epoch": 0.03, + "learning_rate": 1.953781512605042e-05, + "logits/chosen": -2.052269697189331, + "logits/rejected": -1.9552733898162842, + "logps/chosen": -293.24993896484375, + "logps/rejected": -256.47119140625, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9430196285247803, + "rewards/margins": 1.513566255569458, + "rewards/rejected": -0.5705466866493225, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 1.953361344537815e-05, + "logits/chosen": -2.144200325012207, + "logits/rejected": -2.2826175689697266, + "logps/chosen": -380.23797607421875, + "logps/rejected": -341.62493896484375, + "loss": 0.4899, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.559058427810669, + "rewards/margins": 2.098733901977539, + "rewards/rejected": -1.539675235748291, + "step": 131 + }, + { + "epoch": 0.03, + "learning_rate": 1.9529411764705885e-05, + "logits/chosen": -2.0861480236053467, + "logits/rejected": -2.084181308746338, + "logps/chosen": -374.23980712890625, + "logps/rejected": -336.87451171875, + "loss": 0.2313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7874554395675659, + "rewards/margins": 2.2659218311309814, + "rewards/rejected": -1.478466272354126, + "step": 132 + }, + { + "epoch": 0.03, + "learning_rate": 1.9525210084033615e-05, + "logits/chosen": -2.1682991981506348, + "logits/rejected": -1.839908242225647, + "logps/chosen": -343.1785888671875, + "logps/rejected": -283.29254150390625, + "loss": 0.1942, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4628217816352844, + "rewards/margins": 3.1726136207580566, + "rewards/rejected": -2.709791660308838, + "step": 133 + }, + { + "epoch": 0.03, + "learning_rate": 1.9521008403361345e-05, + "logits/chosen": -2.3428895473480225, + "logits/rejected": -1.9651906490325928, + "logps/chosen": -357.1006164550781, + "logps/rejected": -305.46295166015625, + "loss": 0.4695, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7344120740890503, + "rewards/margins": 1.314975619316101, + "rewards/rejected": -0.5805636048316956, + "step": 134 + }, + { + "epoch": 0.03, + "learning_rate": 1.9516806722689075e-05, + "logits/chosen": -1.938299536705017, + "logits/rejected": -1.843817949295044, + "logps/chosen": -298.67462158203125, + "logps/rejected": -329.4986267089844, + "loss": 0.5497, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06151680648326874, + "rewards/margins": 1.017744779586792, + "rewards/rejected": -0.9562280178070068, + "step": 135 + }, + { + "epoch": 0.03, + "learning_rate": 1.951260504201681e-05, + "logits/chosen": -2.382833242416382, + "logits/rejected": -1.7691450119018555, + "logps/chosen": -395.1127014160156, + "logps/rejected": -326.5736389160156, + "loss": 0.2933, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9504792094230652, + "rewards/margins": 2.197152614593506, + "rewards/rejected": -1.246673345565796, + "step": 136 + }, + { + "epoch": 0.03, + "learning_rate": 1.950840336134454e-05, + "logits/chosen": -2.1813700199127197, + "logits/rejected": -1.7723538875579834, + "logps/chosen": -415.26708984375, + "logps/rejected": -375.0914001464844, + "loss": 0.4077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8795511722564697, + "rewards/margins": 2.122403621673584, + "rewards/rejected": -1.2428523302078247, + "step": 137 + }, + { + "epoch": 0.03, + "learning_rate": 1.950420168067227e-05, + "logits/chosen": -2.014397621154785, + "logits/rejected": -1.6715497970581055, + "logps/chosen": -155.23806762695312, + "logps/rejected": -206.61004638671875, + "loss": 0.543, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4505138397216797, + "rewards/margins": 1.0151722431182861, + "rewards/rejected": -0.5646584033966064, + "step": 138 + }, + { + "epoch": 0.03, + "learning_rate": 1.95e-05, + "logits/chosen": -1.9987766742706299, + "logits/rejected": -1.5939929485321045, + "logps/chosen": -346.7440490722656, + "logps/rejected": -199.17239379882812, + "loss": 0.5008, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6306272745132446, + "rewards/margins": 1.3868716955184937, + "rewards/rejected": -0.7562444806098938, + "step": 139 + }, + { + "epoch": 0.03, + "learning_rate": 1.9495798319327733e-05, + "logits/chosen": -2.088118314743042, + "logits/rejected": -1.984297752380371, + "logps/chosen": -283.76739501953125, + "logps/rejected": -404.423095703125, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7562440037727356, + "rewards/margins": 1.527753472328186, + "rewards/rejected": -0.7715095281600952, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 1.9491596638655463e-05, + "logits/chosen": -1.9548678398132324, + "logits/rejected": -1.7473572492599487, + "logps/chosen": -307.74554443359375, + "logps/rejected": -423.6452331542969, + "loss": 0.2522, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5310712456703186, + "rewards/margins": 2.706507682800293, + "rewards/rejected": -2.175436496734619, + "step": 141 + }, + { + "epoch": 0.03, + "learning_rate": 1.9487394957983194e-05, + "logits/chosen": -1.915533423423767, + "logits/rejected": -1.689677119255066, + "logps/chosen": -243.7513427734375, + "logps/rejected": -244.32139587402344, + "loss": 0.4161, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.45054006576538086, + "rewards/margins": 1.5065770149230957, + "rewards/rejected": -1.0560369491577148, + "step": 142 + }, + { + "epoch": 0.03, + "learning_rate": 1.9483193277310924e-05, + "logits/chosen": -1.9550589323043823, + "logits/rejected": -2.287087917327881, + "logps/chosen": -168.13516235351562, + "logps/rejected": -239.2342071533203, + "loss": 0.5656, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7052255868911743, + "rewards/margins": 1.1860077381134033, + "rewards/rejected": -0.4807822108268738, + "step": 143 + }, + { + "epoch": 0.03, + "learning_rate": 1.9478991596638658e-05, + "logits/chosen": -2.001689910888672, + "logits/rejected": -1.7274882793426514, + "logps/chosen": -241.14906311035156, + "logps/rejected": -238.80674743652344, + "loss": 0.4332, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2465830147266388, + "rewards/margins": 1.6028279066085815, + "rewards/rejected": -1.3562450408935547, + "step": 144 + }, + { + "epoch": 0.03, + "learning_rate": 1.9474789915966388e-05, + "logits/chosen": -2.160628318786621, + "logits/rejected": -2.014204978942871, + "logps/chosen": -388.7725830078125, + "logps/rejected": -345.66925048828125, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.258589267730713, + "rewards/margins": 1.8814054727554321, + "rewards/rejected": -0.6228160858154297, + "step": 145 + }, + { + "epoch": 0.03, + "learning_rate": 1.9470588235294118e-05, + "logits/chosen": -1.9197282791137695, + "logits/rejected": -1.8468551635742188, + "logps/chosen": -345.3627624511719, + "logps/rejected": -291.37933349609375, + "loss": 0.2447, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1265082359313965, + "rewards/margins": 2.682868003845215, + "rewards/rejected": -1.5563597679138184, + "step": 146 + }, + { + "epoch": 0.03, + "learning_rate": 1.946638655462185e-05, + "logits/chosen": -2.224470376968384, + "logits/rejected": -1.9717036485671997, + "logps/chosen": -241.99221801757812, + "logps/rejected": -235.1395263671875, + "loss": 0.3608, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8303925395011902, + "rewards/margins": 1.9417951107025146, + "rewards/rejected": -1.1114026308059692, + "step": 147 + }, + { + "epoch": 0.03, + "learning_rate": 1.9462184873949582e-05, + "logits/chosen": -2.1907153129577637, + "logits/rejected": -1.4065743684768677, + "logps/chosen": -333.9032287597656, + "logps/rejected": -222.32427978515625, + "loss": 0.1965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7542409300804138, + "rewards/margins": 2.7903554439544678, + "rewards/rejected": -2.0361146926879883, + "step": 148 + }, + { + "epoch": 0.03, + "learning_rate": 1.9457983193277312e-05, + "logits/chosen": -2.287984609603882, + "logits/rejected": -1.894585132598877, + "logps/chosen": -337.87335205078125, + "logps/rejected": -346.18927001953125, + "loss": 0.1899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2012064456939697, + "rewards/margins": 3.105619430541992, + "rewards/rejected": -1.9044129848480225, + "step": 149 + }, + { + "epoch": 0.03, + "learning_rate": 1.9453781512605042e-05, + "logits/chosen": -1.955935001373291, + "logits/rejected": -2.054630994796753, + "logps/chosen": -302.446533203125, + "logps/rejected": -342.5522155761719, + "loss": 0.2561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1727423667907715, + "rewards/margins": 2.5674867630004883, + "rewards/rejected": -1.3947443962097168, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 1.9449579831932776e-05, + "logits/chosen": -1.795854091644287, + "logits/rejected": -1.9701902866363525, + "logps/chosen": -198.8837432861328, + "logps/rejected": -320.54412841796875, + "loss": 0.1421, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45592638850212097, + "rewards/margins": 3.1315865516662598, + "rewards/rejected": -2.6756601333618164, + "step": 151 + }, + { + "epoch": 0.03, + "learning_rate": 1.9445378151260506e-05, + "logits/chosen": -1.774823546409607, + "logits/rejected": -1.9420603513717651, + "logps/chosen": -321.3607177734375, + "logps/rejected": -369.28729248046875, + "loss": 0.211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.32832881808280945, + "rewards/margins": 2.673469066619873, + "rewards/rejected": -2.345140218734741, + "step": 152 + }, + { + "epoch": 0.03, + "learning_rate": 1.9441176470588236e-05, + "logits/chosen": -1.9179635047912598, + "logits/rejected": -1.5516554117202759, + "logps/chosen": -260.065185546875, + "logps/rejected": -296.73236083984375, + "loss": 0.2608, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7521446347236633, + "rewards/margins": 3.222820997238159, + "rewards/rejected": -2.4706761837005615, + "step": 153 + }, + { + "epoch": 0.03, + "learning_rate": 1.9436974789915967e-05, + "logits/chosen": -1.9910225868225098, + "logits/rejected": -2.0822975635528564, + "logps/chosen": -267.6091003417969, + "logps/rejected": -328.9667663574219, + "loss": 0.2232, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23971346020698547, + "rewards/margins": 3.2980337142944336, + "rewards/rejected": -3.0583202838897705, + "step": 154 + }, + { + "epoch": 0.03, + "learning_rate": 1.94327731092437e-05, + "logits/chosen": -2.140634536743164, + "logits/rejected": -1.8932344913482666, + "logps/chosen": -377.14013671875, + "logps/rejected": -335.8614501953125, + "loss": 0.4596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05733342468738556, + "rewards/margins": 1.7537956237792969, + "rewards/rejected": -1.811129093170166, + "step": 155 + }, + { + "epoch": 0.03, + "learning_rate": 1.942857142857143e-05, + "logits/chosen": -2.20592999458313, + "logits/rejected": -1.8383092880249023, + "logps/chosen": -354.6993103027344, + "logps/rejected": -219.24322509765625, + "loss": 0.5416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15585672855377197, + "rewards/margins": 1.7308855056762695, + "rewards/rejected": -1.8867424726486206, + "step": 156 + }, + { + "epoch": 0.03, + "learning_rate": 1.942436974789916e-05, + "logits/chosen": -2.0593392848968506, + "logits/rejected": -1.82268226146698, + "logps/chosen": -459.42303466796875, + "logps/rejected": -314.2120056152344, + "loss": 0.1246, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0004568099975586, + "rewards/margins": 2.7035768032073975, + "rewards/rejected": -1.7031197547912598, + "step": 157 + }, + { + "epoch": 0.03, + "learning_rate": 1.942016806722689e-05, + "logits/chosen": -2.0842440128326416, + "logits/rejected": -1.7106075286865234, + "logps/chosen": -301.0412902832031, + "logps/rejected": -318.5955810546875, + "loss": 0.1775, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38655221462249756, + "rewards/margins": 2.980933904647827, + "rewards/rejected": -2.59438157081604, + "step": 158 + }, + { + "epoch": 0.03, + "learning_rate": 1.9415966386554624e-05, + "logits/chosen": -2.2098166942596436, + "logits/rejected": -2.253511905670166, + "logps/chosen": -233.94956970214844, + "logps/rejected": -315.84912109375, + "loss": 0.2124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35227689146995544, + "rewards/margins": 3.8535265922546387, + "rewards/rejected": -3.5012497901916504, + "step": 159 + }, + { + "epoch": 0.03, + "learning_rate": 1.9411764705882355e-05, + "logits/chosen": -2.0606324672698975, + "logits/rejected": -2.2401208877563477, + "logps/chosen": -283.51531982421875, + "logps/rejected": -335.6744079589844, + "loss": 0.3731, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.30113106966018677, + "rewards/margins": 2.1955299377441406, + "rewards/rejected": -1.8943989276885986, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 1.9407563025210085e-05, + "logits/chosen": -2.006823778152466, + "logits/rejected": -2.03440523147583, + "logps/chosen": -301.72845458984375, + "logps/rejected": -300.8660888671875, + "loss": 0.3647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05930629372596741, + "rewards/margins": 1.845715045928955, + "rewards/rejected": -1.9050211906433105, + "step": 161 + }, + { + "epoch": 0.03, + "learning_rate": 1.9403361344537815e-05, + "logits/chosen": -2.2592272758483887, + "logits/rejected": -1.4900158643722534, + "logps/chosen": -349.850341796875, + "logps/rejected": -237.9735870361328, + "loss": 0.4265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4505152106285095, + "rewards/margins": 2.180917739868164, + "rewards/rejected": -2.6314330101013184, + "step": 162 + }, + { + "epoch": 0.03, + "learning_rate": 1.939915966386555e-05, + "logits/chosen": -2.138091802597046, + "logits/rejected": -2.0000557899475098, + "logps/chosen": -337.5413818359375, + "logps/rejected": -281.70013427734375, + "loss": 0.2175, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1180100366473198, + "rewards/margins": 2.639892578125, + "rewards/rejected": -2.5218825340270996, + "step": 163 + }, + { + "epoch": 0.03, + "learning_rate": 1.939495798319328e-05, + "logits/chosen": -2.351864814758301, + "logits/rejected": -1.7518774271011353, + "logps/chosen": -305.2463073730469, + "logps/rejected": -265.68487548828125, + "loss": 0.5453, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15149277448654175, + "rewards/margins": 1.4267189502716064, + "rewards/rejected": -1.27522611618042, + "step": 164 + }, + { + "epoch": 0.03, + "learning_rate": 1.939075630252101e-05, + "logits/chosen": -2.159843683242798, + "logits/rejected": -1.8581840991973877, + "logps/chosen": -313.42755126953125, + "logps/rejected": -305.45281982421875, + "loss": 0.275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12428408861160278, + "rewards/margins": 3.574671506881714, + "rewards/rejected": -3.4503870010375977, + "step": 165 + }, + { + "epoch": 0.03, + "learning_rate": 1.9386554621848743e-05, + "logits/chosen": -2.1018590927124023, + "logits/rejected": -1.6583375930786133, + "logps/chosen": -251.76638793945312, + "logps/rejected": -222.0404052734375, + "loss": 0.2708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7818334698677063, + "rewards/margins": 3.106484889984131, + "rewards/rejected": -2.3246514797210693, + "step": 166 + }, + { + "epoch": 0.03, + "learning_rate": 1.9382352941176473e-05, + "logits/chosen": -2.204774856567383, + "logits/rejected": -2.085190773010254, + "logps/chosen": -310.175537109375, + "logps/rejected": -311.9328308105469, + "loss": 0.5247, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6363217830657959, + "rewards/margins": 2.2235963344573975, + "rewards/rejected": -1.5872745513916016, + "step": 167 + }, + { + "epoch": 0.04, + "learning_rate": 1.9378151260504203e-05, + "logits/chosen": -2.0087029933929443, + "logits/rejected": -1.9225444793701172, + "logps/chosen": -357.4611511230469, + "logps/rejected": -365.505615234375, + "loss": 0.2371, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7516674995422363, + "rewards/margins": 2.9219133853912354, + "rewards/rejected": -2.170245885848999, + "step": 168 + }, + { + "epoch": 0.04, + "learning_rate": 1.9373949579831933e-05, + "logits/chosen": -2.308570623397827, + "logits/rejected": -1.8169952630996704, + "logps/chosen": -402.0672912597656, + "logps/rejected": -297.09906005859375, + "loss": 0.3505, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7763336896896362, + "rewards/margins": 3.253223419189453, + "rewards/rejected": -2.4768898487091064, + "step": 169 + }, + { + "epoch": 0.04, + "learning_rate": 1.9369747899159667e-05, + "logits/chosen": -1.8160821199417114, + "logits/rejected": -2.0576789379119873, + "logps/chosen": -231.24964904785156, + "logps/rejected": -230.78431701660156, + "loss": 0.4406, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.29919058084487915, + "rewards/margins": 1.8738502264022827, + "rewards/rejected": -1.5746595859527588, + "step": 170 + }, + { + "epoch": 0.04, + "learning_rate": 1.9365546218487397e-05, + "logits/chosen": -2.0465896129608154, + "logits/rejected": -1.9130158424377441, + "logps/chosen": -308.604736328125, + "logps/rejected": -290.31341552734375, + "loss": 0.4036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19760461151599884, + "rewards/margins": 2.215013265609741, + "rewards/rejected": -2.4126176834106445, + "step": 171 + }, + { + "epoch": 0.04, + "learning_rate": 1.9361344537815127e-05, + "logits/chosen": -2.220503568649292, + "logits/rejected": -1.9116710424423218, + "logps/chosen": -374.86773681640625, + "logps/rejected": -367.9362487792969, + "loss": 0.7568, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.501358151435852, + "rewards/margins": 1.8529269695281982, + "rewards/rejected": -2.3542847633361816, + "step": 172 + }, + { + "epoch": 0.04, + "learning_rate": 1.9357142857142858e-05, + "logits/chosen": -2.1519575119018555, + "logits/rejected": -2.108215808868408, + "logps/chosen": -273.619873046875, + "logps/rejected": -299.34429931640625, + "loss": 0.2548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.659705400466919, + "rewards/margins": 2.3766002655029297, + "rewards/rejected": -1.7168949842453003, + "step": 173 + }, + { + "epoch": 0.04, + "learning_rate": 1.935294117647059e-05, + "logits/chosen": -2.099363327026367, + "logits/rejected": -1.8638235330581665, + "logps/chosen": -266.3851013183594, + "logps/rejected": -314.7110595703125, + "loss": 0.36, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.42388927936553955, + "rewards/margins": 2.994227886199951, + "rewards/rejected": -3.4181172847747803, + "step": 174 + }, + { + "epoch": 0.04, + "learning_rate": 1.934873949579832e-05, + "logits/chosen": -2.004194498062134, + "logits/rejected": -2.1080429553985596, + "logps/chosen": -271.04913330078125, + "logps/rejected": -303.30810546875, + "loss": 0.427, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09644299745559692, + "rewards/margins": 1.8110917806625366, + "rewards/rejected": -1.714648723602295, + "step": 175 + }, + { + "epoch": 0.04, + "learning_rate": 1.9344537815126052e-05, + "logits/chosen": -1.9271230697631836, + "logits/rejected": -1.4533002376556396, + "logps/chosen": -379.56158447265625, + "logps/rejected": -251.24574279785156, + "loss": 0.3061, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7267810106277466, + "rewards/margins": 1.7747788429260254, + "rewards/rejected": -1.0479978322982788, + "step": 176 + }, + { + "epoch": 0.04, + "learning_rate": 1.9340336134453782e-05, + "logits/chosen": -1.7387003898620605, + "logits/rejected": -1.6673742532730103, + "logps/chosen": -319.44677734375, + "logps/rejected": -328.6859130859375, + "loss": 0.2548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13148075342178345, + "rewards/margins": 2.836094856262207, + "rewards/rejected": -2.7046141624450684, + "step": 177 + }, + { + "epoch": 0.04, + "learning_rate": 1.9336134453781516e-05, + "logits/chosen": -2.1485331058502197, + "logits/rejected": -1.7516132593154907, + "logps/chosen": -312.14935302734375, + "logps/rejected": -263.0650634765625, + "loss": 0.3042, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6039855480194092, + "rewards/margins": 2.0820467472076416, + "rewards/rejected": -1.4780610799789429, + "step": 178 + }, + { + "epoch": 0.04, + "learning_rate": 1.9331932773109246e-05, + "logits/chosen": -2.356072425842285, + "logits/rejected": -1.7140439748764038, + "logps/chosen": -474.1767272949219, + "logps/rejected": -322.3995056152344, + "loss": 0.481, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0852775573730469, + "rewards/margins": 2.305659770965576, + "rewards/rejected": -1.2203824520111084, + "step": 179 + }, + { + "epoch": 0.04, + "learning_rate": 1.9327731092436976e-05, + "logits/chosen": -2.0827455520629883, + "logits/rejected": -1.9543653726577759, + "logps/chosen": -329.56024169921875, + "logps/rejected": -366.47454833984375, + "loss": 0.3702, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2157536894083023, + "rewards/margins": 2.586594343185425, + "rewards/rejected": -2.370840549468994, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 1.9323529411764706e-05, + "logits/chosen": -2.1309521198272705, + "logits/rejected": -2.09285044670105, + "logps/chosen": -197.16836547851562, + "logps/rejected": -216.26132202148438, + "loss": 0.4309, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6182193160057068, + "rewards/margins": 1.49525785446167, + "rewards/rejected": -0.8770386576652527, + "step": 181 + }, + { + "epoch": 0.04, + "learning_rate": 1.931932773109244e-05, + "logits/chosen": -1.9410741329193115, + "logits/rejected": -2.0862960815429688, + "logps/chosen": -191.31874084472656, + "logps/rejected": -267.4620056152344, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24719244241714478, + "rewards/margins": 2.0659613609313965, + "rewards/rejected": -2.3131542205810547, + "step": 182 + }, + { + "epoch": 0.04, + "learning_rate": 1.931512605042017e-05, + "logits/chosen": -1.8501081466674805, + "logits/rejected": -1.988316535949707, + "logps/chosen": -253.83676147460938, + "logps/rejected": -322.62493896484375, + "loss": 0.8646, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08060411363840103, + "rewards/margins": 0.7999259829521179, + "rewards/rejected": -0.7193217873573303, + "step": 183 + }, + { + "epoch": 0.04, + "learning_rate": 1.93109243697479e-05, + "logits/chosen": -2.361492872238159, + "logits/rejected": -1.6896589994430542, + "logps/chosen": -373.93548583984375, + "logps/rejected": -351.4352722167969, + "loss": 0.2221, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.030282974243164, + "rewards/margins": 3.338300943374634, + "rewards/rejected": -2.3080177307128906, + "step": 184 + }, + { + "epoch": 0.04, + "learning_rate": 1.930672268907563e-05, + "logits/chosen": -1.9699373245239258, + "logits/rejected": -1.8167184591293335, + "logps/chosen": -260.5339050292969, + "logps/rejected": -273.19976806640625, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6234461665153503, + "rewards/margins": 1.8294168710708618, + "rewards/rejected": -1.2059706449508667, + "step": 185 + }, + { + "epoch": 0.04, + "learning_rate": 1.9302521008403364e-05, + "logits/chosen": -2.2624993324279785, + "logits/rejected": -2.042703628540039, + "logps/chosen": -354.2139892578125, + "logps/rejected": -319.7133483886719, + "loss": 0.5529, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5018466711044312, + "rewards/margins": 2.1002578735351562, + "rewards/rejected": -1.598411202430725, + "step": 186 + }, + { + "epoch": 0.04, + "learning_rate": 1.9298319327731094e-05, + "logits/chosen": -1.943152904510498, + "logits/rejected": -2.1798253059387207, + "logps/chosen": -260.026611328125, + "logps/rejected": -357.4902648925781, + "loss": 0.4278, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5993878245353699, + "rewards/margins": 2.099064826965332, + "rewards/rejected": -1.4996771812438965, + "step": 187 + }, + { + "epoch": 0.04, + "learning_rate": 1.9294117647058825e-05, + "logits/chosen": -2.321877956390381, + "logits/rejected": -1.9957356452941895, + "logps/chosen": -267.6776123046875, + "logps/rejected": -274.17572021484375, + "loss": 0.2828, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.015132449567317963, + "rewards/margins": 2.1086621284484863, + "rewards/rejected": -2.09352970123291, + "step": 188 + }, + { + "epoch": 0.04, + "learning_rate": 1.9289915966386558e-05, + "logits/chosen": -2.2177798748016357, + "logits/rejected": -1.7982139587402344, + "logps/chosen": -389.71221923828125, + "logps/rejected": -341.09716796875, + "loss": 0.3212, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5387109518051147, + "rewards/margins": 2.2304818630218506, + "rewards/rejected": -1.6917710304260254, + "step": 189 + }, + { + "epoch": 0.04, + "learning_rate": 1.928571428571429e-05, + "logits/chosen": -2.0826494693756104, + "logits/rejected": -1.9526170492172241, + "logps/chosen": -244.26217651367188, + "logps/rejected": -226.49497985839844, + "loss": 0.686, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2411191165447235, + "rewards/margins": 1.1531486511230469, + "rewards/rejected": -1.3942676782608032, + "step": 190 + }, + { + "epoch": 0.04, + "learning_rate": 1.928151260504202e-05, + "logits/chosen": -2.1943392753601074, + "logits/rejected": -2.0047335624694824, + "logps/chosen": -322.46368408203125, + "logps/rejected": -266.3966064453125, + "loss": 0.4314, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7097187638282776, + "rewards/margins": 1.5068656206130981, + "rewards/rejected": -0.7971468567848206, + "step": 191 + }, + { + "epoch": 0.04, + "learning_rate": 1.927731092436975e-05, + "logits/chosen": -2.4108781814575195, + "logits/rejected": -2.2714500427246094, + "logps/chosen": -239.07113647460938, + "logps/rejected": -287.70806884765625, + "loss": 0.265, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4333779811859131, + "rewards/margins": 2.222304344177246, + "rewards/rejected": -1.788926362991333, + "step": 192 + }, + { + "epoch": 0.04, + "learning_rate": 1.9273109243697482e-05, + "logits/chosen": -2.0882983207702637, + "logits/rejected": -1.599724531173706, + "logps/chosen": -306.90264892578125, + "logps/rejected": -306.5920104980469, + "loss": 0.4409, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35444700717926025, + "rewards/margins": 3.6877753734588623, + "rewards/rejected": -3.3333282470703125, + "step": 193 + }, + { + "epoch": 0.04, + "learning_rate": 1.9268907563025213e-05, + "logits/chosen": -1.5826417207717896, + "logits/rejected": -1.6908752918243408, + "logps/chosen": -213.50213623046875, + "logps/rejected": -323.8135986328125, + "loss": 0.2619, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4410989284515381, + "rewards/margins": 3.3193366527557373, + "rewards/rejected": -2.878237724304199, + "step": 194 + }, + { + "epoch": 0.04, + "learning_rate": 1.9264705882352943e-05, + "logits/chosen": -2.1554768085479736, + "logits/rejected": -1.9104681015014648, + "logps/chosen": -263.67022705078125, + "logps/rejected": -263.89251708984375, + "loss": 0.3077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4525309205055237, + "rewards/margins": 2.7871453762054443, + "rewards/rejected": -2.3346142768859863, + "step": 195 + }, + { + "epoch": 0.04, + "learning_rate": 1.9260504201680673e-05, + "logits/chosen": -1.8429673910140991, + "logits/rejected": -1.953737735748291, + "logps/chosen": -217.81649780273438, + "logps/rejected": -239.5341339111328, + "loss": 0.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3554931581020355, + "rewards/margins": 3.4962148666381836, + "rewards/rejected": -3.1407217979431152, + "step": 196 + }, + { + "epoch": 0.04, + "learning_rate": 1.9256302521008407e-05, + "logits/chosen": -2.090711832046509, + "logits/rejected": -1.7367603778839111, + "logps/chosen": -295.37701416015625, + "logps/rejected": -233.35989379882812, + "loss": 0.5899, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28794872760772705, + "rewards/margins": 1.9939861297607422, + "rewards/rejected": -1.7060374021530151, + "step": 197 + }, + { + "epoch": 0.04, + "learning_rate": 1.9252100840336137e-05, + "logits/chosen": -2.216714859008789, + "logits/rejected": -1.3142722845077515, + "logps/chosen": -379.0946350097656, + "logps/rejected": -216.58425903320312, + "loss": 0.5711, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.676600456237793, + "rewards/margins": 1.7255703210830688, + "rewards/rejected": -1.0489698648452759, + "step": 198 + }, + { + "epoch": 0.04, + "learning_rate": 1.9247899159663867e-05, + "logits/chosen": -1.886040210723877, + "logits/rejected": -2.2005977630615234, + "logps/chosen": -220.26943969726562, + "logps/rejected": -333.1991271972656, + "loss": 0.4988, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5622369050979614, + "rewards/margins": 2.4243102073669434, + "rewards/rejected": -1.862073302268982, + "step": 199 + }, + { + "epoch": 0.04, + "learning_rate": 1.9243697478991597e-05, + "logits/chosen": -2.152661085128784, + "logits/rejected": -1.7738873958587646, + "logps/chosen": -231.73829650878906, + "logps/rejected": -238.894287109375, + "loss": 0.6994, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24995756149291992, + "rewards/margins": 1.5514318943023682, + "rewards/rejected": -1.3014743328094482, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 1.923949579831933e-05, + "logits/chosen": -1.9182398319244385, + "logits/rejected": -1.8288938999176025, + "logps/chosen": -412.51019287109375, + "logps/rejected": -310.742919921875, + "loss": 0.5176, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2572101950645447, + "rewards/margins": 2.264309883117676, + "rewards/rejected": -2.0070996284484863, + "step": 201 + }, + { + "epoch": 0.04, + "learning_rate": 1.923529411764706e-05, + "logits/chosen": -2.0700290203094482, + "logits/rejected": -1.8612308502197266, + "logps/chosen": -325.8376159667969, + "logps/rejected": -348.48760986328125, + "loss": 0.2201, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3816840350627899, + "rewards/margins": 2.883958578109741, + "rewards/rejected": -2.502274513244629, + "step": 202 + }, + { + "epoch": 0.04, + "learning_rate": 1.923109243697479e-05, + "logits/chosen": -2.0668513774871826, + "logits/rejected": -1.7440872192382812, + "logps/chosen": -304.94110107421875, + "logps/rejected": -294.171875, + "loss": 0.6027, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6673010587692261, + "rewards/margins": 2.2133984565734863, + "rewards/rejected": -1.5460972785949707, + "step": 203 + }, + { + "epoch": 0.04, + "learning_rate": 1.922689075630252e-05, + "logits/chosen": -2.1958768367767334, + "logits/rejected": -1.6824707984924316, + "logps/chosen": -287.90673828125, + "logps/rejected": -251.39566040039062, + "loss": 0.5216, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2339067459106445, + "rewards/margins": 1.9839694499969482, + "rewards/rejected": -0.7500627040863037, + "step": 204 + }, + { + "epoch": 0.04, + "learning_rate": 1.9222689075630255e-05, + "logits/chosen": -1.7814674377441406, + "logits/rejected": -1.9055691957473755, + "logps/chosen": -212.4736328125, + "logps/rejected": -259.2861022949219, + "loss": 0.5105, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6588839888572693, + "rewards/margins": 1.3305463790893555, + "rewards/rejected": -0.6716625094413757, + "step": 205 + }, + { + "epoch": 0.04, + "learning_rate": 1.9218487394957985e-05, + "logits/chosen": -2.300837278366089, + "logits/rejected": -1.6157662868499756, + "logps/chosen": -377.23388671875, + "logps/rejected": -244.01519775390625, + "loss": 0.5409, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.2487529516220093, + "rewards/margins": 1.6235289573669434, + "rewards/rejected": -0.3747760057449341, + "step": 206 + }, + { + "epoch": 0.04, + "learning_rate": 1.9214285714285716e-05, + "logits/chosen": -2.1403493881225586, + "logits/rejected": -1.8367571830749512, + "logps/chosen": -359.39410400390625, + "logps/rejected": -320.2593994140625, + "loss": 0.2947, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9880273938179016, + "rewards/margins": 2.1673243045806885, + "rewards/rejected": -1.1792969703674316, + "step": 207 + }, + { + "epoch": 0.04, + "learning_rate": 1.9210084033613446e-05, + "logits/chosen": -2.1622567176818848, + "logits/rejected": -1.9735891819000244, + "logps/chosen": -348.40472412109375, + "logps/rejected": -389.960205078125, + "loss": 0.5641, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7891466617584229, + "rewards/margins": 1.1746538877487183, + "rewards/rejected": -0.38550716638565063, + "step": 208 + }, + { + "epoch": 0.04, + "learning_rate": 1.920588235294118e-05, + "logits/chosen": -2.1938869953155518, + "logits/rejected": -1.7905843257904053, + "logps/chosen": -353.5918273925781, + "logps/rejected": -348.0268859863281, + "loss": 0.5327, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.1191864013671875, + "rewards/margins": 1.3848307132720947, + "rewards/rejected": -0.26564449071884155, + "step": 209 + }, + { + "epoch": 0.04, + "learning_rate": 1.920168067226891e-05, + "logits/chosen": -2.2892205715179443, + "logits/rejected": -2.0363335609436035, + "logps/chosen": -300.6004638671875, + "logps/rejected": -274.6438903808594, + "loss": 0.2904, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7398238182067871, + "rewards/margins": 1.7600440979003906, + "rewards/rejected": -1.020220160484314, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 1.919747899159664e-05, + "logits/chosen": -2.097750186920166, + "logits/rejected": -1.3871397972106934, + "logps/chosen": -276.9554443359375, + "logps/rejected": -232.28392028808594, + "loss": 0.2275, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6810877323150635, + "rewards/margins": 2.238926887512207, + "rewards/rejected": -1.5578391551971436, + "step": 211 + }, + { + "epoch": 0.04, + "learning_rate": 1.9193277310924374e-05, + "logits/chosen": -2.3199985027313232, + "logits/rejected": -1.8059470653533936, + "logps/chosen": -392.4477233886719, + "logps/rejected": -366.4036560058594, + "loss": 0.2247, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8302945494651794, + "rewards/margins": 2.384056806564331, + "rewards/rejected": -1.5537623167037964, + "step": 212 + }, + { + "epoch": 0.04, + "learning_rate": 1.9189075630252104e-05, + "logits/chosen": -1.8848183155059814, + "logits/rejected": -1.9005846977233887, + "logps/chosen": -222.84994506835938, + "logps/rejected": -350.2418518066406, + "loss": 0.5068, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.389511376619339, + "rewards/margins": 1.5237209796905518, + "rewards/rejected": -1.1342095136642456, + "step": 213 + }, + { + "epoch": 0.04, + "learning_rate": 1.9184873949579834e-05, + "logits/chosen": -1.7811338901519775, + "logits/rejected": -1.7196321487426758, + "logps/chosen": -317.6466979980469, + "logps/rejected": -279.0945739746094, + "loss": 0.2893, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8474152088165283, + "rewards/margins": 1.9977939128875732, + "rewards/rejected": -1.1503784656524658, + "step": 214 + }, + { + "epoch": 0.04, + "learning_rate": 1.9180672268907564e-05, + "logits/chosen": -2.217418909072876, + "logits/rejected": -2.235569715499878, + "logps/chosen": -209.42669677734375, + "logps/rejected": -303.7484436035156, + "loss": 0.3552, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4288293719291687, + "rewards/margins": 1.933406114578247, + "rewards/rejected": -1.5045769214630127, + "step": 215 + }, + { + "epoch": 0.05, + "learning_rate": 1.9176470588235298e-05, + "logits/chosen": -2.2915008068084717, + "logits/rejected": -2.000013828277588, + "logps/chosen": -305.03656005859375, + "logps/rejected": -240.19345092773438, + "loss": 0.6803, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02194342017173767, + "rewards/margins": 1.5948190689086914, + "rewards/rejected": -1.6167625188827515, + "step": 216 + }, + { + "epoch": 0.05, + "learning_rate": 1.9172268907563028e-05, + "logits/chosen": -2.33363676071167, + "logits/rejected": -2.2526702880859375, + "logps/chosen": -283.939697265625, + "logps/rejected": -242.2597198486328, + "loss": 0.3365, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.585136890411377, + "rewards/margins": 2.0768470764160156, + "rewards/rejected": -1.4917103052139282, + "step": 217 + }, + { + "epoch": 0.05, + "learning_rate": 1.9168067226890758e-05, + "logits/chosen": -1.9670711755752563, + "logits/rejected": -1.3943191766738892, + "logps/chosen": -300.0361328125, + "logps/rejected": -274.5738525390625, + "loss": 0.4034, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20370739698410034, + "rewards/margins": 1.436436653137207, + "rewards/rejected": -1.232729196548462, + "step": 218 + }, + { + "epoch": 0.05, + "learning_rate": 1.916386554621849e-05, + "logits/chosen": -1.8724455833435059, + "logits/rejected": -1.7746968269348145, + "logps/chosen": -256.06634521484375, + "logps/rejected": -269.5330810546875, + "loss": 0.3588, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3698304295539856, + "rewards/margins": 1.6467514038085938, + "rewards/rejected": -1.2769211530685425, + "step": 219 + }, + { + "epoch": 0.05, + "learning_rate": 1.9159663865546222e-05, + "logits/chosen": -1.8914530277252197, + "logits/rejected": -1.9324146509170532, + "logps/chosen": -282.6627502441406, + "logps/rejected": -291.2701721191406, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34912729263305664, + "rewards/margins": 3.305083751678467, + "rewards/rejected": -2.95595645904541, + "step": 220 + }, + { + "epoch": 0.05, + "learning_rate": 1.9155462184873952e-05, + "logits/chosen": -2.0394859313964844, + "logits/rejected": -1.800555944442749, + "logps/chosen": -289.435791015625, + "logps/rejected": -235.32615661621094, + "loss": 0.5441, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4788973033428192, + "rewards/margins": 1.236483097076416, + "rewards/rejected": -0.7575857639312744, + "step": 221 + }, + { + "epoch": 0.05, + "learning_rate": 1.9151260504201683e-05, + "logits/chosen": -2.2122366428375244, + "logits/rejected": -2.196902275085449, + "logps/chosen": -299.8267517089844, + "logps/rejected": -268.9872741699219, + "loss": 0.4434, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1474180668592453, + "rewards/margins": 1.3896268606185913, + "rewards/rejected": -1.24220871925354, + "step": 222 + }, + { + "epoch": 0.05, + "learning_rate": 1.9147058823529413e-05, + "logits/chosen": -2.2167115211486816, + "logits/rejected": -2.136651039123535, + "logps/chosen": -445.08013916015625, + "logps/rejected": -368.6702880859375, + "loss": 0.271, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4029645323753357, + "rewards/margins": 2.2742044925689697, + "rewards/rejected": -1.8712400197982788, + "step": 223 + }, + { + "epoch": 0.05, + "learning_rate": 1.9142857142857146e-05, + "logits/chosen": -2.292498826980591, + "logits/rejected": -1.9991729259490967, + "logps/chosen": -290.1639404296875, + "logps/rejected": -351.75665283203125, + "loss": 0.176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6641996502876282, + "rewards/margins": 3.4761962890625, + "rewards/rejected": -2.8119964599609375, + "step": 224 + }, + { + "epoch": 0.05, + "learning_rate": 1.9138655462184877e-05, + "logits/chosen": -2.0794904232025146, + "logits/rejected": -2.17048978805542, + "logps/chosen": -167.24432373046875, + "logps/rejected": -274.07122802734375, + "loss": 0.204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.022808387875556946, + "rewards/margins": 3.534047842025757, + "rewards/rejected": -3.556856155395508, + "step": 225 + }, + { + "epoch": 0.05, + "learning_rate": 1.9134453781512607e-05, + "logits/chosen": -1.8843648433685303, + "logits/rejected": -2.0126261711120605, + "logps/chosen": -180.7335968017578, + "logps/rejected": -201.5263671875, + "loss": 0.4142, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1616390347480774, + "rewards/margins": 1.5557749271392822, + "rewards/rejected": -1.7174140214920044, + "step": 226 + }, + { + "epoch": 0.05, + "learning_rate": 1.9130252100840337e-05, + "logits/chosen": -1.9826205968856812, + "logits/rejected": -1.7967023849487305, + "logps/chosen": -349.6240234375, + "logps/rejected": -337.2738037109375, + "loss": 0.2759, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.321922242641449, + "rewards/margins": 2.146087646484375, + "rewards/rejected": -1.8241654634475708, + "step": 227 + }, + { + "epoch": 0.05, + "learning_rate": 1.912605042016807e-05, + "logits/chosen": -2.325867176055908, + "logits/rejected": -2.057299852371216, + "logps/chosen": -391.8019714355469, + "logps/rejected": -323.2319641113281, + "loss": 0.1588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3355216681957245, + "rewards/margins": 3.1053762435913086, + "rewards/rejected": -2.7698545455932617, + "step": 228 + }, + { + "epoch": 0.05, + "learning_rate": 1.91218487394958e-05, + "logits/chosen": -2.1116814613342285, + "logits/rejected": -1.9139583110809326, + "logps/chosen": -378.53472900390625, + "logps/rejected": -291.7130126953125, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35140061378479004, + "rewards/margins": 2.5469369888305664, + "rewards/rejected": -2.8983376026153564, + "step": 229 + }, + { + "epoch": 0.05, + "learning_rate": 1.911764705882353e-05, + "logits/chosen": -2.312340259552002, + "logits/rejected": -2.018899440765381, + "logps/chosen": -227.06011962890625, + "logps/rejected": -285.7034606933594, + "loss": 0.4459, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.29365068674087524, + "rewards/margins": 2.335214614868164, + "rewards/rejected": -2.6288652420043945, + "step": 230 + }, + { + "epoch": 0.05, + "learning_rate": 1.911344537815126e-05, + "logits/chosen": -2.029402017593384, + "logits/rejected": -1.6517996788024902, + "logps/chosen": -214.90447998046875, + "logps/rejected": -220.51390075683594, + "loss": 0.1437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18353751301765442, + "rewards/margins": 3.010864496231079, + "rewards/rejected": -2.827326774597168, + "step": 231 + }, + { + "epoch": 0.05, + "learning_rate": 1.9109243697478995e-05, + "logits/chosen": -2.1900229454040527, + "logits/rejected": -2.079422950744629, + "logps/chosen": -266.41741943359375, + "logps/rejected": -246.25958251953125, + "loss": 0.2645, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3034582734107971, + "rewards/margins": 3.1612443923950195, + "rewards/rejected": -3.46470308303833, + "step": 232 + }, + { + "epoch": 0.05, + "learning_rate": 1.9105042016806725e-05, + "logits/chosen": -2.1312716007232666, + "logits/rejected": -2.0581488609313965, + "logps/chosen": -330.78875732421875, + "logps/rejected": -368.5685119628906, + "loss": 0.4995, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09492186456918716, + "rewards/margins": 3.064556121826172, + "rewards/rejected": -2.9696342945098877, + "step": 233 + }, + { + "epoch": 0.05, + "learning_rate": 1.9100840336134455e-05, + "logits/chosen": -2.068007469177246, + "logits/rejected": -1.6838291883468628, + "logps/chosen": -257.7823181152344, + "logps/rejected": -202.9130859375, + "loss": 1.0677, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6025550365447998, + "rewards/margins": 0.6695663928985596, + "rewards/rejected": -1.272121548652649, + "step": 234 + }, + { + "epoch": 0.05, + "learning_rate": 1.909663865546219e-05, + "logits/chosen": -1.8926564455032349, + "logits/rejected": -1.7910258769989014, + "logps/chosen": -267.2583312988281, + "logps/rejected": -335.667236328125, + "loss": 0.1903, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09820468723773956, + "rewards/margins": 2.3982319831848145, + "rewards/rejected": -2.496436595916748, + "step": 235 + }, + { + "epoch": 0.05, + "learning_rate": 1.909243697478992e-05, + "logits/chosen": -2.1656434535980225, + "logits/rejected": -1.5117740631103516, + "logps/chosen": -323.4513854980469, + "logps/rejected": -210.25413513183594, + "loss": 0.2304, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.12024146318435669, + "rewards/margins": 2.3969602584838867, + "rewards/rejected": -2.2767186164855957, + "step": 236 + }, + { + "epoch": 0.05, + "learning_rate": 1.908823529411765e-05, + "logits/chosen": -2.2615199089050293, + "logits/rejected": -2.3321115970611572, + "logps/chosen": -391.67791748046875, + "logps/rejected": -341.98870849609375, + "loss": 0.4762, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07293607294559479, + "rewards/margins": 2.3192949295043945, + "rewards/rejected": -2.246358871459961, + "step": 237 + }, + { + "epoch": 0.05, + "learning_rate": 1.908403361344538e-05, + "logits/chosen": -2.0217580795288086, + "logits/rejected": -2.2776317596435547, + "logps/chosen": -242.20521545410156, + "logps/rejected": -317.45501708984375, + "loss": 0.5189, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13696017861366272, + "rewards/margins": 1.7944462299346924, + "rewards/rejected": -1.6574859619140625, + "step": 238 + }, + { + "epoch": 0.05, + "learning_rate": 1.9079831932773113e-05, + "logits/chosen": -1.7655162811279297, + "logits/rejected": -1.946895956993103, + "logps/chosen": -250.6832733154297, + "logps/rejected": -407.4606628417969, + "loss": 0.3513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.013535760343074799, + "rewards/margins": 3.0488224029541016, + "rewards/rejected": -3.0623579025268555, + "step": 239 + }, + { + "epoch": 0.05, + "learning_rate": 1.9075630252100844e-05, + "logits/chosen": -2.2607085704803467, + "logits/rejected": -2.2937917709350586, + "logps/chosen": -302.1427001953125, + "logps/rejected": -371.2340087890625, + "loss": 0.2655, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45845815539360046, + "rewards/margins": 2.677032232284546, + "rewards/rejected": -2.218574047088623, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 1.9071428571428574e-05, + "logits/chosen": -2.1637423038482666, + "logits/rejected": -1.8416534662246704, + "logps/chosen": -308.8120422363281, + "logps/rejected": -242.09458923339844, + "loss": 0.3804, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.37554559111595154, + "rewards/margins": 1.8725324869155884, + "rewards/rejected": -2.2480781078338623, + "step": 241 + }, + { + "epoch": 0.05, + "learning_rate": 1.9067226890756304e-05, + "logits/chosen": -2.151689052581787, + "logits/rejected": -2.1586365699768066, + "logps/chosen": -376.4841003417969, + "logps/rejected": -369.2525634765625, + "loss": 0.2161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03918618708848953, + "rewards/margins": 3.1665728092193604, + "rewards/rejected": -3.205759048461914, + "step": 242 + }, + { + "epoch": 0.05, + "learning_rate": 1.9063025210084038e-05, + "logits/chosen": -1.8786776065826416, + "logits/rejected": -1.9437311887741089, + "logps/chosen": -249.69354248046875, + "logps/rejected": -200.59603881835938, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7313644289970398, + "rewards/margins": 1.9353132247924805, + "rewards/rejected": -2.666677474975586, + "step": 243 + }, + { + "epoch": 0.05, + "learning_rate": 1.9058823529411764e-05, + "logits/chosen": -2.0630462169647217, + "logits/rejected": -1.9633992910385132, + "logps/chosen": -329.28411865234375, + "logps/rejected": -286.7142639160156, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43454915285110474, + "rewards/margins": 1.681921362876892, + "rewards/rejected": -2.1164703369140625, + "step": 244 + }, + { + "epoch": 0.05, + "learning_rate": 1.9054621848739495e-05, + "logits/chosen": -2.1442770957946777, + "logits/rejected": -1.7254831790924072, + "logps/chosen": -375.1960754394531, + "logps/rejected": -285.4134521484375, + "loss": 0.3865, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5157605409622192, + "rewards/margins": 2.7622878551483154, + "rewards/rejected": -3.278048515319824, + "step": 245 + }, + { + "epoch": 0.05, + "learning_rate": 1.9050420168067228e-05, + "logits/chosen": -1.7018543481826782, + "logits/rejected": -2.0866665840148926, + "logps/chosen": -331.94183349609375, + "logps/rejected": -319.7412109375, + "loss": 0.4157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6551017761230469, + "rewards/margins": 2.117560625076294, + "rewards/rejected": -2.7726621627807617, + "step": 246 + }, + { + "epoch": 0.05, + "learning_rate": 1.904621848739496e-05, + "logits/chosen": -2.163576126098633, + "logits/rejected": -1.554231882095337, + "logps/chosen": -452.3487548828125, + "logps/rejected": -313.6382751464844, + "loss": 0.4994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30681225657463074, + "rewards/margins": 2.259860038757324, + "rewards/rejected": -2.5666723251342773, + "step": 247 + }, + { + "epoch": 0.05, + "learning_rate": 1.904201680672269e-05, + "logits/chosen": -2.1186232566833496, + "logits/rejected": -2.1142830848693848, + "logps/chosen": -409.2235107421875, + "logps/rejected": -347.56829833984375, + "loss": 0.2788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06957337260246277, + "rewards/margins": 1.7370926141738892, + "rewards/rejected": -1.8066661357879639, + "step": 248 + }, + { + "epoch": 0.05, + "learning_rate": 1.9037815126050422e-05, + "logits/chosen": -2.2889556884765625, + "logits/rejected": -1.9796427488327026, + "logps/chosen": -308.3511962890625, + "logps/rejected": -284.92462158203125, + "loss": 0.2701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16298097372055054, + "rewards/margins": 2.3532967567443848, + "rewards/rejected": -2.1903157234191895, + "step": 249 + }, + { + "epoch": 0.05, + "learning_rate": 1.9033613445378152e-05, + "logits/chosen": -1.8164410591125488, + "logits/rejected": -1.902475357055664, + "logps/chosen": -380.65618896484375, + "logps/rejected": -288.0320739746094, + "loss": 0.3634, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5005645155906677, + "rewards/margins": 2.498218059539795, + "rewards/rejected": -2.9987828731536865, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 1.9029411764705883e-05, + "logits/chosen": -2.247981071472168, + "logits/rejected": -1.8208394050598145, + "logps/chosen": -311.01983642578125, + "logps/rejected": -251.33236694335938, + "loss": 0.3057, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.45743709802627563, + "rewards/margins": 2.6454529762268066, + "rewards/rejected": -3.1028902530670166, + "step": 251 + }, + { + "epoch": 0.05, + "learning_rate": 1.9025210084033613e-05, + "logits/chosen": -2.20320463180542, + "logits/rejected": -2.198298454284668, + "logps/chosen": -422.0494384765625, + "logps/rejected": -486.51641845703125, + "loss": 0.2731, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4743783473968506, + "rewards/margins": 2.0090861320495605, + "rewards/rejected": -2.483464241027832, + "step": 252 + }, + { + "epoch": 0.05, + "learning_rate": 1.9021008403361347e-05, + "logits/chosen": -2.2514636516571045, + "logits/rejected": -1.893636703491211, + "logps/chosen": -314.3566589355469, + "logps/rejected": -224.54420471191406, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48091644048690796, + "rewards/margins": 2.4646921157836914, + "rewards/rejected": -1.9837758541107178, + "step": 253 + }, + { + "epoch": 0.05, + "learning_rate": 1.9016806722689077e-05, + "logits/chosen": -2.4021365642547607, + "logits/rejected": -2.1228184700012207, + "logps/chosen": -343.2098388671875, + "logps/rejected": -316.01507568359375, + "loss": 0.3059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37482625246047974, + "rewards/margins": 2.382723569869995, + "rewards/rejected": -2.757550001144409, + "step": 254 + }, + { + "epoch": 0.05, + "learning_rate": 1.9012605042016807e-05, + "logits/chosen": -2.2218337059020996, + "logits/rejected": -2.286167860031128, + "logps/chosen": -253.73788452148438, + "logps/rejected": -309.34893798828125, + "loss": 0.2949, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7315078973770142, + "rewards/margins": 3.1146249771118164, + "rewards/rejected": -3.846132755279541, + "step": 255 + }, + { + "epoch": 0.05, + "learning_rate": 1.9008403361344537e-05, + "logits/chosen": -2.080099582672119, + "logits/rejected": -1.8542243242263794, + "logps/chosen": -402.4940185546875, + "logps/rejected": -320.7508850097656, + "loss": 0.2761, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09946957230567932, + "rewards/margins": 2.6404869556427, + "rewards/rejected": -2.7399566173553467, + "step": 256 + }, + { + "epoch": 0.05, + "learning_rate": 1.900420168067227e-05, + "logits/chosen": -1.9595483541488647, + "logits/rejected": -1.8734363317489624, + "logps/chosen": -272.85577392578125, + "logps/rejected": -249.11328125, + "loss": 2.2891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9585080742835999, + "rewards/margins": 0.6084715723991394, + "rewards/rejected": -1.5669796466827393, + "step": 257 + }, + { + "epoch": 0.05, + "learning_rate": 1.9e-05, + "logits/chosen": -1.998580813407898, + "logits/rejected": -2.021963596343994, + "logps/chosen": -220.8612060546875, + "logps/rejected": -326.28802490234375, + "loss": 0.7624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8799558877944946, + "rewards/margins": 2.375614881515503, + "rewards/rejected": -3.255570888519287, + "step": 258 + }, + { + "epoch": 0.05, + "learning_rate": 1.899579831932773e-05, + "logits/chosen": -2.2134571075439453, + "logits/rejected": -2.075129270553589, + "logps/chosen": -236.2484130859375, + "logps/rejected": -276.21527099609375, + "loss": 0.1764, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.06507375091314316, + "rewards/margins": 3.29520320892334, + "rewards/rejected": -3.2301292419433594, + "step": 259 + }, + { + "epoch": 0.05, + "learning_rate": 1.899159663865546e-05, + "logits/chosen": -2.4684386253356934, + "logits/rejected": -2.0644631385803223, + "logps/chosen": -332.96466064453125, + "logps/rejected": -267.83953857421875, + "loss": 0.2236, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35851046442985535, + "rewards/margins": 3.5825247764587402, + "rewards/rejected": -3.2240142822265625, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 1.8987394957983195e-05, + "logits/chosen": -2.151036262512207, + "logits/rejected": -2.0170624256134033, + "logps/chosen": -303.647216796875, + "logps/rejected": -343.37701416015625, + "loss": 0.7025, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6481125950813293, + "rewards/margins": 1.8401405811309814, + "rewards/rejected": -2.488253116607666, + "step": 261 + }, + { + "epoch": 0.05, + "learning_rate": 1.8983193277310925e-05, + "logits/chosen": -1.9874547719955444, + "logits/rejected": -1.628706693649292, + "logps/chosen": -212.7614288330078, + "logps/rejected": -187.1929473876953, + "loss": 0.553, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8250977993011475, + "rewards/margins": 1.5300283432006836, + "rewards/rejected": -2.355126142501831, + "step": 262 + }, + { + "epoch": 0.06, + "learning_rate": 1.8978991596638656e-05, + "logits/chosen": -1.567716360092163, + "logits/rejected": -1.9890470504760742, + "logps/chosen": -278.9992370605469, + "logps/rejected": -370.65118408203125, + "loss": 0.3707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41911080479621887, + "rewards/margins": 1.344163179397583, + "rewards/rejected": -1.763274073600769, + "step": 263 + }, + { + "epoch": 0.06, + "learning_rate": 1.8974789915966386e-05, + "logits/chosen": -2.0037648677825928, + "logits/rejected": -1.9156556129455566, + "logps/chosen": -332.451171875, + "logps/rejected": -446.02532958984375, + "loss": 0.2825, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6619440913200378, + "rewards/margins": 2.5153517723083496, + "rewards/rejected": -3.177295684814453, + "step": 264 + }, + { + "epoch": 0.06, + "learning_rate": 1.897058823529412e-05, + "logits/chosen": -2.224169969558716, + "logits/rejected": -1.6302118301391602, + "logps/chosen": -336.26580810546875, + "logps/rejected": -236.43392944335938, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29110512137413025, + "rewards/margins": 2.6822776794433594, + "rewards/rejected": -2.9733829498291016, + "step": 265 + }, + { + "epoch": 0.06, + "learning_rate": 1.896638655462185e-05, + "logits/chosen": -2.2972166538238525, + "logits/rejected": -1.8663579225540161, + "logps/chosen": -349.3228759765625, + "logps/rejected": -323.1174621582031, + "loss": 0.3396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28448155522346497, + "rewards/margins": 3.315586566925049, + "rewards/rejected": -3.6000683307647705, + "step": 266 + }, + { + "epoch": 0.06, + "learning_rate": 1.896218487394958e-05, + "logits/chosen": -2.286379814147949, + "logits/rejected": -1.89598548412323, + "logps/chosen": -326.5625305175781, + "logps/rejected": -281.0647277832031, + "loss": 0.3079, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38871973752975464, + "rewards/margins": 2.892576217651367, + "rewards/rejected": -3.2812960147857666, + "step": 267 + }, + { + "epoch": 0.06, + "learning_rate": 1.8957983193277313e-05, + "logits/chosen": -2.266174077987671, + "logits/rejected": -2.196779727935791, + "logps/chosen": -225.83517456054688, + "logps/rejected": -273.6953430175781, + "loss": 0.8555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.102109670639038, + "rewards/margins": 3.273979425430298, + "rewards/rejected": -4.376089096069336, + "step": 268 + }, + { + "epoch": 0.06, + "learning_rate": 1.8953781512605044e-05, + "logits/chosen": -1.931058645248413, + "logits/rejected": -1.9792654514312744, + "logps/chosen": -477.3355712890625, + "logps/rejected": -296.3453674316406, + "loss": 0.2875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0060585737228393555, + "rewards/margins": 2.8947529792785645, + "rewards/rejected": -2.9008116722106934, + "step": 269 + }, + { + "epoch": 0.06, + "learning_rate": 1.8949579831932774e-05, + "logits/chosen": -2.336129665374756, + "logits/rejected": -2.0650296211242676, + "logps/chosen": -279.13299560546875, + "logps/rejected": -314.7516174316406, + "loss": 0.3795, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09615585207939148, + "rewards/margins": 3.437487840652466, + "rewards/rejected": -3.5336437225341797, + "step": 270 + }, + { + "epoch": 0.06, + "learning_rate": 1.8945378151260504e-05, + "logits/chosen": -2.0036585330963135, + "logits/rejected": -1.9419636726379395, + "logps/chosen": -419.7447509765625, + "logps/rejected": -381.2106018066406, + "loss": 0.3541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19862446188926697, + "rewards/margins": 2.218693494796753, + "rewards/rejected": -2.4173178672790527, + "step": 271 + }, + { + "epoch": 0.06, + "learning_rate": 1.8941176470588238e-05, + "logits/chosen": -2.2175207138061523, + "logits/rejected": -1.7529559135437012, + "logps/chosen": -252.12490844726562, + "logps/rejected": -297.6517028808594, + "loss": 0.1474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5155524015426636, + "rewards/margins": 4.058847904205322, + "rewards/rejected": -4.574400424957275, + "step": 272 + }, + { + "epoch": 0.06, + "learning_rate": 1.8936974789915968e-05, + "logits/chosen": -2.0892577171325684, + "logits/rejected": -1.3442286252975464, + "logps/chosen": -254.39706420898438, + "logps/rejected": -274.16485595703125, + "loss": 0.2011, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5458801984786987, + "rewards/margins": 3.198228597640991, + "rewards/rejected": -3.7441086769104004, + "step": 273 + }, + { + "epoch": 0.06, + "learning_rate": 1.8932773109243698e-05, + "logits/chosen": -2.259138345718384, + "logits/rejected": -2.2775609493255615, + "logps/chosen": -521.3603515625, + "logps/rejected": -478.9537353515625, + "loss": 0.1142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2233598828315735, + "rewards/margins": 3.9802184104919434, + "rewards/rejected": -4.203578948974609, + "step": 274 + }, + { + "epoch": 0.06, + "learning_rate": 1.892857142857143e-05, + "logits/chosen": -2.0820884704589844, + "logits/rejected": -1.5601340532302856, + "logps/chosen": -327.8568115234375, + "logps/rejected": -256.6611022949219, + "loss": 0.4629, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4417157769203186, + "rewards/margins": 2.0913429260253906, + "rewards/rejected": -2.5330586433410645, + "step": 275 + }, + { + "epoch": 0.06, + "learning_rate": 1.8924369747899162e-05, + "logits/chosen": -2.283698320388794, + "logits/rejected": -2.0509748458862305, + "logps/chosen": -211.555419921875, + "logps/rejected": -264.49810791015625, + "loss": 0.2272, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3787537217140198, + "rewards/margins": 3.1879568099975586, + "rewards/rejected": -3.5667107105255127, + "step": 276 + }, + { + "epoch": 0.06, + "learning_rate": 1.8920168067226892e-05, + "logits/chosen": -2.255230188369751, + "logits/rejected": -2.3941588401794434, + "logps/chosen": -323.314697265625, + "logps/rejected": -339.1155090332031, + "loss": 0.197, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19374454021453857, + "rewards/margins": 2.264287233352661, + "rewards/rejected": -2.070542812347412, + "step": 277 + }, + { + "epoch": 0.06, + "learning_rate": 1.8915966386554622e-05, + "logits/chosen": -2.122606039047241, + "logits/rejected": -1.7213150262832642, + "logps/chosen": -305.01373291015625, + "logps/rejected": -372.7522277832031, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014068517833948135, + "rewards/margins": 3.018646717071533, + "rewards/rejected": -3.032715320587158, + "step": 278 + }, + { + "epoch": 0.06, + "learning_rate": 1.8911764705882353e-05, + "logits/chosen": -1.9908778667449951, + "logits/rejected": -1.467978596687317, + "logps/chosen": -272.57501220703125, + "logps/rejected": -250.95005798339844, + "loss": 0.4133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15837684273719788, + "rewards/margins": 2.1676838397979736, + "rewards/rejected": -2.3260607719421387, + "step": 279 + }, + { + "epoch": 0.06, + "learning_rate": 1.8907563025210086e-05, + "logits/chosen": -2.3047399520874023, + "logits/rejected": -2.1433892250061035, + "logps/chosen": -333.09716796875, + "logps/rejected": -371.75286865234375, + "loss": 0.2282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47221025824546814, + "rewards/margins": 3.88632869720459, + "rewards/rejected": -4.358538627624512, + "step": 280 + }, + { + "epoch": 0.06, + "learning_rate": 1.8903361344537816e-05, + "logits/chosen": -2.0384902954101562, + "logits/rejected": -1.8693532943725586, + "logps/chosen": -351.90643310546875, + "logps/rejected": -319.672607421875, + "loss": 0.4755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5231740474700928, + "rewards/margins": 2.775447368621826, + "rewards/rejected": -3.298621654510498, + "step": 281 + }, + { + "epoch": 0.06, + "learning_rate": 1.8899159663865547e-05, + "logits/chosen": -1.7592918872833252, + "logits/rejected": -1.6648476123809814, + "logps/chosen": -246.908935546875, + "logps/rejected": -258.38177490234375, + "loss": 0.2218, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.705916166305542, + "rewards/margins": 3.2774713039398193, + "rewards/rejected": -3.9833874702453613, + "step": 282 + }, + { + "epoch": 0.06, + "learning_rate": 1.8894957983193277e-05, + "logits/chosen": -2.456876039505005, + "logits/rejected": -1.9175190925598145, + "logps/chosen": -382.9876708984375, + "logps/rejected": -289.6297607421875, + "loss": 0.334, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8396276235580444, + "rewards/margins": 2.283569574356079, + "rewards/rejected": -3.123197317123413, + "step": 283 + }, + { + "epoch": 0.06, + "learning_rate": 1.889075630252101e-05, + "logits/chosen": -1.8148776292800903, + "logits/rejected": -1.9756126403808594, + "logps/chosen": -227.39370727539062, + "logps/rejected": -298.6007995605469, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2220604121685028, + "rewards/margins": 3.041797161102295, + "rewards/rejected": -3.263857364654541, + "step": 284 + }, + { + "epoch": 0.06, + "learning_rate": 1.888655462184874e-05, + "logits/chosen": -2.056464195251465, + "logits/rejected": -2.0410242080688477, + "logps/chosen": -232.30111694335938, + "logps/rejected": -307.7833251953125, + "loss": 0.3806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5008337497711182, + "rewards/margins": 2.573366165161133, + "rewards/rejected": -3.074199676513672, + "step": 285 + }, + { + "epoch": 0.06, + "learning_rate": 1.888235294117647e-05, + "logits/chosen": -2.2546117305755615, + "logits/rejected": -2.1923813819885254, + "logps/chosen": -347.1071472167969, + "logps/rejected": -352.00140380859375, + "loss": 0.3022, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2465924471616745, + "rewards/margins": 3.0192251205444336, + "rewards/rejected": -3.265817403793335, + "step": 286 + }, + { + "epoch": 0.06, + "learning_rate": 1.88781512605042e-05, + "logits/chosen": -2.086280584335327, + "logits/rejected": -1.8711687326431274, + "logps/chosen": -246.7713623046875, + "logps/rejected": -322.1441650390625, + "loss": 0.3882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22017957270145416, + "rewards/margins": 2.388051986694336, + "rewards/rejected": -2.608231544494629, + "step": 287 + }, + { + "epoch": 0.06, + "learning_rate": 1.8873949579831935e-05, + "logits/chosen": -2.0063281059265137, + "logits/rejected": -2.295560598373413, + "logps/chosen": -278.6932067871094, + "logps/rejected": -262.2654724121094, + "loss": 0.2467, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07579703629016876, + "rewards/margins": 3.047999143600464, + "rewards/rejected": -3.1237964630126953, + "step": 288 + }, + { + "epoch": 0.06, + "learning_rate": 1.8869747899159665e-05, + "logits/chosen": -2.074051856994629, + "logits/rejected": -2.1156082153320312, + "logps/chosen": -220.39712524414062, + "logps/rejected": -272.962890625, + "loss": 0.2378, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6701609492301941, + "rewards/margins": 2.572181224822998, + "rewards/rejected": -3.242342472076416, + "step": 289 + }, + { + "epoch": 0.06, + "learning_rate": 1.8865546218487395e-05, + "logits/chosen": -1.7695175409317017, + "logits/rejected": -1.9864470958709717, + "logps/chosen": -281.2634582519531, + "logps/rejected": -314.4126281738281, + "loss": 0.6327, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7106997966766357, + "rewards/margins": 3.1313889026641846, + "rewards/rejected": -3.842088222503662, + "step": 290 + }, + { + "epoch": 0.06, + "learning_rate": 1.886134453781513e-05, + "logits/chosen": -1.754223108291626, + "logits/rejected": -2.0436923503875732, + "logps/chosen": -244.408447265625, + "logps/rejected": -292.4617919921875, + "loss": 0.5628, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.268982172012329, + "rewards/margins": 1.5875968933105469, + "rewards/rejected": -2.856578826904297, + "step": 291 + }, + { + "epoch": 0.06, + "learning_rate": 1.885714285714286e-05, + "logits/chosen": -1.9902204275131226, + "logits/rejected": -1.825770378112793, + "logps/chosen": -278.1590270996094, + "logps/rejected": -252.11109924316406, + "loss": 0.3886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9952681660652161, + "rewards/margins": 2.203479051589966, + "rewards/rejected": -3.198747158050537, + "step": 292 + }, + { + "epoch": 0.06, + "learning_rate": 1.885294117647059e-05, + "logits/chosen": -1.912260890007019, + "logits/rejected": -2.068472146987915, + "logps/chosen": -253.81625366210938, + "logps/rejected": -272.1974182128906, + "loss": 0.4716, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9447513818740845, + "rewards/margins": 1.7583000659942627, + "rewards/rejected": -2.7030515670776367, + "step": 293 + }, + { + "epoch": 0.06, + "learning_rate": 1.884873949579832e-05, + "logits/chosen": -2.188776969909668, + "logits/rejected": -1.8033409118652344, + "logps/chosen": -220.3309326171875, + "logps/rejected": -269.0039978027344, + "loss": 0.2543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8321278095245361, + "rewards/margins": 2.7947778701782227, + "rewards/rejected": -3.626905679702759, + "step": 294 + }, + { + "epoch": 0.06, + "learning_rate": 1.8844537815126053e-05, + "logits/chosen": -2.291764736175537, + "logits/rejected": -2.0348153114318848, + "logps/chosen": -320.97943115234375, + "logps/rejected": -295.7033386230469, + "loss": 0.5771, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5811628699302673, + "rewards/margins": 3.624668598175049, + "rewards/rejected": -4.205831527709961, + "step": 295 + }, + { + "epoch": 0.06, + "learning_rate": 1.8840336134453783e-05, + "logits/chosen": -2.5415329933166504, + "logits/rejected": -1.9511866569519043, + "logps/chosen": -417.8042297363281, + "logps/rejected": -382.7701416015625, + "loss": 0.7825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8851407766342163, + "rewards/margins": 2.9997334480285645, + "rewards/rejected": -3.8848745822906494, + "step": 296 + }, + { + "epoch": 0.06, + "learning_rate": 1.8836134453781514e-05, + "logits/chosen": -2.2607715129852295, + "logits/rejected": -1.6191909313201904, + "logps/chosen": -326.38037109375, + "logps/rejected": -314.59716796875, + "loss": 0.3887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.36671337485313416, + "rewards/margins": 3.2380211353302, + "rewards/rejected": -3.6047346591949463, + "step": 297 + }, + { + "epoch": 0.06, + "learning_rate": 1.8831932773109244e-05, + "logits/chosen": -2.2181785106658936, + "logits/rejected": -2.300800085067749, + "logps/chosen": -255.30764770507812, + "logps/rejected": -309.82220458984375, + "loss": 0.5814, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5556820631027222, + "rewards/margins": 2.1392624378204346, + "rewards/rejected": -2.694944381713867, + "step": 298 + }, + { + "epoch": 0.06, + "learning_rate": 1.8827731092436977e-05, + "logits/chosen": -2.268632411956787, + "logits/rejected": -1.9622522592544556, + "logps/chosen": -249.33746337890625, + "logps/rejected": -247.32687377929688, + "loss": 0.326, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3187025785446167, + "rewards/margins": 2.138777017593384, + "rewards/rejected": -2.457479476928711, + "step": 299 + }, + { + "epoch": 0.06, + "learning_rate": 1.8823529411764708e-05, + "logits/chosen": -2.109956741333008, + "logits/rejected": -1.9494073390960693, + "logps/chosen": -323.4368591308594, + "logps/rejected": -397.69049072265625, + "loss": 0.3467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34082433581352234, + "rewards/margins": 2.7976865768432617, + "rewards/rejected": -3.1385107040405273, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 1.8819327731092438e-05, + "logits/chosen": -2.0344057083129883, + "logits/rejected": -2.1402249336242676, + "logps/chosen": -217.72195434570312, + "logps/rejected": -290.2239990234375, + "loss": 0.4264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4302474856376648, + "rewards/margins": 2.074941873550415, + "rewards/rejected": -2.5051894187927246, + "step": 301 + }, + { + "epoch": 0.06, + "learning_rate": 1.8815126050420168e-05, + "logits/chosen": -2.4479141235351562, + "logits/rejected": -1.9408173561096191, + "logps/chosen": -354.33929443359375, + "logps/rejected": -286.9940185546875, + "loss": 0.3129, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2552834153175354, + "rewards/margins": 2.541254758834839, + "rewards/rejected": -2.285971164703369, + "step": 302 + }, + { + "epoch": 0.06, + "learning_rate": 1.88109243697479e-05, + "logits/chosen": -2.211653709411621, + "logits/rejected": -1.9729622602462769, + "logps/chosen": -327.70196533203125, + "logps/rejected": -297.71484375, + "loss": 0.1917, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12104669958353043, + "rewards/margins": 3.1653082370758057, + "rewards/rejected": -3.2863550186157227, + "step": 303 + }, + { + "epoch": 0.06, + "learning_rate": 1.8806722689075632e-05, + "logits/chosen": -2.3373312950134277, + "logits/rejected": -2.1898093223571777, + "logps/chosen": -392.64312744140625, + "logps/rejected": -428.7371826171875, + "loss": 0.2219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.31619569659233093, + "rewards/margins": 2.3710060119628906, + "rewards/rejected": -2.687201976776123, + "step": 304 + }, + { + "epoch": 0.06, + "learning_rate": 1.8802521008403362e-05, + "logits/chosen": -2.27375864982605, + "logits/rejected": -2.2790725231170654, + "logps/chosen": -332.09051513671875, + "logps/rejected": -304.3880615234375, + "loss": 0.4099, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4454871118068695, + "rewards/margins": 2.11026668548584, + "rewards/rejected": -2.555753707885742, + "step": 305 + }, + { + "epoch": 0.06, + "learning_rate": 1.8798319327731092e-05, + "logits/chosen": -1.9241347312927246, + "logits/rejected": -1.7147314548492432, + "logps/chosen": -312.8620300292969, + "logps/rejected": -332.3287353515625, + "loss": 0.3528, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6963294148445129, + "rewards/margins": 3.4313244819641113, + "rewards/rejected": -4.127654075622559, + "step": 306 + }, + { + "epoch": 0.06, + "learning_rate": 1.8794117647058826e-05, + "logits/chosen": -2.44700288772583, + "logits/rejected": -2.201770782470703, + "logps/chosen": -333.5986328125, + "logps/rejected": -257.613037109375, + "loss": 0.2882, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3046763241291046, + "rewards/margins": 1.9939825534820557, + "rewards/rejected": -1.6893062591552734, + "step": 307 + }, + { + "epoch": 0.06, + "learning_rate": 1.8789915966386556e-05, + "logits/chosen": -1.9482791423797607, + "logits/rejected": -2.108036994934082, + "logps/chosen": -239.434326171875, + "logps/rejected": -284.23931884765625, + "loss": 0.1588, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07599126547574997, + "rewards/margins": 3.667056083679199, + "rewards/rejected": -3.591064453125, + "step": 308 + }, + { + "epoch": 0.06, + "learning_rate": 1.8785714285714286e-05, + "logits/chosen": -2.2287769317626953, + "logits/rejected": -2.2077245712280273, + "logps/chosen": -269.6983642578125, + "logps/rejected": -325.807861328125, + "loss": 0.4458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7661252617835999, + "rewards/margins": 2.473928928375244, + "rewards/rejected": -3.2400546073913574, + "step": 309 + }, + { + "epoch": 0.06, + "learning_rate": 1.8781512605042017e-05, + "logits/chosen": -2.2159597873687744, + "logits/rejected": -1.8691554069519043, + "logps/chosen": -339.8736572265625, + "logps/rejected": -294.8172302246094, + "loss": 0.3963, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9596534967422485, + "rewards/margins": 1.9829761981964111, + "rewards/rejected": -2.942629814147949, + "step": 310 + }, + { + "epoch": 0.07, + "learning_rate": 1.877731092436975e-05, + "logits/chosen": -1.7915797233581543, + "logits/rejected": -1.4641615152359009, + "logps/chosen": -329.60443115234375, + "logps/rejected": -242.4039764404297, + "loss": 0.3085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.020845927298069, + "rewards/margins": 2.6320667266845703, + "rewards/rejected": -2.6529126167297363, + "step": 311 + }, + { + "epoch": 0.07, + "learning_rate": 1.877310924369748e-05, + "logits/chosen": -2.1414425373077393, + "logits/rejected": -2.300830364227295, + "logps/chosen": -269.4566650390625, + "logps/rejected": -341.6777038574219, + "loss": 0.4163, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6433926820755005, + "rewards/margins": 2.5839271545410156, + "rewards/rejected": -3.2273197174072266, + "step": 312 + }, + { + "epoch": 0.07, + "learning_rate": 1.876890756302521e-05, + "logits/chosen": -1.96380615234375, + "logits/rejected": -1.8046247959136963, + "logps/chosen": -348.15655517578125, + "logps/rejected": -293.32647705078125, + "loss": 0.3273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0788097381591797, + "rewards/margins": 2.217039108276367, + "rewards/rejected": -3.295849084854126, + "step": 313 + }, + { + "epoch": 0.07, + "learning_rate": 1.8764705882352944e-05, + "logits/chosen": -2.4041481018066406, + "logits/rejected": -1.9352842569351196, + "logps/chosen": -407.56939697265625, + "logps/rejected": -310.6763000488281, + "loss": 0.311, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20415282249450684, + "rewards/margins": 3.2726519107818604, + "rewards/rejected": -3.0684990882873535, + "step": 314 + }, + { + "epoch": 0.07, + "learning_rate": 1.8760504201680674e-05, + "logits/chosen": -2.2820425033569336, + "logits/rejected": -2.1718294620513916, + "logps/chosen": -331.9612731933594, + "logps/rejected": -334.3269348144531, + "loss": 0.2979, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43394508957862854, + "rewards/margins": 2.825974225997925, + "rewards/rejected": -3.2599191665649414, + "step": 315 + }, + { + "epoch": 0.07, + "learning_rate": 1.8756302521008405e-05, + "logits/chosen": -2.4595770835876465, + "logits/rejected": -2.195279598236084, + "logps/chosen": -308.5310363769531, + "logps/rejected": -235.79022216796875, + "loss": 0.5167, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16394472122192383, + "rewards/margins": 1.5328145027160645, + "rewards/rejected": -1.3688697814941406, + "step": 316 + }, + { + "epoch": 0.07, + "learning_rate": 1.8752100840336135e-05, + "logits/chosen": -1.913292646408081, + "logits/rejected": -1.69960618019104, + "logps/chosen": -340.7834777832031, + "logps/rejected": -300.80084228515625, + "loss": 0.3818, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1078362911939621, + "rewards/margins": 3.6973748207092285, + "rewards/rejected": -3.58953857421875, + "step": 317 + }, + { + "epoch": 0.07, + "learning_rate": 1.874789915966387e-05, + "logits/chosen": -2.0673024654388428, + "logits/rejected": -1.9299508333206177, + "logps/chosen": -373.1454772949219, + "logps/rejected": -371.3177185058594, + "loss": 0.2814, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4277085065841675, + "rewards/margins": 3.9399666786193848, + "rewards/rejected": -4.367674827575684, + "step": 318 + }, + { + "epoch": 0.07, + "learning_rate": 1.87436974789916e-05, + "logits/chosen": -2.0578806400299072, + "logits/rejected": -2.324023485183716, + "logps/chosen": -254.89859008789062, + "logps/rejected": -359.4049072265625, + "loss": 0.1694, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6514156460762024, + "rewards/margins": 3.6118431091308594, + "rewards/rejected": -2.9604272842407227, + "step": 319 + }, + { + "epoch": 0.07, + "learning_rate": 1.873949579831933e-05, + "logits/chosen": -1.7575109004974365, + "logits/rejected": -1.914217472076416, + "logps/chosen": -233.79579162597656, + "logps/rejected": -274.8104553222656, + "loss": 0.3963, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5326837301254272, + "rewards/margins": 2.6236653327941895, + "rewards/rejected": -3.1563491821289062, + "step": 320 + }, + { + "epoch": 0.07, + "learning_rate": 1.873529411764706e-05, + "logits/chosen": -1.8810131549835205, + "logits/rejected": -2.008780002593994, + "logps/chosen": -419.2375183105469, + "logps/rejected": -430.22003173828125, + "loss": 0.2468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6155575513839722, + "rewards/margins": 4.04728889465332, + "rewards/rejected": -4.662846565246582, + "step": 321 + }, + { + "epoch": 0.07, + "learning_rate": 1.8731092436974793e-05, + "logits/chosen": -1.6235010623931885, + "logits/rejected": -1.586295247077942, + "logps/chosen": -266.53515625, + "logps/rejected": -328.22259521484375, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9877183437347412, + "rewards/margins": 3.058164358139038, + "rewards/rejected": -4.045882225036621, + "step": 322 + }, + { + "epoch": 0.07, + "learning_rate": 1.8726890756302523e-05, + "logits/chosen": -2.0542685985565186, + "logits/rejected": -2.024887800216675, + "logps/chosen": -233.3220977783203, + "logps/rejected": -302.41998291015625, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3559727370738983, + "rewards/margins": 2.621556043624878, + "rewards/rejected": -2.9775285720825195, + "step": 323 + }, + { + "epoch": 0.07, + "learning_rate": 1.8722689075630253e-05, + "logits/chosen": -2.036128520965576, + "logits/rejected": -1.8352479934692383, + "logps/chosen": -313.8388977050781, + "logps/rejected": -397.1602783203125, + "loss": 0.3042, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.001976490020751953, + "rewards/margins": 2.834493637084961, + "rewards/rejected": -2.836470127105713, + "step": 324 + }, + { + "epoch": 0.07, + "learning_rate": 1.8718487394957983e-05, + "logits/chosen": -2.0788910388946533, + "logits/rejected": -1.8667393922805786, + "logps/chosen": -268.1964111328125, + "logps/rejected": -296.2843017578125, + "loss": 0.6825, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8272196650505066, + "rewards/margins": 1.1636391878128052, + "rewards/rejected": -1.9908589124679565, + "step": 325 + }, + { + "epoch": 0.07, + "learning_rate": 1.8714285714285717e-05, + "logits/chosen": -2.3844246864318848, + "logits/rejected": -1.6167030334472656, + "logps/chosen": -407.9466552734375, + "logps/rejected": -231.8106231689453, + "loss": 0.3028, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.042908817529678345, + "rewards/margins": 3.6082944869995117, + "rewards/rejected": -3.565385341644287, + "step": 326 + }, + { + "epoch": 0.07, + "learning_rate": 1.8710084033613447e-05, + "logits/chosen": -2.2293574810028076, + "logits/rejected": -1.669480800628662, + "logps/chosen": -249.737060546875, + "logps/rejected": -264.0361328125, + "loss": 0.2597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.14727619290351868, + "rewards/margins": 2.817415952682495, + "rewards/rejected": -2.9646918773651123, + "step": 327 + }, + { + "epoch": 0.07, + "learning_rate": 1.8705882352941178e-05, + "logits/chosen": -1.481929063796997, + "logits/rejected": -1.774039626121521, + "logps/chosen": -241.51197814941406, + "logps/rejected": -283.94915771484375, + "loss": 0.5201, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1028923988342285, + "rewards/margins": 2.2659027576446533, + "rewards/rejected": -3.368795156478882, + "step": 328 + }, + { + "epoch": 0.07, + "learning_rate": 1.8701680672268908e-05, + "logits/chosen": -2.4668219089508057, + "logits/rejected": -2.148750066757202, + "logps/chosen": -373.35162353515625, + "logps/rejected": -303.9354248046875, + "loss": 0.2006, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15432801842689514, + "rewards/margins": 3.2531533241271973, + "rewards/rejected": -3.098825454711914, + "step": 329 + }, + { + "epoch": 0.07, + "learning_rate": 1.869747899159664e-05, + "logits/chosen": -1.9888383150100708, + "logits/rejected": -1.939383625984192, + "logps/chosen": -262.57318115234375, + "logps/rejected": -344.1221923828125, + "loss": 0.5699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4157131314277649, + "rewards/margins": 2.8421761989593506, + "rewards/rejected": -3.2578892707824707, + "step": 330 + }, + { + "epoch": 0.07, + "learning_rate": 1.869327731092437e-05, + "logits/chosen": -1.9494271278381348, + "logits/rejected": -2.0966906547546387, + "logps/chosen": -312.8277587890625, + "logps/rejected": -350.18145751953125, + "loss": 0.7105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9800263047218323, + "rewards/margins": 1.9737093448638916, + "rewards/rejected": -2.953735828399658, + "step": 331 + }, + { + "epoch": 0.07, + "learning_rate": 1.8689075630252102e-05, + "logits/chosen": -2.0298752784729004, + "logits/rejected": -1.875227928161621, + "logps/chosen": -216.7428741455078, + "logps/rejected": -203.33787536621094, + "loss": 0.3845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5260271430015564, + "rewards/margins": 1.8222005367279053, + "rewards/rejected": -2.3482275009155273, + "step": 332 + }, + { + "epoch": 0.07, + "learning_rate": 1.8684873949579832e-05, + "logits/chosen": -2.2422947883605957, + "logits/rejected": -1.9197068214416504, + "logps/chosen": -341.98828125, + "logps/rejected": -315.65814208984375, + "loss": 0.2355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4491279423236847, + "rewards/margins": 2.794490098953247, + "rewards/rejected": -3.2436182498931885, + "step": 333 + }, + { + "epoch": 0.07, + "learning_rate": 1.8680672268907566e-05, + "logits/chosen": -2.3078951835632324, + "logits/rejected": -1.8790884017944336, + "logps/chosen": -396.8863830566406, + "logps/rejected": -309.27490234375, + "loss": 0.5343, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.007239677011966705, + "rewards/margins": 2.369271993637085, + "rewards/rejected": -2.376511573791504, + "step": 334 + }, + { + "epoch": 0.07, + "learning_rate": 1.8676470588235296e-05, + "logits/chosen": -2.066531181335449, + "logits/rejected": -1.5957533121109009, + "logps/chosen": -365.20709228515625, + "logps/rejected": -291.71044921875, + "loss": 0.3208, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.17337965965271, + "rewards/margins": 3.090036630630493, + "rewards/rejected": -4.263416290283203, + "step": 335 + }, + { + "epoch": 0.07, + "learning_rate": 1.8672268907563026e-05, + "logits/chosen": -2.06984281539917, + "logits/rejected": -2.126033306121826, + "logps/chosen": -347.90960693359375, + "logps/rejected": -309.54217529296875, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6166422963142395, + "rewards/margins": 1.3381564617156982, + "rewards/rejected": -1.9547988176345825, + "step": 336 + }, + { + "epoch": 0.07, + "learning_rate": 1.866806722689076e-05, + "logits/chosen": -1.8469455242156982, + "logits/rejected": -1.8469024896621704, + "logps/chosen": -352.01727294921875, + "logps/rejected": -376.5155029296875, + "loss": 0.6246, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.32721078395843506, + "rewards/margins": 1.8597333431243896, + "rewards/rejected": -2.186944007873535, + "step": 337 + }, + { + "epoch": 0.07, + "learning_rate": 1.866386554621849e-05, + "logits/chosen": -2.183018445968628, + "logits/rejected": -1.9800773859024048, + "logps/chosen": -341.5853576660156, + "logps/rejected": -325.4683532714844, + "loss": 0.3829, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0828176736831665, + "rewards/margins": 1.801600456237793, + "rewards/rejected": -2.884418249130249, + "step": 338 + }, + { + "epoch": 0.07, + "learning_rate": 1.865966386554622e-05, + "logits/chosen": -2.216343879699707, + "logits/rejected": -1.7860909700393677, + "logps/chosen": -324.175537109375, + "logps/rejected": -295.8573913574219, + "loss": 0.4519, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10657677054405212, + "rewards/margins": 1.616962194442749, + "rewards/rejected": -1.723538875579834, + "step": 339 + }, + { + "epoch": 0.07, + "learning_rate": 1.865546218487395e-05, + "logits/chosen": -2.2852091789245605, + "logits/rejected": -2.203788995742798, + "logps/chosen": -263.1763610839844, + "logps/rejected": -265.32757568359375, + "loss": 0.2124, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.000583946704864502, + "rewards/margins": 3.75418758392334, + "rewards/rejected": -3.753603458404541, + "step": 340 + }, + { + "epoch": 0.07, + "learning_rate": 1.8651260504201684e-05, + "logits/chosen": -2.134643316268921, + "logits/rejected": -1.8497560024261475, + "logps/chosen": -256.9386291503906, + "logps/rejected": -291.808349609375, + "loss": 0.7313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5955746173858643, + "rewards/margins": 2.1250791549682617, + "rewards/rejected": -2.720653772354126, + "step": 341 + }, + { + "epoch": 0.07, + "learning_rate": 1.8647058823529414e-05, + "logits/chosen": -2.129666805267334, + "logits/rejected": -2.147286891937256, + "logps/chosen": -297.31158447265625, + "logps/rejected": -316.26409912109375, + "loss": 0.4878, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.024975964799523354, + "rewards/margins": 2.073550224304199, + "rewards/rejected": -2.0985262393951416, + "step": 342 + }, + { + "epoch": 0.07, + "learning_rate": 1.8642857142857144e-05, + "logits/chosen": -2.2581629753112793, + "logits/rejected": -2.1744561195373535, + "logps/chosen": -371.3339538574219, + "logps/rejected": -305.3000183105469, + "loss": 0.406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0708468109369278, + "rewards/margins": 2.916726589202881, + "rewards/rejected": -2.9875733852386475, + "step": 343 + }, + { + "epoch": 0.07, + "learning_rate": 1.8638655462184875e-05, + "logits/chosen": -1.987851619720459, + "logits/rejected": -1.8836939334869385, + "logps/chosen": -264.5604248046875, + "logps/rejected": -257.2608337402344, + "loss": 0.2367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3610621690750122, + "rewards/margins": 2.3745124340057373, + "rewards/rejected": -2.735574722290039, + "step": 344 + }, + { + "epoch": 0.07, + "learning_rate": 1.8634453781512608e-05, + "logits/chosen": -1.753413438796997, + "logits/rejected": -1.6945829391479492, + "logps/chosen": -200.69012451171875, + "logps/rejected": -283.2491149902344, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46275192499160767, + "rewards/margins": 1.767363429069519, + "rewards/rejected": -2.2301154136657715, + "step": 345 + }, + { + "epoch": 0.07, + "learning_rate": 1.863025210084034e-05, + "logits/chosen": -2.334855079650879, + "logits/rejected": -2.088732957839966, + "logps/chosen": -397.83343505859375, + "logps/rejected": -338.78179931640625, + "loss": 0.1275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19868981838226318, + "rewards/margins": 3.2203967571258545, + "rewards/rejected": -3.0217068195343018, + "step": 346 + }, + { + "epoch": 0.07, + "learning_rate": 1.862605042016807e-05, + "logits/chosen": -2.3031864166259766, + "logits/rejected": -1.9945582151412964, + "logps/chosen": -385.1195068359375, + "logps/rejected": -343.19866943359375, + "loss": 0.5877, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5145111680030823, + "rewards/margins": 2.5496606826782227, + "rewards/rejected": -3.0641720294952393, + "step": 347 + }, + { + "epoch": 0.07, + "learning_rate": 1.86218487394958e-05, + "logits/chosen": -2.096303701400757, + "logits/rejected": -2.223142623901367, + "logps/chosen": -273.1221008300781, + "logps/rejected": -258.70733642578125, + "loss": 0.4627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5140308737754822, + "rewards/margins": 1.6974841356277466, + "rewards/rejected": -2.211515188217163, + "step": 348 + }, + { + "epoch": 0.07, + "learning_rate": 1.8617647058823533e-05, + "logits/chosen": -1.9672343730926514, + "logits/rejected": -2.240957498550415, + "logps/chosen": -270.3798522949219, + "logps/rejected": -333.7206726074219, + "loss": 0.3858, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17106646299362183, + "rewards/margins": 2.6529464721679688, + "rewards/rejected": -2.8240127563476562, + "step": 349 + }, + { + "epoch": 0.07, + "learning_rate": 1.8613445378151263e-05, + "logits/chosen": -2.130988359451294, + "logits/rejected": -1.7660956382751465, + "logps/chosen": -269.3900451660156, + "logps/rejected": -189.3504638671875, + "loss": 0.4339, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2162952721118927, + "rewards/margins": 1.6339480876922607, + "rewards/rejected": -1.850243330001831, + "step": 350 + }, + { + "epoch": 0.07, + "learning_rate": 1.8609243697478993e-05, + "logits/chosen": -2.2752649784088135, + "logits/rejected": -1.899603247642517, + "logps/chosen": -398.2548522949219, + "logps/rejected": -360.26324462890625, + "loss": 0.229, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7394152283668518, + "rewards/margins": 2.6867666244506836, + "rewards/rejected": -3.4261817932128906, + "step": 351 + }, + { + "epoch": 0.07, + "learning_rate": 1.8605042016806723e-05, + "logits/chosen": -2.5080926418304443, + "logits/rejected": -2.2032487392425537, + "logps/chosen": -452.4739685058594, + "logps/rejected": -370.524658203125, + "loss": 0.4107, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.21627482771873474, + "rewards/margins": 2.1269168853759766, + "rewards/rejected": -2.343191623687744, + "step": 352 + }, + { + "epoch": 0.07, + "learning_rate": 1.8600840336134457e-05, + "logits/chosen": -2.0756988525390625, + "logits/rejected": -1.8713806867599487, + "logps/chosen": -221.3297882080078, + "logps/rejected": -252.8258056640625, + "loss": 0.6293, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1484546661376953, + "rewards/margins": 1.8259799480438232, + "rewards/rejected": -2.9744346141815186, + "step": 353 + }, + { + "epoch": 0.07, + "learning_rate": 1.8596638655462187e-05, + "logits/chosen": -2.014289140701294, + "logits/rejected": -1.9472618103027344, + "logps/chosen": -299.89471435546875, + "logps/rejected": -335.1524658203125, + "loss": 0.494, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.978775143623352, + "rewards/margins": 1.995316743850708, + "rewards/rejected": -2.9740920066833496, + "step": 354 + }, + { + "epoch": 0.07, + "learning_rate": 1.8592436974789917e-05, + "logits/chosen": -1.694716453552246, + "logits/rejected": -1.962080478668213, + "logps/chosen": -222.0389404296875, + "logps/rejected": -271.0969543457031, + "loss": 0.4964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03180364519357681, + "rewards/margins": 2.024240016937256, + "rewards/rejected": -2.0560436248779297, + "step": 355 + }, + { + "epoch": 0.07, + "learning_rate": 1.8588235294117647e-05, + "logits/chosen": -2.2815263271331787, + "logits/rejected": -1.7262376546859741, + "logps/chosen": -459.6976623535156, + "logps/rejected": -336.3739013671875, + "loss": 0.251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8403612375259399, + "rewards/margins": 2.8347840309143066, + "rewards/rejected": -3.675145149230957, + "step": 356 + }, + { + "epoch": 0.07, + "learning_rate": 1.858403361344538e-05, + "logits/chosen": -1.8447967767715454, + "logits/rejected": -1.6927156448364258, + "logps/chosen": -329.1031494140625, + "logps/rejected": -462.51922607421875, + "loss": 0.3266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17482687532901764, + "rewards/margins": 4.396930694580078, + "rewards/rejected": -4.5717573165893555, + "step": 357 + }, + { + "epoch": 0.07, + "learning_rate": 1.857983193277311e-05, + "logits/chosen": -2.336015224456787, + "logits/rejected": -2.1152777671813965, + "logps/chosen": -381.9425354003906, + "logps/rejected": -426.5019836425781, + "loss": 0.2882, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.517792820930481, + "rewards/margins": 1.9772413969039917, + "rewards/rejected": -2.4950342178344727, + "step": 358 + }, + { + "epoch": 0.08, + "learning_rate": 1.857563025210084e-05, + "logits/chosen": -2.1697511672973633, + "logits/rejected": -1.469695806503296, + "logps/chosen": -297.51885986328125, + "logps/rejected": -271.3316345214844, + "loss": 0.4923, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37828758358955383, + "rewards/margins": 2.0601744651794434, + "rewards/rejected": -2.438462018966675, + "step": 359 + }, + { + "epoch": 0.08, + "learning_rate": 1.8571428571428575e-05, + "logits/chosen": -2.045991897583008, + "logits/rejected": -1.8669451475143433, + "logps/chosen": -364.60028076171875, + "logps/rejected": -256.4580993652344, + "loss": 0.2367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22408923506736755, + "rewards/margins": 2.64922833442688, + "rewards/rejected": -2.8733177185058594, + "step": 360 + }, + { + "epoch": 0.08, + "learning_rate": 1.8567226890756305e-05, + "logits/chosen": -2.367703914642334, + "logits/rejected": -1.7133262157440186, + "logps/chosen": -325.2849426269531, + "logps/rejected": -233.791259765625, + "loss": 0.4509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3806113600730896, + "rewards/margins": 1.185697078704834, + "rewards/rejected": -1.5663084983825684, + "step": 361 + }, + { + "epoch": 0.08, + "learning_rate": 1.8563025210084036e-05, + "logits/chosen": -1.9064981937408447, + "logits/rejected": -1.80112886428833, + "logps/chosen": -266.93218994140625, + "logps/rejected": -295.7356872558594, + "loss": 0.3796, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09755747765302658, + "rewards/margins": 2.2983222007751465, + "rewards/rejected": -2.2007646560668945, + "step": 362 + }, + { + "epoch": 0.08, + "learning_rate": 1.8558823529411766e-05, + "logits/chosen": -2.099736213684082, + "logits/rejected": -1.7851954698562622, + "logps/chosen": -284.7494812011719, + "logps/rejected": -312.81378173828125, + "loss": 0.2357, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5242412686347961, + "rewards/margins": 2.6215972900390625, + "rewards/rejected": -3.1458382606506348, + "step": 363 + }, + { + "epoch": 0.08, + "learning_rate": 1.85546218487395e-05, + "logits/chosen": -2.456117630004883, + "logits/rejected": -2.2640092372894287, + "logps/chosen": -333.8551025390625, + "logps/rejected": -326.3819580078125, + "loss": 0.9617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24694955348968506, + "rewards/margins": 1.4985586404800415, + "rewards/rejected": -1.7455081939697266, + "step": 364 + }, + { + "epoch": 0.08, + "learning_rate": 1.855042016806723e-05, + "logits/chosen": -1.9883825778961182, + "logits/rejected": -1.817191481590271, + "logps/chosen": -229.99081420898438, + "logps/rejected": -208.41629028320312, + "loss": 0.2795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0852956771850586, + "rewards/margins": 2.571378231048584, + "rewards/rejected": -2.6566736698150635, + "step": 365 + }, + { + "epoch": 0.08, + "learning_rate": 1.854621848739496e-05, + "logits/chosen": -1.9803918600082397, + "logits/rejected": -1.8723807334899902, + "logps/chosen": -386.2872314453125, + "logps/rejected": -357.6424255371094, + "loss": 0.3321, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3730486333370209, + "rewards/margins": 2.036879301071167, + "rewards/rejected": -2.4099278450012207, + "step": 366 + }, + { + "epoch": 0.08, + "learning_rate": 1.854201680672269e-05, + "logits/chosen": -1.9376304149627686, + "logits/rejected": -1.6873728036880493, + "logps/chosen": -266.780029296875, + "logps/rejected": -243.25033569335938, + "loss": 0.1883, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0850076824426651, + "rewards/margins": 2.605006694793701, + "rewards/rejected": -2.5199992656707764, + "step": 367 + }, + { + "epoch": 0.08, + "learning_rate": 1.8537815126050424e-05, + "logits/chosen": -2.0228683948516846, + "logits/rejected": -2.041003942489624, + "logps/chosen": -204.14617919921875, + "logps/rejected": -217.356689453125, + "loss": 0.4402, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6097668409347534, + "rewards/margins": 1.8481462001800537, + "rewards/rejected": -2.4579131603240967, + "step": 368 + }, + { + "epoch": 0.08, + "learning_rate": 1.8533613445378154e-05, + "logits/chosen": -2.023157835006714, + "logits/rejected": -1.913124442100525, + "logps/chosen": -273.25335693359375, + "logps/rejected": -218.37193298339844, + "loss": 0.2036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01626184582710266, + "rewards/margins": 3.1192469596862793, + "rewards/rejected": -3.1355087757110596, + "step": 369 + }, + { + "epoch": 0.08, + "learning_rate": 1.8529411764705884e-05, + "logits/chosen": -2.0578420162200928, + "logits/rejected": -2.2603094577789307, + "logps/chosen": -323.858642578125, + "logps/rejected": -400.9482421875, + "loss": 0.1523, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2392684519290924, + "rewards/margins": 3.3774380683898926, + "rewards/rejected": -3.616706371307373, + "step": 370 + }, + { + "epoch": 0.08, + "learning_rate": 1.8525210084033614e-05, + "logits/chosen": -1.761908769607544, + "logits/rejected": -1.8511838912963867, + "logps/chosen": -310.01739501953125, + "logps/rejected": -336.5382080078125, + "loss": 0.2275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.773158073425293, + "rewards/margins": 2.381699562072754, + "rewards/rejected": -3.154857635498047, + "step": 371 + }, + { + "epoch": 0.08, + "learning_rate": 1.8521008403361348e-05, + "logits/chosen": -2.2980709075927734, + "logits/rejected": -2.1128406524658203, + "logps/chosen": -313.9911804199219, + "logps/rejected": -297.58551025390625, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.003889426589012146, + "rewards/margins": 2.741454839706421, + "rewards/rejected": -2.737565517425537, + "step": 372 + }, + { + "epoch": 0.08, + "learning_rate": 1.8516806722689078e-05, + "logits/chosen": -1.8783695697784424, + "logits/rejected": -1.7485692501068115, + "logps/chosen": -190.28128051757812, + "logps/rejected": -228.3871307373047, + "loss": 0.4097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7916634678840637, + "rewards/margins": 1.9011632204055786, + "rewards/rejected": -2.692826747894287, + "step": 373 + }, + { + "epoch": 0.08, + "learning_rate": 1.851260504201681e-05, + "logits/chosen": -1.8537722826004028, + "logits/rejected": -1.660403847694397, + "logps/chosen": -255.82949829101562, + "logps/rejected": -304.01934814453125, + "loss": 0.5173, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1927258968353271, + "rewards/margins": 2.1019415855407715, + "rewards/rejected": -3.2946677207946777, + "step": 374 + }, + { + "epoch": 0.08, + "learning_rate": 1.850840336134454e-05, + "logits/chosen": -2.033233642578125, + "logits/rejected": -1.7248446941375732, + "logps/chosen": -342.703857421875, + "logps/rejected": -289.2439880371094, + "loss": 0.3215, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.33285123109817505, + "rewards/margins": 2.8797762393951416, + "rewards/rejected": -3.212627410888672, + "step": 375 + }, + { + "epoch": 0.08, + "learning_rate": 1.8504201680672272e-05, + "logits/chosen": -2.2549326419830322, + "logits/rejected": -1.728357195854187, + "logps/chosen": -347.8545837402344, + "logps/rejected": -270.393798828125, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4776824712753296, + "rewards/margins": 4.028282165527344, + "rewards/rejected": -4.505964756011963, + "step": 376 + }, + { + "epoch": 0.08, + "learning_rate": 1.8500000000000002e-05, + "logits/chosen": -1.7460956573486328, + "logits/rejected": -1.9304550886154175, + "logps/chosen": -372.11505126953125, + "logps/rejected": -441.93463134765625, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5257360935211182, + "rewards/margins": 3.7753725051879883, + "rewards/rejected": -4.301108360290527, + "step": 377 + }, + { + "epoch": 0.08, + "learning_rate": 1.8495798319327733e-05, + "logits/chosen": -2.3899779319763184, + "logits/rejected": -1.9155993461608887, + "logps/chosen": -304.52337646484375, + "logps/rejected": -241.9775848388672, + "loss": 0.4843, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13882781565189362, + "rewards/margins": 2.850095748901367, + "rewards/rejected": -2.711268186569214, + "step": 378 + }, + { + "epoch": 0.08, + "learning_rate": 1.8491596638655466e-05, + "logits/chosen": -1.9093596935272217, + "logits/rejected": -1.9763383865356445, + "logps/chosen": -250.72787475585938, + "logps/rejected": -354.8570556640625, + "loss": 0.5871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9904338121414185, + "rewards/margins": 1.7757227420806885, + "rewards/rejected": -2.7661566734313965, + "step": 379 + }, + { + "epoch": 0.08, + "learning_rate": 1.8487394957983196e-05, + "logits/chosen": -2.28853702545166, + "logits/rejected": -2.0587716102600098, + "logps/chosen": -429.751220703125, + "logps/rejected": -351.5947570800781, + "loss": 0.5801, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.238794207572937, + "rewards/margins": 2.1691994667053223, + "rewards/rejected": -3.407993793487549, + "step": 380 + }, + { + "epoch": 0.08, + "learning_rate": 1.8483193277310927e-05, + "logits/chosen": -2.10351824760437, + "logits/rejected": -1.8710891008377075, + "logps/chosen": -342.17236328125, + "logps/rejected": -328.80499267578125, + "loss": 0.2729, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3526344895362854, + "rewards/margins": 2.7666478157043457, + "rewards/rejected": -3.1192824840545654, + "step": 381 + }, + { + "epoch": 0.08, + "learning_rate": 1.8478991596638657e-05, + "logits/chosen": -2.0367956161499023, + "logits/rejected": -2.171997547149658, + "logps/chosen": -391.1765441894531, + "logps/rejected": -394.83306884765625, + "loss": 0.3867, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.61182701587677, + "rewards/margins": 2.4131217002868652, + "rewards/rejected": -3.024948835372925, + "step": 382 + }, + { + "epoch": 0.08, + "learning_rate": 1.847478991596639e-05, + "logits/chosen": -2.070009469985962, + "logits/rejected": -2.1090354919433594, + "logps/chosen": -337.0970153808594, + "logps/rejected": -356.8552551269531, + "loss": 0.1785, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.47881823778152466, + "rewards/margins": 3.552011013031006, + "rewards/rejected": -4.030829429626465, + "step": 383 + }, + { + "epoch": 0.08, + "learning_rate": 1.847058823529412e-05, + "logits/chosen": -2.2249088287353516, + "logits/rejected": -2.104274034500122, + "logps/chosen": -302.1180114746094, + "logps/rejected": -384.06268310546875, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7090399861335754, + "rewards/margins": 2.8228683471679688, + "rewards/rejected": -3.5319085121154785, + "step": 384 + }, + { + "epoch": 0.08, + "learning_rate": 1.846638655462185e-05, + "logits/chosen": -2.3999202251434326, + "logits/rejected": -2.082932710647583, + "logps/chosen": -319.16180419921875, + "logps/rejected": -283.59246826171875, + "loss": 0.254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5637760162353516, + "rewards/margins": 2.9321329593658447, + "rewards/rejected": -3.4959089756011963, + "step": 385 + }, + { + "epoch": 0.08, + "learning_rate": 1.846218487394958e-05, + "logits/chosen": -2.296931028366089, + "logits/rejected": -1.7105531692504883, + "logps/chosen": -335.31927490234375, + "logps/rejected": -289.3534240722656, + "loss": 0.5521, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9079472422599792, + "rewards/margins": 1.7789301872253418, + "rewards/rejected": -2.6868772506713867, + "step": 386 + }, + { + "epoch": 0.08, + "learning_rate": 1.8457983193277315e-05, + "logits/chosen": -1.9932727813720703, + "logits/rejected": -2.1460158824920654, + "logps/chosen": -510.9311218261719, + "logps/rejected": -411.8099060058594, + "loss": 0.2199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21243184804916382, + "rewards/margins": 3.4859776496887207, + "rewards/rejected": -3.69840931892395, + "step": 387 + }, + { + "epoch": 0.08, + "learning_rate": 1.8453781512605045e-05, + "logits/chosen": -2.1076436042785645, + "logits/rejected": -1.8722412586212158, + "logps/chosen": -278.708251953125, + "logps/rejected": -431.20684814453125, + "loss": 0.528, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4809589982032776, + "rewards/margins": 2.700566291809082, + "rewards/rejected": -3.181525230407715, + "step": 388 + }, + { + "epoch": 0.08, + "learning_rate": 1.8449579831932775e-05, + "logits/chosen": -2.1607489585876465, + "logits/rejected": -1.726609468460083, + "logps/chosen": -378.8289794921875, + "logps/rejected": -346.7107238769531, + "loss": 0.1821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8186622858047485, + "rewards/margins": 2.972604274749756, + "rewards/rejected": -3.791266441345215, + "step": 389 + }, + { + "epoch": 0.08, + "learning_rate": 1.8445378151260505e-05, + "logits/chosen": -2.319326162338257, + "logits/rejected": -1.7188987731933594, + "logps/chosen": -342.0713806152344, + "logps/rejected": -288.20916748046875, + "loss": 0.4432, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0290842056274414, + "rewards/margins": 2.874004364013672, + "rewards/rejected": -3.9030888080596924, + "step": 390 + }, + { + "epoch": 0.08, + "learning_rate": 1.844117647058824e-05, + "logits/chosen": -2.2356772422790527, + "logits/rejected": -2.220611333847046, + "logps/chosen": -351.1100769042969, + "logps/rejected": -338.7408447265625, + "loss": 0.5559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.503207266330719, + "rewards/margins": 1.8040645122528076, + "rewards/rejected": -2.307271718978882, + "step": 391 + }, + { + "epoch": 0.08, + "learning_rate": 1.8436974789915966e-05, + "logits/chosen": -2.1344399452209473, + "logits/rejected": -2.0783677101135254, + "logps/chosen": -261.3807678222656, + "logps/rejected": -330.52557373046875, + "loss": 0.2463, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0167301893234253, + "rewards/margins": 2.9774012565612793, + "rewards/rejected": -3.994131565093994, + "step": 392 + }, + { + "epoch": 0.08, + "learning_rate": 1.84327731092437e-05, + "logits/chosen": -1.9403150081634521, + "logits/rejected": -1.5085800886154175, + "logps/chosen": -249.44728088378906, + "logps/rejected": -210.45999145507812, + "loss": 0.8026, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2809702157974243, + "rewards/margins": 1.0851925611495972, + "rewards/rejected": -2.3661627769470215, + "step": 393 + }, + { + "epoch": 0.08, + "learning_rate": 1.842857142857143e-05, + "logits/chosen": -1.9822087287902832, + "logits/rejected": -2.161910057067871, + "logps/chosen": -254.12217712402344, + "logps/rejected": -345.6323547363281, + "loss": 0.1703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8024090528488159, + "rewards/margins": 2.9197912216186523, + "rewards/rejected": -3.722200393676758, + "step": 394 + }, + { + "epoch": 0.08, + "learning_rate": 1.842436974789916e-05, + "logits/chosen": -1.6290154457092285, + "logits/rejected": -1.6783397197723389, + "logps/chosen": -340.3009948730469, + "logps/rejected": -310.3421936035156, + "loss": 0.1576, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35950857400894165, + "rewards/margins": 3.5517892837524414, + "rewards/rejected": -3.9112980365753174, + "step": 395 + }, + { + "epoch": 0.08, + "learning_rate": 1.842016806722689e-05, + "logits/chosen": -2.0145533084869385, + "logits/rejected": -1.870767593383789, + "logps/chosen": -257.0691223144531, + "logps/rejected": -346.18621826171875, + "loss": 0.147, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5434238314628601, + "rewards/margins": 4.360313415527344, + "rewards/rejected": -4.903737545013428, + "step": 396 + }, + { + "epoch": 0.08, + "learning_rate": 1.8415966386554624e-05, + "logits/chosen": -2.049251079559326, + "logits/rejected": -1.7490243911743164, + "logps/chosen": -301.72601318359375, + "logps/rejected": -335.76373291015625, + "loss": 0.48, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6389510035514832, + "rewards/margins": 2.212846279144287, + "rewards/rejected": -2.851797103881836, + "step": 397 + }, + { + "epoch": 0.08, + "learning_rate": 1.8411764705882354e-05, + "logits/chosen": -2.0430452823638916, + "logits/rejected": -2.023911952972412, + "logps/chosen": -321.87933349609375, + "logps/rejected": -410.43609619140625, + "loss": 0.5061, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4173829555511475, + "rewards/margins": 2.592723846435547, + "rewards/rejected": -4.010106563568115, + "step": 398 + }, + { + "epoch": 0.08, + "learning_rate": 1.8407563025210084e-05, + "logits/chosen": -1.9393033981323242, + "logits/rejected": -2.15283203125, + "logps/chosen": -218.0631866455078, + "logps/rejected": -263.5722351074219, + "loss": 0.306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2121131420135498, + "rewards/margins": 2.416569948196411, + "rewards/rejected": -3.628683090209961, + "step": 399 + }, + { + "epoch": 0.08, + "learning_rate": 1.8403361344537814e-05, + "logits/chosen": -2.161271095275879, + "logits/rejected": -2.035555124282837, + "logps/chosen": -225.12344360351562, + "logps/rejected": -254.11572265625, + "loss": 0.212, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8717067241668701, + "rewards/margins": 3.1039505004882812, + "rewards/rejected": -3.9756572246551514, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 1.8399159663865548e-05, + "logits/chosen": -2.0231268405914307, + "logits/rejected": -1.8708224296569824, + "logps/chosen": -254.9494171142578, + "logps/rejected": -279.9722900390625, + "loss": 0.4026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8832752704620361, + "rewards/margins": 3.6449685096740723, + "rewards/rejected": -4.528243541717529, + "step": 401 + }, + { + "epoch": 0.08, + "learning_rate": 1.839495798319328e-05, + "logits/chosen": -1.9824044704437256, + "logits/rejected": -2.199185371398926, + "logps/chosen": -332.078125, + "logps/rejected": -417.3800354003906, + "loss": 0.6519, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5442051887512207, + "rewards/margins": 1.857395887374878, + "rewards/rejected": -3.4016010761260986, + "step": 402 + }, + { + "epoch": 0.08, + "learning_rate": 1.839075630252101e-05, + "logits/chosen": -1.8881011009216309, + "logits/rejected": -2.0036232471466064, + "logps/chosen": -272.0240783691406, + "logps/rejected": -298.3892822265625, + "loss": 0.4006, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4687628746032715, + "rewards/margins": 3.4236550331115723, + "rewards/rejected": -4.892417907714844, + "step": 403 + }, + { + "epoch": 0.08, + "learning_rate": 1.838655462184874e-05, + "logits/chosen": -2.3152854442596436, + "logits/rejected": -1.3793350458145142, + "logps/chosen": -408.720947265625, + "logps/rejected": -259.06597900390625, + "loss": 0.2866, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6584242582321167, + "rewards/margins": 3.0207679271698, + "rewards/rejected": -4.679192066192627, + "step": 404 + }, + { + "epoch": 0.08, + "learning_rate": 1.8382352941176472e-05, + "logits/chosen": -2.289783477783203, + "logits/rejected": -1.8729841709136963, + "logps/chosen": -449.4166259765625, + "logps/rejected": -440.9778747558594, + "loss": 0.3789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3342485427856445, + "rewards/margins": 2.9533896446228027, + "rewards/rejected": -4.287638187408447, + "step": 405 + }, + { + "epoch": 0.08, + "learning_rate": 1.8378151260504203e-05, + "logits/chosen": -2.270742416381836, + "logits/rejected": -2.2102785110473633, + "logps/chosen": -264.51678466796875, + "logps/rejected": -236.71121215820312, + "loss": 0.4186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9885307550430298, + "rewards/margins": 1.7186174392700195, + "rewards/rejected": -2.7071480751037598, + "step": 406 + }, + { + "epoch": 0.09, + "learning_rate": 1.8373949579831933e-05, + "logits/chosen": -2.402445077896118, + "logits/rejected": -1.7826098203659058, + "logps/chosen": -416.5032043457031, + "logps/rejected": -298.052001953125, + "loss": 0.3506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5602412223815918, + "rewards/margins": 2.2944812774658203, + "rewards/rejected": -2.854722499847412, + "step": 407 + }, + { + "epoch": 0.09, + "learning_rate": 1.8369747899159663e-05, + "logits/chosen": -2.239947557449341, + "logits/rejected": -2.2207236289978027, + "logps/chosen": -381.25054931640625, + "logps/rejected": -301.436767578125, + "loss": 0.4853, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1972332000732422, + "rewards/margins": 2.953791856765747, + "rewards/rejected": -4.15102481842041, + "step": 408 + }, + { + "epoch": 0.09, + "learning_rate": 1.8365546218487397e-05, + "logits/chosen": -2.1951308250427246, + "logits/rejected": -1.747607707977295, + "logps/chosen": -286.4284973144531, + "logps/rejected": -260.4895324707031, + "loss": 0.503, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4415708780288696, + "rewards/margins": 1.7284908294677734, + "rewards/rejected": -3.1700613498687744, + "step": 409 + }, + { + "epoch": 0.09, + "learning_rate": 1.8361344537815127e-05, + "logits/chosen": -1.9372808933258057, + "logits/rejected": -2.1047611236572266, + "logps/chosen": -199.15914916992188, + "logps/rejected": -289.2158203125, + "loss": 0.605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5658094882965088, + "rewards/margins": 2.872337818145752, + "rewards/rejected": -3.43814754486084, + "step": 410 + }, + { + "epoch": 0.09, + "learning_rate": 1.8357142857142857e-05, + "logits/chosen": -2.3187527656555176, + "logits/rejected": -1.9968873262405396, + "logps/chosen": -387.2622375488281, + "logps/rejected": -331.4651794433594, + "loss": 0.1481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3018587827682495, + "rewards/margins": 3.6443562507629395, + "rewards/rejected": -3.9462149143218994, + "step": 411 + }, + { + "epoch": 0.09, + "learning_rate": 1.8352941176470587e-05, + "logits/chosen": -2.2263684272766113, + "logits/rejected": -1.7158617973327637, + "logps/chosen": -283.41741943359375, + "logps/rejected": -374.18658447265625, + "loss": 0.3547, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.10880038887262344, + "rewards/margins": 2.185023069381714, + "rewards/rejected": -2.0762226581573486, + "step": 412 + }, + { + "epoch": 0.09, + "learning_rate": 1.834873949579832e-05, + "logits/chosen": -2.1612541675567627, + "logits/rejected": -2.051351547241211, + "logps/chosen": -413.21234130859375, + "logps/rejected": -358.8020324707031, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3436061143875122, + "rewards/margins": 3.0096888542175293, + "rewards/rejected": -2.6660826206207275, + "step": 413 + }, + { + "epoch": 0.09, + "learning_rate": 1.834453781512605e-05, + "logits/chosen": -2.178018569946289, + "logits/rejected": -1.7989444732666016, + "logps/chosen": -366.8251953125, + "logps/rejected": -301.50555419921875, + "loss": 0.1768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.47563356161117554, + "rewards/margins": 2.7578303813934326, + "rewards/rejected": -2.282196521759033, + "step": 414 + }, + { + "epoch": 0.09, + "learning_rate": 1.834033613445378e-05, + "logits/chosen": -2.2351861000061035, + "logits/rejected": -1.9579159021377563, + "logps/chosen": -255.4895782470703, + "logps/rejected": -261.81390380859375, + "loss": 0.3326, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47969090938568115, + "rewards/margins": 2.6430249214172363, + "rewards/rejected": -3.122715950012207, + "step": 415 + }, + { + "epoch": 0.09, + "learning_rate": 1.8336134453781515e-05, + "logits/chosen": -1.8611608743667603, + "logits/rejected": -2.120920419692993, + "logps/chosen": -257.4296875, + "logps/rejected": -257.55029296875, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24374975264072418, + "rewards/margins": 3.0062265396118164, + "rewards/rejected": -3.24997615814209, + "step": 416 + }, + { + "epoch": 0.09, + "learning_rate": 1.8331932773109245e-05, + "logits/chosen": -1.9984084367752075, + "logits/rejected": -1.6272146701812744, + "logps/chosen": -280.01177978515625, + "logps/rejected": -275.6769714355469, + "loss": 0.4414, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8667396306991577, + "rewards/margins": 2.1595852375030518, + "rewards/rejected": -3.02632474899292, + "step": 417 + }, + { + "epoch": 0.09, + "learning_rate": 1.8327731092436975e-05, + "logits/chosen": -1.8926503658294678, + "logits/rejected": -1.1714746952056885, + "logps/chosen": -479.9905700683594, + "logps/rejected": -263.75341796875, + "loss": 0.4008, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0290093421936035, + "rewards/margins": 2.638532876968384, + "rewards/rejected": -1.6095231771469116, + "step": 418 + }, + { + "epoch": 0.09, + "learning_rate": 1.8323529411764706e-05, + "logits/chosen": -2.300912380218506, + "logits/rejected": -2.214662551879883, + "logps/chosen": -399.84088134765625, + "logps/rejected": -403.6318664550781, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5819576978683472, + "rewards/margins": 2.674020767211914, + "rewards/rejected": -2.0920627117156982, + "step": 419 + }, + { + "epoch": 0.09, + "learning_rate": 1.831932773109244e-05, + "logits/chosen": -1.975816011428833, + "logits/rejected": -2.032226085662842, + "logps/chosen": -272.3787841796875, + "logps/rejected": -283.6316833496094, + "loss": 0.4659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2719263732433319, + "rewards/margins": 1.9665776491165161, + "rewards/rejected": -2.238503932952881, + "step": 420 + }, + { + "epoch": 0.09, + "learning_rate": 1.831512605042017e-05, + "logits/chosen": -2.149604558944702, + "logits/rejected": -1.8828272819519043, + "logps/chosen": -262.6552734375, + "logps/rejected": -235.02587890625, + "loss": 0.2882, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3024459183216095, + "rewards/margins": 3.7423324584960938, + "rewards/rejected": -4.044778823852539, + "step": 421 + }, + { + "epoch": 0.09, + "learning_rate": 1.83109243697479e-05, + "logits/chosen": -2.101395845413208, + "logits/rejected": -1.5884850025177002, + "logps/chosen": -332.09197998046875, + "logps/rejected": -277.72662353515625, + "loss": 0.3119, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.133995920419693, + "rewards/margins": 2.948085069656372, + "rewards/rejected": -2.81408953666687, + "step": 422 + }, + { + "epoch": 0.09, + "learning_rate": 1.830672268907563e-05, + "logits/chosen": -1.9242068529129028, + "logits/rejected": -2.0045411586761475, + "logps/chosen": -353.66900634765625, + "logps/rejected": -406.51416015625, + "loss": 0.3477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5238945484161377, + "rewards/margins": 2.9817824363708496, + "rewards/rejected": -3.5056772232055664, + "step": 423 + }, + { + "epoch": 0.09, + "learning_rate": 1.8302521008403364e-05, + "logits/chosen": -2.0353612899780273, + "logits/rejected": -1.8938074111938477, + "logps/chosen": -286.2059631347656, + "logps/rejected": -292.628173828125, + "loss": 0.5462, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02371799945831299, + "rewards/margins": 2.6758546829223633, + "rewards/rejected": -2.65213680267334, + "step": 424 + }, + { + "epoch": 0.09, + "learning_rate": 1.8298319327731094e-05, + "logits/chosen": -2.05151629447937, + "logits/rejected": -1.985163688659668, + "logps/chosen": -400.56103515625, + "logps/rejected": -320.9740295410156, + "loss": 0.4048, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5659486055374146, + "rewards/margins": 2.076845645904541, + "rewards/rejected": -1.5108968019485474, + "step": 425 + }, + { + "epoch": 0.09, + "learning_rate": 1.8294117647058824e-05, + "logits/chosen": -2.1431612968444824, + "logits/rejected": -2.0062804222106934, + "logps/chosen": -378.94451904296875, + "logps/rejected": -320.79449462890625, + "loss": 0.2322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05997392535209656, + "rewards/margins": 2.9121718406677246, + "rewards/rejected": -2.8521978855133057, + "step": 426 + }, + { + "epoch": 0.09, + "learning_rate": 1.8289915966386554e-05, + "logits/chosen": -1.6912953853607178, + "logits/rejected": -1.7182713747024536, + "logps/chosen": -303.0714111328125, + "logps/rejected": -350.28192138671875, + "loss": 0.2803, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1679016798734665, + "rewards/margins": 2.768211603164673, + "rewards/rejected": -2.936113119125366, + "step": 427 + }, + { + "epoch": 0.09, + "learning_rate": 1.8285714285714288e-05, + "logits/chosen": -2.379438877105713, + "logits/rejected": -2.1407203674316406, + "logps/chosen": -374.27264404296875, + "logps/rejected": -381.09228515625, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26474815607070923, + "rewards/margins": 2.9242446422576904, + "rewards/rejected": -2.659496307373047, + "step": 428 + }, + { + "epoch": 0.09, + "learning_rate": 1.8281512605042018e-05, + "logits/chosen": -1.895861268043518, + "logits/rejected": -2.0332565307617188, + "logps/chosen": -337.95281982421875, + "logps/rejected": -378.36053466796875, + "loss": 0.2621, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4939233660697937, + "rewards/margins": 2.6256091594696045, + "rewards/rejected": -2.131685733795166, + "step": 429 + }, + { + "epoch": 0.09, + "learning_rate": 1.8277310924369748e-05, + "logits/chosen": -1.9959743022918701, + "logits/rejected": -1.8283321857452393, + "logps/chosen": -230.7371826171875, + "logps/rejected": -399.9266052246094, + "loss": 0.2077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5777879357337952, + "rewards/margins": 3.7699341773986816, + "rewards/rejected": -4.347722053527832, + "step": 430 + }, + { + "epoch": 0.09, + "learning_rate": 1.827310924369748e-05, + "logits/chosen": -2.3392908573150635, + "logits/rejected": -2.0552642345428467, + "logps/chosen": -420.57196044921875, + "logps/rejected": -356.7542419433594, + "loss": 0.4148, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16800589859485626, + "rewards/margins": 3.497415542602539, + "rewards/rejected": -3.3294098377227783, + "step": 431 + }, + { + "epoch": 0.09, + "learning_rate": 1.8268907563025212e-05, + "logits/chosen": -2.1076271533966064, + "logits/rejected": -1.7760194540023804, + "logps/chosen": -438.2403869628906, + "logps/rejected": -340.4200134277344, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29134923219680786, + "rewards/margins": 3.066493034362793, + "rewards/rejected": -2.77514386177063, + "step": 432 + }, + { + "epoch": 0.09, + "learning_rate": 1.8264705882352942e-05, + "logits/chosen": -1.9294633865356445, + "logits/rejected": -1.7531614303588867, + "logps/chosen": -268.9462890625, + "logps/rejected": -265.12286376953125, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32981252670288086, + "rewards/margins": 2.1313040256500244, + "rewards/rejected": -1.8014914989471436, + "step": 433 + }, + { + "epoch": 0.09, + "learning_rate": 1.8260504201680673e-05, + "logits/chosen": -2.2013320922851562, + "logits/rejected": -2.003314971923828, + "logps/chosen": -243.44723510742188, + "logps/rejected": -226.79901123046875, + "loss": 0.4459, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1876690834760666, + "rewards/margins": 2.4671425819396973, + "rewards/rejected": -2.279473304748535, + "step": 434 + }, + { + "epoch": 0.09, + "learning_rate": 1.8256302521008403e-05, + "logits/chosen": -2.0985069274902344, + "logits/rejected": -2.01230788230896, + "logps/chosen": -327.3497009277344, + "logps/rejected": -303.96533203125, + "loss": 0.3102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7715959548950195, + "rewards/margins": 3.2752957344055176, + "rewards/rejected": -2.503699779510498, + "step": 435 + }, + { + "epoch": 0.09, + "learning_rate": 1.8252100840336136e-05, + "logits/chosen": -2.1409311294555664, + "logits/rejected": -1.8834095001220703, + "logps/chosen": -344.905029296875, + "logps/rejected": -325.209716796875, + "loss": 0.3905, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20048293471336365, + "rewards/margins": 1.9339041709899902, + "rewards/rejected": -1.7334210872650146, + "step": 436 + }, + { + "epoch": 0.09, + "learning_rate": 1.8247899159663867e-05, + "logits/chosen": -2.15683650970459, + "logits/rejected": -1.881533145904541, + "logps/chosen": -356.26531982421875, + "logps/rejected": -272.372314453125, + "loss": 0.4086, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31405243277549744, + "rewards/margins": 2.866708755493164, + "rewards/rejected": -3.1807610988616943, + "step": 437 + }, + { + "epoch": 0.09, + "learning_rate": 1.8243697478991597e-05, + "logits/chosen": -2.448089599609375, + "logits/rejected": -1.938776969909668, + "logps/chosen": -255.0645294189453, + "logps/rejected": -228.92654418945312, + "loss": 0.4238, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18938884139060974, + "rewards/margins": 2.685791254043579, + "rewards/rejected": -2.4964022636413574, + "step": 438 + }, + { + "epoch": 0.09, + "learning_rate": 1.823949579831933e-05, + "logits/chosen": -2.3020715713500977, + "logits/rejected": -1.7897909879684448, + "logps/chosen": -369.67706298828125, + "logps/rejected": -308.80322265625, + "loss": 0.2207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.018297463655471802, + "rewards/margins": 2.9154388904571533, + "rewards/rejected": -2.9337363243103027, + "step": 439 + }, + { + "epoch": 0.09, + "learning_rate": 1.823529411764706e-05, + "logits/chosen": -2.04315185546875, + "logits/rejected": -2.176276206970215, + "logps/chosen": -344.2831115722656, + "logps/rejected": -365.1346435546875, + "loss": 0.2441, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8508395552635193, + "rewards/margins": 3.6053500175476074, + "rewards/rejected": -2.7545104026794434, + "step": 440 + }, + { + "epoch": 0.09, + "learning_rate": 1.823109243697479e-05, + "logits/chosen": -1.9498186111450195, + "logits/rejected": -1.7883284091949463, + "logps/chosen": -214.4251708984375, + "logps/rejected": -277.6398620605469, + "loss": 0.3085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42159321904182434, + "rewards/margins": 2.0666284561157227, + "rewards/rejected": -1.6450350284576416, + "step": 441 + }, + { + "epoch": 0.09, + "learning_rate": 1.822689075630252e-05, + "logits/chosen": -2.3132596015930176, + "logits/rejected": -2.1059367656707764, + "logps/chosen": -330.3541564941406, + "logps/rejected": -354.6991271972656, + "loss": 0.5807, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4959174692630768, + "rewards/margins": 2.7793803215026855, + "rewards/rejected": -3.2752978801727295, + "step": 442 + }, + { + "epoch": 0.09, + "learning_rate": 1.8222689075630255e-05, + "logits/chosen": -1.8083010911941528, + "logits/rejected": -1.611409306526184, + "logps/chosen": -264.4505920410156, + "logps/rejected": -268.5172424316406, + "loss": 0.2615, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11492379009723663, + "rewards/margins": 3.0617222785949707, + "rewards/rejected": -3.1766459941864014, + "step": 443 + }, + { + "epoch": 0.09, + "learning_rate": 1.8218487394957985e-05, + "logits/chosen": -2.0951881408691406, + "logits/rejected": -1.8586359024047852, + "logps/chosen": -311.2127990722656, + "logps/rejected": -386.22259521484375, + "loss": 0.5194, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17915694415569305, + "rewards/margins": 3.9140496253967285, + "rewards/rejected": -3.7348928451538086, + "step": 444 + }, + { + "epoch": 0.09, + "learning_rate": 1.8214285714285715e-05, + "logits/chosen": -2.2957074642181396, + "logits/rejected": -2.258702278137207, + "logps/chosen": -312.64788818359375, + "logps/rejected": -272.65411376953125, + "loss": 0.3874, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40231961011886597, + "rewards/margins": 2.468316078186035, + "rewards/rejected": -2.870635747909546, + "step": 445 + }, + { + "epoch": 0.09, + "learning_rate": 1.8210084033613445e-05, + "logits/chosen": -2.11244535446167, + "logits/rejected": -1.9430699348449707, + "logps/chosen": -316.4300537109375, + "logps/rejected": -272.95037841796875, + "loss": 0.3922, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.45706817507743835, + "rewards/margins": 2.1827540397644043, + "rewards/rejected": -1.7256858348846436, + "step": 446 + }, + { + "epoch": 0.09, + "learning_rate": 1.820588235294118e-05, + "logits/chosen": -1.9815583229064941, + "logits/rejected": -1.6495671272277832, + "logps/chosen": -288.887451171875, + "logps/rejected": -277.43377685546875, + "loss": 0.2881, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.031989581882953644, + "rewards/margins": 2.7415804862976074, + "rewards/rejected": -2.7735700607299805, + "step": 447 + }, + { + "epoch": 0.09, + "learning_rate": 1.820168067226891e-05, + "logits/chosen": -1.7236675024032593, + "logits/rejected": -1.9960949420928955, + "logps/chosen": -238.47738647460938, + "logps/rejected": -283.5688781738281, + "loss": 0.7905, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5538796782493591, + "rewards/margins": 0.8750126361846924, + "rewards/rejected": -1.4288922548294067, + "step": 448 + }, + { + "epoch": 0.09, + "learning_rate": 1.819747899159664e-05, + "logits/chosen": -1.5379657745361328, + "logits/rejected": -1.375888705253601, + "logps/chosen": -241.681396484375, + "logps/rejected": -278.0999755859375, + "loss": 0.4768, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11749270558357239, + "rewards/margins": 2.251631498336792, + "rewards/rejected": -2.369123935699463, + "step": 449 + }, + { + "epoch": 0.09, + "learning_rate": 1.819327731092437e-05, + "logits/chosen": -1.990583062171936, + "logits/rejected": -2.044398546218872, + "logps/chosen": -333.5953369140625, + "logps/rejected": -397.26483154296875, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16924190521240234, + "rewards/margins": 1.9998054504394531, + "rewards/rejected": -2.1690473556518555, + "step": 450 + }, + { + "epoch": 0.09, + "learning_rate": 1.8189075630252103e-05, + "logits/chosen": -2.0269851684570312, + "logits/rejected": -2.1474108695983887, + "logps/chosen": -273.5993957519531, + "logps/rejected": -385.10882568359375, + "loss": 0.2793, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12756304442882538, + "rewards/margins": 2.1246533393859863, + "rewards/rejected": -2.2522165775299072, + "step": 451 + }, + { + "epoch": 0.09, + "learning_rate": 1.8184873949579833e-05, + "logits/chosen": -2.1646690368652344, + "logits/rejected": -1.8509190082550049, + "logps/chosen": -248.11123657226562, + "logps/rejected": -261.6076965332031, + "loss": 0.2691, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.18941614031791687, + "rewards/margins": 2.55245304107666, + "rewards/rejected": -2.363036632537842, + "step": 452 + }, + { + "epoch": 0.09, + "learning_rate": 1.8180672268907564e-05, + "logits/chosen": -2.0878331661224365, + "logits/rejected": -1.8863886594772339, + "logps/chosen": -250.40542602539062, + "logps/rejected": -275.64508056640625, + "loss": 0.406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1500859409570694, + "rewards/margins": 2.256984233856201, + "rewards/rejected": -2.4070701599121094, + "step": 453 + }, + { + "epoch": 0.09, + "learning_rate": 1.8176470588235294e-05, + "logits/chosen": -1.9686896800994873, + "logits/rejected": -1.6723570823669434, + "logps/chosen": -274.258544921875, + "logps/rejected": -314.515869140625, + "loss": 0.4388, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.009008590131998062, + "rewards/margins": 1.8709897994995117, + "rewards/rejected": -1.8619811534881592, + "step": 454 + }, + { + "epoch": 0.1, + "learning_rate": 1.8172268907563027e-05, + "logits/chosen": -2.16030216217041, + "logits/rejected": -1.8089762926101685, + "logps/chosen": -468.20416259765625, + "logps/rejected": -275.9560546875, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0918165445327759, + "rewards/margins": 3.3607137203216553, + "rewards/rejected": -2.26889705657959, + "step": 455 + }, + { + "epoch": 0.1, + "learning_rate": 1.8168067226890758e-05, + "logits/chosen": -1.7544831037521362, + "logits/rejected": -1.9637465476989746, + "logps/chosen": -186.10647583007812, + "logps/rejected": -273.6681213378906, + "loss": 0.477, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.006809163838624954, + "rewards/margins": 1.667669415473938, + "rewards/rejected": -1.660860300064087, + "step": 456 + }, + { + "epoch": 0.1, + "learning_rate": 1.8163865546218488e-05, + "logits/chosen": -2.226810932159424, + "logits/rejected": -1.9543696641921997, + "logps/chosen": -408.641845703125, + "logps/rejected": -340.444580078125, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40470558404922485, + "rewards/margins": 2.5785889625549316, + "rewards/rejected": -2.1738831996917725, + "step": 457 + }, + { + "epoch": 0.1, + "learning_rate": 1.815966386554622e-05, + "logits/chosen": -1.8382527828216553, + "logits/rejected": -1.8056399822235107, + "logps/chosen": -353.8582763671875, + "logps/rejected": -348.326171875, + "loss": 0.5055, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05595725029706955, + "rewards/margins": 1.7954365015029907, + "rewards/rejected": -1.851393699645996, + "step": 458 + }, + { + "epoch": 0.1, + "learning_rate": 1.8155462184873952e-05, + "logits/chosen": -2.136549472808838, + "logits/rejected": -1.4456157684326172, + "logps/chosen": -251.87380981445312, + "logps/rejected": -241.96295166015625, + "loss": 0.3531, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05324000120162964, + "rewards/margins": 2.4561290740966797, + "rewards/rejected": -2.509368896484375, + "step": 459 + }, + { + "epoch": 0.1, + "learning_rate": 1.8151260504201682e-05, + "logits/chosen": -1.7990928888320923, + "logits/rejected": -1.5568106174468994, + "logps/chosen": -321.53485107421875, + "logps/rejected": -309.19940185546875, + "loss": 0.5021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.378719300031662, + "rewards/margins": 2.2479031085968018, + "rewards/rejected": -2.626622438430786, + "step": 460 + }, + { + "epoch": 0.1, + "learning_rate": 1.8147058823529412e-05, + "logits/chosen": -2.1900739669799805, + "logits/rejected": -1.9575533866882324, + "logps/chosen": -321.0294189453125, + "logps/rejected": -284.5794677734375, + "loss": 0.3111, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2660277485847473, + "rewards/margins": 1.9817547798156738, + "rewards/rejected": -2.2477824687957764, + "step": 461 + }, + { + "epoch": 0.1, + "learning_rate": 1.8142857142857146e-05, + "logits/chosen": -2.106145143508911, + "logits/rejected": -1.78156316280365, + "logps/chosen": -314.9206848144531, + "logps/rejected": -266.0330810546875, + "loss": 0.3376, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.03151342272758484, + "rewards/margins": 2.3189806938171387, + "rewards/rejected": -2.2874674797058105, + "step": 462 + }, + { + "epoch": 0.1, + "learning_rate": 1.8138655462184876e-05, + "logits/chosen": -2.070035219192505, + "logits/rejected": -1.6459128856658936, + "logps/chosen": -354.7567138671875, + "logps/rejected": -307.1123352050781, + "loss": 0.3962, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32077494263648987, + "rewards/margins": 1.6847877502441406, + "rewards/rejected": -1.3640128374099731, + "step": 463 + }, + { + "epoch": 0.1, + "learning_rate": 1.8134453781512606e-05, + "logits/chosen": -2.2774393558502197, + "logits/rejected": -1.9349925518035889, + "logps/chosen": -282.53900146484375, + "logps/rejected": -297.27093505859375, + "loss": 0.3103, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005994489416480064, + "rewards/margins": 2.17958664894104, + "rewards/rejected": -2.1735920906066895, + "step": 464 + }, + { + "epoch": 0.1, + "learning_rate": 1.8130252100840336e-05, + "logits/chosen": -2.030294895172119, + "logits/rejected": -1.8782742023468018, + "logps/chosen": -285.802001953125, + "logps/rejected": -292.132568359375, + "loss": 0.3012, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04754617065191269, + "rewards/margins": 1.7985600233078003, + "rewards/rejected": -1.7510137557983398, + "step": 465 + }, + { + "epoch": 0.1, + "learning_rate": 1.812605042016807e-05, + "logits/chosen": -1.79093599319458, + "logits/rejected": -1.7948484420776367, + "logps/chosen": -288.230224609375, + "logps/rejected": -319.9525146484375, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11549008637666702, + "rewards/margins": 2.987337112426758, + "rewards/rejected": -3.102827310562134, + "step": 466 + }, + { + "epoch": 0.1, + "learning_rate": 1.81218487394958e-05, + "logits/chosen": -2.0728960037231445, + "logits/rejected": -2.0928308963775635, + "logps/chosen": -272.58770751953125, + "logps/rejected": -307.96490478515625, + "loss": 0.1698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3147255480289459, + "rewards/margins": 3.7766573429107666, + "rewards/rejected": -3.4619317054748535, + "step": 467 + }, + { + "epoch": 0.1, + "learning_rate": 1.811764705882353e-05, + "logits/chosen": -2.0510151386260986, + "logits/rejected": -1.9429408311843872, + "logps/chosen": -323.97467041015625, + "logps/rejected": -315.77252197265625, + "loss": 0.2078, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.547203540802002, + "rewards/margins": 3.029226303100586, + "rewards/rejected": -2.482023000717163, + "step": 468 + }, + { + "epoch": 0.1, + "learning_rate": 1.811344537815126e-05, + "logits/chosen": -2.0363616943359375, + "logits/rejected": -1.8428747653961182, + "logps/chosen": -343.7933654785156, + "logps/rejected": -302.945068359375, + "loss": 0.2906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2114700973033905, + "rewards/margins": 3.440668821334839, + "rewards/rejected": -3.6521387100219727, + "step": 469 + }, + { + "epoch": 0.1, + "learning_rate": 1.8109243697478994e-05, + "logits/chosen": -2.177489757537842, + "logits/rejected": -1.9540671110153198, + "logps/chosen": -345.616455078125, + "logps/rejected": -389.21484375, + "loss": 0.2852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23222655057907104, + "rewards/margins": 2.5556788444519043, + "rewards/rejected": -2.3234524726867676, + "step": 470 + }, + { + "epoch": 0.1, + "learning_rate": 1.8105042016806725e-05, + "logits/chosen": -2.24438214302063, + "logits/rejected": -2.225999593734741, + "logps/chosen": -306.83306884765625, + "logps/rejected": -363.3036804199219, + "loss": 0.5031, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5617220997810364, + "rewards/margins": 2.1759278774261475, + "rewards/rejected": -2.737650156021118, + "step": 471 + }, + { + "epoch": 0.1, + "learning_rate": 1.8100840336134455e-05, + "logits/chosen": -2.2119107246398926, + "logits/rejected": -2.1409425735473633, + "logps/chosen": -456.27923583984375, + "logps/rejected": -367.75433349609375, + "loss": 0.237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6669508218765259, + "rewards/margins": 2.9997236728668213, + "rewards/rejected": -3.6666743755340576, + "step": 472 + }, + { + "epoch": 0.1, + "learning_rate": 1.8096638655462185e-05, + "logits/chosen": -1.991231918334961, + "logits/rejected": -1.4948508739471436, + "logps/chosen": -297.85968017578125, + "logps/rejected": -290.2069396972656, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5998925566673279, + "rewards/margins": 1.7855578660964966, + "rewards/rejected": -2.3854503631591797, + "step": 473 + }, + { + "epoch": 0.1, + "learning_rate": 1.809243697478992e-05, + "logits/chosen": -2.2086966037750244, + "logits/rejected": -2.0955843925476074, + "logps/chosen": -221.19500732421875, + "logps/rejected": -242.82528686523438, + "loss": 0.5517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9625298380851746, + "rewards/margins": 2.123574733734131, + "rewards/rejected": -3.08610463142395, + "step": 474 + }, + { + "epoch": 0.1, + "learning_rate": 1.808823529411765e-05, + "logits/chosen": -2.027224063873291, + "logits/rejected": -1.9769313335418701, + "logps/chosen": -160.25396728515625, + "logps/rejected": -188.99205017089844, + "loss": 0.425, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7655349969863892, + "rewards/margins": 2.5798983573913574, + "rewards/rejected": -3.345433473587036, + "step": 475 + }, + { + "epoch": 0.1, + "learning_rate": 1.808403361344538e-05, + "logits/chosen": -1.9810261726379395, + "logits/rejected": -1.7768886089324951, + "logps/chosen": -362.0196533203125, + "logps/rejected": -354.4373779296875, + "loss": 0.5212, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6842369437217712, + "rewards/margins": 1.4541566371917725, + "rewards/rejected": -2.1383936405181885, + "step": 476 + }, + { + "epoch": 0.1, + "learning_rate": 1.807983193277311e-05, + "logits/chosen": -1.9409639835357666, + "logits/rejected": -1.6685116291046143, + "logps/chosen": -352.915283203125, + "logps/rejected": -366.5359191894531, + "loss": 0.3444, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1256626546382904, + "rewards/margins": 3.746347665786743, + "rewards/rejected": -3.62068510055542, + "step": 477 + }, + { + "epoch": 0.1, + "learning_rate": 1.8075630252100843e-05, + "logits/chosen": -1.9778801202774048, + "logits/rejected": -1.526355266571045, + "logps/chosen": -396.0119934082031, + "logps/rejected": -335.4312438964844, + "loss": 0.1799, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05994391441345215, + "rewards/margins": 4.086123943328857, + "rewards/rejected": -4.026180267333984, + "step": 478 + }, + { + "epoch": 0.1, + "learning_rate": 1.8071428571428573e-05, + "logits/chosen": -2.014559507369995, + "logits/rejected": -1.9628279209136963, + "logps/chosen": -287.56622314453125, + "logps/rejected": -300.8746032714844, + "loss": 0.2029, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18346592783927917, + "rewards/margins": 3.5677192211151123, + "rewards/rejected": -3.751185178756714, + "step": 479 + }, + { + "epoch": 0.1, + "learning_rate": 1.8067226890756303e-05, + "logits/chosen": -2.129014015197754, + "logits/rejected": -1.7606970071792603, + "logps/chosen": -255.42312622070312, + "logps/rejected": -274.4722900390625, + "loss": 0.3294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8223708271980286, + "rewards/margins": 2.844892978668213, + "rewards/rejected": -3.6672637462615967, + "step": 480 + }, + { + "epoch": 0.1, + "learning_rate": 1.8063025210084037e-05, + "logits/chosen": -2.204853057861328, + "logits/rejected": -1.9907968044281006, + "logps/chosen": -490.4839782714844, + "logps/rejected": -380.83978271484375, + "loss": 0.142, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12432041764259338, + "rewards/margins": 3.2527475357055664, + "rewards/rejected": -3.377067804336548, + "step": 481 + }, + { + "epoch": 0.1, + "learning_rate": 1.8058823529411767e-05, + "logits/chosen": -2.3475499153137207, + "logits/rejected": -1.9604389667510986, + "logps/chosen": -304.75970458984375, + "logps/rejected": -287.97637939453125, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9087990522384644, + "rewards/margins": 3.345973491668701, + "rewards/rejected": -4.254772186279297, + "step": 482 + }, + { + "epoch": 0.1, + "learning_rate": 1.8054621848739497e-05, + "logits/chosen": -2.079780340194702, + "logits/rejected": -2.1623263359069824, + "logps/chosen": -299.0748596191406, + "logps/rejected": -359.23858642578125, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4798990786075592, + "rewards/margins": 3.2230770587921143, + "rewards/rejected": -3.7029762268066406, + "step": 483 + }, + { + "epoch": 0.1, + "learning_rate": 1.8050420168067228e-05, + "logits/chosen": -1.9328267574310303, + "logits/rejected": -1.971990942955017, + "logps/chosen": -213.33895874023438, + "logps/rejected": -288.5398864746094, + "loss": 0.4663, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7918972373008728, + "rewards/margins": 2.957124710083008, + "rewards/rejected": -3.749021530151367, + "step": 484 + }, + { + "epoch": 0.1, + "learning_rate": 1.804621848739496e-05, + "logits/chosen": -2.017963171005249, + "logits/rejected": -1.8460841178894043, + "logps/chosen": -289.8305969238281, + "logps/rejected": -289.7749938964844, + "loss": 1.0281, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8532503843307495, + "rewards/margins": 1.8140392303466797, + "rewards/rejected": -3.6672897338867188, + "step": 485 + }, + { + "epoch": 0.1, + "learning_rate": 1.804201680672269e-05, + "logits/chosen": -2.2805466651916504, + "logits/rejected": -1.931815266609192, + "logps/chosen": -319.70196533203125, + "logps/rejected": -238.758056640625, + "loss": 0.3343, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.938738226890564, + "rewards/margins": 3.0914947986602783, + "rewards/rejected": -4.030232906341553, + "step": 486 + }, + { + "epoch": 0.1, + "learning_rate": 1.803781512605042e-05, + "logits/chosen": -2.1851680278778076, + "logits/rejected": -2.0835630893707275, + "logps/chosen": -274.6766052246094, + "logps/rejected": -432.77618408203125, + "loss": 0.2779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8493441939353943, + "rewards/margins": 3.7885684967041016, + "rewards/rejected": -4.637912750244141, + "step": 487 + }, + { + "epoch": 0.1, + "learning_rate": 1.8033613445378152e-05, + "logits/chosen": -1.6215686798095703, + "logits/rejected": -1.8605685234069824, + "logps/chosen": -204.21066284179688, + "logps/rejected": -253.59251403808594, + "loss": 0.1575, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0725224018096924, + "rewards/margins": 3.6719846725463867, + "rewards/rejected": -4.744507312774658, + "step": 488 + }, + { + "epoch": 0.1, + "learning_rate": 1.8029411764705886e-05, + "logits/chosen": -1.9747123718261719, + "logits/rejected": -1.6317942142486572, + "logps/chosen": -457.21624755859375, + "logps/rejected": -340.9676208496094, + "loss": 0.3963, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4580353796482086, + "rewards/margins": 2.5620951652526855, + "rewards/rejected": -3.0201303958892822, + "step": 489 + }, + { + "epoch": 0.1, + "learning_rate": 1.8025210084033616e-05, + "logits/chosen": -2.1669583320617676, + "logits/rejected": -1.611142635345459, + "logps/chosen": -335.9529113769531, + "logps/rejected": -296.7880554199219, + "loss": 0.2719, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3707687556743622, + "rewards/margins": 2.8880085945129395, + "rewards/rejected": -3.258777141571045, + "step": 490 + }, + { + "epoch": 0.1, + "learning_rate": 1.8021008403361346e-05, + "logits/chosen": -2.119159460067749, + "logits/rejected": -1.8584749698638916, + "logps/chosen": -445.0024108886719, + "logps/rejected": -498.0438232421875, + "loss": 0.2557, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10292220115661621, + "rewards/margins": 3.0323965549468994, + "rewards/rejected": -2.929474353790283, + "step": 491 + }, + { + "epoch": 0.1, + "learning_rate": 1.8016806722689076e-05, + "logits/chosen": -2.186915874481201, + "logits/rejected": -2.149617910385132, + "logps/chosen": -254.51690673828125, + "logps/rejected": -338.875244140625, + "loss": 0.3422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8599196672439575, + "rewards/margins": 3.1991493701934814, + "rewards/rejected": -4.0590691566467285, + "step": 492 + }, + { + "epoch": 0.1, + "learning_rate": 1.801260504201681e-05, + "logits/chosen": -2.3135881423950195, + "logits/rejected": -2.0308024883270264, + "logps/chosen": -365.0280456542969, + "logps/rejected": -309.6629638671875, + "loss": 0.3966, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1584479808807373, + "rewards/margins": 3.358734607696533, + "rewards/rejected": -3.5171828269958496, + "step": 493 + }, + { + "epoch": 0.1, + "learning_rate": 1.800840336134454e-05, + "logits/chosen": -1.8752378225326538, + "logits/rejected": -1.6269867420196533, + "logps/chosen": -290.3400573730469, + "logps/rejected": -318.9837951660156, + "loss": 0.5858, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4499101638793945, + "rewards/margins": 1.7428889274597168, + "rewards/rejected": -3.1927988529205322, + "step": 494 + }, + { + "epoch": 0.1, + "learning_rate": 1.800420168067227e-05, + "logits/chosen": -2.2372565269470215, + "logits/rejected": -2.0063397884368896, + "logps/chosen": -354.43402099609375, + "logps/rejected": -352.0789489746094, + "loss": 0.6456, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7475398778915405, + "rewards/margins": 1.893190622329712, + "rewards/rejected": -2.640730381011963, + "step": 495 + }, + { + "epoch": 0.1, + "learning_rate": 1.8e-05, + "logits/chosen": -1.9513461589813232, + "logits/rejected": -1.806603193283081, + "logps/chosen": -261.93560791015625, + "logps/rejected": -308.74566650390625, + "loss": 0.6531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3341524600982666, + "rewards/margins": 1.4642958641052246, + "rewards/rejected": -2.798448085784912, + "step": 496 + }, + { + "epoch": 0.1, + "learning_rate": 1.7995798319327734e-05, + "logits/chosen": -1.8942919969558716, + "logits/rejected": -1.8682172298431396, + "logps/chosen": -290.3580017089844, + "logps/rejected": -304.41546630859375, + "loss": 0.1651, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6255410313606262, + "rewards/margins": 3.0988292694091797, + "rewards/rejected": -3.7243704795837402, + "step": 497 + }, + { + "epoch": 0.1, + "learning_rate": 1.7991596638655464e-05, + "logits/chosen": -2.0446813106536865, + "logits/rejected": -1.956098198890686, + "logps/chosen": -370.5096740722656, + "logps/rejected": -436.359130859375, + "loss": 0.3016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5994164943695068, + "rewards/margins": 2.8163557052612305, + "rewards/rejected": -3.4157724380493164, + "step": 498 + }, + { + "epoch": 0.1, + "learning_rate": 1.7987394957983195e-05, + "logits/chosen": -2.373622417449951, + "logits/rejected": -2.1277596950531006, + "logps/chosen": -410.5650634765625, + "logps/rejected": -336.9373474121094, + "loss": 0.1102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22633305191993713, + "rewards/margins": 3.778165340423584, + "rewards/rejected": -3.5518321990966797, + "step": 499 + }, + { + "epoch": 0.1, + "learning_rate": 1.7983193277310925e-05, + "logits/chosen": -2.2056350708007812, + "logits/rejected": -1.6106029748916626, + "logps/chosen": -333.8369140625, + "logps/rejected": -255.19503784179688, + "loss": 0.1935, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2770302891731262, + "rewards/margins": 3.305591106414795, + "rewards/rejected": -3.5826215744018555, + "step": 500 + }, + { + "epoch": 0.1, + "learning_rate": 1.797899159663866e-05, + "logits/chosen": -2.2231571674346924, + "logits/rejected": -2.1447365283966064, + "logps/chosen": -351.8971862792969, + "logps/rejected": -331.91748046875, + "loss": 0.6916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7122281789779663, + "rewards/margins": 2.016400098800659, + "rewards/rejected": -2.728628396987915, + "step": 501 + }, + { + "epoch": 0.11, + "learning_rate": 1.797478991596639e-05, + "logits/chosen": -2.379485607147217, + "logits/rejected": -1.968024492263794, + "logps/chosen": -301.2986145019531, + "logps/rejected": -281.73876953125, + "loss": 0.3369, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6425855159759521, + "rewards/margins": 3.2059216499328613, + "rewards/rejected": -3.8485074043273926, + "step": 502 + }, + { + "epoch": 0.11, + "learning_rate": 1.797058823529412e-05, + "logits/chosen": -2.228400468826294, + "logits/rejected": -1.5500074625015259, + "logps/chosen": -419.002685546875, + "logps/rejected": -314.27691650390625, + "loss": 0.3732, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3209814429283142, + "rewards/margins": 2.987415075302124, + "rewards/rejected": -3.308396339416504, + "step": 503 + }, + { + "epoch": 0.11, + "learning_rate": 1.7966386554621852e-05, + "logits/chosen": -1.9893072843551636, + "logits/rejected": -2.028350353240967, + "logps/chosen": -378.90264892578125, + "logps/rejected": -348.1063232421875, + "loss": 0.3905, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0967445820569992, + "rewards/margins": 2.305095911026001, + "rewards/rejected": -2.2083513736724854, + "step": 504 + }, + { + "epoch": 0.11, + "learning_rate": 1.7962184873949583e-05, + "logits/chosen": -2.348510265350342, + "logits/rejected": -2.255265712738037, + "logps/chosen": -321.7799377441406, + "logps/rejected": -277.0984191894531, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3429746627807617, + "rewards/margins": 3.408268928527832, + "rewards/rejected": -3.7512435913085938, + "step": 505 + }, + { + "epoch": 0.11, + "learning_rate": 1.7957983193277313e-05, + "logits/chosen": -2.207750082015991, + "logits/rejected": -2.218050003051758, + "logps/chosen": -267.4234313964844, + "logps/rejected": -365.1224365234375, + "loss": 0.4753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36581629514694214, + "rewards/margins": 1.6360876560211182, + "rewards/rejected": -2.001904010772705, + "step": 506 + }, + { + "epoch": 0.11, + "learning_rate": 1.7953781512605043e-05, + "logits/chosen": -2.389249086380005, + "logits/rejected": -2.1676321029663086, + "logps/chosen": -297.28271484375, + "logps/rejected": -332.85797119140625, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2257842868566513, + "rewards/margins": 3.5806705951690674, + "rewards/rejected": -3.806454658508301, + "step": 507 + }, + { + "epoch": 0.11, + "learning_rate": 1.7949579831932777e-05, + "logits/chosen": -2.205676317214966, + "logits/rejected": -2.053122043609619, + "logps/chosen": -449.7177734375, + "logps/rejected": -374.27874755859375, + "loss": 0.5052, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2652510106563568, + "rewards/margins": 2.2145097255706787, + "rewards/rejected": -2.4797608852386475, + "step": 508 + }, + { + "epoch": 0.11, + "learning_rate": 1.7945378151260507e-05, + "logits/chosen": -2.2396016120910645, + "logits/rejected": -1.7451249361038208, + "logps/chosen": -240.06060791015625, + "logps/rejected": -221.070068359375, + "loss": 0.1928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20131206512451172, + "rewards/margins": 3.187617778778076, + "rewards/rejected": -3.388929843902588, + "step": 509 + }, + { + "epoch": 0.11, + "learning_rate": 1.7941176470588237e-05, + "logits/chosen": -2.279101848602295, + "logits/rejected": -1.6466021537780762, + "logps/chosen": -363.7454528808594, + "logps/rejected": -322.2547607421875, + "loss": 0.1723, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05343270301818848, + "rewards/margins": 4.262198448181152, + "rewards/rejected": -4.315630912780762, + "step": 510 + }, + { + "epoch": 0.11, + "learning_rate": 1.7936974789915967e-05, + "logits/chosen": -2.101118803024292, + "logits/rejected": -2.1309876441955566, + "logps/chosen": -376.3255920410156, + "logps/rejected": -325.61444091796875, + "loss": 0.7477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6300605535507202, + "rewards/margins": 1.8778836727142334, + "rewards/rejected": -2.507944107055664, + "step": 511 + }, + { + "epoch": 0.11, + "learning_rate": 1.79327731092437e-05, + "logits/chosen": -1.7612611055374146, + "logits/rejected": -1.7461483478546143, + "logps/chosen": -290.1326904296875, + "logps/rejected": -362.45892333984375, + "loss": 0.2199, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9684104323387146, + "rewards/margins": 3.9241747856140137, + "rewards/rejected": -4.892585277557373, + "step": 512 + }, + { + "epoch": 0.11, + "learning_rate": 1.792857142857143e-05, + "logits/chosen": -2.28304123878479, + "logits/rejected": -2.103008270263672, + "logps/chosen": -384.8870849609375, + "logps/rejected": -371.3544616699219, + "loss": 0.562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7352423667907715, + "rewards/margins": 2.027202606201172, + "rewards/rejected": -2.7624449729919434, + "step": 513 + }, + { + "epoch": 0.11, + "learning_rate": 1.792436974789916e-05, + "logits/chosen": -2.233563184738159, + "logits/rejected": -2.147578001022339, + "logps/chosen": -337.4376525878906, + "logps/rejected": -295.6685485839844, + "loss": 0.2794, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6074300408363342, + "rewards/margins": 3.869246006011963, + "rewards/rejected": -4.476675987243652, + "step": 514 + }, + { + "epoch": 0.11, + "learning_rate": 1.792016806722689e-05, + "logits/chosen": -1.6828869581222534, + "logits/rejected": -1.786900281906128, + "logps/chosen": -221.24606323242188, + "logps/rejected": -391.76629638671875, + "loss": 0.4184, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0562387704849243, + "rewards/margins": 3.1828885078430176, + "rewards/rejected": -4.239127159118652, + "step": 515 + }, + { + "epoch": 0.11, + "learning_rate": 1.7915966386554625e-05, + "logits/chosen": -2.1881747245788574, + "logits/rejected": -2.139512777328491, + "logps/chosen": -307.0408020019531, + "logps/rejected": -322.06536865234375, + "loss": 0.2386, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.27792808413505554, + "rewards/margins": 2.9610986709594727, + "rewards/rejected": -3.2390267848968506, + "step": 516 + }, + { + "epoch": 0.11, + "learning_rate": 1.7911764705882355e-05, + "logits/chosen": -2.1356711387634277, + "logits/rejected": -2.504804849624634, + "logps/chosen": -209.44581604003906, + "logps/rejected": -273.0061950683594, + "loss": 0.3096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7350555062294006, + "rewards/margins": 2.5185976028442383, + "rewards/rejected": -3.253653049468994, + "step": 517 + }, + { + "epoch": 0.11, + "learning_rate": 1.7907563025210086e-05, + "logits/chosen": -2.1660633087158203, + "logits/rejected": -2.1959242820739746, + "logps/chosen": -333.2518615722656, + "logps/rejected": -337.03033447265625, + "loss": 0.4121, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2149726152420044, + "rewards/margins": 1.8880847692489624, + "rewards/rejected": -3.103057384490967, + "step": 518 + }, + { + "epoch": 0.11, + "learning_rate": 1.7903361344537816e-05, + "logits/chosen": -2.237673044204712, + "logits/rejected": -2.227508544921875, + "logps/chosen": -384.73095703125, + "logps/rejected": -402.16607666015625, + "loss": 0.7928, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4252049922943115, + "rewards/margins": 2.3365488052368164, + "rewards/rejected": -2.761753559112549, + "step": 519 + }, + { + "epoch": 0.11, + "learning_rate": 1.789915966386555e-05, + "logits/chosen": -2.355433464050293, + "logits/rejected": -2.2755563259124756, + "logps/chosen": -283.7584533691406, + "logps/rejected": -316.8488464355469, + "loss": 0.2614, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29245471954345703, + "rewards/margins": 2.292234420776367, + "rewards/rejected": -2.584689140319824, + "step": 520 + }, + { + "epoch": 0.11, + "learning_rate": 1.789495798319328e-05, + "logits/chosen": -2.184380292892456, + "logits/rejected": -1.882248878479004, + "logps/chosen": -272.8014221191406, + "logps/rejected": -222.315673828125, + "loss": 0.2975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46726298332214355, + "rewards/margins": 2.8321585655212402, + "rewards/rejected": -3.299421548843384, + "step": 521 + }, + { + "epoch": 0.11, + "learning_rate": 1.789075630252101e-05, + "logits/chosen": -2.010697841644287, + "logits/rejected": -1.7495272159576416, + "logps/chosen": -349.9699401855469, + "logps/rejected": -330.77490234375, + "loss": 0.7439, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.896056056022644, + "rewards/margins": 1.1062815189361572, + "rewards/rejected": -2.0023374557495117, + "step": 522 + }, + { + "epoch": 0.11, + "learning_rate": 1.788655462184874e-05, + "logits/chosen": -2.327357530593872, + "logits/rejected": -1.9403951168060303, + "logps/chosen": -405.6573486328125, + "logps/rejected": -302.9195861816406, + "loss": 0.5474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7912900447845459, + "rewards/margins": 3.085538625717163, + "rewards/rejected": -3.876828670501709, + "step": 523 + }, + { + "epoch": 0.11, + "learning_rate": 1.7882352941176474e-05, + "logits/chosen": -1.9623956680297852, + "logits/rejected": -1.873540997505188, + "logps/chosen": -307.3180847167969, + "logps/rejected": -324.9881286621094, + "loss": 0.5614, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1170825958251953, + "rewards/margins": 2.3464694023132324, + "rewards/rejected": -3.463552236557007, + "step": 524 + }, + { + "epoch": 0.11, + "learning_rate": 1.7878151260504204e-05, + "logits/chosen": -2.4614877700805664, + "logits/rejected": -2.22268009185791, + "logps/chosen": -344.09326171875, + "logps/rejected": -350.6828918457031, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10511809587478638, + "rewards/margins": 2.6819238662719727, + "rewards/rejected": -2.787041664123535, + "step": 525 + }, + { + "epoch": 0.11, + "learning_rate": 1.7873949579831934e-05, + "logits/chosen": -1.8780057430267334, + "logits/rejected": -1.9672216176986694, + "logps/chosen": -231.87127685546875, + "logps/rejected": -419.1820373535156, + "loss": 0.4756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7887938022613525, + "rewards/margins": 1.867379069328308, + "rewards/rejected": -2.65617299079895, + "step": 526 + }, + { + "epoch": 0.11, + "learning_rate": 1.7869747899159668e-05, + "logits/chosen": -2.363224744796753, + "logits/rejected": -1.1982814073562622, + "logps/chosen": -480.41229248046875, + "logps/rejected": -321.1180419921875, + "loss": 0.6134, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0522228479385376, + "rewards/margins": 2.965284824371338, + "rewards/rejected": -4.017507553100586, + "step": 527 + }, + { + "epoch": 0.11, + "learning_rate": 1.7865546218487398e-05, + "logits/chosen": -1.6889622211456299, + "logits/rejected": -1.8687525987625122, + "logps/chosen": -249.06980895996094, + "logps/rejected": -330.18609619140625, + "loss": 0.1438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12231326103210449, + "rewards/margins": 4.1892313957214355, + "rewards/rejected": -4.066917896270752, + "step": 528 + }, + { + "epoch": 0.11, + "learning_rate": 1.7861344537815128e-05, + "logits/chosen": -2.195996046066284, + "logits/rejected": -2.0173168182373047, + "logps/chosen": -237.76255798339844, + "logps/rejected": -277.70904541015625, + "loss": 0.1484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.40628090500831604, + "rewards/margins": 4.171627998352051, + "rewards/rejected": -4.577908515930176, + "step": 529 + }, + { + "epoch": 0.11, + "learning_rate": 1.785714285714286e-05, + "logits/chosen": -2.086538553237915, + "logits/rejected": -2.025927782058716, + "logps/chosen": -330.7870788574219, + "logps/rejected": -331.86041259765625, + "loss": 0.202, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0680372714996338, + "rewards/margins": 2.367128372192383, + "rewards/rejected": -3.4351658821105957, + "step": 530 + }, + { + "epoch": 0.11, + "learning_rate": 1.7852941176470592e-05, + "logits/chosen": -2.1370768547058105, + "logits/rejected": -1.6344835758209229, + "logps/chosen": -359.1519775390625, + "logps/rejected": -354.7419738769531, + "loss": 0.4844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3396291434764862, + "rewards/margins": 2.7243471145629883, + "rewards/rejected": -3.063976287841797, + "step": 531 + }, + { + "epoch": 0.11, + "learning_rate": 1.7848739495798322e-05, + "logits/chosen": -2.042631149291992, + "logits/rejected": -1.798729658126831, + "logps/chosen": -284.9844665527344, + "logps/rejected": -292.60137939453125, + "loss": 0.4142, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0338668823242188, + "rewards/margins": 2.046664237976074, + "rewards/rejected": -3.080531120300293, + "step": 532 + }, + { + "epoch": 0.11, + "learning_rate": 1.7844537815126053e-05, + "logits/chosen": -2.2620251178741455, + "logits/rejected": -2.0029969215393066, + "logps/chosen": -419.5677490234375, + "logps/rejected": -338.8554992675781, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45283305644989014, + "rewards/margins": 4.5514655113220215, + "rewards/rejected": -5.004298686981201, + "step": 533 + }, + { + "epoch": 0.11, + "learning_rate": 1.7840336134453783e-05, + "logits/chosen": -2.1992368698120117, + "logits/rejected": -1.9105582237243652, + "logps/chosen": -324.9189453125, + "logps/rejected": -299.4775085449219, + "loss": 0.1979, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2988405227661133, + "rewards/margins": 4.087227821350098, + "rewards/rejected": -5.386068344116211, + "step": 534 + }, + { + "epoch": 0.11, + "learning_rate": 1.7836134453781516e-05, + "logits/chosen": -2.2962145805358887, + "logits/rejected": -2.087346076965332, + "logps/chosen": -348.236572265625, + "logps/rejected": -322.84429931640625, + "loss": 0.5034, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.693610668182373, + "rewards/margins": 2.676043748855591, + "rewards/rejected": -4.369654655456543, + "step": 535 + }, + { + "epoch": 0.11, + "learning_rate": 1.7831932773109247e-05, + "logits/chosen": -1.9971843957901, + "logits/rejected": -1.7039964199066162, + "logps/chosen": -224.19512939453125, + "logps/rejected": -237.74679565429688, + "loss": 0.3386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9335631132125854, + "rewards/margins": 3.0680341720581055, + "rewards/rejected": -4.0015974044799805, + "step": 536 + }, + { + "epoch": 0.11, + "learning_rate": 1.7827731092436977e-05, + "logits/chosen": -2.22658109664917, + "logits/rejected": -1.9773976802825928, + "logps/chosen": -395.08038330078125, + "logps/rejected": -408.44561767578125, + "loss": 0.2763, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8340373039245605, + "rewards/margins": 4.448478698730469, + "rewards/rejected": -5.2825164794921875, + "step": 537 + }, + { + "epoch": 0.11, + "learning_rate": 1.7823529411764707e-05, + "logits/chosen": -2.2323849201202393, + "logits/rejected": -1.7051142454147339, + "logps/chosen": -393.07769775390625, + "logps/rejected": -348.2321472167969, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5521345138549805, + "rewards/margins": 3.7498769760131836, + "rewards/rejected": -5.302011489868164, + "step": 538 + }, + { + "epoch": 0.11, + "learning_rate": 1.781932773109244e-05, + "logits/chosen": -2.0877904891967773, + "logits/rejected": -2.138853073120117, + "logps/chosen": -446.02630615234375, + "logps/rejected": -451.53948974609375, + "loss": 0.6178, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.223884344100952, + "rewards/margins": 2.2587437629699707, + "rewards/rejected": -4.482627868652344, + "step": 539 + }, + { + "epoch": 0.11, + "learning_rate": 1.781512605042017e-05, + "logits/chosen": -2.0213232040405273, + "logits/rejected": -1.9649556875228882, + "logps/chosen": -306.36065673828125, + "logps/rejected": -388.1372375488281, + "loss": 0.1717, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5134642124176025, + "rewards/margins": 3.3441970348358154, + "rewards/rejected": -4.857661247253418, + "step": 540 + }, + { + "epoch": 0.11, + "learning_rate": 1.78109243697479e-05, + "logits/chosen": -2.0425426959991455, + "logits/rejected": -2.32568621635437, + "logps/chosen": -323.7206726074219, + "logps/rejected": -368.44854736328125, + "loss": 0.3465, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2193355560302734, + "rewards/margins": 3.305173397064209, + "rewards/rejected": -5.524508953094482, + "step": 541 + }, + { + "epoch": 0.11, + "learning_rate": 1.780672268907563e-05, + "logits/chosen": -1.8869844675064087, + "logits/rejected": -1.5312440395355225, + "logps/chosen": -273.87091064453125, + "logps/rejected": -291.2333984375, + "loss": 0.4641, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3759486675262451, + "rewards/margins": 2.814509630203247, + "rewards/rejected": -4.190458297729492, + "step": 542 + }, + { + "epoch": 0.11, + "learning_rate": 1.780252100840336e-05, + "logits/chosen": -2.1331233978271484, + "logits/rejected": -1.5030254125595093, + "logps/chosen": -392.5062561035156, + "logps/rejected": -298.1678161621094, + "loss": 0.5079, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5962162017822266, + "rewards/margins": 2.7842936515808105, + "rewards/rejected": -4.380509853363037, + "step": 543 + }, + { + "epoch": 0.11, + "learning_rate": 1.7798319327731092e-05, + "logits/chosen": -2.0721113681793213, + "logits/rejected": -2.161677598953247, + "logps/chosen": -285.09503173828125, + "logps/rejected": -439.75732421875, + "loss": 0.1517, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3982731103897095, + "rewards/margins": 4.4617719650268555, + "rewards/rejected": -5.860044956207275, + "step": 544 + }, + { + "epoch": 0.11, + "learning_rate": 1.7794117647058825e-05, + "logits/chosen": -2.1232032775878906, + "logits/rejected": -2.0298287868499756, + "logps/chosen": -457.37432861328125, + "logps/rejected": -397.9346923828125, + "loss": 0.5972, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.329658031463623, + "rewards/margins": 2.617422103881836, + "rewards/rejected": -3.947080135345459, + "step": 545 + }, + { + "epoch": 0.11, + "learning_rate": 1.7789915966386556e-05, + "logits/chosen": -1.8775863647460938, + "logits/rejected": -1.6165963411331177, + "logps/chosen": -399.70208740234375, + "logps/rejected": -358.88043212890625, + "loss": 0.7381, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5014736652374268, + "rewards/margins": 2.4977059364318848, + "rewards/rejected": -4.999179840087891, + "step": 546 + }, + { + "epoch": 0.11, + "learning_rate": 1.7785714285714286e-05, + "logits/chosen": -1.9926302433013916, + "logits/rejected": -1.9439363479614258, + "logps/chosen": -380.2137756347656, + "logps/rejected": -404.1746826171875, + "loss": 0.309, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.624013900756836, + "rewards/margins": 3.4728505611419678, + "rewards/rejected": -6.096864700317383, + "step": 547 + }, + { + "epoch": 0.11, + "learning_rate": 1.7781512605042016e-05, + "logits/chosen": -2.035512685775757, + "logits/rejected": -1.6787346601486206, + "logps/chosen": -229.11582946777344, + "logps/rejected": -218.19442749023438, + "loss": 0.5592, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.526768207550049, + "rewards/margins": 2.394801616668701, + "rewards/rejected": -4.921569347381592, + "step": 548 + }, + { + "epoch": 0.11, + "learning_rate": 1.777731092436975e-05, + "logits/chosen": -1.9025330543518066, + "logits/rejected": -2.224428176879883, + "logps/chosen": -271.4559631347656, + "logps/rejected": -312.6767578125, + "loss": 0.1611, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7880305051803589, + "rewards/margins": 4.676723003387451, + "rewards/rejected": -6.4647536277771, + "step": 549 + }, + { + "epoch": 0.12, + "learning_rate": 1.777310924369748e-05, + "logits/chosen": -1.925479531288147, + "logits/rejected": -1.9021575450897217, + "logps/chosen": -276.25048828125, + "logps/rejected": -224.90701293945312, + "loss": 0.3973, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0463192462921143, + "rewards/margins": 2.7438788414001465, + "rewards/rejected": -5.790197372436523, + "step": 550 + }, + { + "epoch": 0.12, + "learning_rate": 1.776890756302521e-05, + "logits/chosen": -2.0473976135253906, + "logits/rejected": -1.4948766231536865, + "logps/chosen": -350.2103271484375, + "logps/rejected": -359.45599365234375, + "loss": 0.2173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7156624794006348, + "rewards/margins": 3.4159865379333496, + "rewards/rejected": -5.131649017333984, + "step": 551 + }, + { + "epoch": 0.12, + "learning_rate": 1.776470588235294e-05, + "logits/chosen": -1.8817487955093384, + "logits/rejected": -1.5503897666931152, + "logps/chosen": -342.044189453125, + "logps/rejected": -267.8489990234375, + "loss": 0.297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5470385551452637, + "rewards/margins": 3.4172656536102295, + "rewards/rejected": -4.964304447174072, + "step": 552 + }, + { + "epoch": 0.12, + "learning_rate": 1.7760504201680674e-05, + "logits/chosen": -2.0708799362182617, + "logits/rejected": -1.9367434978485107, + "logps/chosen": -333.230712890625, + "logps/rejected": -265.77862548828125, + "loss": 0.2934, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9380170106887817, + "rewards/margins": 2.589078187942505, + "rewards/rejected": -4.527094841003418, + "step": 553 + }, + { + "epoch": 0.12, + "learning_rate": 1.7756302521008404e-05, + "logits/chosen": -1.7775119543075562, + "logits/rejected": -1.761277675628662, + "logps/chosen": -295.5238037109375, + "logps/rejected": -280.0086364746094, + "loss": 0.4768, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8351261615753174, + "rewards/margins": 3.1918423175811768, + "rewards/rejected": -5.026968002319336, + "step": 554 + }, + { + "epoch": 0.12, + "learning_rate": 1.7752100840336134e-05, + "logits/chosen": -1.835569143295288, + "logits/rejected": -1.7730984687805176, + "logps/chosen": -369.357666015625, + "logps/rejected": -363.00396728515625, + "loss": 0.3492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.367192029953003, + "rewards/margins": 2.3578941822052, + "rewards/rejected": -4.725086212158203, + "step": 555 + }, + { + "epoch": 0.12, + "learning_rate": 1.7747899159663865e-05, + "logits/chosen": -1.9479680061340332, + "logits/rejected": -1.6829419136047363, + "logps/chosen": -450.7618713378906, + "logps/rejected": -423.020751953125, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9476662874221802, + "rewards/margins": 3.371506690979004, + "rewards/rejected": -5.319173336029053, + "step": 556 + }, + { + "epoch": 0.12, + "learning_rate": 1.7743697478991598e-05, + "logits/chosen": -2.0395281314849854, + "logits/rejected": -1.7483015060424805, + "logps/chosen": -455.34112548828125, + "logps/rejected": -335.931640625, + "loss": 0.7752, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4735007286071777, + "rewards/margins": 1.4180452823638916, + "rewards/rejected": -3.8915462493896484, + "step": 557 + }, + { + "epoch": 0.12, + "learning_rate": 1.773949579831933e-05, + "logits/chosen": -2.1122360229492188, + "logits/rejected": -1.830791711807251, + "logps/chosen": -376.23486328125, + "logps/rejected": -285.22601318359375, + "loss": 0.5763, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0258774757385254, + "rewards/margins": 2.2812435626983643, + "rewards/rejected": -4.3071208000183105, + "step": 558 + }, + { + "epoch": 0.12, + "learning_rate": 1.773529411764706e-05, + "logits/chosen": -2.440333366394043, + "logits/rejected": -1.9048335552215576, + "logps/chosen": -411.12677001953125, + "logps/rejected": -333.35699462890625, + "loss": 0.0741, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7845588326454163, + "rewards/margins": 4.232915878295898, + "rewards/rejected": -5.017475128173828, + "step": 559 + }, + { + "epoch": 0.12, + "learning_rate": 1.7731092436974792e-05, + "logits/chosen": -2.020054340362549, + "logits/rejected": -2.4066474437713623, + "logps/chosen": -221.4602813720703, + "logps/rejected": -356.1713562011719, + "loss": 0.1724, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1847712993621826, + "rewards/margins": 4.0312042236328125, + "rewards/rejected": -6.215975761413574, + "step": 560 + }, + { + "epoch": 0.12, + "learning_rate": 1.7726890756302522e-05, + "logits/chosen": -2.2433838844299316, + "logits/rejected": -1.7616820335388184, + "logps/chosen": -411.79901123046875, + "logps/rejected": -398.7806701660156, + "loss": 0.7543, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.376258611679077, + "rewards/margins": 2.3949642181396484, + "rewards/rejected": -4.7712225914001465, + "step": 561 + }, + { + "epoch": 0.12, + "learning_rate": 1.7722689075630253e-05, + "logits/chosen": -2.210214853286743, + "logits/rejected": -1.9913015365600586, + "logps/chosen": -373.44744873046875, + "logps/rejected": -340.349609375, + "loss": 0.5858, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6961324214935303, + "rewards/margins": 1.6905564069747925, + "rewards/rejected": -3.3866889476776123, + "step": 562 + }, + { + "epoch": 0.12, + "learning_rate": 1.7718487394957983e-05, + "logits/chosen": -2.246351480484009, + "logits/rejected": -1.8784517049789429, + "logps/chosen": -427.2381286621094, + "logps/rejected": -321.5286865234375, + "loss": 0.2151, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4049296975135803, + "rewards/margins": 3.590799331665039, + "rewards/rejected": -3.9957292079925537, + "step": 563 + }, + { + "epoch": 0.12, + "learning_rate": 1.7714285714285717e-05, + "logits/chosen": -1.972480297088623, + "logits/rejected": -1.6647220849990845, + "logps/chosen": -322.11083984375, + "logps/rejected": -264.3584289550781, + "loss": 0.0943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.063344120979309, + "rewards/margins": 4.47200870513916, + "rewards/rejected": -5.535353183746338, + "step": 564 + }, + { + "epoch": 0.12, + "learning_rate": 1.7710084033613447e-05, + "logits/chosen": -2.041421413421631, + "logits/rejected": -1.713474988937378, + "logps/chosen": -275.64727783203125, + "logps/rejected": -248.07948303222656, + "loss": 0.4807, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7853559255599976, + "rewards/margins": 2.016425132751465, + "rewards/rejected": -3.801781177520752, + "step": 565 + }, + { + "epoch": 0.12, + "learning_rate": 1.7705882352941177e-05, + "logits/chosen": -2.329613447189331, + "logits/rejected": -2.1407434940338135, + "logps/chosen": -287.83526611328125, + "logps/rejected": -281.6817626953125, + "loss": 0.7844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.984637975692749, + "rewards/margins": 1.642470359802246, + "rewards/rejected": -3.627108335494995, + "step": 566 + }, + { + "epoch": 0.12, + "learning_rate": 1.7701680672268907e-05, + "logits/chosen": -1.9577643871307373, + "logits/rejected": -2.136990547180176, + "logps/chosen": -217.54519653320312, + "logps/rejected": -321.43487548828125, + "loss": 0.2944, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3599539995193481, + "rewards/margins": 2.861196994781494, + "rewards/rejected": -4.221151351928711, + "step": 567 + }, + { + "epoch": 0.12, + "learning_rate": 1.769747899159664e-05, + "logits/chosen": -2.3956363201141357, + "logits/rejected": -2.0991134643554688, + "logps/chosen": -331.6529541015625, + "logps/rejected": -287.12664794921875, + "loss": 0.2447, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0588245391845703, + "rewards/margins": 3.1444296836853027, + "rewards/rejected": -4.203254222869873, + "step": 568 + }, + { + "epoch": 0.12, + "learning_rate": 1.769327731092437e-05, + "logits/chosen": -1.9715251922607422, + "logits/rejected": -2.0345516204833984, + "logps/chosen": -362.01776123046875, + "logps/rejected": -347.6793518066406, + "loss": 0.2147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9462583661079407, + "rewards/margins": 3.158165693283081, + "rewards/rejected": -4.104423522949219, + "step": 569 + }, + { + "epoch": 0.12, + "learning_rate": 1.76890756302521e-05, + "logits/chosen": -2.0222535133361816, + "logits/rejected": -1.746554970741272, + "logps/chosen": -347.338134765625, + "logps/rejected": -332.7222595214844, + "loss": 0.6918, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6448588371276855, + "rewards/margins": 2.356677532196045, + "rewards/rejected": -5.0015363693237305, + "step": 570 + }, + { + "epoch": 0.12, + "learning_rate": 1.768487394957983e-05, + "logits/chosen": -2.1338720321655273, + "logits/rejected": -2.0698888301849365, + "logps/chosen": -310.0467224121094, + "logps/rejected": -341.3106689453125, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5019992589950562, + "rewards/margins": 3.719092607498169, + "rewards/rejected": -5.221092224121094, + "step": 571 + }, + { + "epoch": 0.12, + "learning_rate": 1.7680672268907565e-05, + "logits/chosen": -2.1733458042144775, + "logits/rejected": -2.0725340843200684, + "logps/chosen": -337.7464599609375, + "logps/rejected": -284.135986328125, + "loss": 0.3166, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2706047296524048, + "rewards/margins": 3.42317795753479, + "rewards/rejected": -4.693782806396484, + "step": 572 + }, + { + "epoch": 0.12, + "learning_rate": 1.7676470588235295e-05, + "logits/chosen": -1.994964361190796, + "logits/rejected": -2.023188352584839, + "logps/chosen": -328.5353698730469, + "logps/rejected": -343.7881774902344, + "loss": 0.4296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9336496591567993, + "rewards/margins": 2.9438352584838867, + "rewards/rejected": -3.8774852752685547, + "step": 573 + }, + { + "epoch": 0.12, + "learning_rate": 1.7672268907563026e-05, + "logits/chosen": -1.9034863710403442, + "logits/rejected": -1.650263786315918, + "logps/chosen": -322.71533203125, + "logps/rejected": -297.9567565917969, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0696885585784912, + "rewards/margins": 4.231565952301025, + "rewards/rejected": -5.301254749298096, + "step": 574 + }, + { + "epoch": 0.12, + "learning_rate": 1.7668067226890756e-05, + "logits/chosen": -2.0441155433654785, + "logits/rejected": -1.703157663345337, + "logps/chosen": -284.8507385253906, + "logps/rejected": -342.7308044433594, + "loss": 0.6736, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4934263229370117, + "rewards/margins": 2.0427916049957275, + "rewards/rejected": -3.53621768951416, + "step": 575 + }, + { + "epoch": 0.12, + "learning_rate": 1.766386554621849e-05, + "logits/chosen": -2.3282277584075928, + "logits/rejected": -1.7080771923065186, + "logps/chosen": -300.5726318359375, + "logps/rejected": -210.52487182617188, + "loss": 0.2109, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.126968502998352, + "rewards/margins": 2.4636807441711426, + "rewards/rejected": -3.590648889541626, + "step": 576 + }, + { + "epoch": 0.12, + "learning_rate": 1.765966386554622e-05, + "logits/chosen": -2.173309326171875, + "logits/rejected": -1.8335094451904297, + "logps/chosen": -375.16552734375, + "logps/rejected": -304.8026428222656, + "loss": 0.3875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5488709211349487, + "rewards/margins": 2.5619935989379883, + "rewards/rejected": -4.110864162445068, + "step": 577 + }, + { + "epoch": 0.12, + "learning_rate": 1.765546218487395e-05, + "logits/chosen": -1.9434499740600586, + "logits/rejected": -1.5711898803710938, + "logps/chosen": -274.57318115234375, + "logps/rejected": -380.7144775390625, + "loss": 0.4132, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1733087301254272, + "rewards/margins": 2.676362991333008, + "rewards/rejected": -3.8496716022491455, + "step": 578 + }, + { + "epoch": 0.12, + "learning_rate": 1.765126050420168e-05, + "logits/chosen": -1.9481709003448486, + "logits/rejected": -2.0177364349365234, + "logps/chosen": -151.82408142089844, + "logps/rejected": -198.28517150878906, + "loss": 0.23, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8105243444442749, + "rewards/margins": 2.2585389614105225, + "rewards/rejected": -3.069063186645508, + "step": 579 + }, + { + "epoch": 0.12, + "learning_rate": 1.7647058823529414e-05, + "logits/chosen": -2.138062000274658, + "logits/rejected": -1.9974300861358643, + "logps/chosen": -421.0310363769531, + "logps/rejected": -362.08807373046875, + "loss": 0.4583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41938063502311707, + "rewards/margins": 2.721446990966797, + "rewards/rejected": -3.1408276557922363, + "step": 580 + }, + { + "epoch": 0.12, + "learning_rate": 1.7642857142857144e-05, + "logits/chosen": -2.303051710128784, + "logits/rejected": -2.015049457550049, + "logps/chosen": -326.5808410644531, + "logps/rejected": -334.4703063964844, + "loss": 0.4056, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1618808507919312, + "rewards/margins": 2.3653814792633057, + "rewards/rejected": -3.5272624492645264, + "step": 581 + }, + { + "epoch": 0.12, + "learning_rate": 1.7638655462184874e-05, + "logits/chosen": -1.76221764087677, + "logits/rejected": -1.7318577766418457, + "logps/chosen": -363.66455078125, + "logps/rejected": -439.78448486328125, + "loss": 0.3356, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3488526940345764, + "rewards/margins": 2.6168365478515625, + "rewards/rejected": -2.965689182281494, + "step": 582 + }, + { + "epoch": 0.12, + "learning_rate": 1.7634453781512608e-05, + "logits/chosen": -1.7114746570587158, + "logits/rejected": -1.5570493936538696, + "logps/chosen": -279.304443359375, + "logps/rejected": -313.8551330566406, + "loss": 0.7882, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0155909061431885, + "rewards/margins": 1.4448413848876953, + "rewards/rejected": -3.4604320526123047, + "step": 583 + }, + { + "epoch": 0.12, + "learning_rate": 1.7630252100840338e-05, + "logits/chosen": -2.102205276489258, + "logits/rejected": -2.0762243270874023, + "logps/chosen": -368.6748046875, + "logps/rejected": -386.53253173828125, + "loss": 0.331, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8447445631027222, + "rewards/margins": 2.411888599395752, + "rewards/rejected": -3.2566332817077637, + "step": 584 + }, + { + "epoch": 0.12, + "learning_rate": 1.7626050420168068e-05, + "logits/chosen": -2.3064589500427246, + "logits/rejected": -2.073137044906616, + "logps/chosen": -448.7082824707031, + "logps/rejected": -302.3025207519531, + "loss": 0.3575, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8835690021514893, + "rewards/margins": 1.9845120906829834, + "rewards/rejected": -2.8680808544158936, + "step": 585 + }, + { + "epoch": 0.12, + "learning_rate": 1.76218487394958e-05, + "logits/chosen": -2.0600881576538086, + "logits/rejected": -1.8609788417816162, + "logps/chosen": -229.59158325195312, + "logps/rejected": -287.3170166015625, + "loss": 0.2262, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.468012809753418, + "rewards/margins": 3.3571579456329346, + "rewards/rejected": -4.825170516967773, + "step": 586 + }, + { + "epoch": 0.12, + "learning_rate": 1.7617647058823532e-05, + "logits/chosen": -1.8887302875518799, + "logits/rejected": -1.94637131690979, + "logps/chosen": -347.83673095703125, + "logps/rejected": -359.481201171875, + "loss": 0.1922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9356111288070679, + "rewards/margins": 3.4601073265075684, + "rewards/rejected": -4.395718574523926, + "step": 587 + }, + { + "epoch": 0.12, + "learning_rate": 1.7613445378151262e-05, + "logits/chosen": -2.2874603271484375, + "logits/rejected": -2.176391124725342, + "logps/chosen": -313.77178955078125, + "logps/rejected": -371.0591735839844, + "loss": 0.359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5926164388656616, + "rewards/margins": 3.3008227348327637, + "rewards/rejected": -3.893439292907715, + "step": 588 + }, + { + "epoch": 0.12, + "learning_rate": 1.7609243697478992e-05, + "logits/chosen": -2.123997211456299, + "logits/rejected": -1.7127245664596558, + "logps/chosen": -366.6797790527344, + "logps/rejected": -322.2105712890625, + "loss": 0.3332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5245232582092285, + "rewards/margins": 3.5848498344421387, + "rewards/rejected": -4.109373092651367, + "step": 589 + }, + { + "epoch": 0.12, + "learning_rate": 1.7605042016806723e-05, + "logits/chosen": -2.1327927112579346, + "logits/rejected": -1.6013575792312622, + "logps/chosen": -393.24249267578125, + "logps/rejected": -372.1217041015625, + "loss": 0.1196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17057377099990845, + "rewards/margins": 4.495490550994873, + "rewards/rejected": -4.324916839599609, + "step": 590 + }, + { + "epoch": 0.12, + "learning_rate": 1.7600840336134456e-05, + "logits/chosen": -2.1168289184570312, + "logits/rejected": -1.937970757484436, + "logps/chosen": -349.0628662109375, + "logps/rejected": -370.396484375, + "loss": 0.592, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3728522062301636, + "rewards/margins": 1.8734581470489502, + "rewards/rejected": -3.246310234069824, + "step": 591 + }, + { + "epoch": 0.12, + "learning_rate": 1.7596638655462186e-05, + "logits/chosen": -2.1187102794647217, + "logits/rejected": -2.194070339202881, + "logps/chosen": -336.94140625, + "logps/rejected": -362.3601989746094, + "loss": 0.3624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5056046843528748, + "rewards/margins": 2.729396343231201, + "rewards/rejected": -3.2350008487701416, + "step": 592 + }, + { + "epoch": 0.12, + "learning_rate": 1.7592436974789917e-05, + "logits/chosen": -1.8995732069015503, + "logits/rejected": -1.947818636894226, + "logps/chosen": -327.80487060546875, + "logps/rejected": -356.39263916015625, + "loss": 0.1776, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5217307806015015, + "rewards/margins": 3.156291961669922, + "rewards/rejected": -4.678022384643555, + "step": 593 + }, + { + "epoch": 0.12, + "learning_rate": 1.7588235294117647e-05, + "logits/chosen": -1.85849130153656, + "logits/rejected": -1.6905248165130615, + "logps/chosen": -251.05442810058594, + "logps/rejected": -282.79168701171875, + "loss": 0.2727, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7055599689483643, + "rewards/margins": 2.28725004196167, + "rewards/rejected": -2.992809772491455, + "step": 594 + }, + { + "epoch": 0.12, + "learning_rate": 1.758403361344538e-05, + "logits/chosen": -1.76235830783844, + "logits/rejected": -1.3964152336120605, + "logps/chosen": -374.6567077636719, + "logps/rejected": -350.7769470214844, + "loss": 0.1998, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1414450407028198, + "rewards/margins": 4.14404296875, + "rewards/rejected": -5.285488128662109, + "step": 595 + }, + { + "epoch": 0.12, + "learning_rate": 1.757983193277311e-05, + "logits/chosen": -2.1745529174804688, + "logits/rejected": -2.1625161170959473, + "logps/chosen": -351.01861572265625, + "logps/rejected": -325.62030029296875, + "loss": 0.5352, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2620129585266113, + "rewards/margins": 1.9831123352050781, + "rewards/rejected": -3.2451255321502686, + "step": 596 + }, + { + "epoch": 0.12, + "learning_rate": 1.757563025210084e-05, + "logits/chosen": -2.062751531600952, + "logits/rejected": -1.5992534160614014, + "logps/chosen": -334.5269470214844, + "logps/rejected": -334.413330078125, + "loss": 0.2543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2660620212554932, + "rewards/margins": 2.692103385925293, + "rewards/rejected": -3.958165168762207, + "step": 597 + }, + { + "epoch": 0.13, + "learning_rate": 1.757142857142857e-05, + "logits/chosen": -2.329374313354492, + "logits/rejected": -1.8794341087341309, + "logps/chosen": -413.318603515625, + "logps/rejected": -356.1742248535156, + "loss": 0.3024, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2969900369644165, + "rewards/margins": 3.5236058235168457, + "rewards/rejected": -4.820595741271973, + "step": 598 + }, + { + "epoch": 0.13, + "learning_rate": 1.7567226890756305e-05, + "logits/chosen": -2.0708303451538086, + "logits/rejected": -1.8224103450775146, + "logps/chosen": -296.98626708984375, + "logps/rejected": -264.0754699707031, + "loss": 0.4375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3862814903259277, + "rewards/margins": 3.3693251609802246, + "rewards/rejected": -4.755606651306152, + "step": 599 + }, + { + "epoch": 0.13, + "learning_rate": 1.7563025210084035e-05, + "logits/chosen": -1.8793928623199463, + "logits/rejected": -1.6184089183807373, + "logps/chosen": -330.78729248046875, + "logps/rejected": -307.60003662109375, + "loss": 0.3772, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1317617893218994, + "rewards/margins": 2.379058361053467, + "rewards/rejected": -3.5108203887939453, + "step": 600 + }, + { + "epoch": 0.13, + "learning_rate": 1.7558823529411765e-05, + "logits/chosen": -2.0106217861175537, + "logits/rejected": -1.9231221675872803, + "logps/chosen": -381.6519775390625, + "logps/rejected": -343.802490234375, + "loss": 0.1988, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8961509466171265, + "rewards/margins": 4.124539852142334, + "rewards/rejected": -5.02069091796875, + "step": 601 + }, + { + "epoch": 0.13, + "learning_rate": 1.7554621848739495e-05, + "logits/chosen": -2.065702199935913, + "logits/rejected": -1.61421799659729, + "logps/chosen": -336.43902587890625, + "logps/rejected": -274.4747314453125, + "loss": 0.3495, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4075486660003662, + "rewards/margins": 3.287841796875, + "rewards/rejected": -4.695390701293945, + "step": 602 + }, + { + "epoch": 0.13, + "learning_rate": 1.755042016806723e-05, + "logits/chosen": -2.1952593326568604, + "logits/rejected": -2.114231824874878, + "logps/chosen": -265.59100341796875, + "logps/rejected": -277.71368408203125, + "loss": 0.5747, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.651538372039795, + "rewards/margins": 1.7672960758209229, + "rewards/rejected": -3.4188344478607178, + "step": 603 + }, + { + "epoch": 0.13, + "learning_rate": 1.754621848739496e-05, + "logits/chosen": -1.7957954406738281, + "logits/rejected": -1.6911286115646362, + "logps/chosen": -237.08416748046875, + "logps/rejected": -292.2733154296875, + "loss": 0.1808, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.252558946609497, + "rewards/margins": 4.193234443664551, + "rewards/rejected": -5.445793151855469, + "step": 604 + }, + { + "epoch": 0.13, + "learning_rate": 1.754201680672269e-05, + "logits/chosen": -2.3046746253967285, + "logits/rejected": -1.6288542747497559, + "logps/chosen": -378.6341857910156, + "logps/rejected": -284.7864990234375, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3336551189422607, + "rewards/margins": 3.163825273513794, + "rewards/rejected": -4.497480392456055, + "step": 605 + }, + { + "epoch": 0.13, + "learning_rate": 1.7537815126050423e-05, + "logits/chosen": -2.25481915473938, + "logits/rejected": -1.8874237537384033, + "logps/chosen": -424.5133056640625, + "logps/rejected": -318.4740295410156, + "loss": 0.1827, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.140905737876892, + "rewards/margins": 3.2062745094299316, + "rewards/rejected": -4.347179889678955, + "step": 606 + }, + { + "epoch": 0.13, + "learning_rate": 1.7533613445378153e-05, + "logits/chosen": -2.022418260574341, + "logits/rejected": -2.0001907348632812, + "logps/chosen": -395.0960693359375, + "logps/rejected": -314.1839294433594, + "loss": 0.8382, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.50838041305542, + "rewards/margins": 1.3427683115005493, + "rewards/rejected": -2.8511486053466797, + "step": 607 + }, + { + "epoch": 0.13, + "learning_rate": 1.7529411764705884e-05, + "logits/chosen": -2.142299175262451, + "logits/rejected": -1.4220865964889526, + "logps/chosen": -392.3373107910156, + "logps/rejected": -311.0252685546875, + "loss": 0.2207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12111552059650421, + "rewards/margins": 3.1376452445983887, + "rewards/rejected": -3.258760929107666, + "step": 608 + }, + { + "epoch": 0.13, + "learning_rate": 1.7525210084033614e-05, + "logits/chosen": -2.2132859230041504, + "logits/rejected": -1.585438847541809, + "logps/chosen": -351.05328369140625, + "logps/rejected": -285.6705322265625, + "loss": 0.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3578495979309082, + "rewards/margins": 3.9617502689361572, + "rewards/rejected": -4.3196001052856445, + "step": 609 + }, + { + "epoch": 0.13, + "learning_rate": 1.7521008403361347e-05, + "logits/chosen": -2.41865873336792, + "logits/rejected": -2.4663453102111816, + "logps/chosen": -322.7279968261719, + "logps/rejected": -354.6105041503906, + "loss": 0.5306, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3983232975006104, + "rewards/margins": 2.2518470287323, + "rewards/rejected": -3.65017032623291, + "step": 610 + }, + { + "epoch": 0.13, + "learning_rate": 1.7516806722689078e-05, + "logits/chosen": -2.120915412902832, + "logits/rejected": -2.240330696105957, + "logps/chosen": -324.6904602050781, + "logps/rejected": -527.383056640625, + "loss": 0.3158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.266683429479599, + "rewards/margins": 3.2078123092651367, + "rewards/rejected": -3.4744958877563477, + "step": 611 + }, + { + "epoch": 0.13, + "learning_rate": 1.7512605042016808e-05, + "logits/chosen": -1.903422474861145, + "logits/rejected": -2.035590648651123, + "logps/chosen": -246.1698760986328, + "logps/rejected": -290.1372375488281, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0302534103393555, + "rewards/margins": 4.132861137390137, + "rewards/rejected": -5.163114547729492, + "step": 612 + }, + { + "epoch": 0.13, + "learning_rate": 1.7508403361344538e-05, + "logits/chosen": -2.25305438041687, + "logits/rejected": -1.8691236972808838, + "logps/chosen": -447.41748046875, + "logps/rejected": -392.9903564453125, + "loss": 0.2915, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10366980731487274, + "rewards/margins": 3.635295867919922, + "rewards/rejected": -3.7389655113220215, + "step": 613 + }, + { + "epoch": 0.13, + "learning_rate": 1.750420168067227e-05, + "logits/chosen": -1.9130713939666748, + "logits/rejected": -1.7619062662124634, + "logps/chosen": -299.79046630859375, + "logps/rejected": -279.08502197265625, + "loss": 0.3796, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2327542304992676, + "rewards/margins": 2.162534713745117, + "rewards/rejected": -3.3952889442443848, + "step": 614 + }, + { + "epoch": 0.13, + "learning_rate": 1.7500000000000002e-05, + "logits/chosen": -2.184145927429199, + "logits/rejected": -1.763026237487793, + "logps/chosen": -356.28399658203125, + "logps/rejected": -326.8001708984375, + "loss": 0.3708, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6756588816642761, + "rewards/margins": 3.3722288608551025, + "rewards/rejected": -4.047887802124023, + "step": 615 + }, + { + "epoch": 0.13, + "learning_rate": 1.7495798319327732e-05, + "logits/chosen": -2.224501609802246, + "logits/rejected": -2.1770598888397217, + "logps/chosen": -288.3049011230469, + "logps/rejected": -297.07305908203125, + "loss": 0.1933, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1636801958084106, + "rewards/margins": 3.263641357421875, + "rewards/rejected": -4.427321910858154, + "step": 616 + }, + { + "epoch": 0.13, + "learning_rate": 1.7491596638655462e-05, + "logits/chosen": -2.1870622634887695, + "logits/rejected": -1.968467116355896, + "logps/chosen": -302.8945617675781, + "logps/rejected": -328.1313781738281, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3654404878616333, + "rewards/margins": 2.6853227615356445, + "rewards/rejected": -4.050763130187988, + "step": 617 + }, + { + "epoch": 0.13, + "learning_rate": 1.7487394957983196e-05, + "logits/chosen": -2.115370750427246, + "logits/rejected": -2.1195783615112305, + "logps/chosen": -218.62728881835938, + "logps/rejected": -257.3113098144531, + "loss": 0.4401, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.213369369506836, + "rewards/margins": 1.8138105869293213, + "rewards/rejected": -3.0271799564361572, + "step": 618 + }, + { + "epoch": 0.13, + "learning_rate": 1.7483193277310926e-05, + "logits/chosen": -1.9826796054840088, + "logits/rejected": -1.8652582168579102, + "logps/chosen": -309.51348876953125, + "logps/rejected": -323.62872314453125, + "loss": 0.5281, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5351694822311401, + "rewards/margins": 2.707502841949463, + "rewards/rejected": -3.2426724433898926, + "step": 619 + }, + { + "epoch": 0.13, + "learning_rate": 1.7478991596638656e-05, + "logits/chosen": -1.883870244026184, + "logits/rejected": -2.0177013874053955, + "logps/chosen": -403.1067810058594, + "logps/rejected": -446.9234924316406, + "loss": 0.3197, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6505222320556641, + "rewards/margins": 2.757884979248047, + "rewards/rejected": -3.408407211303711, + "step": 620 + }, + { + "epoch": 0.13, + "learning_rate": 1.7474789915966387e-05, + "logits/chosen": -1.7034316062927246, + "logits/rejected": -1.8480875492095947, + "logps/chosen": -245.02505493164062, + "logps/rejected": -246.24557495117188, + "loss": 0.5094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2714037895202637, + "rewards/margins": 2.8076908588409424, + "rewards/rejected": -4.079094886779785, + "step": 621 + }, + { + "epoch": 0.13, + "learning_rate": 1.747058823529412e-05, + "logits/chosen": -2.2557621002197266, + "logits/rejected": -1.904726266860962, + "logps/chosen": -379.7161865234375, + "logps/rejected": -346.000732421875, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.050634741783142, + "rewards/margins": 4.45014762878418, + "rewards/rejected": -5.500782012939453, + "step": 622 + }, + { + "epoch": 0.13, + "learning_rate": 1.746638655462185e-05, + "logits/chosen": -1.9042936563491821, + "logits/rejected": -1.8329639434814453, + "logps/chosen": -229.14935302734375, + "logps/rejected": -316.176025390625, + "loss": 0.4426, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9361891746520996, + "rewards/margins": 2.475944757461548, + "rewards/rejected": -3.4121341705322266, + "step": 623 + }, + { + "epoch": 0.13, + "learning_rate": 1.746218487394958e-05, + "logits/chosen": -1.8404033184051514, + "logits/rejected": -2.1285698413848877, + "logps/chosen": -326.9837341308594, + "logps/rejected": -401.146484375, + "loss": 0.2532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0858327150344849, + "rewards/margins": 3.590076446533203, + "rewards/rejected": -4.675909042358398, + "step": 624 + }, + { + "epoch": 0.13, + "learning_rate": 1.745798319327731e-05, + "logits/chosen": -2.1854569911956787, + "logits/rejected": -2.437499523162842, + "logps/chosen": -320.7906494140625, + "logps/rejected": -383.99737548828125, + "loss": 0.2313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4466704726219177, + "rewards/margins": 3.720686435699463, + "rewards/rejected": -4.167356491088867, + "step": 625 + }, + { + "epoch": 0.13, + "learning_rate": 1.7453781512605044e-05, + "logits/chosen": -2.2070181369781494, + "logits/rejected": -1.920109748840332, + "logps/chosen": -276.9339904785156, + "logps/rejected": -240.9857177734375, + "loss": 0.2328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3672330379486084, + "rewards/margins": 4.07345724105835, + "rewards/rejected": -6.440690517425537, + "step": 626 + }, + { + "epoch": 0.13, + "learning_rate": 1.7449579831932775e-05, + "logits/chosen": -2.192082166671753, + "logits/rejected": -2.0379700660705566, + "logps/chosen": -244.08450317382812, + "logps/rejected": -302.1907043457031, + "loss": 0.3003, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.371840238571167, + "rewards/margins": 2.1844491958618164, + "rewards/rejected": -3.5562894344329834, + "step": 627 + }, + { + "epoch": 0.13, + "learning_rate": 1.7445378151260505e-05, + "logits/chosen": -2.074871778488159, + "logits/rejected": -2.1112983226776123, + "logps/chosen": -191.67086791992188, + "logps/rejected": -337.5142822265625, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1002492904663086, + "rewards/margins": 3.577155113220215, + "rewards/rejected": -4.677404403686523, + "step": 628 + }, + { + "epoch": 0.13, + "learning_rate": 1.744117647058824e-05, + "logits/chosen": -1.948610544204712, + "logits/rejected": -2.1481761932373047, + "logps/chosen": -320.94384765625, + "logps/rejected": -350.3824768066406, + "loss": 0.4805, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7741295695304871, + "rewards/margins": 1.5979022979736328, + "rewards/rejected": -2.3720319271087646, + "step": 629 + }, + { + "epoch": 0.13, + "learning_rate": 1.743697478991597e-05, + "logits/chosen": -2.1214332580566406, + "logits/rejected": -2.0843162536621094, + "logps/chosen": -267.44232177734375, + "logps/rejected": -327.6312255859375, + "loss": 0.2766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9848929047584534, + "rewards/margins": 3.2845354080200195, + "rewards/rejected": -4.269428253173828, + "step": 630 + }, + { + "epoch": 0.13, + "learning_rate": 1.74327731092437e-05, + "logits/chosen": -2.274620771408081, + "logits/rejected": -2.1911637783050537, + "logps/chosen": -288.55218505859375, + "logps/rejected": -324.39996337890625, + "loss": 0.7385, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6419466733932495, + "rewards/margins": 1.8183467388153076, + "rewards/rejected": -3.4602932929992676, + "step": 631 + }, + { + "epoch": 0.13, + "learning_rate": 1.742857142857143e-05, + "logits/chosen": -1.9665277004241943, + "logits/rejected": -1.8588104248046875, + "logps/chosen": -386.43743896484375, + "logps/rejected": -345.87762451171875, + "loss": 0.3855, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8597573041915894, + "rewards/margins": 2.6692023277282715, + "rewards/rejected": -3.5289597511291504, + "step": 632 + }, + { + "epoch": 0.13, + "learning_rate": 1.7424369747899163e-05, + "logits/chosen": -2.173203229904175, + "logits/rejected": -1.7052295207977295, + "logps/chosen": -243.45892333984375, + "logps/rejected": -221.82313537597656, + "loss": 0.1552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9518831968307495, + "rewards/margins": 4.023397922515869, + "rewards/rejected": -4.975281238555908, + "step": 633 + }, + { + "epoch": 0.13, + "learning_rate": 1.7420168067226893e-05, + "logits/chosen": -2.4001569747924805, + "logits/rejected": -2.0408408641815186, + "logps/chosen": -389.422119140625, + "logps/rejected": -320.9310302734375, + "loss": 0.4071, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.07696533203125, + "rewards/margins": 3.422020435333252, + "rewards/rejected": -4.498985767364502, + "step": 634 + }, + { + "epoch": 0.13, + "learning_rate": 1.7415966386554623e-05, + "logits/chosen": -1.846866250038147, + "logits/rejected": -2.1061363220214844, + "logps/chosen": -261.47821044921875, + "logps/rejected": -300.9818420410156, + "loss": 0.5803, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8103018999099731, + "rewards/margins": 2.213616132736206, + "rewards/rejected": -4.023918151855469, + "step": 635 + }, + { + "epoch": 0.13, + "learning_rate": 1.7411764705882353e-05, + "logits/chosen": -2.018479824066162, + "logits/rejected": -1.6680288314819336, + "logps/chosen": -267.2178039550781, + "logps/rejected": -275.43145751953125, + "loss": 0.2268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9537438154220581, + "rewards/margins": 3.1539361476898193, + "rewards/rejected": -4.107679843902588, + "step": 636 + }, + { + "epoch": 0.13, + "learning_rate": 1.7407563025210087e-05, + "logits/chosen": -1.9982413053512573, + "logits/rejected": -2.01298189163208, + "logps/chosen": -197.8479766845703, + "logps/rejected": -280.20562744140625, + "loss": 0.2362, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2701283693313599, + "rewards/margins": 3.0283617973327637, + "rewards/rejected": -4.298490047454834, + "step": 637 + }, + { + "epoch": 0.13, + "learning_rate": 1.7403361344537817e-05, + "logits/chosen": -2.0463712215423584, + "logits/rejected": -1.6519734859466553, + "logps/chosen": -375.9577331542969, + "logps/rejected": -357.0042724609375, + "loss": 0.4341, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6750936508178711, + "rewards/margins": 3.336115837097168, + "rewards/rejected": -4.011209487915039, + "step": 638 + }, + { + "epoch": 0.13, + "learning_rate": 1.7399159663865548e-05, + "logits/chosen": -2.1566858291625977, + "logits/rejected": -1.8198866844177246, + "logps/chosen": -523.0704956054688, + "logps/rejected": -479.570556640625, + "loss": 0.1695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3096842765808105, + "rewards/margins": 4.248785972595215, + "rewards/rejected": -5.558470249176025, + "step": 639 + }, + { + "epoch": 0.13, + "learning_rate": 1.7394957983193278e-05, + "logits/chosen": -2.3514232635498047, + "logits/rejected": -1.799801230430603, + "logps/chosen": -402.88671875, + "logps/rejected": -378.341064453125, + "loss": 0.1681, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9623078107833862, + "rewards/margins": 3.933157444000244, + "rewards/rejected": -4.89546537399292, + "step": 640 + }, + { + "epoch": 0.13, + "learning_rate": 1.739075630252101e-05, + "logits/chosen": -2.018665075302124, + "logits/rejected": -1.97682523727417, + "logps/chosen": -335.3333435058594, + "logps/rejected": -375.4092102050781, + "loss": 0.2325, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0102876424789429, + "rewards/margins": 2.6628689765930176, + "rewards/rejected": -3.673156499862671, + "step": 641 + }, + { + "epoch": 0.13, + "learning_rate": 1.738655462184874e-05, + "logits/chosen": -2.100670576095581, + "logits/rejected": -1.8882553577423096, + "logps/chosen": -307.13751220703125, + "logps/rejected": -260.1299743652344, + "loss": 0.0737, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2428934574127197, + "rewards/margins": 3.9339916706085205, + "rewards/rejected": -5.17688512802124, + "step": 642 + }, + { + "epoch": 0.13, + "learning_rate": 1.7382352941176472e-05, + "logits/chosen": -2.0367884635925293, + "logits/rejected": -1.9806405305862427, + "logps/chosen": -326.42205810546875, + "logps/rejected": -299.490478515625, + "loss": 0.3915, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1731911897659302, + "rewards/margins": 3.3622539043426514, + "rewards/rejected": -4.535445213317871, + "step": 643 + }, + { + "epoch": 0.13, + "learning_rate": 1.7378151260504202e-05, + "logits/chosen": -2.2748634815216064, + "logits/rejected": -1.988891363143921, + "logps/chosen": -289.9861145019531, + "logps/rejected": -246.89251708984375, + "loss": 0.4475, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3171441555023193, + "rewards/margins": 2.587984085083008, + "rewards/rejected": -3.905128240585327, + "step": 644 + }, + { + "epoch": 0.13, + "learning_rate": 1.7373949579831936e-05, + "logits/chosen": -1.8365641832351685, + "logits/rejected": -1.6036657094955444, + "logps/chosen": -248.7274932861328, + "logps/rejected": -237.21253967285156, + "loss": 0.4101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5616984367370605, + "rewards/margins": 2.34481143951416, + "rewards/rejected": -3.9065098762512207, + "step": 645 + }, + { + "epoch": 0.14, + "learning_rate": 1.7369747899159666e-05, + "logits/chosen": -2.258124828338623, + "logits/rejected": -1.7239854335784912, + "logps/chosen": -402.8916015625, + "logps/rejected": -326.09576416015625, + "loss": 0.1693, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5601903200149536, + "rewards/margins": 3.3523125648498535, + "rewards/rejected": -3.9125027656555176, + "step": 646 + }, + { + "epoch": 0.14, + "learning_rate": 1.7365546218487396e-05, + "logits/chosen": -2.1163954734802246, + "logits/rejected": -1.822643756866455, + "logps/chosen": -258.8406677246094, + "logps/rejected": -279.0697326660156, + "loss": 0.1607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.72913658618927, + "rewards/margins": 4.059009075164795, + "rewards/rejected": -5.788146018981934, + "step": 647 + }, + { + "epoch": 0.14, + "learning_rate": 1.7361344537815126e-05, + "logits/chosen": -1.8672274351119995, + "logits/rejected": -1.8529818058013916, + "logps/chosen": -333.4593505859375, + "logps/rejected": -325.0284423828125, + "loss": 0.2338, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6187670230865479, + "rewards/margins": 2.986903190612793, + "rewards/rejected": -4.605669975280762, + "step": 648 + }, + { + "epoch": 0.14, + "learning_rate": 1.735714285714286e-05, + "logits/chosen": -2.069216251373291, + "logits/rejected": -1.6111959218978882, + "logps/chosen": -424.6082458496094, + "logps/rejected": -294.0614318847656, + "loss": 0.5072, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6537176370620728, + "rewards/margins": 2.450573205947876, + "rewards/rejected": -4.104290962219238, + "step": 649 + }, + { + "epoch": 0.14, + "learning_rate": 1.735294117647059e-05, + "logits/chosen": -2.2140450477600098, + "logits/rejected": -2.270001173019409, + "logps/chosen": -415.84710693359375, + "logps/rejected": -348.36083984375, + "loss": 0.5278, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.184394121170044, + "rewards/margins": 3.0564780235290527, + "rewards/rejected": -4.240871906280518, + "step": 650 + }, + { + "epoch": 0.14, + "learning_rate": 1.734873949579832e-05, + "logits/chosen": -2.2667994499206543, + "logits/rejected": -1.8827555179595947, + "logps/chosen": -440.50213623046875, + "logps/rejected": -470.83642578125, + "loss": 0.1936, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8355374932289124, + "rewards/margins": 3.2577900886535645, + "rewards/rejected": -4.093327522277832, + "step": 651 + }, + { + "epoch": 0.14, + "learning_rate": 1.7344537815126054e-05, + "logits/chosen": -2.031855344772339, + "logits/rejected": -1.4163709878921509, + "logps/chosen": -429.764892578125, + "logps/rejected": -299.6787414550781, + "loss": 0.1584, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1578457355499268, + "rewards/margins": 3.0670769214630127, + "rewards/rejected": -4.224922180175781, + "step": 652 + }, + { + "epoch": 0.14, + "learning_rate": 1.7340336134453784e-05, + "logits/chosen": -2.3417391777038574, + "logits/rejected": -2.0120458602905273, + "logps/chosen": -378.86724853515625, + "logps/rejected": -320.24847412109375, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8696843981742859, + "rewards/margins": 3.079339027404785, + "rewards/rejected": -3.9490230083465576, + "step": 653 + }, + { + "epoch": 0.14, + "learning_rate": 1.7336134453781514e-05, + "logits/chosen": -1.899755835533142, + "logits/rejected": -2.2152485847473145, + "logps/chosen": -289.23052978515625, + "logps/rejected": -277.4675598144531, + "loss": 0.2919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9454280138015747, + "rewards/margins": 2.594362258911133, + "rewards/rejected": -3.539790391921997, + "step": 654 + }, + { + "epoch": 0.14, + "learning_rate": 1.7331932773109245e-05, + "logits/chosen": -2.056635618209839, + "logits/rejected": -2.1157376766204834, + "logps/chosen": -267.3724670410156, + "logps/rejected": -340.13580322265625, + "loss": 0.4387, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.224498987197876, + "rewards/margins": 2.2869489192962646, + "rewards/rejected": -4.511447906494141, + "step": 655 + }, + { + "epoch": 0.14, + "learning_rate": 1.7327731092436978e-05, + "logits/chosen": -1.9028781652450562, + "logits/rejected": -2.004976511001587, + "logps/chosen": -274.9028015136719, + "logps/rejected": -434.4044189453125, + "loss": 0.2837, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2780661582946777, + "rewards/margins": 3.7909231185913086, + "rewards/rejected": -5.068988800048828, + "step": 656 + }, + { + "epoch": 0.14, + "learning_rate": 1.732352941176471e-05, + "logits/chosen": -1.7320234775543213, + "logits/rejected": -2.093090534210205, + "logps/chosen": -261.589599609375, + "logps/rejected": -349.3472900390625, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0084866285324097, + "rewards/margins": 4.010622024536133, + "rewards/rejected": -5.019108772277832, + "step": 657 + }, + { + "epoch": 0.14, + "learning_rate": 1.731932773109244e-05, + "logits/chosen": -1.9890551567077637, + "logits/rejected": -1.9334732294082642, + "logps/chosen": -264.3675231933594, + "logps/rejected": -375.4937438964844, + "loss": 0.4021, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6627682447433472, + "rewards/margins": 3.1620559692382812, + "rewards/rejected": -4.824824333190918, + "step": 658 + }, + { + "epoch": 0.14, + "learning_rate": 1.731512605042017e-05, + "logits/chosen": -2.1906607151031494, + "logits/rejected": -2.0650694370269775, + "logps/chosen": -410.758544921875, + "logps/rejected": -411.5650939941406, + "loss": 0.5833, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2554771900177002, + "rewards/margins": 2.2665514945983887, + "rewards/rejected": -3.5220284461975098, + "step": 659 + }, + { + "epoch": 0.14, + "learning_rate": 1.7310924369747902e-05, + "logits/chosen": -1.9395146369934082, + "logits/rejected": -1.8063230514526367, + "logps/chosen": -442.1985778808594, + "logps/rejected": -372.2989196777344, + "loss": 0.2103, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5384433269500732, + "rewards/margins": 2.69295072555542, + "rewards/rejected": -4.231394290924072, + "step": 660 + }, + { + "epoch": 0.14, + "learning_rate": 1.7306722689075633e-05, + "logits/chosen": -1.9820976257324219, + "logits/rejected": -2.0732693672180176, + "logps/chosen": -185.29586791992188, + "logps/rejected": -235.94895935058594, + "loss": 0.6932, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7201175689697266, + "rewards/margins": 3.0689449310302734, + "rewards/rejected": -4.7890625, + "step": 661 + }, + { + "epoch": 0.14, + "learning_rate": 1.7302521008403363e-05, + "logits/chosen": -1.9752837419509888, + "logits/rejected": -1.9079817533493042, + "logps/chosen": -369.05499267578125, + "logps/rejected": -361.75048828125, + "loss": 0.1479, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1471731662750244, + "rewards/margins": 4.216258525848389, + "rewards/rejected": -5.363431930541992, + "step": 662 + }, + { + "epoch": 0.14, + "learning_rate": 1.7298319327731093e-05, + "logits/chosen": -1.8032164573669434, + "logits/rejected": -1.7057702541351318, + "logps/chosen": -329.5226745605469, + "logps/rejected": -334.4352111816406, + "loss": 0.3653, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8003126382827759, + "rewards/margins": 3.5742008686065674, + "rewards/rejected": -5.374513149261475, + "step": 663 + }, + { + "epoch": 0.14, + "learning_rate": 1.7294117647058827e-05, + "logits/chosen": -2.2539303302764893, + "logits/rejected": -1.832952618598938, + "logps/chosen": -356.5443115234375, + "logps/rejected": -388.2044372558594, + "loss": 0.1476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22541813552379608, + "rewards/margins": 4.171304225921631, + "rewards/rejected": -4.396722316741943, + "step": 664 + }, + { + "epoch": 0.14, + "learning_rate": 1.7289915966386557e-05, + "logits/chosen": -2.0836853981018066, + "logits/rejected": -1.8866572380065918, + "logps/chosen": -367.36224365234375, + "logps/rejected": -300.6065673828125, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2593399286270142, + "rewards/margins": 2.918307304382324, + "rewards/rejected": -4.177647113800049, + "step": 665 + }, + { + "epoch": 0.14, + "learning_rate": 1.7285714285714287e-05, + "logits/chosen": -2.4005470275878906, + "logits/rejected": -1.7531957626342773, + "logps/chosen": -428.22235107421875, + "logps/rejected": -350.5605773925781, + "loss": 0.1366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9964143633842468, + "rewards/margins": 3.841459274291992, + "rewards/rejected": -4.837873935699463, + "step": 666 + }, + { + "epoch": 0.14, + "learning_rate": 1.7281512605042017e-05, + "logits/chosen": -2.4517130851745605, + "logits/rejected": -2.0735883712768555, + "logps/chosen": -390.8885803222656, + "logps/rejected": -300.8526916503906, + "loss": 0.2846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7490054368972778, + "rewards/margins": 2.4245247840881348, + "rewards/rejected": -4.173530101776123, + "step": 667 + }, + { + "epoch": 0.14, + "learning_rate": 1.727731092436975e-05, + "logits/chosen": -2.218165636062622, + "logits/rejected": -1.9664578437805176, + "logps/chosen": -390.86871337890625, + "logps/rejected": -365.5804443359375, + "loss": 0.3374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48010826110839844, + "rewards/margins": 3.423478603363037, + "rewards/rejected": -3.9035871028900146, + "step": 668 + }, + { + "epoch": 0.14, + "learning_rate": 1.727310924369748e-05, + "logits/chosen": -2.1214077472686768, + "logits/rejected": -1.6956534385681152, + "logps/chosen": -267.4032897949219, + "logps/rejected": -285.0607604980469, + "loss": 0.2044, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6615678668022156, + "rewards/margins": 3.4231882095336914, + "rewards/rejected": -4.084755897521973, + "step": 669 + }, + { + "epoch": 0.14, + "learning_rate": 1.726890756302521e-05, + "logits/chosen": -1.928623914718628, + "logits/rejected": -1.538848638534546, + "logps/chosen": -241.68421936035156, + "logps/rejected": -205.2398223876953, + "loss": 0.744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.82919442653656, + "rewards/margins": 1.6892874240875244, + "rewards/rejected": -3.518481731414795, + "step": 670 + }, + { + "epoch": 0.14, + "learning_rate": 1.7264705882352945e-05, + "logits/chosen": -2.073869466781616, + "logits/rejected": -1.9732764959335327, + "logps/chosen": -356.5965576171875, + "logps/rejected": -307.3775634765625, + "loss": 0.2087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8155758380889893, + "rewards/margins": 2.8724477291107178, + "rewards/rejected": -3.688023567199707, + "step": 671 + }, + { + "epoch": 0.14, + "learning_rate": 1.7260504201680675e-05, + "logits/chosen": -1.957749605178833, + "logits/rejected": -1.5791001319885254, + "logps/chosen": -336.7607116699219, + "logps/rejected": -325.90692138671875, + "loss": 0.142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8536182641983032, + "rewards/margins": 3.7742176055908203, + "rewards/rejected": -4.627835273742676, + "step": 672 + }, + { + "epoch": 0.14, + "learning_rate": 1.7256302521008406e-05, + "logits/chosen": -1.7034693956375122, + "logits/rejected": -1.8518991470336914, + "logps/chosen": -230.3818359375, + "logps/rejected": -261.7847900390625, + "loss": 0.508, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9512046575546265, + "rewards/margins": 1.807443380355835, + "rewards/rejected": -3.758648157119751, + "step": 673 + }, + { + "epoch": 0.14, + "learning_rate": 1.7252100840336136e-05, + "logits/chosen": -2.013270378112793, + "logits/rejected": -1.9225053787231445, + "logps/chosen": -399.438720703125, + "logps/rejected": -281.4459228515625, + "loss": 0.4501, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.171720027923584, + "rewards/margins": 1.7998218536376953, + "rewards/rejected": -2.9715418815612793, + "step": 674 + }, + { + "epoch": 0.14, + "learning_rate": 1.724789915966387e-05, + "logits/chosen": -2.1820831298828125, + "logits/rejected": -2.063739776611328, + "logps/chosen": -281.82366943359375, + "logps/rejected": -291.52923583984375, + "loss": 0.6628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3655743598937988, + "rewards/margins": 3.2180187702178955, + "rewards/rejected": -4.583592891693115, + "step": 675 + }, + { + "epoch": 0.14, + "learning_rate": 1.72436974789916e-05, + "logits/chosen": -2.045133113861084, + "logits/rejected": -1.7008068561553955, + "logps/chosen": -278.39117431640625, + "logps/rejected": -265.5382080078125, + "loss": 0.3397, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.613903284072876, + "rewards/margins": 2.2477810382843018, + "rewards/rejected": -2.861684560775757, + "step": 676 + }, + { + "epoch": 0.14, + "learning_rate": 1.723949579831933e-05, + "logits/chosen": -1.9759258031845093, + "logits/rejected": -1.7019156217575073, + "logps/chosen": -283.2732238769531, + "logps/rejected": -333.37255859375, + "loss": 0.3952, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.075766921043396, + "rewards/margins": 2.7298686504364014, + "rewards/rejected": -3.805635690689087, + "step": 677 + }, + { + "epoch": 0.14, + "learning_rate": 1.723529411764706e-05, + "logits/chosen": -2.427515983581543, + "logits/rejected": -2.209681749343872, + "logps/chosen": -410.6153564453125, + "logps/rejected": -492.728759765625, + "loss": 0.1751, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1429438591003418, + "rewards/margins": 4.598230361938477, + "rewards/rejected": -4.455286026000977, + "step": 678 + }, + { + "epoch": 0.14, + "learning_rate": 1.7231092436974794e-05, + "logits/chosen": -2.0737991333007812, + "logits/rejected": -2.0608856678009033, + "logps/chosen": -266.1748046875, + "logps/rejected": -291.5542907714844, + "loss": 0.2251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7223702669143677, + "rewards/margins": 3.3233089447021484, + "rewards/rejected": -4.045679092407227, + "step": 679 + }, + { + "epoch": 0.14, + "learning_rate": 1.7226890756302524e-05, + "logits/chosen": -2.1106553077697754, + "logits/rejected": -1.6831328868865967, + "logps/chosen": -427.328857421875, + "logps/rejected": -325.7439270019531, + "loss": 0.5774, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.104979395866394, + "rewards/margins": 3.377889633178711, + "rewards/rejected": -4.482868671417236, + "step": 680 + }, + { + "epoch": 0.14, + "learning_rate": 1.7222689075630254e-05, + "logits/chosen": -1.9135695695877075, + "logits/rejected": -2.0843729972839355, + "logps/chosen": -343.29833984375, + "logps/rejected": -408.4560546875, + "loss": 0.335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5545223951339722, + "rewards/margins": 2.8233156204223633, + "rewards/rejected": -3.377838134765625, + "step": 681 + }, + { + "epoch": 0.14, + "learning_rate": 1.7218487394957984e-05, + "logits/chosen": -1.7749607563018799, + "logits/rejected": -1.782073974609375, + "logps/chosen": -310.2999572753906, + "logps/rejected": -361.42608642578125, + "loss": 0.1943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8468155860900879, + "rewards/margins": 3.1006357669830322, + "rewards/rejected": -3.947451114654541, + "step": 682 + }, + { + "epoch": 0.14, + "learning_rate": 1.7214285714285718e-05, + "logits/chosen": -1.9808976650238037, + "logits/rejected": -2.119267463684082, + "logps/chosen": -258.61102294921875, + "logps/rejected": -317.12506103515625, + "loss": 0.1242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6111612915992737, + "rewards/margins": 3.6671926975250244, + "rewards/rejected": -4.278353691101074, + "step": 683 + }, + { + "epoch": 0.14, + "learning_rate": 1.7210084033613448e-05, + "logits/chosen": -2.331777572631836, + "logits/rejected": -2.1018166542053223, + "logps/chosen": -336.4613037109375, + "logps/rejected": -345.8485412597656, + "loss": 0.3093, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6613254547119141, + "rewards/margins": 3.3645129203796387, + "rewards/rejected": -4.025838375091553, + "step": 684 + }, + { + "epoch": 0.14, + "learning_rate": 1.720588235294118e-05, + "logits/chosen": -2.323230266571045, + "logits/rejected": -1.4636880159378052, + "logps/chosen": -343.62652587890625, + "logps/rejected": -265.6925964355469, + "loss": 0.1822, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8246605396270752, + "rewards/margins": 3.6590194702148438, + "rewards/rejected": -4.48367977142334, + "step": 685 + }, + { + "epoch": 0.14, + "learning_rate": 1.720168067226891e-05, + "logits/chosen": -1.9645280838012695, + "logits/rejected": -2.000112533569336, + "logps/chosen": -412.9718933105469, + "logps/rejected": -385.92730712890625, + "loss": 0.2232, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06948405504226685, + "rewards/margins": 3.859463691711426, + "rewards/rejected": -3.928947687149048, + "step": 686 + }, + { + "epoch": 0.14, + "learning_rate": 1.7197478991596642e-05, + "logits/chosen": -2.3017377853393555, + "logits/rejected": -2.226318359375, + "logps/chosen": -371.52813720703125, + "logps/rejected": -386.92352294921875, + "loss": 0.569, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8945224285125732, + "rewards/margins": 2.5431067943573, + "rewards/rejected": -3.437629222869873, + "step": 687 + }, + { + "epoch": 0.14, + "learning_rate": 1.7193277310924372e-05, + "logits/chosen": -2.076875686645508, + "logits/rejected": -2.1938681602478027, + "logps/chosen": -236.74893188476562, + "logps/rejected": -295.1387939453125, + "loss": 0.4666, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1517714262008667, + "rewards/margins": 3.644257068634033, + "rewards/rejected": -4.7960286140441895, + "step": 688 + }, + { + "epoch": 0.14, + "learning_rate": 1.7189075630252103e-05, + "logits/chosen": -1.9153176546096802, + "logits/rejected": -1.8392949104309082, + "logps/chosen": -319.8885498046875, + "logps/rejected": -331.46771240234375, + "loss": 0.5233, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9958435297012329, + "rewards/margins": 2.5478975772857666, + "rewards/rejected": -3.543741226196289, + "step": 689 + }, + { + "epoch": 0.14, + "learning_rate": 1.7184873949579833e-05, + "logits/chosen": -2.0593974590301514, + "logits/rejected": -1.7980951070785522, + "logps/chosen": -396.2750244140625, + "logps/rejected": -330.88067626953125, + "loss": 0.1758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18590497970581055, + "rewards/margins": 2.9292798042297363, + "rewards/rejected": -3.115185022354126, + "step": 690 + }, + { + "epoch": 0.14, + "learning_rate": 1.7180672268907563e-05, + "logits/chosen": -2.0920934677124023, + "logits/rejected": -1.928424596786499, + "logps/chosen": -305.40850830078125, + "logps/rejected": -352.9814147949219, + "loss": 0.4255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8967874050140381, + "rewards/margins": 3.9730615615844727, + "rewards/rejected": -4.86984920501709, + "step": 691 + }, + { + "epoch": 0.14, + "learning_rate": 1.7176470588235293e-05, + "logits/chosen": -1.8887488842010498, + "logits/rejected": -1.8493366241455078, + "logps/chosen": -380.24029541015625, + "logps/rejected": -452.76116943359375, + "loss": 0.1294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5439873337745667, + "rewards/margins": 3.9800472259521484, + "rewards/rejected": -4.5240349769592285, + "step": 692 + }, + { + "epoch": 0.14, + "learning_rate": 1.7172268907563027e-05, + "logits/chosen": -1.9121209383010864, + "logits/rejected": -1.6266415119171143, + "logps/chosen": -254.69651794433594, + "logps/rejected": -304.2913513183594, + "loss": 0.2047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43852972984313965, + "rewards/margins": 3.38332462310791, + "rewards/rejected": -3.82185435295105, + "step": 693 + }, + { + "epoch": 0.15, + "learning_rate": 1.7168067226890757e-05, + "logits/chosen": -1.6789984703063965, + "logits/rejected": -1.8904715776443481, + "logps/chosen": -421.5914001464844, + "logps/rejected": -411.797119140625, + "loss": 0.3554, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.552094578742981, + "rewards/margins": 3.0789880752563477, + "rewards/rejected": -3.6310830116271973, + "step": 694 + }, + { + "epoch": 0.15, + "learning_rate": 1.7163865546218487e-05, + "logits/chosen": -2.0443294048309326, + "logits/rejected": -1.7869457006454468, + "logps/chosen": -344.9375, + "logps/rejected": -263.63287353515625, + "loss": 0.2826, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7583024501800537, + "rewards/margins": 3.060530662536621, + "rewards/rejected": -3.818833112716675, + "step": 695 + }, + { + "epoch": 0.15, + "learning_rate": 1.7159663865546218e-05, + "logits/chosen": -2.0378646850585938, + "logits/rejected": -2.0197155475616455, + "logps/chosen": -322.33917236328125, + "logps/rejected": -353.748291015625, + "loss": 0.7203, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6857253313064575, + "rewards/margins": 1.2424486875534058, + "rewards/rejected": -2.9281740188598633, + "step": 696 + }, + { + "epoch": 0.15, + "learning_rate": 1.715546218487395e-05, + "logits/chosen": -2.2373576164245605, + "logits/rejected": -2.3120899200439453, + "logps/chosen": -371.76483154296875, + "logps/rejected": -389.9080810546875, + "loss": 0.3095, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5310362577438354, + "rewards/margins": 3.078679084777832, + "rewards/rejected": -3.609715700149536, + "step": 697 + }, + { + "epoch": 0.15, + "learning_rate": 1.715126050420168e-05, + "logits/chosen": -2.0614230632781982, + "logits/rejected": -2.0350730419158936, + "logps/chosen": -329.12042236328125, + "logps/rejected": -334.38427734375, + "loss": 0.1935, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19396349787712097, + "rewards/margins": 3.8307032585144043, + "rewards/rejected": -4.024666786193848, + "step": 698 + }, + { + "epoch": 0.15, + "learning_rate": 1.714705882352941e-05, + "logits/chosen": -2.2367663383483887, + "logits/rejected": -1.8684735298156738, + "logps/chosen": -333.8081359863281, + "logps/rejected": -328.0357666015625, + "loss": 0.2564, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1592206954956055, + "rewards/margins": 3.8257079124450684, + "rewards/rejected": -4.984929084777832, + "step": 699 + }, + { + "epoch": 0.15, + "learning_rate": 1.7142857142857142e-05, + "logits/chosen": -2.156998634338379, + "logits/rejected": -1.6880600452423096, + "logps/chosen": -379.6072082519531, + "logps/rejected": -370.2983093261719, + "loss": 0.2015, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7204073667526245, + "rewards/margins": 4.4051666259765625, + "rewards/rejected": -5.125573635101318, + "step": 700 + }, + { + "epoch": 0.15, + "learning_rate": 1.7138655462184875e-05, + "logits/chosen": -2.0436623096466064, + "logits/rejected": -1.5975549221038818, + "logps/chosen": -311.1147155761719, + "logps/rejected": -296.11279296875, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.372866153717041, + "rewards/margins": 3.583177089691162, + "rewards/rejected": -4.956043243408203, + "step": 701 + }, + { + "epoch": 0.15, + "learning_rate": 1.7134453781512606e-05, + "logits/chosen": -2.10982346534729, + "logits/rejected": -1.935610294342041, + "logps/chosen": -346.2196960449219, + "logps/rejected": -352.525146484375, + "loss": 0.3001, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9308915734291077, + "rewards/margins": 3.532325029373169, + "rewards/rejected": -4.463216781616211, + "step": 702 + }, + { + "epoch": 0.15, + "learning_rate": 1.7130252100840336e-05, + "logits/chosen": -2.3135061264038086, + "logits/rejected": -2.128934621810913, + "logps/chosen": -383.6361083984375, + "logps/rejected": -321.07574462890625, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6225247979164124, + "rewards/margins": 2.862924337387085, + "rewards/rejected": -3.4854490756988525, + "step": 703 + }, + { + "epoch": 0.15, + "learning_rate": 1.7126050420168066e-05, + "logits/chosen": -2.0983588695526123, + "logits/rejected": -2.1279656887054443, + "logps/chosen": -465.5097351074219, + "logps/rejected": -394.8746032714844, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.588475227355957, + "rewards/margins": 3.6594579219818115, + "rewards/rejected": -4.2479329109191895, + "step": 704 + }, + { + "epoch": 0.15, + "learning_rate": 1.71218487394958e-05, + "logits/chosen": -2.2282066345214844, + "logits/rejected": -1.9480252265930176, + "logps/chosen": -341.08441162109375, + "logps/rejected": -319.0345153808594, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9014024138450623, + "rewards/margins": 4.911703109741211, + "rewards/rejected": -5.813106060028076, + "step": 705 + }, + { + "epoch": 0.15, + "learning_rate": 1.711764705882353e-05, + "logits/chosen": -2.0968925952911377, + "logits/rejected": -1.6364997625350952, + "logps/chosen": -340.4033203125, + "logps/rejected": -342.8914794921875, + "loss": 0.2481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.586866557598114, + "rewards/margins": 3.365581512451172, + "rewards/rejected": -3.9524483680725098, + "step": 706 + }, + { + "epoch": 0.15, + "learning_rate": 1.711344537815126e-05, + "logits/chosen": -2.2105777263641357, + "logits/rejected": -2.0825812816619873, + "logps/chosen": -353.2305908203125, + "logps/rejected": -412.3689880371094, + "loss": 0.3513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.39845770597457886, + "rewards/margins": 3.2391905784606934, + "rewards/rejected": -3.637648582458496, + "step": 707 + }, + { + "epoch": 0.15, + "learning_rate": 1.7109243697478994e-05, + "logits/chosen": -2.11026668548584, + "logits/rejected": -1.5614155530929565, + "logps/chosen": -384.54168701171875, + "logps/rejected": -286.07867431640625, + "loss": 0.1942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9091488122940063, + "rewards/margins": 3.0858840942382812, + "rewards/rejected": -3.995033025741577, + "step": 708 + }, + { + "epoch": 0.15, + "learning_rate": 1.7105042016806724e-05, + "logits/chosen": -2.072533130645752, + "logits/rejected": -1.6032205820083618, + "logps/chosen": -278.2815856933594, + "logps/rejected": -250.24232482910156, + "loss": 0.1631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03345600143074989, + "rewards/margins": 3.1667966842651367, + "rewards/rejected": -3.2002525329589844, + "step": 709 + }, + { + "epoch": 0.15, + "learning_rate": 1.7100840336134454e-05, + "logits/chosen": -2.1043851375579834, + "logits/rejected": -1.4964663982391357, + "logps/chosen": -335.7789306640625, + "logps/rejected": -273.5032653808594, + "loss": 0.2496, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2616996765136719, + "rewards/margins": 2.377262592315674, + "rewards/rejected": -2.6389622688293457, + "step": 710 + }, + { + "epoch": 0.15, + "learning_rate": 1.7096638655462184e-05, + "logits/chosen": -1.977784276008606, + "logits/rejected": -2.0842790603637695, + "logps/chosen": -374.2403259277344, + "logps/rejected": -359.81427001953125, + "loss": 0.2648, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0386358499526978, + "rewards/margins": 3.4374914169311523, + "rewards/rejected": -4.476127624511719, + "step": 711 + }, + { + "epoch": 0.15, + "learning_rate": 1.7092436974789918e-05, + "logits/chosen": -2.151043176651001, + "logits/rejected": -1.6124058961868286, + "logps/chosen": -352.8233642578125, + "logps/rejected": -310.1890869140625, + "loss": 0.107, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.38482666015625, + "rewards/margins": 4.073087215423584, + "rewards/rejected": -5.457913398742676, + "step": 712 + }, + { + "epoch": 0.15, + "learning_rate": 1.7088235294117648e-05, + "logits/chosen": -1.9535274505615234, + "logits/rejected": -1.2877719402313232, + "logps/chosen": -284.9102783203125, + "logps/rejected": -276.1152648925781, + "loss": 0.382, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4412153959274292, + "rewards/margins": 2.972545862197876, + "rewards/rejected": -4.413761138916016, + "step": 713 + }, + { + "epoch": 0.15, + "learning_rate": 1.708403361344538e-05, + "logits/chosen": -2.0154826641082764, + "logits/rejected": -1.9935276508331299, + "logps/chosen": -400.90594482421875, + "logps/rejected": -342.13616943359375, + "loss": 0.2915, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4212172031402588, + "rewards/margins": 3.207465171813965, + "rewards/rejected": -4.6286821365356445, + "step": 714 + }, + { + "epoch": 0.15, + "learning_rate": 1.707983193277311e-05, + "logits/chosen": -1.8846263885498047, + "logits/rejected": -1.741307020187378, + "logps/chosen": -346.0419921875, + "logps/rejected": -457.7156066894531, + "loss": 0.6746, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.2680182456970215, + "rewards/margins": 2.1655664443969727, + "rewards/rejected": -4.433585166931152, + "step": 715 + }, + { + "epoch": 0.15, + "learning_rate": 1.7075630252100842e-05, + "logits/chosen": -1.9114598035812378, + "logits/rejected": -2.1001482009887695, + "logps/chosen": -385.4615478515625, + "logps/rejected": -457.49261474609375, + "loss": 0.2002, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6733380556106567, + "rewards/margins": 4.430871963500977, + "rewards/rejected": -6.104209899902344, + "step": 716 + }, + { + "epoch": 0.15, + "learning_rate": 1.7071428571428573e-05, + "logits/chosen": -2.184648036956787, + "logits/rejected": -1.9081792831420898, + "logps/chosen": -359.1806640625, + "logps/rejected": -432.8729248046875, + "loss": 0.1026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5722174644470215, + "rewards/margins": 4.043186187744141, + "rewards/rejected": -5.615403175354004, + "step": 717 + }, + { + "epoch": 0.15, + "learning_rate": 1.7067226890756303e-05, + "logits/chosen": -1.9436229467391968, + "logits/rejected": -1.7883977890014648, + "logps/chosen": -342.4736022949219, + "logps/rejected": -300.35491943359375, + "loss": 0.4697, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4920488595962524, + "rewards/margins": 2.424128532409668, + "rewards/rejected": -3.916177272796631, + "step": 718 + }, + { + "epoch": 0.15, + "learning_rate": 1.7063025210084033e-05, + "logits/chosen": -1.891648530960083, + "logits/rejected": -1.6593637466430664, + "logps/chosen": -192.44589233398438, + "logps/rejected": -186.4752197265625, + "loss": 0.2824, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.433328628540039, + "rewards/margins": 2.645397663116455, + "rewards/rejected": -5.078725814819336, + "step": 719 + }, + { + "epoch": 0.15, + "learning_rate": 1.7058823529411767e-05, + "logits/chosen": -1.926894187927246, + "logits/rejected": -1.8717057704925537, + "logps/chosen": -340.2047119140625, + "logps/rejected": -311.362060546875, + "loss": 0.6939, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.599405288696289, + "rewards/margins": 2.407620429992676, + "rewards/rejected": -4.007025718688965, + "step": 720 + }, + { + "epoch": 0.15, + "learning_rate": 1.7054621848739497e-05, + "logits/chosen": -2.0984959602355957, + "logits/rejected": -1.996431827545166, + "logps/chosen": -403.31640625, + "logps/rejected": -362.71942138671875, + "loss": 0.2613, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9902312755584717, + "rewards/margins": 3.4481632709503174, + "rewards/rejected": -5.438394546508789, + "step": 721 + }, + { + "epoch": 0.15, + "learning_rate": 1.7050420168067227e-05, + "logits/chosen": -1.9579122066497803, + "logits/rejected": -2.276113748550415, + "logps/chosen": -257.9245300292969, + "logps/rejected": -358.4361877441406, + "loss": 0.4497, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.913058876991272, + "rewards/margins": 3.0072388648986816, + "rewards/rejected": -4.920297622680664, + "step": 722 + }, + { + "epoch": 0.15, + "learning_rate": 1.7046218487394957e-05, + "logits/chosen": -2.154367208480835, + "logits/rejected": -1.9137614965438843, + "logps/chosen": -345.5387268066406, + "logps/rejected": -380.2763671875, + "loss": 0.6701, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.485901355743408, + "rewards/margins": 3.1181859970092773, + "rewards/rejected": -5.604086875915527, + "step": 723 + }, + { + "epoch": 0.15, + "learning_rate": 1.704201680672269e-05, + "logits/chosen": -2.0980918407440186, + "logits/rejected": -1.8121957778930664, + "logps/chosen": -268.64654541015625, + "logps/rejected": -273.2153015136719, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.173196792602539, + "rewards/margins": 2.406545877456665, + "rewards/rejected": -4.579742431640625, + "step": 724 + }, + { + "epoch": 0.15, + "learning_rate": 1.703781512605042e-05, + "logits/chosen": -1.8285319805145264, + "logits/rejected": -2.174788475036621, + "logps/chosen": -206.33349609375, + "logps/rejected": -345.8277282714844, + "loss": 0.5907, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9394997358322144, + "rewards/margins": 2.211231231689453, + "rewards/rejected": -4.150731086730957, + "step": 725 + }, + { + "epoch": 0.15, + "learning_rate": 1.703361344537815e-05, + "logits/chosen": -2.2193386554718018, + "logits/rejected": -2.2776079177856445, + "logps/chosen": -248.9822235107422, + "logps/rejected": -306.0496826171875, + "loss": 0.2325, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1564745903015137, + "rewards/margins": 4.135500907897949, + "rewards/rejected": -6.291975021362305, + "step": 726 + }, + { + "epoch": 0.15, + "learning_rate": 1.702941176470588e-05, + "logits/chosen": -2.177299737930298, + "logits/rejected": -2.1963884830474854, + "logps/chosen": -312.93438720703125, + "logps/rejected": -401.07379150390625, + "loss": 0.1808, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3164889812469482, + "rewards/margins": 5.062575340270996, + "rewards/rejected": -6.379064559936523, + "step": 727 + }, + { + "epoch": 0.15, + "learning_rate": 1.7025210084033615e-05, + "logits/chosen": -1.960242509841919, + "logits/rejected": -1.5620043277740479, + "logps/chosen": -314.60357666015625, + "logps/rejected": -320.797119140625, + "loss": 0.4408, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5881104469299316, + "rewards/margins": 3.4220874309539795, + "rewards/rejected": -6.010197639465332, + "step": 728 + }, + { + "epoch": 0.15, + "learning_rate": 1.7021008403361345e-05, + "logits/chosen": -2.244079113006592, + "logits/rejected": -1.9179579019546509, + "logps/chosen": -334.6098327636719, + "logps/rejected": -318.0796203613281, + "loss": 0.2865, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4356093406677246, + "rewards/margins": 5.762797832489014, + "rewards/rejected": -8.198407173156738, + "step": 729 + }, + { + "epoch": 0.15, + "learning_rate": 1.7016806722689076e-05, + "logits/chosen": -2.2572948932647705, + "logits/rejected": -2.0175554752349854, + "logps/chosen": -336.3603515625, + "logps/rejected": -308.03411865234375, + "loss": 0.6594, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2969253063201904, + "rewards/margins": 2.8958585262298584, + "rewards/rejected": -5.192783832550049, + "step": 730 + }, + { + "epoch": 0.15, + "learning_rate": 1.701260504201681e-05, + "logits/chosen": -1.9566154479980469, + "logits/rejected": -2.1031384468078613, + "logps/chosen": -163.4652557373047, + "logps/rejected": -218.31568908691406, + "loss": 0.4784, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.990065574645996, + "rewards/margins": 2.536547899246216, + "rewards/rejected": -5.526613235473633, + "step": 731 + }, + { + "epoch": 0.15, + "learning_rate": 1.700840336134454e-05, + "logits/chosen": -1.7026079893112183, + "logits/rejected": -2.092538595199585, + "logps/chosen": -382.9058837890625, + "logps/rejected": -344.6308898925781, + "loss": 0.3592, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7175614833831787, + "rewards/margins": 3.662135124206543, + "rewards/rejected": -6.379697322845459, + "step": 732 + }, + { + "epoch": 0.15, + "learning_rate": 1.700420168067227e-05, + "logits/chosen": -2.3577661514282227, + "logits/rejected": -1.7167326211929321, + "logps/chosen": -291.07647705078125, + "logps/rejected": -293.70135498046875, + "loss": 0.3766, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.145514726638794, + "rewards/margins": 3.0079023838043213, + "rewards/rejected": -5.153417110443115, + "step": 733 + }, + { + "epoch": 0.15, + "learning_rate": 1.7e-05, + "logits/chosen": -1.9224162101745605, + "logits/rejected": -2.0145130157470703, + "logps/chosen": -301.79229736328125, + "logps/rejected": -379.64312744140625, + "loss": 0.3541, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.432405948638916, + "rewards/margins": 3.0382261276245117, + "rewards/rejected": -5.4706315994262695, + "step": 734 + }, + { + "epoch": 0.15, + "learning_rate": 1.6995798319327733e-05, + "logits/chosen": -1.698994517326355, + "logits/rejected": -1.6362982988357544, + "logps/chosen": -369.7939758300781, + "logps/rejected": -318.4671936035156, + "loss": 0.5482, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7628350257873535, + "rewards/margins": 2.9005603790283203, + "rewards/rejected": -5.663394927978516, + "step": 735 + }, + { + "epoch": 0.15, + "learning_rate": 1.6991596638655464e-05, + "logits/chosen": -1.9325566291809082, + "logits/rejected": -1.7474031448364258, + "logps/chosen": -197.885498046875, + "logps/rejected": -297.7406005859375, + "loss": 0.3897, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1120567321777344, + "rewards/margins": 2.5959575176239014, + "rewards/rejected": -4.708014011383057, + "step": 736 + }, + { + "epoch": 0.15, + "learning_rate": 1.6987394957983194e-05, + "logits/chosen": -2.037433624267578, + "logits/rejected": -1.5862784385681152, + "logps/chosen": -353.72991943359375, + "logps/rejected": -349.3614501953125, + "loss": 0.3791, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8576717376708984, + "rewards/margins": 2.9837183952331543, + "rewards/rejected": -4.841390132904053, + "step": 737 + }, + { + "epoch": 0.15, + "learning_rate": 1.6983193277310924e-05, + "logits/chosen": -2.2920360565185547, + "logits/rejected": -2.04499888420105, + "logps/chosen": -351.6866455078125, + "logps/rejected": -353.43218994140625, + "loss": 0.4162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3684329986572266, + "rewards/margins": 1.8947117328643799, + "rewards/rejected": -3.2631447315216064, + "step": 738 + }, + { + "epoch": 0.15, + "learning_rate": 1.6978991596638658e-05, + "logits/chosen": -2.11529541015625, + "logits/rejected": -1.5084693431854248, + "logps/chosen": -350.6539611816406, + "logps/rejected": -284.243408203125, + "loss": 0.4971, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7144124507904053, + "rewards/margins": 2.403724431991577, + "rewards/rejected": -4.118136405944824, + "step": 739 + }, + { + "epoch": 0.15, + "learning_rate": 1.6974789915966388e-05, + "logits/chosen": -2.2437610626220703, + "logits/rejected": -2.058847665786743, + "logps/chosen": -365.35491943359375, + "logps/rejected": -385.7453918457031, + "loss": 0.5045, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2335970401763916, + "rewards/margins": 2.8418381214141846, + "rewards/rejected": -4.075435161590576, + "step": 740 + }, + { + "epoch": 0.16, + "learning_rate": 1.6970588235294118e-05, + "logits/chosen": -2.191389799118042, + "logits/rejected": -1.9575977325439453, + "logps/chosen": -381.34228515625, + "logps/rejected": -313.0478210449219, + "loss": 0.6515, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6277685165405273, + "rewards/margins": 1.6641924381256104, + "rewards/rejected": -3.291961193084717, + "step": 741 + }, + { + "epoch": 0.16, + "learning_rate": 1.696638655462185e-05, + "logits/chosen": -2.0234718322753906, + "logits/rejected": -1.8011419773101807, + "logps/chosen": -251.2014923095703, + "logps/rejected": -293.146728515625, + "loss": 0.2082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7777428030967712, + "rewards/margins": 4.249463081359863, + "rewards/rejected": -5.027205944061279, + "step": 742 + }, + { + "epoch": 0.16, + "learning_rate": 1.6962184873949582e-05, + "logits/chosen": -1.864234447479248, + "logits/rejected": -2.0068678855895996, + "logps/chosen": -274.41192626953125, + "logps/rejected": -315.5820617675781, + "loss": 0.542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.331112265586853, + "rewards/margins": 4.016895294189453, + "rewards/rejected": -5.3480072021484375, + "step": 743 + }, + { + "epoch": 0.16, + "learning_rate": 1.6957983193277312e-05, + "logits/chosen": -2.2664895057678223, + "logits/rejected": -2.3592734336853027, + "logps/chosen": -264.84539794921875, + "logps/rejected": -315.04931640625, + "loss": 0.1673, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0168516635894775, + "rewards/margins": 4.741488456726074, + "rewards/rejected": -5.758340358734131, + "step": 744 + }, + { + "epoch": 0.16, + "learning_rate": 1.6953781512605042e-05, + "logits/chosen": -2.037034034729004, + "logits/rejected": -1.3760628700256348, + "logps/chosen": -334.2329406738281, + "logps/rejected": -353.3927307128906, + "loss": 0.4428, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.82315993309021, + "rewards/margins": 2.4147167205810547, + "rewards/rejected": -4.237876892089844, + "step": 745 + }, + { + "epoch": 0.16, + "learning_rate": 1.6949579831932773e-05, + "logits/chosen": -1.9302103519439697, + "logits/rejected": -2.135316848754883, + "logps/chosen": -240.90794372558594, + "logps/rejected": -403.2741394042969, + "loss": 0.3176, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8007187247276306, + "rewards/margins": 3.941528558731079, + "rewards/rejected": -4.742247104644775, + "step": 746 + }, + { + "epoch": 0.16, + "learning_rate": 1.6945378151260506e-05, + "logits/chosen": -1.7697516679763794, + "logits/rejected": -1.861335039138794, + "logps/chosen": -274.8431701660156, + "logps/rejected": -358.81903076171875, + "loss": 0.5667, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0028183460235596, + "rewards/margins": 3.1203484535217285, + "rewards/rejected": -5.123167037963867, + "step": 747 + }, + { + "epoch": 0.16, + "learning_rate": 1.6941176470588237e-05, + "logits/chosen": -2.134904384613037, + "logits/rejected": -1.4478917121887207, + "logps/chosen": -364.1796569824219, + "logps/rejected": -301.8296813964844, + "loss": 0.282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9386551380157471, + "rewards/margins": 3.4062767028808594, + "rewards/rejected": -4.3449320793151855, + "step": 748 + }, + { + "epoch": 0.16, + "learning_rate": 1.6936974789915967e-05, + "logits/chosen": -2.1162972450256348, + "logits/rejected": -1.95829176902771, + "logps/chosen": -316.54095458984375, + "logps/rejected": -275.2511901855469, + "loss": 0.9433, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.344026803970337, + "rewards/margins": 1.6161044836044312, + "rewards/rejected": -2.9601311683654785, + "step": 749 + }, + { + "epoch": 0.16, + "learning_rate": 1.69327731092437e-05, + "logits/chosen": -2.0439603328704834, + "logits/rejected": -2.044919013977051, + "logps/chosen": -273.7200927734375, + "logps/rejected": -262.03143310546875, + "loss": 0.32, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1411241292953491, + "rewards/margins": 2.6149826049804688, + "rewards/rejected": -3.756106376647949, + "step": 750 + }, + { + "epoch": 0.16, + "learning_rate": 1.692857142857143e-05, + "logits/chosen": -2.0982487201690674, + "logits/rejected": -2.0909159183502197, + "logps/chosen": -224.33486938476562, + "logps/rejected": -326.01397705078125, + "loss": 0.1559, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.608189344406128, + "rewards/margins": 4.219444274902344, + "rewards/rejected": -5.827633857727051, + "step": 751 + }, + { + "epoch": 0.16, + "learning_rate": 1.692436974789916e-05, + "logits/chosen": -2.2449052333831787, + "logits/rejected": -1.840789794921875, + "logps/chosen": -396.5846862792969, + "logps/rejected": -487.5141906738281, + "loss": 0.6014, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.068540573120117, + "rewards/margins": 2.9587674140930176, + "rewards/rejected": -5.027308464050293, + "step": 752 + }, + { + "epoch": 0.16, + "learning_rate": 1.692016806722689e-05, + "logits/chosen": -2.167710781097412, + "logits/rejected": -2.1640191078186035, + "logps/chosen": -353.65362548828125, + "logps/rejected": -397.40386962890625, + "loss": 0.8355, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2302634716033936, + "rewards/margins": 2.131781578063965, + "rewards/rejected": -3.3620448112487793, + "step": 753 + }, + { + "epoch": 0.16, + "learning_rate": 1.6915966386554625e-05, + "logits/chosen": -1.7070573568344116, + "logits/rejected": -1.8579171895980835, + "logps/chosen": -252.21473693847656, + "logps/rejected": -327.1890869140625, + "loss": 0.3548, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6616603136062622, + "rewards/margins": 3.498135566711426, + "rewards/rejected": -5.159795761108398, + "step": 754 + }, + { + "epoch": 0.16, + "learning_rate": 1.6911764705882355e-05, + "logits/chosen": -2.249856472015381, + "logits/rejected": -1.8819094896316528, + "logps/chosen": -373.22430419921875, + "logps/rejected": -338.3874206542969, + "loss": 0.4051, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7668718099594116, + "rewards/margins": 2.1771368980407715, + "rewards/rejected": -2.9440085887908936, + "step": 755 + }, + { + "epoch": 0.16, + "learning_rate": 1.6907563025210085e-05, + "logits/chosen": -2.296297550201416, + "logits/rejected": -1.9027172327041626, + "logps/chosen": -415.1441650390625, + "logps/rejected": -323.9563903808594, + "loss": 0.2058, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8324196934700012, + "rewards/margins": 4.69835090637207, + "rewards/rejected": -5.530771255493164, + "step": 756 + }, + { + "epoch": 0.16, + "learning_rate": 1.6903361344537815e-05, + "logits/chosen": -2.232555866241455, + "logits/rejected": -1.6718475818634033, + "logps/chosen": -339.8697204589844, + "logps/rejected": -313.59759521484375, + "loss": 0.2137, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11247356981039047, + "rewards/margins": 3.270537853240967, + "rewards/rejected": -3.38301157951355, + "step": 757 + }, + { + "epoch": 0.16, + "learning_rate": 1.689915966386555e-05, + "logits/chosen": -2.215729236602783, + "logits/rejected": -1.5736956596374512, + "logps/chosen": -286.5233459472656, + "logps/rejected": -239.6471405029297, + "loss": 0.3545, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2287719249725342, + "rewards/margins": 2.747135877609253, + "rewards/rejected": -3.975907802581787, + "step": 758 + }, + { + "epoch": 0.16, + "learning_rate": 1.689495798319328e-05, + "logits/chosen": -1.9748581647872925, + "logits/rejected": -1.919254183769226, + "logps/chosen": -288.23077392578125, + "logps/rejected": -351.3341369628906, + "loss": 0.2988, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7039434909820557, + "rewards/margins": 3.1637625694274902, + "rewards/rejected": -4.867705821990967, + "step": 759 + }, + { + "epoch": 0.16, + "learning_rate": 1.689075630252101e-05, + "logits/chosen": -1.8948873281478882, + "logits/rejected": -1.9115080833435059, + "logps/chosen": -259.0742492675781, + "logps/rejected": -404.9477233886719, + "loss": 0.2827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1262116432189941, + "rewards/margins": 3.5397770404815674, + "rewards/rejected": -4.665988922119141, + "step": 760 + }, + { + "epoch": 0.16, + "learning_rate": 1.688655462184874e-05, + "logits/chosen": -2.2461085319519043, + "logits/rejected": -1.8397977352142334, + "logps/chosen": -323.16986083984375, + "logps/rejected": -281.90606689453125, + "loss": 0.2954, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.00258469581604, + "rewards/margins": 2.2382278442382812, + "rewards/rejected": -3.2408125400543213, + "step": 761 + }, + { + "epoch": 0.16, + "learning_rate": 1.6882352941176473e-05, + "logits/chosen": -2.2737436294555664, + "logits/rejected": -2.0214569568634033, + "logps/chosen": -253.75103759765625, + "logps/rejected": -264.873779296875, + "loss": 0.6854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.4338724613189697, + "rewards/margins": 2.171159505844116, + "rewards/rejected": -4.605031967163086, + "step": 762 + }, + { + "epoch": 0.16, + "learning_rate": 1.6878151260504203e-05, + "logits/chosen": -2.2242767810821533, + "logits/rejected": -2.18857479095459, + "logps/chosen": -441.96405029296875, + "logps/rejected": -390.2977294921875, + "loss": 0.5068, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5113966464996338, + "rewards/margins": 2.2426154613494873, + "rewards/rejected": -3.754012107849121, + "step": 763 + }, + { + "epoch": 0.16, + "learning_rate": 1.6873949579831934e-05, + "logits/chosen": -2.274514675140381, + "logits/rejected": -1.9777116775512695, + "logps/chosen": -320.8673400878906, + "logps/rejected": -258.4037170410156, + "loss": 0.5442, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5064418315887451, + "rewards/margins": 2.0420022010803223, + "rewards/rejected": -3.5484437942504883, + "step": 764 + }, + { + "epoch": 0.16, + "learning_rate": 1.6869747899159664e-05, + "logits/chosen": -2.1914801597595215, + "logits/rejected": -2.0470032691955566, + "logps/chosen": -340.25311279296875, + "logps/rejected": -362.69256591796875, + "loss": 0.185, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1276934146881104, + "rewards/margins": 4.396705627441406, + "rewards/rejected": -5.524399280548096, + "step": 765 + }, + { + "epoch": 0.16, + "learning_rate": 1.6865546218487397e-05, + "logits/chosen": -2.424621343612671, + "logits/rejected": -1.9833660125732422, + "logps/chosen": -382.83404541015625, + "logps/rejected": -272.07977294921875, + "loss": 0.4559, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.056641697883606, + "rewards/margins": 1.9395010471343994, + "rewards/rejected": -2.996142864227295, + "step": 766 + }, + { + "epoch": 0.16, + "learning_rate": 1.6861344537815128e-05, + "logits/chosen": -2.056659698486328, + "logits/rejected": -2.1205928325653076, + "logps/chosen": -261.20574951171875, + "logps/rejected": -331.51470947265625, + "loss": 0.2219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2542498111724854, + "rewards/margins": 3.1566002368927, + "rewards/rejected": -4.410849571228027, + "step": 767 + }, + { + "epoch": 0.16, + "learning_rate": 1.6857142857142858e-05, + "logits/chosen": -1.8308494091033936, + "logits/rejected": -1.9774675369262695, + "logps/chosen": -333.05780029296875, + "logps/rejected": -360.0506591796875, + "loss": 0.1289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9276551604270935, + "rewards/margins": 3.8657102584838867, + "rewards/rejected": -4.793365478515625, + "step": 768 + }, + { + "epoch": 0.16, + "learning_rate": 1.6852941176470588e-05, + "logits/chosen": -2.085810422897339, + "logits/rejected": -2.072783946990967, + "logps/chosen": -315.7756042480469, + "logps/rejected": -279.2412109375, + "loss": 0.2876, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6299153566360474, + "rewards/margins": 2.3403263092041016, + "rewards/rejected": -3.9702417850494385, + "step": 769 + }, + { + "epoch": 0.16, + "learning_rate": 1.6848739495798322e-05, + "logits/chosen": -2.5391578674316406, + "logits/rejected": -1.9431068897247314, + "logps/chosen": -314.40704345703125, + "logps/rejected": -261.99078369140625, + "loss": 0.6811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.90847647190094, + "rewards/margins": 1.9313626289367676, + "rewards/rejected": -3.839838981628418, + "step": 770 + }, + { + "epoch": 0.16, + "learning_rate": 1.6844537815126052e-05, + "logits/chosen": -2.3814117908477783, + "logits/rejected": -1.6420822143554688, + "logps/chosen": -422.2216796875, + "logps/rejected": -282.3962707519531, + "loss": 0.1183, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9640604257583618, + "rewards/margins": 3.6622517108917236, + "rewards/rejected": -4.626311779022217, + "step": 771 + }, + { + "epoch": 0.16, + "learning_rate": 1.6840336134453782e-05, + "logits/chosen": -1.9254976511001587, + "logits/rejected": -2.0351967811584473, + "logps/chosen": -272.7548828125, + "logps/rejected": -327.5586853027344, + "loss": 0.1996, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.559951663017273, + "rewards/margins": 3.5174148082733154, + "rewards/rejected": -5.077365875244141, + "step": 772 + }, + { + "epoch": 0.16, + "learning_rate": 1.6836134453781516e-05, + "logits/chosen": -2.0609374046325684, + "logits/rejected": -1.9988148212432861, + "logps/chosen": -364.296875, + "logps/rejected": -342.71685791015625, + "loss": 0.3271, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0422489643096924, + "rewards/margins": 2.7478864192962646, + "rewards/rejected": -3.790135383605957, + "step": 773 + }, + { + "epoch": 0.16, + "learning_rate": 1.6831932773109246e-05, + "logits/chosen": -2.2556371688842773, + "logits/rejected": -1.8432490825653076, + "logps/chosen": -359.98486328125, + "logps/rejected": -363.55517578125, + "loss": 0.4182, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2572693824768066, + "rewards/margins": 2.8683996200561523, + "rewards/rejected": -4.125669002532959, + "step": 774 + }, + { + "epoch": 0.16, + "learning_rate": 1.6827731092436976e-05, + "logits/chosen": -2.075491428375244, + "logits/rejected": -1.6872649192810059, + "logps/chosen": -248.0001220703125, + "logps/rejected": -273.78009033203125, + "loss": 0.4588, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9449328184127808, + "rewards/margins": 2.391058921813965, + "rewards/rejected": -4.335991859436035, + "step": 775 + }, + { + "epoch": 0.16, + "learning_rate": 1.6823529411764706e-05, + "logits/chosen": -2.1104815006256104, + "logits/rejected": -1.71555495262146, + "logps/chosen": -255.82373046875, + "logps/rejected": -241.81765747070312, + "loss": 0.2554, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7972748279571533, + "rewards/margins": 3.034355640411377, + "rewards/rejected": -4.831630706787109, + "step": 776 + }, + { + "epoch": 0.16, + "learning_rate": 1.681932773109244e-05, + "logits/chosen": -2.084442615509033, + "logits/rejected": -1.8904378414154053, + "logps/chosen": -375.68731689453125, + "logps/rejected": -372.9009704589844, + "loss": 0.362, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0301165580749512, + "rewards/margins": 3.63201904296875, + "rewards/rejected": -4.662135601043701, + "step": 777 + }, + { + "epoch": 0.16, + "learning_rate": 1.681512605042017e-05, + "logits/chosen": -2.0034523010253906, + "logits/rejected": -2.2124459743499756, + "logps/chosen": -207.46792602539062, + "logps/rejected": -279.690673828125, + "loss": 0.3807, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4096014499664307, + "rewards/margins": 3.102025032043457, + "rewards/rejected": -4.511626243591309, + "step": 778 + }, + { + "epoch": 0.16, + "learning_rate": 1.68109243697479e-05, + "logits/chosen": -2.0103211402893066, + "logits/rejected": -1.9724972248077393, + "logps/chosen": -331.57659912109375, + "logps/rejected": -597.255126953125, + "loss": 0.2127, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7209285497665405, + "rewards/margins": 3.5068488121032715, + "rewards/rejected": -5.227777481079102, + "step": 779 + }, + { + "epoch": 0.16, + "learning_rate": 1.680672268907563e-05, + "logits/chosen": -2.1506237983703613, + "logits/rejected": -2.103100538253784, + "logps/chosen": -268.5717468261719, + "logps/rejected": -291.78289794921875, + "loss": 0.4498, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.254150390625, + "rewards/margins": 2.6157407760620117, + "rewards/rejected": -4.86989164352417, + "step": 780 + }, + { + "epoch": 0.16, + "learning_rate": 1.6802521008403364e-05, + "logits/chosen": -2.157309055328369, + "logits/rejected": -1.8166359663009644, + "logps/chosen": -476.0903015136719, + "logps/rejected": -417.87567138671875, + "loss": 0.4418, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5498747825622559, + "rewards/margins": 2.613753318786621, + "rewards/rejected": -4.163628101348877, + "step": 781 + }, + { + "epoch": 0.16, + "learning_rate": 1.6798319327731095e-05, + "logits/chosen": -2.344633102416992, + "logits/rejected": -2.0754342079162598, + "logps/chosen": -395.18255615234375, + "logps/rejected": -377.35589599609375, + "loss": 0.4607, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.523148536682129, + "rewards/margins": 2.0642309188842773, + "rewards/rejected": -3.5873796939849854, + "step": 782 + }, + { + "epoch": 0.16, + "learning_rate": 1.6794117647058825e-05, + "logits/chosen": -2.101289987564087, + "logits/rejected": -1.602689504623413, + "logps/chosen": -312.36981201171875, + "logps/rejected": -350.0341796875, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7673592567443848, + "rewards/margins": 3.626525640487671, + "rewards/rejected": -5.393884658813477, + "step": 783 + }, + { + "epoch": 0.16, + "learning_rate": 1.6789915966386555e-05, + "logits/chosen": -2.4433979988098145, + "logits/rejected": -2.4493441581726074, + "logps/chosen": -371.55267333984375, + "logps/rejected": -392.18963623046875, + "loss": 0.3999, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6184519529342651, + "rewards/margins": 1.6528499126434326, + "rewards/rejected": -3.2713019847869873, + "step": 784 + }, + { + "epoch": 0.16, + "learning_rate": 1.678571428571429e-05, + "logits/chosen": -2.0333924293518066, + "logits/rejected": -2.0256452560424805, + "logps/chosen": -319.7632141113281, + "logps/rejected": -403.7701110839844, + "loss": 0.0919, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9684919118881226, + "rewards/margins": 4.64276123046875, + "rewards/rejected": -5.611252784729004, + "step": 785 + }, + { + "epoch": 0.16, + "learning_rate": 1.678151260504202e-05, + "logits/chosen": -1.9279687404632568, + "logits/rejected": -1.769026279449463, + "logps/chosen": -356.0840148925781, + "logps/rejected": -363.72796630859375, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9372475147247314, + "rewards/margins": 3.156273365020752, + "rewards/rejected": -5.093520641326904, + "step": 786 + }, + { + "epoch": 0.16, + "learning_rate": 1.677731092436975e-05, + "logits/chosen": -1.9117867946624756, + "logits/rejected": -1.5678147077560425, + "logps/chosen": -448.7518310546875, + "logps/rejected": -339.37255859375, + "loss": 0.5029, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.287269115447998, + "rewards/margins": 1.6798906326293945, + "rewards/rejected": -3.9671595096588135, + "step": 787 + }, + { + "epoch": 0.16, + "learning_rate": 1.677310924369748e-05, + "logits/chosen": -2.0754055976867676, + "logits/rejected": -1.9076216220855713, + "logps/chosen": -281.4664611816406, + "logps/rejected": -297.4395751953125, + "loss": 0.4817, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.540342926979065, + "rewards/margins": 3.485705614089966, + "rewards/rejected": -5.02604866027832, + "step": 788 + }, + { + "epoch": 0.17, + "learning_rate": 1.6768907563025213e-05, + "logits/chosen": -2.2240195274353027, + "logits/rejected": -1.9126520156860352, + "logps/chosen": -228.79345703125, + "logps/rejected": -254.69720458984375, + "loss": 0.1187, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7896902561187744, + "rewards/margins": 4.832139015197754, + "rewards/rejected": -7.621829509735107, + "step": 789 + }, + { + "epoch": 0.17, + "learning_rate": 1.6764705882352943e-05, + "logits/chosen": -2.1068506240844727, + "logits/rejected": -1.8672306537628174, + "logps/chosen": -302.209716796875, + "logps/rejected": -310.5289306640625, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1633200645446777, + "rewards/margins": 3.0395374298095703, + "rewards/rejected": -5.20285701751709, + "step": 790 + }, + { + "epoch": 0.17, + "learning_rate": 1.6760504201680673e-05, + "logits/chosen": -1.9448015689849854, + "logits/rejected": -1.9556479454040527, + "logps/chosen": -405.7635803222656, + "logps/rejected": -362.47381591796875, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1993584632873535, + "rewards/margins": 3.4291772842407227, + "rewards/rejected": -4.628536224365234, + "step": 791 + }, + { + "epoch": 0.17, + "learning_rate": 1.6756302521008404e-05, + "logits/chosen": -1.7859869003295898, + "logits/rejected": -1.378179669380188, + "logps/chosen": -264.8746032714844, + "logps/rejected": -270.6776428222656, + "loss": 0.5102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3737199306488037, + "rewards/margins": 3.250246286392212, + "rewards/rejected": -5.623966217041016, + "step": 792 + }, + { + "epoch": 0.17, + "learning_rate": 1.6752100840336137e-05, + "logits/chosen": -2.1382250785827637, + "logits/rejected": -2.0095725059509277, + "logps/chosen": -308.4726257324219, + "logps/rejected": -288.9901428222656, + "loss": 0.1775, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.295863151550293, + "rewards/margins": 4.498048305511475, + "rewards/rejected": -5.793911933898926, + "step": 793 + }, + { + "epoch": 0.17, + "learning_rate": 1.6747899159663867e-05, + "logits/chosen": -2.27700138092041, + "logits/rejected": -1.8501858711242676, + "logps/chosen": -448.6407470703125, + "logps/rejected": -402.751220703125, + "loss": 0.3421, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2299268245697021, + "rewards/margins": 3.9400675296783447, + "rewards/rejected": -5.169994354248047, + "step": 794 + }, + { + "epoch": 0.17, + "learning_rate": 1.6743697478991598e-05, + "logits/chosen": -1.8190721273422241, + "logits/rejected": -1.5961840152740479, + "logps/chosen": -237.38885498046875, + "logps/rejected": -302.30084228515625, + "loss": 0.1003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.062920570373535, + "rewards/margins": 4.599336624145508, + "rewards/rejected": -6.662257194519043, + "step": 795 + }, + { + "epoch": 0.17, + "learning_rate": 1.673949579831933e-05, + "logits/chosen": -2.145782947540283, + "logits/rejected": -1.9014884233474731, + "logps/chosen": -347.6669006347656, + "logps/rejected": -322.4659118652344, + "loss": 0.3878, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8265118598937988, + "rewards/margins": 2.844614267349243, + "rewards/rejected": -4.671126365661621, + "step": 796 + }, + { + "epoch": 0.17, + "learning_rate": 1.673529411764706e-05, + "logits/chosen": -2.083191394805908, + "logits/rejected": -2.0543155670166016, + "logps/chosen": -311.47314453125, + "logps/rejected": -414.01861572265625, + "loss": 0.2183, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5698487758636475, + "rewards/margins": 3.643481731414795, + "rewards/rejected": -5.213330268859863, + "step": 797 + }, + { + "epoch": 0.17, + "learning_rate": 1.673109243697479e-05, + "logits/chosen": -2.074673652648926, + "logits/rejected": -1.9473825693130493, + "logps/chosen": -284.04864501953125, + "logps/rejected": -263.177734375, + "loss": 0.5197, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5385236740112305, + "rewards/margins": 1.558686375617981, + "rewards/rejected": -4.097209930419922, + "step": 798 + }, + { + "epoch": 0.17, + "learning_rate": 1.6726890756302522e-05, + "logits/chosen": -1.9169516563415527, + "logits/rejected": -2.1017541885375977, + "logps/chosen": -209.96096801757812, + "logps/rejected": -317.68707275390625, + "loss": 0.3793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8720102310180664, + "rewards/margins": 3.108508348464966, + "rewards/rejected": -4.980518341064453, + "step": 799 + }, + { + "epoch": 0.17, + "learning_rate": 1.6722689075630255e-05, + "logits/chosen": -1.9574940204620361, + "logits/rejected": -1.3475319147109985, + "logps/chosen": -296.0467834472656, + "logps/rejected": -308.9198913574219, + "loss": 0.4556, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5379273891448975, + "rewards/margins": 3.087721347808838, + "rewards/rejected": -4.625648498535156, + "step": 800 + }, + { + "epoch": 0.17, + "learning_rate": 1.6718487394957986e-05, + "logits/chosen": -2.219754457473755, + "logits/rejected": -1.9547958374023438, + "logps/chosen": -409.7639465332031, + "logps/rejected": -411.67901611328125, + "loss": 0.4537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3477132320404053, + "rewards/margins": 3.5002918243408203, + "rewards/rejected": -4.848004341125488, + "step": 801 + }, + { + "epoch": 0.17, + "learning_rate": 1.6714285714285716e-05, + "logits/chosen": -2.143476724624634, + "logits/rejected": -1.8891558647155762, + "logps/chosen": -360.0707092285156, + "logps/rejected": -391.6202087402344, + "loss": 0.1866, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6756775379180908, + "rewards/margins": 4.436486721038818, + "rewards/rejected": -6.112164497375488, + "step": 802 + }, + { + "epoch": 0.17, + "learning_rate": 1.6710084033613446e-05, + "logits/chosen": -1.8714195489883423, + "logits/rejected": -1.5319408178329468, + "logps/chosen": -298.5542907714844, + "logps/rejected": -298.84100341796875, + "loss": 0.2397, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2462854385375977, + "rewards/margins": 3.2517969608306885, + "rewards/rejected": -4.498082160949707, + "step": 803 + }, + { + "epoch": 0.17, + "learning_rate": 1.670588235294118e-05, + "logits/chosen": -2.017505645751953, + "logits/rejected": -1.9519598484039307, + "logps/chosen": -360.55242919921875, + "logps/rejected": -346.94952392578125, + "loss": 0.3809, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0962398052215576, + "rewards/margins": 3.0482583045959473, + "rewards/rejected": -4.144497871398926, + "step": 804 + }, + { + "epoch": 0.17, + "learning_rate": 1.670168067226891e-05, + "logits/chosen": -2.340400218963623, + "logits/rejected": -2.199429512023926, + "logps/chosen": -311.14617919921875, + "logps/rejected": -276.6678466796875, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7301533222198486, + "rewards/margins": 2.1489012241363525, + "rewards/rejected": -3.8790547847747803, + "step": 805 + }, + { + "epoch": 0.17, + "learning_rate": 1.669747899159664e-05, + "logits/chosen": -1.7521651983261108, + "logits/rejected": -2.2176084518432617, + "logps/chosen": -144.79974365234375, + "logps/rejected": -236.9078369140625, + "loss": 0.3722, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.937251329421997, + "rewards/margins": 2.42926025390625, + "rewards/rejected": -4.366511344909668, + "step": 806 + }, + { + "epoch": 0.17, + "learning_rate": 1.669327731092437e-05, + "logits/chosen": -2.192504644393921, + "logits/rejected": -1.9060466289520264, + "logps/chosen": -319.04949951171875, + "logps/rejected": -304.0797424316406, + "loss": 0.457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3857057094573975, + "rewards/margins": 2.898308277130127, + "rewards/rejected": -4.284013748168945, + "step": 807 + }, + { + "epoch": 0.17, + "learning_rate": 1.6689075630252104e-05, + "logits/chosen": -2.053361177444458, + "logits/rejected": -1.803290605545044, + "logps/chosen": -276.19207763671875, + "logps/rejected": -323.74530029296875, + "loss": 0.6948, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.045436143875122, + "rewards/margins": 1.7818390130996704, + "rewards/rejected": -3.827275276184082, + "step": 808 + }, + { + "epoch": 0.17, + "learning_rate": 1.6684873949579834e-05, + "logits/chosen": -1.9378191232681274, + "logits/rejected": -1.8617969751358032, + "logps/chosen": -305.732666015625, + "logps/rejected": -403.3728942871094, + "loss": 0.1649, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0122315883636475, + "rewards/margins": 3.1898858547210693, + "rewards/rejected": -4.202117919921875, + "step": 809 + }, + { + "epoch": 0.17, + "learning_rate": 1.6680672268907564e-05, + "logits/chosen": -1.5843219757080078, + "logits/rejected": -1.7740013599395752, + "logps/chosen": -348.0705261230469, + "logps/rejected": -352.1595458984375, + "loss": 0.9016, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.067608594894409, + "rewards/margins": 1.5226538181304932, + "rewards/rejected": -4.590262413024902, + "step": 810 + }, + { + "epoch": 0.17, + "learning_rate": 1.6676470588235295e-05, + "logits/chosen": -2.06152606010437, + "logits/rejected": -1.969719409942627, + "logps/chosen": -382.32061767578125, + "logps/rejected": -324.88482666015625, + "loss": 0.3366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2894357442855835, + "rewards/margins": 3.620511054992676, + "rewards/rejected": -4.909946918487549, + "step": 811 + }, + { + "epoch": 0.17, + "learning_rate": 1.6672268907563028e-05, + "logits/chosen": -2.0924124717712402, + "logits/rejected": -1.4267748594284058, + "logps/chosen": -368.8747863769531, + "logps/rejected": -271.54833984375, + "loss": 0.7101, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.334439754486084, + "rewards/margins": 2.4245223999023438, + "rewards/rejected": -4.7589616775512695, + "step": 812 + }, + { + "epoch": 0.17, + "learning_rate": 1.666806722689076e-05, + "logits/chosen": -2.0999631881713867, + "logits/rejected": -2.0761067867279053, + "logps/chosen": -249.6148681640625, + "logps/rejected": -257.709228515625, + "loss": 0.2051, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5720138549804688, + "rewards/margins": 3.9965007305145264, + "rewards/rejected": -5.568514823913574, + "step": 813 + }, + { + "epoch": 0.17, + "learning_rate": 1.666386554621849e-05, + "logits/chosen": -1.9476782083511353, + "logits/rejected": -1.9972881078720093, + "logps/chosen": -261.1157531738281, + "logps/rejected": -280.040283203125, + "loss": 0.8267, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0977253913879395, + "rewards/margins": 1.4153145551681519, + "rewards/rejected": -3.5130398273468018, + "step": 814 + }, + { + "epoch": 0.17, + "learning_rate": 1.665966386554622e-05, + "logits/chosen": -1.8016812801361084, + "logits/rejected": -2.0127463340759277, + "logps/chosen": -167.29953002929688, + "logps/rejected": -262.3382263183594, + "loss": 0.347, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9207371473312378, + "rewards/margins": 2.5700247287750244, + "rewards/rejected": -4.490761756896973, + "step": 815 + }, + { + "epoch": 0.17, + "learning_rate": 1.6655462184873953e-05, + "logits/chosen": -2.1510488986968994, + "logits/rejected": -1.751096487045288, + "logps/chosen": -361.0743408203125, + "logps/rejected": -333.98529052734375, + "loss": 0.4481, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6933932304382324, + "rewards/margins": 1.940618634223938, + "rewards/rejected": -3.634012222290039, + "step": 816 + }, + { + "epoch": 0.17, + "learning_rate": 1.6651260504201683e-05, + "logits/chosen": -2.1181344985961914, + "logits/rejected": -1.837376356124878, + "logps/chosen": -268.8865966796875, + "logps/rejected": -276.5599060058594, + "loss": 0.1769, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2857040166854858, + "rewards/margins": 2.93165922164917, + "rewards/rejected": -4.217363357543945, + "step": 817 + }, + { + "epoch": 0.17, + "learning_rate": 1.6647058823529413e-05, + "logits/chosen": -2.012856960296631, + "logits/rejected": -1.591303825378418, + "logps/chosen": -417.49609375, + "logps/rejected": -322.93426513671875, + "loss": 0.3277, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9792369604110718, + "rewards/margins": 3.5162994861602783, + "rewards/rejected": -4.495536804199219, + "step": 818 + }, + { + "epoch": 0.17, + "learning_rate": 1.6642857142857147e-05, + "logits/chosen": -1.9322481155395508, + "logits/rejected": -2.034684181213379, + "logps/chosen": -213.40687561035156, + "logps/rejected": -292.88848876953125, + "loss": 0.3431, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.089569568634033, + "rewards/margins": 2.6935977935791016, + "rewards/rejected": -4.783167362213135, + "step": 819 + }, + { + "epoch": 0.17, + "learning_rate": 1.6638655462184877e-05, + "logits/chosen": -1.8864178657531738, + "logits/rejected": -2.1955924034118652, + "logps/chosen": -292.01617431640625, + "logps/rejected": -304.77667236328125, + "loss": 0.409, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6079450845718384, + "rewards/margins": 2.152806043624878, + "rewards/rejected": -3.760751247406006, + "step": 820 + }, + { + "epoch": 0.17, + "learning_rate": 1.6634453781512607e-05, + "logits/chosen": -1.7228481769561768, + "logits/rejected": -2.0135490894317627, + "logps/chosen": -197.5811309814453, + "logps/rejected": -315.9188232421875, + "loss": 0.1827, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.016920328140259, + "rewards/margins": 3.6686692237854004, + "rewards/rejected": -5.685589790344238, + "step": 821 + }, + { + "epoch": 0.17, + "learning_rate": 1.6630252100840337e-05, + "logits/chosen": -1.976762294769287, + "logits/rejected": -1.8989331722259521, + "logps/chosen": -391.1483154296875, + "logps/rejected": -490.8774108886719, + "loss": 0.1226, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8597344756126404, + "rewards/margins": 4.330629348754883, + "rewards/rejected": -5.190363883972168, + "step": 822 + }, + { + "epoch": 0.17, + "learning_rate": 1.662605042016807e-05, + "logits/chosen": -2.1878890991210938, + "logits/rejected": -2.1649534702301025, + "logps/chosen": -340.9357604980469, + "logps/rejected": -296.375, + "loss": 0.2849, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1962876319885254, + "rewards/margins": 2.259341239929199, + "rewards/rejected": -3.4556286334991455, + "step": 823 + }, + { + "epoch": 0.17, + "learning_rate": 1.66218487394958e-05, + "logits/chosen": -1.9601835012435913, + "logits/rejected": -1.541343331336975, + "logps/chosen": -321.4696350097656, + "logps/rejected": -303.7713623046875, + "loss": 0.2457, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7580763697624207, + "rewards/margins": 4.0705742835998535, + "rewards/rejected": -4.82865047454834, + "step": 824 + }, + { + "epoch": 0.17, + "learning_rate": 1.661764705882353e-05, + "logits/chosen": -2.2011148929595947, + "logits/rejected": -1.8671495914459229, + "logps/chosen": -333.15191650390625, + "logps/rejected": -302.48480224609375, + "loss": 0.5476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1909797191619873, + "rewards/margins": 2.1786837577819824, + "rewards/rejected": -3.369663715362549, + "step": 825 + }, + { + "epoch": 0.17, + "learning_rate": 1.661344537815126e-05, + "logits/chosen": -1.973969578742981, + "logits/rejected": -1.5397248268127441, + "logps/chosen": -454.9148254394531, + "logps/rejected": -419.6549377441406, + "loss": 0.0814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6346754431724548, + "rewards/margins": 3.710078239440918, + "rewards/rejected": -4.344753742218018, + "step": 826 + }, + { + "epoch": 0.17, + "learning_rate": 1.6609243697478995e-05, + "logits/chosen": -2.0503973960876465, + "logits/rejected": -1.7811187505722046, + "logps/chosen": -194.16696166992188, + "logps/rejected": -224.06613159179688, + "loss": 0.2646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3351134061813354, + "rewards/margins": 2.9045636653900146, + "rewards/rejected": -4.2396769523620605, + "step": 827 + }, + { + "epoch": 0.17, + "learning_rate": 1.6605042016806725e-05, + "logits/chosen": -2.504589080810547, + "logits/rejected": -1.7091355323791504, + "logps/chosen": -417.6837158203125, + "logps/rejected": -321.0303039550781, + "loss": 0.2736, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.787346601486206, + "rewards/margins": 3.2485744953155518, + "rewards/rejected": -4.035921096801758, + "step": 828 + }, + { + "epoch": 0.17, + "learning_rate": 1.6600840336134456e-05, + "logits/chosen": -2.1851143836975098, + "logits/rejected": -1.783128261566162, + "logps/chosen": -325.93475341796875, + "logps/rejected": -399.90081787109375, + "loss": 0.2158, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.199167251586914, + "rewards/margins": 3.5577316284179688, + "rewards/rejected": -4.756898880004883, + "step": 829 + }, + { + "epoch": 0.17, + "learning_rate": 1.6596638655462186e-05, + "logits/chosen": -1.9567270278930664, + "logits/rejected": -1.649950385093689, + "logps/chosen": -300.76953125, + "logps/rejected": -272.15728759765625, + "loss": 0.3177, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7872998714447021, + "rewards/margins": 2.7889013290405273, + "rewards/rejected": -4.576201438903809, + "step": 830 + }, + { + "epoch": 0.17, + "learning_rate": 1.659243697478992e-05, + "logits/chosen": -1.9760212898254395, + "logits/rejected": -1.7709592580795288, + "logps/chosen": -315.890380859375, + "logps/rejected": -335.98797607421875, + "loss": 0.5109, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7812278270721436, + "rewards/margins": 3.208256721496582, + "rewards/rejected": -4.989484786987305, + "step": 831 + }, + { + "epoch": 0.17, + "learning_rate": 1.658823529411765e-05, + "logits/chosen": -1.936383605003357, + "logits/rejected": -1.6465673446655273, + "logps/chosen": -330.939453125, + "logps/rejected": -316.66668701171875, + "loss": 0.165, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0802595615386963, + "rewards/margins": 3.58834171295166, + "rewards/rejected": -4.6686015129089355, + "step": 832 + }, + { + "epoch": 0.17, + "learning_rate": 1.658403361344538e-05, + "logits/chosen": -2.3320469856262207, + "logits/rejected": -2.003618001937866, + "logps/chosen": -337.9537048339844, + "logps/rejected": -284.35986328125, + "loss": 0.4036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8827441930770874, + "rewards/margins": 2.307610511779785, + "rewards/rejected": -3.190354824066162, + "step": 833 + }, + { + "epoch": 0.17, + "learning_rate": 1.657983193277311e-05, + "logits/chosen": -2.048344612121582, + "logits/rejected": -1.9014158248901367, + "logps/chosen": -216.5763397216797, + "logps/rejected": -269.81683349609375, + "loss": 0.289, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.364983081817627, + "rewards/margins": 2.904245376586914, + "rewards/rejected": -4.269228458404541, + "step": 834 + }, + { + "epoch": 0.17, + "learning_rate": 1.6575630252100844e-05, + "logits/chosen": -1.6831631660461426, + "logits/rejected": -1.8635473251342773, + "logps/chosen": -227.89779663085938, + "logps/rejected": -315.51690673828125, + "loss": 0.4791, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2472732067108154, + "rewards/margins": 2.108879327774048, + "rewards/rejected": -4.356152534484863, + "step": 835 + }, + { + "epoch": 0.17, + "learning_rate": 1.6571428571428574e-05, + "logits/chosen": -1.9859883785247803, + "logits/rejected": -1.6807833909988403, + "logps/chosen": -338.3689880371094, + "logps/rejected": -323.92547607421875, + "loss": 0.4065, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.162870407104492, + "rewards/margins": 3.001528263092041, + "rewards/rejected": -5.164398670196533, + "step": 836 + }, + { + "epoch": 0.18, + "learning_rate": 1.6567226890756304e-05, + "logits/chosen": -1.9614168405532837, + "logits/rejected": -1.7866511344909668, + "logps/chosen": -331.42034912109375, + "logps/rejected": -349.8808288574219, + "loss": 0.5605, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6788291931152344, + "rewards/margins": 2.541520595550537, + "rewards/rejected": -5.220349311828613, + "step": 837 + }, + { + "epoch": 0.18, + "learning_rate": 1.6563025210084034e-05, + "logits/chosen": -2.057004928588867, + "logits/rejected": -1.7994532585144043, + "logps/chosen": -362.586669921875, + "logps/rejected": -332.658447265625, + "loss": 0.4787, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.59127938747406, + "rewards/margins": 2.264190196990967, + "rewards/rejected": -3.8554694652557373, + "step": 838 + }, + { + "epoch": 0.18, + "learning_rate": 1.6558823529411765e-05, + "logits/chosen": -1.9931203126907349, + "logits/rejected": -1.9703037738800049, + "logps/chosen": -357.2671203613281, + "logps/rejected": -421.91998291015625, + "loss": 0.5829, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6607378721237183, + "rewards/margins": 2.8475656509399414, + "rewards/rejected": -4.508303642272949, + "step": 839 + }, + { + "epoch": 0.18, + "learning_rate": 1.6554621848739495e-05, + "logits/chosen": -1.7824808359146118, + "logits/rejected": -2.113828659057617, + "logps/chosen": -304.1899108886719, + "logps/rejected": -386.22314453125, + "loss": 0.2567, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4706379175186157, + "rewards/margins": 3.265458583831787, + "rewards/rejected": -4.736096382141113, + "step": 840 + }, + { + "epoch": 0.18, + "learning_rate": 1.655042016806723e-05, + "logits/chosen": -1.8116238117218018, + "logits/rejected": -1.7838225364685059, + "logps/chosen": -268.669921875, + "logps/rejected": -266.62542724609375, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4213013648986816, + "rewards/margins": 2.80891489982605, + "rewards/rejected": -4.2302165031433105, + "step": 841 + }, + { + "epoch": 0.18, + "learning_rate": 1.654621848739496e-05, + "logits/chosen": -2.097501277923584, + "logits/rejected": -1.7903889417648315, + "logps/chosen": -397.4696960449219, + "logps/rejected": -374.2898254394531, + "loss": 0.4986, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5505564212799072, + "rewards/margins": 2.56294322013855, + "rewards/rejected": -5.113499641418457, + "step": 842 + }, + { + "epoch": 0.18, + "learning_rate": 1.654201680672269e-05, + "logits/chosen": -2.309311866760254, + "logits/rejected": -1.962480902671814, + "logps/chosen": -397.01068115234375, + "logps/rejected": -334.1055908203125, + "loss": 0.6108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3525598049163818, + "rewards/margins": 2.0506162643432617, + "rewards/rejected": -3.4031758308410645, + "step": 843 + }, + { + "epoch": 0.18, + "learning_rate": 1.653781512605042e-05, + "logits/chosen": -1.9976651668548584, + "logits/rejected": -1.823923110961914, + "logps/chosen": -345.355224609375, + "logps/rejected": -402.2070007324219, + "loss": 0.1688, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3158338069915771, + "rewards/margins": 3.419156551361084, + "rewards/rejected": -4.734990119934082, + "step": 844 + }, + { + "epoch": 0.18, + "learning_rate": 1.6533613445378153e-05, + "logits/chosen": -1.9651539325714111, + "logits/rejected": -2.055600166320801, + "logps/chosen": -268.68121337890625, + "logps/rejected": -320.0924377441406, + "loss": 0.5162, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2614715099334717, + "rewards/margins": 2.5186023712158203, + "rewards/rejected": -4.780073642730713, + "step": 845 + }, + { + "epoch": 0.18, + "learning_rate": 1.6529411764705883e-05, + "logits/chosen": -1.9466253519058228, + "logits/rejected": -1.5661593675613403, + "logps/chosen": -360.6986083984375, + "logps/rejected": -369.8818359375, + "loss": 0.3559, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6444405317306519, + "rewards/margins": 2.702845335006714, + "rewards/rejected": -4.347285747528076, + "step": 846 + }, + { + "epoch": 0.18, + "learning_rate": 1.6525210084033613e-05, + "logits/chosen": -2.030992031097412, + "logits/rejected": -1.8643478155136108, + "logps/chosen": -302.4504089355469, + "logps/rejected": -300.9117736816406, + "loss": 0.201, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0876160860061646, + "rewards/margins": 3.9334614276885986, + "rewards/rejected": -5.021077632904053, + "step": 847 + }, + { + "epoch": 0.18, + "learning_rate": 1.6521008403361343e-05, + "logits/chosen": -1.9273464679718018, + "logits/rejected": -1.8194384574890137, + "logps/chosen": -381.476806640625, + "logps/rejected": -406.6851501464844, + "loss": 0.306, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0414516925811768, + "rewards/margins": 2.5162088871002197, + "rewards/rejected": -3.5576605796813965, + "step": 848 + }, + { + "epoch": 0.18, + "learning_rate": 1.6516806722689077e-05, + "logits/chosen": -1.8308701515197754, + "logits/rejected": -1.709214210510254, + "logps/chosen": -380.6348876953125, + "logps/rejected": -396.2377624511719, + "loss": 0.3121, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2020562887191772, + "rewards/margins": 2.959399938583374, + "rewards/rejected": -4.16145658493042, + "step": 849 + }, + { + "epoch": 0.18, + "learning_rate": 1.6512605042016807e-05, + "logits/chosen": -2.2406840324401855, + "logits/rejected": -1.7540903091430664, + "logps/chosen": -352.529052734375, + "logps/rejected": -291.0569152832031, + "loss": 0.2422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.038619041442871, + "rewards/margins": 3.9786553382873535, + "rewards/rejected": -5.017273902893066, + "step": 850 + }, + { + "epoch": 0.18, + "learning_rate": 1.6508403361344537e-05, + "logits/chosen": -2.294544219970703, + "logits/rejected": -2.259855270385742, + "logps/chosen": -350.5856628417969, + "logps/rejected": -408.121337890625, + "loss": 0.4345, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.87151038646698, + "rewards/margins": 2.057766914367676, + "rewards/rejected": -2.929277181625366, + "step": 851 + }, + { + "epoch": 0.18, + "learning_rate": 1.650420168067227e-05, + "logits/chosen": -2.2867562770843506, + "logits/rejected": -2.1423451900482178, + "logps/chosen": -176.26080322265625, + "logps/rejected": -226.0680694580078, + "loss": 0.3108, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2988001108169556, + "rewards/margins": 3.1676506996154785, + "rewards/rejected": -4.4664506912231445, + "step": 852 + }, + { + "epoch": 0.18, + "learning_rate": 1.65e-05, + "logits/chosen": -2.1745870113372803, + "logits/rejected": -2.0351154804229736, + "logps/chosen": -319.2963562011719, + "logps/rejected": -310.81927490234375, + "loss": 0.395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1311728954315186, + "rewards/margins": 2.9557228088378906, + "rewards/rejected": -4.08689546585083, + "step": 853 + }, + { + "epoch": 0.18, + "learning_rate": 1.649579831932773e-05, + "logits/chosen": -1.8942925930023193, + "logits/rejected": -2.042156934738159, + "logps/chosen": -322.89141845703125, + "logps/rejected": -362.81488037109375, + "loss": 0.6047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9837080240249634, + "rewards/margins": 2.8692259788513184, + "rewards/rejected": -3.852933883666992, + "step": 854 + }, + { + "epoch": 0.18, + "learning_rate": 1.6491596638655462e-05, + "logits/chosen": -1.9540199041366577, + "logits/rejected": -2.148458957672119, + "logps/chosen": -200.40017700195312, + "logps/rejected": -272.02374267578125, + "loss": 0.3267, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5883879661560059, + "rewards/margins": 2.9480838775634766, + "rewards/rejected": -4.536471843719482, + "step": 855 + }, + { + "epoch": 0.18, + "learning_rate": 1.6487394957983195e-05, + "logits/chosen": -2.0282158851623535, + "logits/rejected": -1.984845519065857, + "logps/chosen": -234.03158569335938, + "logps/rejected": -297.2955322265625, + "loss": 0.1634, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9600425958633423, + "rewards/margins": 3.9911789894104004, + "rewards/rejected": -4.951221942901611, + "step": 856 + }, + { + "epoch": 0.18, + "learning_rate": 1.6483193277310926e-05, + "logits/chosen": -2.0615880489349365, + "logits/rejected": -1.6632945537567139, + "logps/chosen": -348.844482421875, + "logps/rejected": -317.849365234375, + "loss": 0.4408, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9168353080749512, + "rewards/margins": 3.5147628784179688, + "rewards/rejected": -4.43159818649292, + "step": 857 + }, + { + "epoch": 0.18, + "learning_rate": 1.6478991596638656e-05, + "logits/chosen": -2.010307788848877, + "logits/rejected": -1.8121495246887207, + "logps/chosen": -238.03521728515625, + "logps/rejected": -237.03298950195312, + "loss": 0.1395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5243815183639526, + "rewards/margins": 4.120826721191406, + "rewards/rejected": -4.645208358764648, + "step": 858 + }, + { + "epoch": 0.18, + "learning_rate": 1.6474789915966386e-05, + "logits/chosen": -2.1482393741607666, + "logits/rejected": -1.954933762550354, + "logps/chosen": -253.74533081054688, + "logps/rejected": -296.804443359375, + "loss": 0.2187, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9486840963363647, + "rewards/margins": 4.430158615112305, + "rewards/rejected": -5.378842353820801, + "step": 859 + }, + { + "epoch": 0.18, + "learning_rate": 1.647058823529412e-05, + "logits/chosen": -2.2143912315368652, + "logits/rejected": -1.9657034873962402, + "logps/chosen": -379.7645263671875, + "logps/rejected": -453.1151428222656, + "loss": 0.3534, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.390915870666504, + "rewards/margins": 3.3093655109405518, + "rewards/rejected": -4.700282096862793, + "step": 860 + }, + { + "epoch": 0.18, + "learning_rate": 1.646638655462185e-05, + "logits/chosen": -2.304980516433716, + "logits/rejected": -2.093060255050659, + "logps/chosen": -292.78802490234375, + "logps/rejected": -328.1954650878906, + "loss": 0.5197, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2974127531051636, + "rewards/margins": 1.7077938318252563, + "rewards/rejected": -3.005206346511841, + "step": 861 + }, + { + "epoch": 0.18, + "learning_rate": 1.646218487394958e-05, + "logits/chosen": -1.9428411722183228, + "logits/rejected": -1.5828006267547607, + "logps/chosen": -298.0184326171875, + "logps/rejected": -261.4896240234375, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3908194303512573, + "rewards/margins": 1.7250456809997559, + "rewards/rejected": -3.1158649921417236, + "step": 862 + }, + { + "epoch": 0.18, + "learning_rate": 1.645798319327731e-05, + "logits/chosen": -2.2632293701171875, + "logits/rejected": -2.421980619430542, + "logps/chosen": -320.84283447265625, + "logps/rejected": -354.6207275390625, + "loss": 0.3628, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3431496620178223, + "rewards/margins": 2.89518404006958, + "rewards/rejected": -4.238333702087402, + "step": 863 + }, + { + "epoch": 0.18, + "learning_rate": 1.6453781512605044e-05, + "logits/chosen": -2.2326416969299316, + "logits/rejected": -1.7632266283035278, + "logps/chosen": -416.22735595703125, + "logps/rejected": -413.9497985839844, + "loss": 0.1922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05262170732021332, + "rewards/margins": 3.9401092529296875, + "rewards/rejected": -3.9927310943603516, + "step": 864 + }, + { + "epoch": 0.18, + "learning_rate": 1.6449579831932774e-05, + "logits/chosen": -2.2643580436706543, + "logits/rejected": -1.5607458353042603, + "logps/chosen": -383.83551025390625, + "logps/rejected": -321.55718994140625, + "loss": 0.1777, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.30315354466438293, + "rewards/margins": 3.521136522293091, + "rewards/rejected": -3.8242900371551514, + "step": 865 + }, + { + "epoch": 0.18, + "learning_rate": 1.6445378151260504e-05, + "logits/chosen": -1.699146032333374, + "logits/rejected": -1.956099271774292, + "logps/chosen": -245.18296813964844, + "logps/rejected": -312.64801025390625, + "loss": 0.3764, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6060068607330322, + "rewards/margins": 2.6803605556488037, + "rewards/rejected": -4.286367416381836, + "step": 866 + }, + { + "epoch": 0.18, + "learning_rate": 1.6441176470588235e-05, + "logits/chosen": -2.0695505142211914, + "logits/rejected": -1.6839985847473145, + "logps/chosen": -302.244873046875, + "logps/rejected": -303.36810302734375, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0057995319366455, + "rewards/margins": 2.0823960304260254, + "rewards/rejected": -3.08819580078125, + "step": 867 + }, + { + "epoch": 0.18, + "learning_rate": 1.6436974789915968e-05, + "logits/chosen": -1.829862356185913, + "logits/rejected": -2.0130064487457275, + "logps/chosen": -224.52940368652344, + "logps/rejected": -285.8321533203125, + "loss": 0.3092, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3459399938583374, + "rewards/margins": 2.6092429161071777, + "rewards/rejected": -3.9551827907562256, + "step": 868 + }, + { + "epoch": 0.18, + "learning_rate": 1.64327731092437e-05, + "logits/chosen": -2.3632571697235107, + "logits/rejected": -1.8849446773529053, + "logps/chosen": -388.83197021484375, + "logps/rejected": -340.25634765625, + "loss": 0.3105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5788600444793701, + "rewards/margins": 2.7900514602661133, + "rewards/rejected": -3.3689115047454834, + "step": 869 + }, + { + "epoch": 0.18, + "learning_rate": 1.642857142857143e-05, + "logits/chosen": -2.087378978729248, + "logits/rejected": -1.8918813467025757, + "logps/chosen": -241.5161590576172, + "logps/rejected": -268.33074951171875, + "loss": 0.4724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0371454954147339, + "rewards/margins": 2.611354112625122, + "rewards/rejected": -3.6484994888305664, + "step": 870 + }, + { + "epoch": 0.18, + "learning_rate": 1.642436974789916e-05, + "logits/chosen": -2.0778613090515137, + "logits/rejected": -1.9503037929534912, + "logps/chosen": -239.8323516845703, + "logps/rejected": -262.5771789550781, + "loss": 0.1792, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8480715155601501, + "rewards/margins": 3.899735927581787, + "rewards/rejected": -4.747807502746582, + "step": 871 + }, + { + "epoch": 0.18, + "learning_rate": 1.6420168067226892e-05, + "logits/chosen": -1.9055280685424805, + "logits/rejected": -1.6988685131072998, + "logps/chosen": -207.60768127441406, + "logps/rejected": -249.6638641357422, + "loss": 0.214, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3459224700927734, + "rewards/margins": 2.718294382095337, + "rewards/rejected": -4.064216613769531, + "step": 872 + }, + { + "epoch": 0.18, + "learning_rate": 1.6415966386554623e-05, + "logits/chosen": -2.025665760040283, + "logits/rejected": -1.8769866228103638, + "logps/chosen": -365.9964599609375, + "logps/rejected": -341.34466552734375, + "loss": 0.1997, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0866661071777344, + "rewards/margins": 3.3571691513061523, + "rewards/rejected": -4.443835258483887, + "step": 873 + }, + { + "epoch": 0.18, + "learning_rate": 1.6411764705882353e-05, + "logits/chosen": -2.006777763366699, + "logits/rejected": -1.6285200119018555, + "logps/chosen": -291.044189453125, + "logps/rejected": -305.30926513671875, + "loss": 0.342, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0532143115997314, + "rewards/margins": 2.9847567081451416, + "rewards/rejected": -4.037971496582031, + "step": 874 + }, + { + "epoch": 0.18, + "learning_rate": 1.6407563025210086e-05, + "logits/chosen": -1.941154956817627, + "logits/rejected": -1.896328091621399, + "logps/chosen": -226.91986083984375, + "logps/rejected": -321.24261474609375, + "loss": 0.3292, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8371610641479492, + "rewards/margins": 2.636752128601074, + "rewards/rejected": -4.473913669586182, + "step": 875 + }, + { + "epoch": 0.18, + "learning_rate": 1.6403361344537817e-05, + "logits/chosen": -1.7844109535217285, + "logits/rejected": -1.823304295539856, + "logps/chosen": -229.37229919433594, + "logps/rejected": -377.96490478515625, + "loss": 0.1902, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5230915546417236, + "rewards/margins": 4.438862323760986, + "rewards/rejected": -5.961954116821289, + "step": 876 + }, + { + "epoch": 0.18, + "learning_rate": 1.6399159663865547e-05, + "logits/chosen": -2.117547035217285, + "logits/rejected": -1.943943738937378, + "logps/chosen": -208.5506591796875, + "logps/rejected": -260.7738342285156, + "loss": 0.655, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.129209280014038, + "rewards/margins": 3.2130351066589355, + "rewards/rejected": -5.342244625091553, + "step": 877 + }, + { + "epoch": 0.18, + "learning_rate": 1.6394957983193277e-05, + "logits/chosen": -1.8762304782867432, + "logits/rejected": -2.008063554763794, + "logps/chosen": -341.86859130859375, + "logps/rejected": -434.216796875, + "loss": 0.5347, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9512302875518799, + "rewards/margins": 2.597323417663574, + "rewards/rejected": -3.548553466796875, + "step": 878 + }, + { + "epoch": 0.18, + "learning_rate": 1.639075630252101e-05, + "logits/chosen": -1.9159164428710938, + "logits/rejected": -1.9696593284606934, + "logps/chosen": -201.14906311035156, + "logps/rejected": -301.0068054199219, + "loss": 0.6713, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.636357307434082, + "rewards/margins": 2.3764381408691406, + "rewards/rejected": -4.012795448303223, + "step": 879 + }, + { + "epoch": 0.18, + "learning_rate": 1.638655462184874e-05, + "logits/chosen": -2.1168620586395264, + "logits/rejected": -2.0773792266845703, + "logps/chosen": -270.95245361328125, + "logps/rejected": -291.47772216796875, + "loss": 0.15, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3984571695327759, + "rewards/margins": 3.4344632625579834, + "rewards/rejected": -4.832920074462891, + "step": 880 + }, + { + "epoch": 0.18, + "learning_rate": 1.638235294117647e-05, + "logits/chosen": -1.848799705505371, + "logits/rejected": -1.8525713682174683, + "logps/chosen": -355.29351806640625, + "logps/rejected": -310.83978271484375, + "loss": 0.1713, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.34663504362106323, + "rewards/margins": 3.317166805267334, + "rewards/rejected": -3.663801670074463, + "step": 881 + }, + { + "epoch": 0.18, + "learning_rate": 1.63781512605042e-05, + "logits/chosen": -1.7868198156356812, + "logits/rejected": -1.6365983486175537, + "logps/chosen": -247.5361785888672, + "logps/rejected": -322.70684814453125, + "loss": 0.1945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.913313865661621, + "rewards/margins": 4.205984115600586, + "rewards/rejected": -6.119297981262207, + "step": 882 + }, + { + "epoch": 0.18, + "learning_rate": 1.6373949579831935e-05, + "logits/chosen": -1.8090060949325562, + "logits/rejected": -1.876386284828186, + "logps/chosen": -318.5756530761719, + "logps/rejected": -351.6622619628906, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3252978324890137, + "rewards/margins": 2.5264720916748047, + "rewards/rejected": -3.8517696857452393, + "step": 883 + }, + { + "epoch": 0.18, + "learning_rate": 1.6369747899159665e-05, + "logits/chosen": -2.0742955207824707, + "logits/rejected": -1.9202224016189575, + "logps/chosen": -331.4919738769531, + "logps/rejected": -353.0850830078125, + "loss": 0.4026, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6761324405670166, + "rewards/margins": 4.3154191970825195, + "rewards/rejected": -5.991551876068115, + "step": 884 + }, + { + "epoch": 0.19, + "learning_rate": 1.6365546218487395e-05, + "logits/chosen": -1.9296650886535645, + "logits/rejected": -1.9566481113433838, + "logps/chosen": -250.49246215820312, + "logps/rejected": -281.1885986328125, + "loss": 0.3591, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2915005683898926, + "rewards/margins": 3.8833706378936768, + "rewards/rejected": -5.17487096786499, + "step": 885 + }, + { + "epoch": 0.19, + "learning_rate": 1.6361344537815126e-05, + "logits/chosen": -1.8366259336471558, + "logits/rejected": -1.8812752962112427, + "logps/chosen": -335.166259765625, + "logps/rejected": -344.3279113769531, + "loss": 0.9352, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.160041332244873, + "rewards/margins": 1.4995570182800293, + "rewards/rejected": -3.6595983505249023, + "step": 886 + }, + { + "epoch": 0.19, + "learning_rate": 1.635714285714286e-05, + "logits/chosen": -2.2893869876861572, + "logits/rejected": -2.0035924911499023, + "logps/chosen": -508.09478759765625, + "logps/rejected": -332.3266296386719, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2859076261520386, + "rewards/margins": 3.980938673019409, + "rewards/rejected": -5.266845703125, + "step": 887 + }, + { + "epoch": 0.19, + "learning_rate": 1.635294117647059e-05, + "logits/chosen": -2.169868230819702, + "logits/rejected": -1.9147133827209473, + "logps/chosen": -229.88088989257812, + "logps/rejected": -302.2627868652344, + "loss": 0.3355, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2932333946228027, + "rewards/margins": 3.5300803184509277, + "rewards/rejected": -4.8233137130737305, + "step": 888 + }, + { + "epoch": 0.19, + "learning_rate": 1.634873949579832e-05, + "logits/chosen": -1.8004443645477295, + "logits/rejected": -1.66507887840271, + "logps/chosen": -366.08819580078125, + "logps/rejected": -355.1624755859375, + "loss": 0.1311, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2386223077774048, + "rewards/margins": 4.955789566040039, + "rewards/rejected": -6.194411277770996, + "step": 889 + }, + { + "epoch": 0.19, + "learning_rate": 1.634453781512605e-05, + "logits/chosen": -1.8324013948440552, + "logits/rejected": -1.7521955966949463, + "logps/chosen": -276.56988525390625, + "logps/rejected": -264.121826171875, + "loss": 0.6167, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6577619314193726, + "rewards/margins": 1.9621384143829346, + "rewards/rejected": -3.6199002265930176, + "step": 890 + }, + { + "epoch": 0.19, + "learning_rate": 1.6340336134453784e-05, + "logits/chosen": -2.2964229583740234, + "logits/rejected": -1.6694972515106201, + "logps/chosen": -425.6768798828125, + "logps/rejected": -321.85736083984375, + "loss": 0.197, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5942802429199219, + "rewards/margins": 3.9484291076660156, + "rewards/rejected": -5.5427093505859375, + "step": 891 + }, + { + "epoch": 0.19, + "learning_rate": 1.6336134453781514e-05, + "logits/chosen": -1.870035171508789, + "logits/rejected": -1.6253803968429565, + "logps/chosen": -314.9829406738281, + "logps/rejected": -321.0347900390625, + "loss": 0.5274, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.105365037918091, + "rewards/margins": 2.045036554336548, + "rewards/rejected": -4.150401592254639, + "step": 892 + }, + { + "epoch": 0.19, + "learning_rate": 1.6331932773109244e-05, + "logits/chosen": -2.0040812492370605, + "logits/rejected": -2.1056299209594727, + "logps/chosen": -261.9435729980469, + "logps/rejected": -307.0917053222656, + "loss": 0.5958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.740065097808838, + "rewards/margins": 1.639905333518982, + "rewards/rejected": -3.3799703121185303, + "step": 893 + }, + { + "epoch": 0.19, + "learning_rate": 1.6327731092436974e-05, + "logits/chosen": -1.840165376663208, + "logits/rejected": -1.7288068532943726, + "logps/chosen": -256.5667419433594, + "logps/rejected": -288.83355712890625, + "loss": 0.2113, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.358893632888794, + "rewards/margins": 2.9421653747558594, + "rewards/rejected": -4.301058769226074, + "step": 894 + }, + { + "epoch": 0.19, + "learning_rate": 1.6323529411764708e-05, + "logits/chosen": -2.0562233924865723, + "logits/rejected": -1.8470474481582642, + "logps/chosen": -287.37371826171875, + "logps/rejected": -330.51934814453125, + "loss": 0.3998, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.304749608039856, + "rewards/margins": 3.2410483360290527, + "rewards/rejected": -4.545797824859619, + "step": 895 + }, + { + "epoch": 0.19, + "learning_rate": 1.6319327731092438e-05, + "logits/chosen": -2.3353686332702637, + "logits/rejected": -2.2728850841522217, + "logps/chosen": -353.1683654785156, + "logps/rejected": -323.61065673828125, + "loss": 0.2424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6277364492416382, + "rewards/margins": 2.9938278198242188, + "rewards/rejected": -4.621563911437988, + "step": 896 + }, + { + "epoch": 0.19, + "learning_rate": 1.6315126050420168e-05, + "logits/chosen": -2.0406594276428223, + "logits/rejected": -1.948315143585205, + "logps/chosen": -300.9489440917969, + "logps/rejected": -305.4774475097656, + "loss": 0.282, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2815067768096924, + "rewards/margins": 3.31465482711792, + "rewards/rejected": -4.596161365509033, + "step": 897 + }, + { + "epoch": 0.19, + "learning_rate": 1.6310924369747902e-05, + "logits/chosen": -2.0638017654418945, + "logits/rejected": -2.173253059387207, + "logps/chosen": -266.5672302246094, + "logps/rejected": -341.09552001953125, + "loss": 0.4387, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6528520584106445, + "rewards/margins": 1.8177275657653809, + "rewards/rejected": -3.4705796241760254, + "step": 898 + }, + { + "epoch": 0.19, + "learning_rate": 1.6306722689075632e-05, + "logits/chosen": -2.0284862518310547, + "logits/rejected": -1.877831220626831, + "logps/chosen": -242.322509765625, + "logps/rejected": -253.47434997558594, + "loss": 0.2654, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3718500137329102, + "rewards/margins": 3.1581337451934814, + "rewards/rejected": -4.5299835205078125, + "step": 899 + }, + { + "epoch": 0.19, + "learning_rate": 1.6302521008403362e-05, + "logits/chosen": -1.961199402809143, + "logits/rejected": -2.235602378845215, + "logps/chosen": -327.5709228515625, + "logps/rejected": -392.82080078125, + "loss": 0.207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6375327110290527, + "rewards/margins": 3.892061710357666, + "rewards/rejected": -5.529594421386719, + "step": 900 + }, + { + "epoch": 0.19, + "learning_rate": 1.6298319327731093e-05, + "logits/chosen": -2.0290136337280273, + "logits/rejected": -1.696539044380188, + "logps/chosen": -334.60601806640625, + "logps/rejected": -262.968505859375, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1712181568145752, + "rewards/margins": 4.417806625366211, + "rewards/rejected": -5.589025020599365, + "step": 901 + }, + { + "epoch": 0.19, + "learning_rate": 1.6294117647058826e-05, + "logits/chosen": -2.2502691745758057, + "logits/rejected": -2.1580700874328613, + "logps/chosen": -271.84765625, + "logps/rejected": -209.18585205078125, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5988433361053467, + "rewards/margins": 2.7156684398651123, + "rewards/rejected": -4.314511775970459, + "step": 902 + }, + { + "epoch": 0.19, + "learning_rate": 1.6289915966386556e-05, + "logits/chosen": -2.0320043563842773, + "logits/rejected": -1.588046669960022, + "logps/chosen": -393.6407775878906, + "logps/rejected": -276.511962890625, + "loss": 0.4197, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.903933048248291, + "rewards/margins": 2.1499881744384766, + "rewards/rejected": -4.053921699523926, + "step": 903 + }, + { + "epoch": 0.19, + "learning_rate": 1.6285714285714287e-05, + "logits/chosen": -2.47733211517334, + "logits/rejected": -1.9541778564453125, + "logps/chosen": -443.1294250488281, + "logps/rejected": -330.012939453125, + "loss": 0.2284, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0894834995269775, + "rewards/margins": 3.156179189682007, + "rewards/rejected": -4.245662689208984, + "step": 904 + }, + { + "epoch": 0.19, + "learning_rate": 1.6281512605042017e-05, + "logits/chosen": -2.060857057571411, + "logits/rejected": -1.7653508186340332, + "logps/chosen": -318.81451416015625, + "logps/rejected": -241.42276000976562, + "loss": 0.4835, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8648567199707031, + "rewards/margins": 1.6097255945205688, + "rewards/rejected": -3.4745821952819824, + "step": 905 + }, + { + "epoch": 0.19, + "learning_rate": 1.627731092436975e-05, + "logits/chosen": -2.1348257064819336, + "logits/rejected": -1.7093878984451294, + "logps/chosen": -285.4877014160156, + "logps/rejected": -290.025634765625, + "loss": 0.3394, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.996217131614685, + "rewards/margins": 2.865814208984375, + "rewards/rejected": -4.862030982971191, + "step": 906 + }, + { + "epoch": 0.19, + "learning_rate": 1.627310924369748e-05, + "logits/chosen": -1.5797868967056274, + "logits/rejected": -1.4345656633377075, + "logps/chosen": -342.4110107421875, + "logps/rejected": -269.3785095214844, + "loss": 0.3566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7635388374328613, + "rewards/margins": 2.652501344680786, + "rewards/rejected": -4.416040420532227, + "step": 907 + }, + { + "epoch": 0.19, + "learning_rate": 1.626890756302521e-05, + "logits/chosen": -2.080841064453125, + "logits/rejected": -1.4903613328933716, + "logps/chosen": -443.020263671875, + "logps/rejected": -397.37432861328125, + "loss": 0.1821, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0566246509552002, + "rewards/margins": 3.6935763359069824, + "rewards/rejected": -4.7502007484436035, + "step": 908 + }, + { + "epoch": 0.19, + "learning_rate": 1.626470588235294e-05, + "logits/chosen": -2.2648098468780518, + "logits/rejected": -1.8361196517944336, + "logps/chosen": -390.60595703125, + "logps/rejected": -379.9647216796875, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.834219217300415, + "rewards/margins": 3.8679771423339844, + "rewards/rejected": -4.70219612121582, + "step": 909 + }, + { + "epoch": 0.19, + "learning_rate": 1.6260504201680675e-05, + "logits/chosen": -1.9963374137878418, + "logits/rejected": -2.1134889125823975, + "logps/chosen": -352.23309326171875, + "logps/rejected": -386.251953125, + "loss": 0.267, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7679274082183838, + "rewards/margins": 2.7292299270629883, + "rewards/rejected": -4.497157096862793, + "step": 910 + }, + { + "epoch": 0.19, + "learning_rate": 1.6256302521008405e-05, + "logits/chosen": -2.139585018157959, + "logits/rejected": -1.826736569404602, + "logps/chosen": -305.2021179199219, + "logps/rejected": -273.81103515625, + "loss": 0.2705, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1817421913146973, + "rewards/margins": 2.803069591522217, + "rewards/rejected": -3.984811782836914, + "step": 911 + }, + { + "epoch": 0.19, + "learning_rate": 1.6252100840336135e-05, + "logits/chosen": -1.981002688407898, + "logits/rejected": -2.0703396797180176, + "logps/chosen": -406.9721984863281, + "logps/rejected": -371.20440673828125, + "loss": 0.2799, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1434235572814941, + "rewards/margins": 3.0727317333221436, + "rewards/rejected": -4.216155052185059, + "step": 912 + }, + { + "epoch": 0.19, + "learning_rate": 1.6247899159663865e-05, + "logits/chosen": -1.7255853414535522, + "logits/rejected": -1.8718693256378174, + "logps/chosen": -286.42413330078125, + "logps/rejected": -319.8940124511719, + "loss": 0.3252, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.128069281578064, + "rewards/margins": 2.729300022125244, + "rewards/rejected": -3.8573689460754395, + "step": 913 + }, + { + "epoch": 0.19, + "learning_rate": 1.62436974789916e-05, + "logits/chosen": -1.887884259223938, + "logits/rejected": -1.9022674560546875, + "logps/chosen": -256.7900085449219, + "logps/rejected": -269.56280517578125, + "loss": 0.7025, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0569393634796143, + "rewards/margins": 1.2357622385025024, + "rewards/rejected": -3.292701482772827, + "step": 914 + }, + { + "epoch": 0.19, + "learning_rate": 1.623949579831933e-05, + "logits/chosen": -2.2448368072509766, + "logits/rejected": -1.9794319868087769, + "logps/chosen": -379.6931457519531, + "logps/rejected": -299.73516845703125, + "loss": 0.2382, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9326602816581726, + "rewards/margins": 3.316673755645752, + "rewards/rejected": -4.249334335327148, + "step": 915 + }, + { + "epoch": 0.19, + "learning_rate": 1.623529411764706e-05, + "logits/chosen": -1.8805540800094604, + "logits/rejected": -1.561633825302124, + "logps/chosen": -298.3570556640625, + "logps/rejected": -316.96923828125, + "loss": 0.3102, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9303977489471436, + "rewards/margins": 2.5443007946014404, + "rewards/rejected": -4.474699020385742, + "step": 916 + }, + { + "epoch": 0.19, + "learning_rate": 1.623109243697479e-05, + "logits/chosen": -2.048431396484375, + "logits/rejected": -1.9180465936660767, + "logps/chosen": -295.2594909667969, + "logps/rejected": -264.12872314453125, + "loss": 0.2324, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1292060613632202, + "rewards/margins": 3.2113733291625977, + "rewards/rejected": -4.340579032897949, + "step": 917 + }, + { + "epoch": 0.19, + "learning_rate": 1.6226890756302523e-05, + "logits/chosen": -2.3067197799682617, + "logits/rejected": -2.1548309326171875, + "logps/chosen": -301.251708984375, + "logps/rejected": -332.4122009277344, + "loss": 0.4744, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0159692764282227, + "rewards/margins": 1.5090093612670898, + "rewards/rejected": -2.5249786376953125, + "step": 918 + }, + { + "epoch": 0.19, + "learning_rate": 1.6222689075630253e-05, + "logits/chosen": -1.8239529132843018, + "logits/rejected": -1.8729701042175293, + "logps/chosen": -301.16497802734375, + "logps/rejected": -319.0786437988281, + "loss": 0.3685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8851511478424072, + "rewards/margins": 2.9885482788085938, + "rewards/rejected": -3.87369966506958, + "step": 919 + }, + { + "epoch": 0.19, + "learning_rate": 1.6218487394957984e-05, + "logits/chosen": -2.174138307571411, + "logits/rejected": -1.890104055404663, + "logps/chosen": -558.3136596679688, + "logps/rejected": -482.9833984375, + "loss": 0.157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0088156461715698, + "rewards/margins": 3.641759157180786, + "rewards/rejected": -4.650574684143066, + "step": 920 + }, + { + "epoch": 0.19, + "learning_rate": 1.6214285714285717e-05, + "logits/chosen": -1.9609408378601074, + "logits/rejected": -1.9450379610061646, + "logps/chosen": -166.66647338867188, + "logps/rejected": -230.41082763671875, + "loss": 0.3405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8020756840705872, + "rewards/margins": 2.7518205642700195, + "rewards/rejected": -3.553896188735962, + "step": 921 + }, + { + "epoch": 0.19, + "learning_rate": 1.6210084033613448e-05, + "logits/chosen": -2.2108054161071777, + "logits/rejected": -1.811680793762207, + "logps/chosen": -468.9318542480469, + "logps/rejected": -335.023193359375, + "loss": 0.2847, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9263225793838501, + "rewards/margins": 2.652930736541748, + "rewards/rejected": -3.5792531967163086, + "step": 922 + }, + { + "epoch": 0.19, + "learning_rate": 1.6205882352941178e-05, + "logits/chosen": -2.0254106521606445, + "logits/rejected": -1.9421586990356445, + "logps/chosen": -411.4443664550781, + "logps/rejected": -476.85455322265625, + "loss": 0.2413, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49983686208724976, + "rewards/margins": 2.882030725479126, + "rewards/rejected": -3.3818676471710205, + "step": 923 + }, + { + "epoch": 0.19, + "learning_rate": 1.6201680672268908e-05, + "logits/chosen": -2.1283533573150635, + "logits/rejected": -1.5643560886383057, + "logps/chosen": -213.84304809570312, + "logps/rejected": -213.2480010986328, + "loss": 0.3522, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.468817949295044, + "rewards/margins": 2.8268935680389404, + "rewards/rejected": -4.295711517333984, + "step": 924 + }, + { + "epoch": 0.19, + "learning_rate": 1.619747899159664e-05, + "logits/chosen": -2.2803688049316406, + "logits/rejected": -2.024850606918335, + "logps/chosen": -513.8499145507812, + "logps/rejected": -487.3311767578125, + "loss": 0.2643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35037142038345337, + "rewards/margins": 5.109805107116699, + "rewards/rejected": -5.460176467895508, + "step": 925 + }, + { + "epoch": 0.19, + "learning_rate": 1.6193277310924372e-05, + "logits/chosen": -2.05611252784729, + "logits/rejected": -1.5205470323562622, + "logps/chosen": -338.35125732421875, + "logps/rejected": -348.0825500488281, + "loss": 0.1334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2922396659851074, + "rewards/margins": 4.227748870849609, + "rewards/rejected": -5.519988059997559, + "step": 926 + }, + { + "epoch": 0.19, + "learning_rate": 1.6189075630252102e-05, + "logits/chosen": -1.9922364950180054, + "logits/rejected": -1.9709713459014893, + "logps/chosen": -340.4877014160156, + "logps/rejected": -306.7227478027344, + "loss": 0.4801, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.722760558128357, + "rewards/margins": 2.3541500568389893, + "rewards/rejected": -4.076910972595215, + "step": 927 + }, + { + "epoch": 0.19, + "learning_rate": 1.6184873949579832e-05, + "logits/chosen": -2.097243309020996, + "logits/rejected": -1.9899710416793823, + "logps/chosen": -349.98687744140625, + "logps/rejected": -340.2271423339844, + "loss": 0.4914, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.592383861541748, + "rewards/margins": 3.6400413513183594, + "rewards/rejected": -4.232425212860107, + "step": 928 + }, + { + "epoch": 0.19, + "learning_rate": 1.6180672268907566e-05, + "logits/chosen": -1.8802413940429688, + "logits/rejected": -1.7215006351470947, + "logps/chosen": -279.944091796875, + "logps/rejected": -281.08184814453125, + "loss": 0.4542, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5357730388641357, + "rewards/margins": 2.8300819396972656, + "rewards/rejected": -4.365854740142822, + "step": 929 + }, + { + "epoch": 0.19, + "learning_rate": 1.6176470588235296e-05, + "logits/chosen": -1.8888359069824219, + "logits/rejected": -1.9341886043548584, + "logps/chosen": -252.97305297851562, + "logps/rejected": -347.5448913574219, + "loss": 0.343, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2318804264068604, + "rewards/margins": 3.367450714111328, + "rewards/rejected": -4.599330902099609, + "step": 930 + }, + { + "epoch": 0.19, + "learning_rate": 1.6172268907563026e-05, + "logits/chosen": -2.2744109630584717, + "logits/rejected": -1.9098711013793945, + "logps/chosen": -331.54168701171875, + "logps/rejected": -309.3653259277344, + "loss": 0.4295, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0927395820617676, + "rewards/margins": 3.3619794845581055, + "rewards/rejected": -4.454718589782715, + "step": 931 + }, + { + "epoch": 0.19, + "learning_rate": 1.6168067226890757e-05, + "logits/chosen": -2.229959011077881, + "logits/rejected": -2.0564463138580322, + "logps/chosen": -337.414794921875, + "logps/rejected": -316.2709655761719, + "loss": 0.3448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9802029728889465, + "rewards/margins": 2.461056709289551, + "rewards/rejected": -3.4412598609924316, + "step": 932 + }, + { + "epoch": 0.2, + "learning_rate": 1.616386554621849e-05, + "logits/chosen": -2.2918221950531006, + "logits/rejected": -1.9197709560394287, + "logps/chosen": -377.126220703125, + "logps/rejected": -350.18035888671875, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.168889045715332, + "rewards/margins": 3.2541346549987793, + "rewards/rejected": -4.4230241775512695, + "step": 933 + }, + { + "epoch": 0.2, + "learning_rate": 1.615966386554622e-05, + "logits/chosen": -2.046694755554199, + "logits/rejected": -1.7996652126312256, + "logps/chosen": -379.83453369140625, + "logps/rejected": -424.0287780761719, + "loss": 0.3029, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5068362951278687, + "rewards/margins": 3.8165125846862793, + "rewards/rejected": -5.323348522186279, + "step": 934 + }, + { + "epoch": 0.2, + "learning_rate": 1.615546218487395e-05, + "logits/chosen": -2.006044626235962, + "logits/rejected": -1.7604379653930664, + "logps/chosen": -358.02349853515625, + "logps/rejected": -372.83563232421875, + "loss": 0.2468, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1392922401428223, + "rewards/margins": 4.669717788696289, + "rewards/rejected": -5.809009552001953, + "step": 935 + }, + { + "epoch": 0.2, + "learning_rate": 1.615126050420168e-05, + "logits/chosen": -2.06150484085083, + "logits/rejected": -2.139726400375366, + "logps/chosen": -274.4190368652344, + "logps/rejected": -346.6410217285156, + "loss": 0.4083, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7062376737594604, + "rewards/margins": 2.6906228065490723, + "rewards/rejected": -4.396860122680664, + "step": 936 + }, + { + "epoch": 0.2, + "learning_rate": 1.6147058823529414e-05, + "logits/chosen": -2.36195707321167, + "logits/rejected": -2.273604393005371, + "logps/chosen": -265.2520446777344, + "logps/rejected": -262.8707275390625, + "loss": 0.4938, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0171256065368652, + "rewards/margins": 2.4653913974761963, + "rewards/rejected": -4.482516765594482, + "step": 937 + }, + { + "epoch": 0.2, + "learning_rate": 1.6142857142857145e-05, + "logits/chosen": -1.8691480159759521, + "logits/rejected": -1.7311128377914429, + "logps/chosen": -275.6723937988281, + "logps/rejected": -302.32171630859375, + "loss": 0.2987, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.598949670791626, + "rewards/margins": 3.0386173725128174, + "rewards/rejected": -4.637567520141602, + "step": 938 + }, + { + "epoch": 0.2, + "learning_rate": 1.6138655462184875e-05, + "logits/chosen": -2.3988940715789795, + "logits/rejected": -2.1015825271606445, + "logps/chosen": -278.5812072753906, + "logps/rejected": -282.5924987792969, + "loss": 0.4784, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0416860580444336, + "rewards/margins": 3.5054492950439453, + "rewards/rejected": -5.547134876251221, + "step": 939 + }, + { + "epoch": 0.2, + "learning_rate": 1.6134453781512605e-05, + "logits/chosen": -1.6708526611328125, + "logits/rejected": -1.3594670295715332, + "logps/chosen": -284.24395751953125, + "logps/rejected": -261.78753662109375, + "loss": 0.3832, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.343563437461853, + "rewards/margins": 3.172497510910034, + "rewards/rejected": -4.516060829162598, + "step": 940 + }, + { + "epoch": 0.2, + "learning_rate": 1.613025210084034e-05, + "logits/chosen": -2.0220630168914795, + "logits/rejected": -1.6935721635818481, + "logps/chosen": -284.0840148925781, + "logps/rejected": -305.60125732421875, + "loss": 0.2676, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1521260738372803, + "rewards/margins": 3.2628371715545654, + "rewards/rejected": -4.414963245391846, + "step": 941 + }, + { + "epoch": 0.2, + "learning_rate": 1.612605042016807e-05, + "logits/chosen": -2.0774691104888916, + "logits/rejected": -2.070549488067627, + "logps/chosen": -288.32379150390625, + "logps/rejected": -283.20526123046875, + "loss": 0.3461, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3477067947387695, + "rewards/margins": 3.0809428691864014, + "rewards/rejected": -4.42864990234375, + "step": 942 + }, + { + "epoch": 0.2, + "learning_rate": 1.61218487394958e-05, + "logits/chosen": -1.7696259021759033, + "logits/rejected": -1.865986704826355, + "logps/chosen": -253.49505615234375, + "logps/rejected": -442.7955322265625, + "loss": 0.1525, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4500681161880493, + "rewards/margins": 5.546562671661377, + "rewards/rejected": -6.996630668640137, + "step": 943 + }, + { + "epoch": 0.2, + "learning_rate": 1.6117647058823533e-05, + "logits/chosen": -2.2208175659179688, + "logits/rejected": -1.7669366598129272, + "logps/chosen": -264.23095703125, + "logps/rejected": -267.67083740234375, + "loss": 0.369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2596253156661987, + "rewards/margins": 2.8266682624816895, + "rewards/rejected": -4.086293697357178, + "step": 944 + }, + { + "epoch": 0.2, + "learning_rate": 1.6113445378151263e-05, + "logits/chosen": -2.0319926738739014, + "logits/rejected": -1.9267170429229736, + "logps/chosen": -306.5227355957031, + "logps/rejected": -331.3794250488281, + "loss": 0.5253, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1882493495941162, + "rewards/margins": 2.494633436203003, + "rewards/rejected": -3.682882785797119, + "step": 945 + }, + { + "epoch": 0.2, + "learning_rate": 1.6109243697478993e-05, + "logits/chosen": -2.099152088165283, + "logits/rejected": -1.7469524145126343, + "logps/chosen": -306.03839111328125, + "logps/rejected": -273.3168029785156, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8145140409469604, + "rewards/margins": 3.748594284057617, + "rewards/rejected": -4.563108444213867, + "step": 946 + }, + { + "epoch": 0.2, + "learning_rate": 1.6105042016806723e-05, + "logits/chosen": -2.0426230430603027, + "logits/rejected": -1.679762601852417, + "logps/chosen": -288.73663330078125, + "logps/rejected": -272.72576904296875, + "loss": 0.4731, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6338205337524414, + "rewards/margins": 2.9622178077697754, + "rewards/rejected": -4.596037864685059, + "step": 947 + }, + { + "epoch": 0.2, + "learning_rate": 1.6100840336134457e-05, + "logits/chosen": -2.098888874053955, + "logits/rejected": -2.033669948577881, + "logps/chosen": -296.52008056640625, + "logps/rejected": -330.41473388671875, + "loss": 0.2741, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3321335315704346, + "rewards/margins": 3.7607579231262207, + "rewards/rejected": -5.092891216278076, + "step": 948 + }, + { + "epoch": 0.2, + "learning_rate": 1.6096638655462187e-05, + "logits/chosen": -1.8267366886138916, + "logits/rejected": -1.715399980545044, + "logps/chosen": -410.0887451171875, + "logps/rejected": -333.095703125, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5467424392700195, + "rewards/margins": 2.395495653152466, + "rewards/rejected": -3.9422380924224854, + "step": 949 + }, + { + "epoch": 0.2, + "learning_rate": 1.6092436974789917e-05, + "logits/chosen": -2.006714344024658, + "logits/rejected": -2.23336124420166, + "logps/chosen": -288.5964050292969, + "logps/rejected": -427.2220458984375, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6714967489242554, + "rewards/margins": 4.585170269012451, + "rewards/rejected": -6.256667613983154, + "step": 950 + }, + { + "epoch": 0.2, + "learning_rate": 1.6088235294117648e-05, + "logits/chosen": -2.2115395069122314, + "logits/rejected": -2.147038698196411, + "logps/chosen": -252.1461944580078, + "logps/rejected": -330.59161376953125, + "loss": 0.0979, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4695258140563965, + "rewards/margins": 3.851541519165039, + "rewards/rejected": -5.3210673332214355, + "step": 951 + }, + { + "epoch": 0.2, + "learning_rate": 1.608403361344538e-05, + "logits/chosen": -2.0036466121673584, + "logits/rejected": -1.905857801437378, + "logps/chosen": -389.0947265625, + "logps/rejected": -306.5209655761719, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.507166862487793, + "rewards/margins": 3.9212779998779297, + "rewards/rejected": -5.4284443855285645, + "step": 952 + }, + { + "epoch": 0.2, + "learning_rate": 1.607983193277311e-05, + "logits/chosen": -2.4142422676086426, + "logits/rejected": -2.2419326305389404, + "logps/chosen": -288.0456237792969, + "logps/rejected": -310.0427551269531, + "loss": 0.1473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4873964190483093, + "rewards/margins": 3.292844295501709, + "rewards/rejected": -3.780241012573242, + "step": 953 + }, + { + "epoch": 0.2, + "learning_rate": 1.6075630252100842e-05, + "logits/chosen": -2.146336555480957, + "logits/rejected": -2.030111789703369, + "logps/chosen": -250.44271850585938, + "logps/rejected": -314.92950439453125, + "loss": 0.2473, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6969060897827148, + "rewards/margins": 3.61592960357666, + "rewards/rejected": -5.312835693359375, + "step": 954 + }, + { + "epoch": 0.2, + "learning_rate": 1.6071428571428572e-05, + "logits/chosen": -2.2848095893859863, + "logits/rejected": -2.0192418098449707, + "logps/chosen": -396.1111755371094, + "logps/rejected": -312.5511169433594, + "loss": 0.4043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8675146102905273, + "rewards/margins": 3.271200180053711, + "rewards/rejected": -5.138714790344238, + "step": 955 + }, + { + "epoch": 0.2, + "learning_rate": 1.6067226890756306e-05, + "logits/chosen": -2.0489754676818848, + "logits/rejected": -1.9869868755340576, + "logps/chosen": -400.83404541015625, + "logps/rejected": -427.11968994140625, + "loss": 0.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9448701739311218, + "rewards/margins": 3.682661294937134, + "rewards/rejected": -4.6275315284729, + "step": 956 + }, + { + "epoch": 0.2, + "learning_rate": 1.6063025210084036e-05, + "logits/chosen": -2.131647825241089, + "logits/rejected": -1.7466652393341064, + "logps/chosen": -462.75537109375, + "logps/rejected": -384.59039306640625, + "loss": 0.3906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8894122838974, + "rewards/margins": 2.7314529418945312, + "rewards/rejected": -4.620865345001221, + "step": 957 + }, + { + "epoch": 0.2, + "learning_rate": 1.6058823529411766e-05, + "logits/chosen": -1.8268980979919434, + "logits/rejected": -1.9135490655899048, + "logps/chosen": -181.1867218017578, + "logps/rejected": -257.34600830078125, + "loss": 0.2489, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9502403736114502, + "rewards/margins": 2.652364730834961, + "rewards/rejected": -4.602604866027832, + "step": 958 + }, + { + "epoch": 0.2, + "learning_rate": 1.6054621848739496e-05, + "logits/chosen": -1.9169213771820068, + "logits/rejected": -2.2156481742858887, + "logps/chosen": -292.47357177734375, + "logps/rejected": -370.2200927734375, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8812296390533447, + "rewards/margins": 4.399232864379883, + "rewards/rejected": -6.280462265014648, + "step": 959 + }, + { + "epoch": 0.2, + "learning_rate": 1.605042016806723e-05, + "logits/chosen": -1.9813048839569092, + "logits/rejected": -2.1631431579589844, + "logps/chosen": -348.94384765625, + "logps/rejected": -346.21331787109375, + "loss": 0.8581, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9477732181549072, + "rewards/margins": 2.3577558994293213, + "rewards/rejected": -4.3055291175842285, + "step": 960 + }, + { + "epoch": 0.2, + "learning_rate": 1.604621848739496e-05, + "logits/chosen": -1.8230655193328857, + "logits/rejected": -1.8407657146453857, + "logps/chosen": -180.0000457763672, + "logps/rejected": -295.68603515625, + "loss": 0.1646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1433920860290527, + "rewards/margins": 4.4139885902404785, + "rewards/rejected": -5.557380676269531, + "step": 961 + }, + { + "epoch": 0.2, + "learning_rate": 1.604201680672269e-05, + "logits/chosen": -2.3089263439178467, + "logits/rejected": -1.8260228633880615, + "logps/chosen": -237.73977661132812, + "logps/rejected": -186.85098266601562, + "loss": 0.4226, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7359535694122314, + "rewards/margins": 1.7652562856674194, + "rewards/rejected": -3.5012099742889404, + "step": 962 + }, + { + "epoch": 0.2, + "learning_rate": 1.6037815126050424e-05, + "logits/chosen": -2.5158958435058594, + "logits/rejected": -2.2219765186309814, + "logps/chosen": -568.96728515625, + "logps/rejected": -465.2297668457031, + "loss": 0.648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6339870095252991, + "rewards/margins": 3.2373623847961426, + "rewards/rejected": -3.871349334716797, + "step": 963 + }, + { + "epoch": 0.2, + "learning_rate": 1.6033613445378154e-05, + "logits/chosen": -2.211601734161377, + "logits/rejected": -2.054121255874634, + "logps/chosen": -405.22821044921875, + "logps/rejected": -430.16552734375, + "loss": 0.2456, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5112473964691162, + "rewards/margins": 3.313117504119873, + "rewards/rejected": -4.82436466217041, + "step": 964 + }, + { + "epoch": 0.2, + "learning_rate": 1.6029411764705884e-05, + "logits/chosen": -1.7952327728271484, + "logits/rejected": -1.3685411214828491, + "logps/chosen": -328.3746337890625, + "logps/rejected": -304.05780029296875, + "loss": 0.4384, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9979314804077148, + "rewards/margins": 2.496091365814209, + "rewards/rejected": -4.494022846221924, + "step": 965 + }, + { + "epoch": 0.2, + "learning_rate": 1.6025210084033615e-05, + "logits/chosen": -2.0978634357452393, + "logits/rejected": -1.5490803718566895, + "logps/chosen": -637.6278076171875, + "logps/rejected": -367.8577575683594, + "loss": 0.1327, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0601354837417603, + "rewards/margins": 3.7182583808898926, + "rewards/rejected": -4.7783942222595215, + "step": 966 + }, + { + "epoch": 0.2, + "learning_rate": 1.6021008403361348e-05, + "logits/chosen": -2.293529510498047, + "logits/rejected": -1.758965015411377, + "logps/chosen": -289.671630859375, + "logps/rejected": -268.2323303222656, + "loss": 0.516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6899057626724243, + "rewards/margins": 2.7523252964019775, + "rewards/rejected": -4.442231178283691, + "step": 967 + }, + { + "epoch": 0.2, + "learning_rate": 1.601680672268908e-05, + "logits/chosen": -1.9055092334747314, + "logits/rejected": -2.0438497066497803, + "logps/chosen": -305.809326171875, + "logps/rejected": -352.200439453125, + "loss": 0.4915, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1041617393493652, + "rewards/margins": 2.303605079650879, + "rewards/rejected": -4.407767295837402, + "step": 968 + }, + { + "epoch": 0.2, + "learning_rate": 1.601260504201681e-05, + "logits/chosen": -1.9326434135437012, + "logits/rejected": -2.179170846939087, + "logps/chosen": -407.10430908203125, + "logps/rejected": -450.77386474609375, + "loss": 0.7245, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7418363094329834, + "rewards/margins": 1.7377456426620483, + "rewards/rejected": -3.4795820713043213, + "step": 969 + }, + { + "epoch": 0.2, + "learning_rate": 1.600840336134454e-05, + "logits/chosen": -2.1444361209869385, + "logits/rejected": -2.093311071395874, + "logps/chosen": -337.5771179199219, + "logps/rejected": -289.28472900390625, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4055050611495972, + "rewards/margins": 4.7778730392456055, + "rewards/rejected": -6.183377742767334, + "step": 970 + }, + { + "epoch": 0.2, + "learning_rate": 1.6004201680672272e-05, + "logits/chosen": -2.1992075443267822, + "logits/rejected": -1.967665433883667, + "logps/chosen": -351.4008483886719, + "logps/rejected": -428.0211181640625, + "loss": 0.157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8945674896240234, + "rewards/margins": 3.9394099712371826, + "rewards/rejected": -4.833977222442627, + "step": 971 + }, + { + "epoch": 0.2, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -2.328667163848877, + "logits/rejected": -1.9910246133804321, + "logps/chosen": -316.12939453125, + "logps/rejected": -309.1440734863281, + "loss": 0.3179, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9753143191337585, + "rewards/margins": 3.3967621326446533, + "rewards/rejected": -4.372076511383057, + "step": 972 + }, + { + "epoch": 0.2, + "learning_rate": 1.5995798319327733e-05, + "logits/chosen": -2.0464725494384766, + "logits/rejected": -2.0297629833221436, + "logps/chosen": -430.61016845703125, + "logps/rejected": -437.5151062011719, + "loss": 0.1596, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9454315304756165, + "rewards/margins": 3.407010555267334, + "rewards/rejected": -4.352442264556885, + "step": 973 + }, + { + "epoch": 0.2, + "learning_rate": 1.5991596638655463e-05, + "logits/chosen": -2.111048698425293, + "logits/rejected": -1.7278022766113281, + "logps/chosen": -271.6231384277344, + "logps/rejected": -237.2025604248047, + "loss": 0.3344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.518215298652649, + "rewards/margins": 3.550593376159668, + "rewards/rejected": -5.068808555603027, + "step": 974 + }, + { + "epoch": 0.2, + "learning_rate": 1.5987394957983197e-05, + "logits/chosen": -2.125736713409424, + "logits/rejected": -1.9389903545379639, + "logps/chosen": -356.8438415527344, + "logps/rejected": -345.59375, + "loss": 0.3553, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7286221981048584, + "rewards/margins": 3.0152809619903564, + "rewards/rejected": -4.743903160095215, + "step": 975 + }, + { + "epoch": 0.2, + "learning_rate": 1.5983193277310927e-05, + "logits/chosen": -2.135952949523926, + "logits/rejected": -2.1630730628967285, + "logps/chosen": -296.26287841796875, + "logps/rejected": -352.83026123046875, + "loss": 0.2042, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.042497158050537, + "rewards/margins": 3.567195415496826, + "rewards/rejected": -5.609692573547363, + "step": 976 + }, + { + "epoch": 0.2, + "learning_rate": 1.5978991596638657e-05, + "logits/chosen": -1.6039451360702515, + "logits/rejected": -1.5763413906097412, + "logps/chosen": -304.0045471191406, + "logps/rejected": -315.9101257324219, + "loss": 0.2926, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6680569648742676, + "rewards/margins": 3.900869131088257, + "rewards/rejected": -5.568925857543945, + "step": 977 + }, + { + "epoch": 0.2, + "learning_rate": 1.5974789915966387e-05, + "logits/chosen": -2.230966806411743, + "logits/rejected": -1.9008398056030273, + "logps/chosen": -348.0404357910156, + "logps/rejected": -309.02825927734375, + "loss": 0.0941, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0074272155761719, + "rewards/margins": 4.123209476470947, + "rewards/rejected": -5.130637168884277, + "step": 978 + }, + { + "epoch": 0.2, + "learning_rate": 1.597058823529412e-05, + "logits/chosen": -2.1211373805999756, + "logits/rejected": -2.225551128387451, + "logps/chosen": -280.7225341796875, + "logps/rejected": -319.66412353515625, + "loss": 0.2427, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2527220249176025, + "rewards/margins": 3.760939836502075, + "rewards/rejected": -6.0136613845825195, + "step": 979 + }, + { + "epoch": 0.21, + "learning_rate": 1.596638655462185e-05, + "logits/chosen": -1.992407202720642, + "logits/rejected": -2.0630340576171875, + "logps/chosen": -273.6285400390625, + "logps/rejected": -267.4842529296875, + "loss": 1.3249, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.1059138774871826, + "rewards/margins": 0.3664852976799011, + "rewards/rejected": -3.4723992347717285, + "step": 980 + }, + { + "epoch": 0.21, + "learning_rate": 1.596218487394958e-05, + "logits/chosen": -2.4282758235931396, + "logits/rejected": -2.183845281600952, + "logps/chosen": -318.62451171875, + "logps/rejected": -338.48553466796875, + "loss": 0.411, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.801322340965271, + "rewards/margins": 3.6411235332489014, + "rewards/rejected": -5.442445755004883, + "step": 981 + }, + { + "epoch": 0.21, + "learning_rate": 1.595798319327731e-05, + "logits/chosen": -2.049459934234619, + "logits/rejected": -2.1267614364624023, + "logps/chosen": -488.1476745605469, + "logps/rejected": -434.264892578125, + "loss": 0.4719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7990267276763916, + "rewards/margins": 2.4907145500183105, + "rewards/rejected": -5.2897419929504395, + "step": 982 + }, + { + "epoch": 0.21, + "learning_rate": 1.5953781512605045e-05, + "logits/chosen": -2.1547482013702393, + "logits/rejected": -1.9187558889389038, + "logps/chosen": -355.8290100097656, + "logps/rejected": -354.94610595703125, + "loss": 0.7006, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1883840560913086, + "rewards/margins": 2.9620940685272217, + "rewards/rejected": -5.150478363037109, + "step": 983 + }, + { + "epoch": 0.21, + "learning_rate": 1.5949579831932775e-05, + "logits/chosen": -2.2430317401885986, + "logits/rejected": -1.9718213081359863, + "logps/chosen": -345.955322265625, + "logps/rejected": -399.9026794433594, + "loss": 0.2591, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6024341583251953, + "rewards/margins": 4.545600891113281, + "rewards/rejected": -7.148034572601318, + "step": 984 + }, + { + "epoch": 0.21, + "learning_rate": 1.5945378151260506e-05, + "logits/chosen": -1.7310130596160889, + "logits/rejected": -1.7342963218688965, + "logps/chosen": -255.662841796875, + "logps/rejected": -329.61865234375, + "loss": 0.2125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8036235570907593, + "rewards/margins": 4.13872766494751, + "rewards/rejected": -5.942351341247559, + "step": 985 + }, + { + "epoch": 0.21, + "learning_rate": 1.594117647058824e-05, + "logits/chosen": -2.52667236328125, + "logits/rejected": -1.900354027748108, + "logps/chosen": -544.683837890625, + "logps/rejected": -395.1111755371094, + "loss": 0.5719, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1777844429016113, + "rewards/margins": 3.926586627960205, + "rewards/rejected": -6.104371070861816, + "step": 986 + }, + { + "epoch": 0.21, + "learning_rate": 1.5936974789915966e-05, + "logits/chosen": -2.203887939453125, + "logits/rejected": -1.8927711248397827, + "logps/chosen": -296.181884765625, + "logps/rejected": -256.6437683105469, + "loss": 0.5472, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.595014810562134, + "rewards/margins": 2.6470212936401367, + "rewards/rejected": -5.242035388946533, + "step": 987 + }, + { + "epoch": 0.21, + "learning_rate": 1.5932773109243696e-05, + "logits/chosen": -2.147514820098877, + "logits/rejected": -2.0080254077911377, + "logps/chosen": -346.51324462890625, + "logps/rejected": -326.51898193359375, + "loss": 0.2249, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9157497882843018, + "rewards/margins": 4.018861770629883, + "rewards/rejected": -5.9346113204956055, + "step": 988 + }, + { + "epoch": 0.21, + "learning_rate": 1.592857142857143e-05, + "logits/chosen": -2.0567169189453125, + "logits/rejected": -1.9067251682281494, + "logps/chosen": -295.603759765625, + "logps/rejected": -307.12847900390625, + "loss": 0.3289, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0667433738708496, + "rewards/margins": 3.1505918502807617, + "rewards/rejected": -5.217335224151611, + "step": 989 + }, + { + "epoch": 0.21, + "learning_rate": 1.592436974789916e-05, + "logits/chosen": -2.1492984294891357, + "logits/rejected": -1.9382822513580322, + "logps/chosen": -329.0425109863281, + "logps/rejected": -357.9788818359375, + "loss": 0.2395, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3744035959243774, + "rewards/margins": 3.94028639793396, + "rewards/rejected": -5.314690589904785, + "step": 990 + }, + { + "epoch": 0.21, + "learning_rate": 1.592016806722689e-05, + "logits/chosen": -2.06227970123291, + "logits/rejected": -1.793077826499939, + "logps/chosen": -383.4049072265625, + "logps/rejected": -380.588134765625, + "loss": 0.3827, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2909398078918457, + "rewards/margins": 3.0851078033447266, + "rewards/rejected": -5.376047134399414, + "step": 991 + }, + { + "epoch": 0.21, + "learning_rate": 1.591596638655462e-05, + "logits/chosen": -2.1432809829711914, + "logits/rejected": -1.4809675216674805, + "logps/chosen": -287.3902587890625, + "logps/rejected": -244.61148071289062, + "loss": 0.1346, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.113315463066101, + "rewards/margins": 4.157505989074707, + "rewards/rejected": -5.270821571350098, + "step": 992 + }, + { + "epoch": 0.21, + "learning_rate": 1.5911764705882354e-05, + "logits/chosen": -2.3400068283081055, + "logits/rejected": -1.8157542943954468, + "logps/chosen": -381.60516357421875, + "logps/rejected": -373.2441711425781, + "loss": 0.1661, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.063373327255249, + "rewards/margins": 4.645111083984375, + "rewards/rejected": -5.708484649658203, + "step": 993 + }, + { + "epoch": 0.21, + "learning_rate": 1.5907563025210084e-05, + "logits/chosen": -2.1736326217651367, + "logits/rejected": -2.024364948272705, + "logps/chosen": -209.30006408691406, + "logps/rejected": -230.43218994140625, + "loss": 0.187, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2895779609680176, + "rewards/margins": 2.84120512008667, + "rewards/rejected": -5.130782604217529, + "step": 994 + }, + { + "epoch": 0.21, + "learning_rate": 1.5903361344537815e-05, + "logits/chosen": -2.1777327060699463, + "logits/rejected": -1.7259260416030884, + "logps/chosen": -230.63426208496094, + "logps/rejected": -259.3458557128906, + "loss": 0.1616, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3928728103637695, + "rewards/margins": 3.788881540298462, + "rewards/rejected": -5.181754112243652, + "step": 995 + }, + { + "epoch": 0.21, + "learning_rate": 1.5899159663865545e-05, + "logits/chosen": -2.0159902572631836, + "logits/rejected": -1.9090015888214111, + "logps/chosen": -314.9594421386719, + "logps/rejected": -318.53546142578125, + "loss": 0.3031, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0985493659973145, + "rewards/margins": 2.2413251399993896, + "rewards/rejected": -5.339874744415283, + "step": 996 + }, + { + "epoch": 0.21, + "learning_rate": 1.589495798319328e-05, + "logits/chosen": -2.1222875118255615, + "logits/rejected": -2.075869560241699, + "logps/chosen": -279.2803039550781, + "logps/rejected": -314.81561279296875, + "loss": 0.1371, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.860377311706543, + "rewards/margins": 4.949416160583496, + "rewards/rejected": -5.809793472290039, + "step": 997 + }, + { + "epoch": 0.21, + "learning_rate": 1.589075630252101e-05, + "logits/chosen": -2.203815460205078, + "logits/rejected": -2.1900599002838135, + "logps/chosen": -321.6600036621094, + "logps/rejected": -460.579345703125, + "loss": 0.2982, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.626387596130371, + "rewards/margins": 4.398473739624023, + "rewards/rejected": -6.024860858917236, + "step": 998 + }, + { + "epoch": 0.21, + "learning_rate": 1.588655462184874e-05, + "logits/chosen": -1.6782182455062866, + "logits/rejected": -1.4933725595474243, + "logps/chosen": -274.6654968261719, + "logps/rejected": -245.32205200195312, + "loss": 0.4175, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.060499906539917, + "rewards/margins": 3.4444291591644287, + "rewards/rejected": -5.5049285888671875, + "step": 999 + }, + { + "epoch": 0.21, + "learning_rate": 1.5882352941176473e-05, + "logits/chosen": -2.128310441970825, + "logits/rejected": -1.7982635498046875, + "logps/chosen": -336.26702880859375, + "logps/rejected": -310.9106750488281, + "loss": 0.5163, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.399898052215576, + "rewards/margins": 2.4173684120178223, + "rewards/rejected": -4.817266464233398, + "step": 1000 + }, + { + "epoch": 0.21, + "learning_rate": 1.5878151260504203e-05, + "logits/chosen": -2.1564605236053467, + "logits/rejected": -1.914797067642212, + "logps/chosen": -304.67596435546875, + "logps/rejected": -351.14013671875, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.037938117980957, + "rewards/margins": 5.277014255523682, + "rewards/rejected": -7.314952850341797, + "step": 1001 + }, + { + "epoch": 0.21, + "learning_rate": 1.5873949579831933e-05, + "logits/chosen": -1.8247300386428833, + "logits/rejected": -1.5515823364257812, + "logps/chosen": -259.6643981933594, + "logps/rejected": -255.8385772705078, + "loss": 0.3908, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.675656795501709, + "rewards/margins": 2.1125659942626953, + "rewards/rejected": -4.788222789764404, + "step": 1002 + }, + { + "epoch": 0.21, + "learning_rate": 1.5869747899159663e-05, + "logits/chosen": -2.079982280731201, + "logits/rejected": -2.0852432250976562, + "logps/chosen": -389.28619384765625, + "logps/rejected": -397.09857177734375, + "loss": 0.2464, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1009445190429688, + "rewards/margins": 4.061807632446289, + "rewards/rejected": -6.162752151489258, + "step": 1003 + }, + { + "epoch": 0.21, + "learning_rate": 1.5865546218487397e-05, + "logits/chosen": -1.6550183296203613, + "logits/rejected": -1.98838210105896, + "logps/chosen": -273.01409912109375, + "logps/rejected": -359.2369384765625, + "loss": 0.4703, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3376121520996094, + "rewards/margins": 3.1436686515808105, + "rewards/rejected": -5.48128080368042, + "step": 1004 + }, + { + "epoch": 0.21, + "learning_rate": 1.5861344537815127e-05, + "logits/chosen": -1.861385464668274, + "logits/rejected": -1.9208061695098877, + "logps/chosen": -344.4636535644531, + "logps/rejected": -335.368408203125, + "loss": 0.2375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3113930225372314, + "rewards/margins": 3.915203094482422, + "rewards/rejected": -6.226596832275391, + "step": 1005 + }, + { + "epoch": 0.21, + "learning_rate": 1.5857142857142857e-05, + "logits/chosen": -2.0344033241271973, + "logits/rejected": -2.0327882766723633, + "logps/chosen": -286.20703125, + "logps/rejected": -348.3990478515625, + "loss": 0.526, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4683268070220947, + "rewards/margins": 2.743124485015869, + "rewards/rejected": -5.211451530456543, + "step": 1006 + }, + { + "epoch": 0.21, + "learning_rate": 1.5852941176470588e-05, + "logits/chosen": -2.1941375732421875, + "logits/rejected": -1.8723511695861816, + "logps/chosen": -381.8442687988281, + "logps/rejected": -328.5703125, + "loss": 0.4321, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.955676794052124, + "rewards/margins": 3.0702829360961914, + "rewards/rejected": -5.0259599685668945, + "step": 1007 + }, + { + "epoch": 0.21, + "learning_rate": 1.584873949579832e-05, + "logits/chosen": -2.0592691898345947, + "logits/rejected": -1.6820740699768066, + "logps/chosen": -340.93914794921875, + "logps/rejected": -297.41424560546875, + "loss": 0.3653, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9973559379577637, + "rewards/margins": 2.6213486194610596, + "rewards/rejected": -5.618704319000244, + "step": 1008 + }, + { + "epoch": 0.21, + "learning_rate": 1.584453781512605e-05, + "logits/chosen": -2.322657585144043, + "logits/rejected": -2.3027448654174805, + "logps/chosen": -413.90625, + "logps/rejected": -369.0709228515625, + "loss": 0.4432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8684520721435547, + "rewards/margins": 3.0394368171691895, + "rewards/rejected": -4.907888412475586, + "step": 1009 + }, + { + "epoch": 0.21, + "learning_rate": 1.584033613445378e-05, + "logits/chosen": -2.1634111404418945, + "logits/rejected": -1.6900907754898071, + "logps/chosen": -289.776123046875, + "logps/rejected": -303.7606506347656, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4645133018493652, + "rewards/margins": 2.172579526901245, + "rewards/rejected": -4.6370930671691895, + "step": 1010 + }, + { + "epoch": 0.21, + "learning_rate": 1.5836134453781512e-05, + "logits/chosen": -1.9188296794891357, + "logits/rejected": -1.9418878555297852, + "logps/chosen": -256.3186340332031, + "logps/rejected": -276.24151611328125, + "loss": 0.4689, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3924880027770996, + "rewards/margins": 2.3059651851654053, + "rewards/rejected": -4.698453426361084, + "step": 1011 + }, + { + "epoch": 0.21, + "learning_rate": 1.5831932773109245e-05, + "logits/chosen": -2.28934383392334, + "logits/rejected": -1.9998149871826172, + "logps/chosen": -258.2101745605469, + "logps/rejected": -254.7913818359375, + "loss": 0.5938, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0487024784088135, + "rewards/margins": 2.2096283435821533, + "rewards/rejected": -5.258330821990967, + "step": 1012 + }, + { + "epoch": 0.21, + "learning_rate": 1.5827731092436976e-05, + "logits/chosen": -2.004002571105957, + "logits/rejected": -1.9574902057647705, + "logps/chosen": -359.8652648925781, + "logps/rejected": -345.0855407714844, + "loss": 0.3681, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8650950193405151, + "rewards/margins": 3.7826039791107178, + "rewards/rejected": -5.647699356079102, + "step": 1013 + }, + { + "epoch": 0.21, + "learning_rate": 1.5823529411764706e-05, + "logits/chosen": -1.993525743484497, + "logits/rejected": -1.885300636291504, + "logps/chosen": -335.21929931640625, + "logps/rejected": -382.243408203125, + "loss": 0.5637, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3249666690826416, + "rewards/margins": 1.8985376358032227, + "rewards/rejected": -4.223504066467285, + "step": 1014 + }, + { + "epoch": 0.21, + "learning_rate": 1.5819327731092436e-05, + "logits/chosen": -2.210437059402466, + "logits/rejected": -2.3024606704711914, + "logps/chosen": -287.51861572265625, + "logps/rejected": -275.244384765625, + "loss": 0.125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5779825448989868, + "rewards/margins": 3.6603569984436035, + "rewards/rejected": -5.238339424133301, + "step": 1015 + }, + { + "epoch": 0.21, + "learning_rate": 1.581512605042017e-05, + "logits/chosen": -2.023965358734131, + "logits/rejected": -1.8840968608856201, + "logps/chosen": -416.53369140625, + "logps/rejected": -399.0089111328125, + "loss": 0.1224, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1955689191818237, + "rewards/margins": 4.53508996963501, + "rewards/rejected": -5.730658531188965, + "step": 1016 + }, + { + "epoch": 0.21, + "learning_rate": 1.58109243697479e-05, + "logits/chosen": -2.0806667804718018, + "logits/rejected": -2.154951333999634, + "logps/chosen": -314.016845703125, + "logps/rejected": -304.0430603027344, + "loss": 0.4446, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4265894889831543, + "rewards/margins": 3.3580851554870605, + "rewards/rejected": -5.784675121307373, + "step": 1017 + }, + { + "epoch": 0.21, + "learning_rate": 1.580672268907563e-05, + "logits/chosen": -2.1001968383789062, + "logits/rejected": -1.943385362625122, + "logps/chosen": -367.1288757324219, + "logps/rejected": -324.0970458984375, + "loss": 0.5309, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.123927593231201, + "rewards/margins": 2.621354818344116, + "rewards/rejected": -4.745282173156738, + "step": 1018 + }, + { + "epoch": 0.21, + "learning_rate": 1.580252100840336e-05, + "logits/chosen": -2.298964023590088, + "logits/rejected": -1.869608998298645, + "logps/chosen": -276.7232971191406, + "logps/rejected": -284.82342529296875, + "loss": 0.3035, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9368112087249756, + "rewards/margins": 4.026775360107422, + "rewards/rejected": -6.963586330413818, + "step": 1019 + }, + { + "epoch": 0.21, + "learning_rate": 1.5798319327731094e-05, + "logits/chosen": -2.269315719604492, + "logits/rejected": -2.1161513328552246, + "logps/chosen": -401.36480712890625, + "logps/rejected": -375.71954345703125, + "loss": 0.7493, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.101378917694092, + "rewards/margins": 2.2584598064422607, + "rewards/rejected": -4.359838962554932, + "step": 1020 + }, + { + "epoch": 0.21, + "learning_rate": 1.5794117647058824e-05, + "logits/chosen": -1.9500479698181152, + "logits/rejected": -1.7284438610076904, + "logps/chosen": -309.60943603515625, + "logps/rejected": -411.1970520019531, + "loss": 0.301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.672088623046875, + "rewards/margins": 3.5757875442504883, + "rewards/rejected": -5.247876167297363, + "step": 1021 + }, + { + "epoch": 0.21, + "learning_rate": 1.5789915966386554e-05, + "logits/chosen": -2.0553410053253174, + "logits/rejected": -2.3254337310791016, + "logps/chosen": -342.6885681152344, + "logps/rejected": -358.798828125, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6520211696624756, + "rewards/margins": 4.654041290283203, + "rewards/rejected": -6.3060622215271, + "step": 1022 + }, + { + "epoch": 0.21, + "learning_rate": 1.5785714285714288e-05, + "logits/chosen": -2.1380250453948975, + "logits/rejected": -2.264366388320923, + "logps/chosen": -227.76400756835938, + "logps/rejected": -280.9142761230469, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8462836742401123, + "rewards/margins": 4.351341724395752, + "rewards/rejected": -6.197626113891602, + "step": 1023 + }, + { + "epoch": 0.21, + "learning_rate": 1.5781512605042018e-05, + "logits/chosen": -2.2289586067199707, + "logits/rejected": -1.7775155305862427, + "logps/chosen": -439.2747802734375, + "logps/rejected": -304.8201904296875, + "loss": 0.1713, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3938897848129272, + "rewards/margins": 4.525201320648193, + "rewards/rejected": -5.919090747833252, + "step": 1024 + }, + { + "epoch": 0.21, + "learning_rate": 1.577731092436975e-05, + "logits/chosen": -2.3004164695739746, + "logits/rejected": -2.111860752105713, + "logps/chosen": -393.759521484375, + "logps/rejected": -413.17742919921875, + "loss": 0.272, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6322078704833984, + "rewards/margins": 2.432725191116333, + "rewards/rejected": -4.064932823181152, + "step": 1025 + }, + { + "epoch": 0.21, + "learning_rate": 1.577310924369748e-05, + "logits/chosen": -2.1183876991271973, + "logits/rejected": -1.7869009971618652, + "logps/chosen": -350.95098876953125, + "logps/rejected": -358.25433349609375, + "loss": 0.2333, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.558724045753479, + "rewards/margins": 3.3837058544158936, + "rewards/rejected": -4.942430019378662, + "step": 1026 + }, + { + "epoch": 0.21, + "learning_rate": 1.5768907563025212e-05, + "logits/chosen": -1.6044032573699951, + "logits/rejected": -1.6205954551696777, + "logps/chosen": -197.78744506835938, + "logps/rejected": -260.8172912597656, + "loss": 0.6223, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7520601749420166, + "rewards/margins": 1.552250623703003, + "rewards/rejected": -4.3043107986450195, + "step": 1027 + }, + { + "epoch": 0.22, + "learning_rate": 1.5764705882352943e-05, + "logits/chosen": -1.9246397018432617, + "logits/rejected": -2.16696834564209, + "logps/chosen": -293.6358947753906, + "logps/rejected": -279.64666748046875, + "loss": 0.5489, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2128446102142334, + "rewards/margins": 2.0586791038513184, + "rewards/rejected": -4.271523475646973, + "step": 1028 + }, + { + "epoch": 0.22, + "learning_rate": 1.5760504201680673e-05, + "logits/chosen": -2.089893341064453, + "logits/rejected": -1.9854090213775635, + "logps/chosen": -344.54998779296875, + "logps/rejected": -339.42535400390625, + "loss": 0.3752, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.773606777191162, + "rewards/margins": 2.2489991188049316, + "rewards/rejected": -4.022605895996094, + "step": 1029 + }, + { + "epoch": 0.22, + "learning_rate": 1.5756302521008403e-05, + "logits/chosen": -2.307817220687866, + "logits/rejected": -1.9461517333984375, + "logps/chosen": -371.61962890625, + "logps/rejected": -342.8824462890625, + "loss": 0.1874, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6164699792861938, + "rewards/margins": 3.5407934188842773, + "rewards/rejected": -5.157263278961182, + "step": 1030 + }, + { + "epoch": 0.22, + "learning_rate": 1.5752100840336137e-05, + "logits/chosen": -2.0700907707214355, + "logits/rejected": -2.295524835586548, + "logps/chosen": -199.2720947265625, + "logps/rejected": -293.9393005371094, + "loss": 0.1716, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.475959300994873, + "rewards/margins": 3.9602999687194824, + "rewards/rejected": -5.4362592697143555, + "step": 1031 + }, + { + "epoch": 0.22, + "learning_rate": 1.5747899159663867e-05, + "logits/chosen": -2.1870431900024414, + "logits/rejected": -2.079000949859619, + "logps/chosen": -279.5403137207031, + "logps/rejected": -378.5204772949219, + "loss": 0.3308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5660508871078491, + "rewards/margins": 2.89620304107666, + "rewards/rejected": -4.462253570556641, + "step": 1032 + }, + { + "epoch": 0.22, + "learning_rate": 1.5743697478991597e-05, + "logits/chosen": -2.233264446258545, + "logits/rejected": -2.1475863456726074, + "logps/chosen": -280.5260009765625, + "logps/rejected": -222.30731201171875, + "loss": 0.4507, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.30723237991333, + "rewards/margins": 1.550318956375122, + "rewards/rejected": -2.857551336288452, + "step": 1033 + }, + { + "epoch": 0.22, + "learning_rate": 1.5739495798319327e-05, + "logits/chosen": -2.116415500640869, + "logits/rejected": -1.9250054359436035, + "logps/chosen": -351.7734680175781, + "logps/rejected": -317.4046325683594, + "loss": 0.4731, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3256168365478516, + "rewards/margins": 2.4478559494018555, + "rewards/rejected": -4.773472785949707, + "step": 1034 + }, + { + "epoch": 0.22, + "learning_rate": 1.573529411764706e-05, + "logits/chosen": -2.1922314167022705, + "logits/rejected": -2.1391685009002686, + "logps/chosen": -244.06134033203125, + "logps/rejected": -289.43951416015625, + "loss": 0.2559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.726034164428711, + "rewards/margins": 3.462398052215576, + "rewards/rejected": -5.188432216644287, + "step": 1035 + }, + { + "epoch": 0.22, + "learning_rate": 1.573109243697479e-05, + "logits/chosen": -2.36611270904541, + "logits/rejected": -1.842142105102539, + "logps/chosen": -250.618896484375, + "logps/rejected": -273.9358215332031, + "loss": 0.376, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.614174246788025, + "rewards/margins": 3.283245086669922, + "rewards/rejected": -4.897418975830078, + "step": 1036 + }, + { + "epoch": 0.22, + "learning_rate": 1.572689075630252e-05, + "logits/chosen": -2.028487205505371, + "logits/rejected": -1.9487860202789307, + "logps/chosen": -339.63958740234375, + "logps/rejected": -304.4535217285156, + "loss": 0.4047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5681397914886475, + "rewards/margins": 3.165180206298828, + "rewards/rejected": -4.733320236206055, + "step": 1037 + }, + { + "epoch": 0.22, + "learning_rate": 1.572268907563025e-05, + "logits/chosen": -2.0320651531219482, + "logits/rejected": -2.364687204360962, + "logps/chosen": -335.325927734375, + "logps/rejected": -440.23577880859375, + "loss": 0.4058, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7449978590011597, + "rewards/margins": 3.087007999420166, + "rewards/rejected": -4.832005977630615, + "step": 1038 + }, + { + "epoch": 0.22, + "learning_rate": 1.5718487394957985e-05, + "logits/chosen": -2.3069984912872314, + "logits/rejected": -2.179272174835205, + "logps/chosen": -444.83642578125, + "logps/rejected": -415.9942626953125, + "loss": 0.355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5126739740371704, + "rewards/margins": 4.149467468261719, + "rewards/rejected": -5.662141799926758, + "step": 1039 + }, + { + "epoch": 0.22, + "learning_rate": 1.5714285714285715e-05, + "logits/chosen": -2.1814706325531006, + "logits/rejected": -1.832313060760498, + "logps/chosen": -437.3150634765625, + "logps/rejected": -303.0058898925781, + "loss": 0.5554, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6847037076950073, + "rewards/margins": 2.7606427669525146, + "rewards/rejected": -4.445346832275391, + "step": 1040 + }, + { + "epoch": 0.22, + "learning_rate": 1.5710084033613446e-05, + "logits/chosen": -2.351832866668701, + "logits/rejected": -1.9153738021850586, + "logps/chosen": -434.5226745605469, + "logps/rejected": -359.20660400390625, + "loss": 0.2545, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9174166321754456, + "rewards/margins": 3.314091682434082, + "rewards/rejected": -4.231508255004883, + "step": 1041 + }, + { + "epoch": 0.22, + "learning_rate": 1.570588235294118e-05, + "logits/chosen": -2.1452512741088867, + "logits/rejected": -2.02679443359375, + "logps/chosen": -278.96148681640625, + "logps/rejected": -345.1434326171875, + "loss": 0.2789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.440082550048828, + "rewards/margins": 3.3516077995300293, + "rewards/rejected": -5.791690349578857, + "step": 1042 + }, + { + "epoch": 0.22, + "learning_rate": 1.570168067226891e-05, + "logits/chosen": -2.2455363273620605, + "logits/rejected": -1.8577537536621094, + "logps/chosen": -440.77410888671875, + "logps/rejected": -400.5489501953125, + "loss": 0.3369, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4547991752624512, + "rewards/margins": 3.333770990371704, + "rewards/rejected": -4.788569450378418, + "step": 1043 + }, + { + "epoch": 0.22, + "learning_rate": 1.569747899159664e-05, + "logits/chosen": -2.3846778869628906, + "logits/rejected": -1.8157987594604492, + "logps/chosen": -402.36468505859375, + "logps/rejected": -338.8222961425781, + "loss": 0.2073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8999124765396118, + "rewards/margins": 3.6617870330810547, + "rewards/rejected": -4.561699390411377, + "step": 1044 + }, + { + "epoch": 0.22, + "learning_rate": 1.569327731092437e-05, + "logits/chosen": -2.145190715789795, + "logits/rejected": -1.7300249338150024, + "logps/chosen": -454.65838623046875, + "logps/rejected": -367.63519287109375, + "loss": 0.7194, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7534492015838623, + "rewards/margins": 2.1044344902038574, + "rewards/rejected": -4.857883930206299, + "step": 1045 + }, + { + "epoch": 0.22, + "learning_rate": 1.5689075630252103e-05, + "logits/chosen": -2.297982692718506, + "logits/rejected": -1.787142276763916, + "logps/chosen": -299.640869140625, + "logps/rejected": -296.74981689453125, + "loss": 0.338, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3258482217788696, + "rewards/margins": 3.2778358459472656, + "rewards/rejected": -4.603683948516846, + "step": 1046 + }, + { + "epoch": 0.22, + "learning_rate": 1.5684873949579834e-05, + "logits/chosen": -2.0597774982452393, + "logits/rejected": -1.8493540287017822, + "logps/chosen": -363.616943359375, + "logps/rejected": -293.5428466796875, + "loss": 0.5212, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7531907558441162, + "rewards/margins": 2.836975336074829, + "rewards/rejected": -4.590166091918945, + "step": 1047 + }, + { + "epoch": 0.22, + "learning_rate": 1.5680672268907564e-05, + "logits/chosen": -2.1516826152801514, + "logits/rejected": -1.5491429567337036, + "logps/chosen": -333.3130187988281, + "logps/rejected": -262.74847412109375, + "loss": 0.3446, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8804630041122437, + "rewards/margins": 3.5086886882781982, + "rewards/rejected": -5.389151573181152, + "step": 1048 + }, + { + "epoch": 0.22, + "learning_rate": 1.5676470588235294e-05, + "logits/chosen": -2.4076790809631348, + "logits/rejected": -1.9318214654922485, + "logps/chosen": -326.7567138671875, + "logps/rejected": -362.8504333496094, + "loss": 0.5094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4532806873321533, + "rewards/margins": 3.107287883758545, + "rewards/rejected": -4.560568332672119, + "step": 1049 + }, + { + "epoch": 0.22, + "learning_rate": 1.5672268907563028e-05, + "logits/chosen": -2.1510233879089355, + "logits/rejected": -1.7747035026550293, + "logps/chosen": -413.5495910644531, + "logps/rejected": -355.978271484375, + "loss": 0.2436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9846274852752686, + "rewards/margins": 2.6355772018432617, + "rewards/rejected": -4.620204448699951, + "step": 1050 + }, + { + "epoch": 0.22, + "learning_rate": 1.5668067226890758e-05, + "logits/chosen": -1.7494980096817017, + "logits/rejected": -1.5653728246688843, + "logps/chosen": -559.1565551757812, + "logps/rejected": -329.03424072265625, + "loss": 0.6958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1177010536193848, + "rewards/margins": 2.7193045616149902, + "rewards/rejected": -5.837005615234375, + "step": 1051 + }, + { + "epoch": 0.22, + "learning_rate": 1.5663865546218488e-05, + "logits/chosen": -2.192047357559204, + "logits/rejected": -2.125075340270996, + "logps/chosen": -374.524658203125, + "logps/rejected": -397.61907958984375, + "loss": 0.0932, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.144680380821228, + "rewards/margins": 4.933655738830566, + "rewards/rejected": -6.078336238861084, + "step": 1052 + }, + { + "epoch": 0.22, + "learning_rate": 1.565966386554622e-05, + "logits/chosen": -1.9797996282577515, + "logits/rejected": -2.0858266353607178, + "logps/chosen": -442.8431396484375, + "logps/rejected": -421.2425537109375, + "loss": 0.2757, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7781563997268677, + "rewards/margins": 3.7903406620025635, + "rewards/rejected": -5.5684967041015625, + "step": 1053 + }, + { + "epoch": 0.22, + "learning_rate": 1.5655462184873952e-05, + "logits/chosen": -2.343562602996826, + "logits/rejected": -2.357333183288574, + "logps/chosen": -225.52113342285156, + "logps/rejected": -240.44729614257812, + "loss": 0.29, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6451144218444824, + "rewards/margins": 3.344367504119873, + "rewards/rejected": -4.9894819259643555, + "step": 1054 + }, + { + "epoch": 0.22, + "learning_rate": 1.5651260504201682e-05, + "logits/chosen": -2.129213571548462, + "logits/rejected": -1.8569040298461914, + "logps/chosen": -333.76177978515625, + "logps/rejected": -321.2130432128906, + "loss": 0.2842, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8709126710891724, + "rewards/margins": 2.6699986457824707, + "rewards/rejected": -4.540911674499512, + "step": 1055 + }, + { + "epoch": 0.22, + "learning_rate": 1.5647058823529412e-05, + "logits/chosen": -2.449032783508301, + "logits/rejected": -2.0392043590545654, + "logps/chosen": -482.1714172363281, + "logps/rejected": -445.0155029296875, + "loss": 0.1242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3811352252960205, + "rewards/margins": 5.03152322769165, + "rewards/rejected": -6.412657737731934, + "step": 1056 + }, + { + "epoch": 0.22, + "learning_rate": 1.5642857142857143e-05, + "logits/chosen": -2.445967197418213, + "logits/rejected": -1.8239625692367554, + "logps/chosen": -355.8022155761719, + "logps/rejected": -265.74761962890625, + "loss": 0.3162, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5941202640533447, + "rewards/margins": 3.00582218170166, + "rewards/rejected": -5.599942207336426, + "step": 1057 + }, + { + "epoch": 0.22, + "learning_rate": 1.5638655462184876e-05, + "logits/chosen": -2.1468396186828613, + "logits/rejected": -2.0982284545898438, + "logps/chosen": -420.23980712890625, + "logps/rejected": -371.2124328613281, + "loss": 0.1636, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.441194772720337, + "rewards/margins": 4.537558555603027, + "rewards/rejected": -5.978753089904785, + "step": 1058 + }, + { + "epoch": 0.22, + "learning_rate": 1.5634453781512606e-05, + "logits/chosen": -2.0872554779052734, + "logits/rejected": -1.7862269878387451, + "logps/chosen": -353.5707092285156, + "logps/rejected": -300.4449768066406, + "loss": 0.1958, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6572359800338745, + "rewards/margins": 3.3234400749206543, + "rewards/rejected": -4.980676174163818, + "step": 1059 + }, + { + "epoch": 0.22, + "learning_rate": 1.5630252100840337e-05, + "logits/chosen": -2.22546124458313, + "logits/rejected": -1.8920305967330933, + "logps/chosen": -392.8912353515625, + "logps/rejected": -421.10870361328125, + "loss": 0.3166, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4793956279754639, + "rewards/margins": 2.5840389728546143, + "rewards/rejected": -4.063434600830078, + "step": 1060 + }, + { + "epoch": 0.22, + "learning_rate": 1.5626050420168067e-05, + "logits/chosen": -2.0951898097991943, + "logits/rejected": -1.8646944761276245, + "logps/chosen": -447.3456115722656, + "logps/rejected": -400.37762451171875, + "loss": 0.2695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3893158435821533, + "rewards/margins": 3.2697815895080566, + "rewards/rejected": -4.659097671508789, + "step": 1061 + }, + { + "epoch": 0.22, + "learning_rate": 1.56218487394958e-05, + "logits/chosen": -1.9456130266189575, + "logits/rejected": -2.1775624752044678, + "logps/chosen": -281.8665771484375, + "logps/rejected": -318.178466796875, + "loss": 0.2409, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.997543215751648, + "rewards/margins": 3.2904200553894043, + "rewards/rejected": -5.287962913513184, + "step": 1062 + }, + { + "epoch": 0.22, + "learning_rate": 1.561764705882353e-05, + "logits/chosen": -2.320709705352783, + "logits/rejected": -2.3321659564971924, + "logps/chosen": -312.359619140625, + "logps/rejected": -337.9840087890625, + "loss": 0.6249, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.15242075920105, + "rewards/margins": 2.5110092163085938, + "rewards/rejected": -4.6634297370910645, + "step": 1063 + }, + { + "epoch": 0.22, + "learning_rate": 1.561344537815126e-05, + "logits/chosen": -2.32196307182312, + "logits/rejected": -2.2228965759277344, + "logps/chosen": -264.49688720703125, + "logps/rejected": -297.9734191894531, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4416754245758057, + "rewards/margins": 3.9339993000030518, + "rewards/rejected": -6.375675201416016, + "step": 1064 + }, + { + "epoch": 0.22, + "learning_rate": 1.5609243697478995e-05, + "logits/chosen": -2.125380516052246, + "logits/rejected": -2.022549629211426, + "logps/chosen": -374.00384521484375, + "logps/rejected": -324.53106689453125, + "loss": 0.1067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3432464599609375, + "rewards/margins": 4.418325901031494, + "rewards/rejected": -5.76157283782959, + "step": 1065 + }, + { + "epoch": 0.22, + "learning_rate": 1.5605042016806725e-05, + "logits/chosen": -2.1170427799224854, + "logits/rejected": -2.2868189811706543, + "logps/chosen": -382.4078369140625, + "logps/rejected": -346.73583984375, + "loss": 0.5472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7097530364990234, + "rewards/margins": 2.548435926437378, + "rewards/rejected": -4.2581892013549805, + "step": 1066 + }, + { + "epoch": 0.22, + "learning_rate": 1.5600840336134455e-05, + "logits/chosen": -2.2237319946289062, + "logits/rejected": -1.9568169116973877, + "logps/chosen": -358.5841369628906, + "logps/rejected": -323.93072509765625, + "loss": 0.5891, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.226712942123413, + "rewards/margins": 1.9938093423843384, + "rewards/rejected": -4.220521926879883, + "step": 1067 + }, + { + "epoch": 0.22, + "learning_rate": 1.5596638655462185e-05, + "logits/chosen": -2.117027759552002, + "logits/rejected": -1.7533752918243408, + "logps/chosen": -404.026611328125, + "logps/rejected": -340.84881591796875, + "loss": 0.3717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6832351684570312, + "rewards/margins": 4.267095565795898, + "rewards/rejected": -6.95033073425293, + "step": 1068 + }, + { + "epoch": 0.22, + "learning_rate": 1.559243697478992e-05, + "logits/chosen": -2.1706459522247314, + "logits/rejected": -2.1421070098876953, + "logps/chosen": -332.73553466796875, + "logps/rejected": -387.4233703613281, + "loss": 0.1861, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.187776803970337, + "rewards/margins": 2.5737130641937256, + "rewards/rejected": -3.7614901065826416, + "step": 1069 + }, + { + "epoch": 0.22, + "learning_rate": 1.558823529411765e-05, + "logits/chosen": -1.998975157737732, + "logits/rejected": -1.8555452823638916, + "logps/chosen": -358.2004699707031, + "logps/rejected": -402.6553955078125, + "loss": 0.4058, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.752549648284912, + "rewards/margins": 2.4785211086273193, + "rewards/rejected": -4.231070518493652, + "step": 1070 + }, + { + "epoch": 0.22, + "learning_rate": 1.558403361344538e-05, + "logits/chosen": -2.1340579986572266, + "logits/rejected": -1.657408595085144, + "logps/chosen": -304.9349060058594, + "logps/rejected": -401.2822265625, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1549335718154907, + "rewards/margins": 4.3107476234436035, + "rewards/rejected": -5.465681076049805, + "step": 1071 + }, + { + "epoch": 0.22, + "learning_rate": 1.557983193277311e-05, + "logits/chosen": -2.285947561264038, + "logits/rejected": -1.7917628288269043, + "logps/chosen": -302.83837890625, + "logps/rejected": -272.8876953125, + "loss": 0.1783, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3537662029266357, + "rewards/margins": 3.5185294151306152, + "rewards/rejected": -4.87229585647583, + "step": 1072 + }, + { + "epoch": 0.22, + "learning_rate": 1.5575630252100843e-05, + "logits/chosen": -2.2296924591064453, + "logits/rejected": -1.904763102531433, + "logps/chosen": -382.1318664550781, + "logps/rejected": -358.8062438964844, + "loss": 0.3233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8686118125915527, + "rewards/margins": 3.838479995727539, + "rewards/rejected": -5.707091808319092, + "step": 1073 + }, + { + "epoch": 0.22, + "learning_rate": 1.5571428571428573e-05, + "logits/chosen": -2.313615322113037, + "logits/rejected": -1.852065086364746, + "logps/chosen": -360.0382080078125, + "logps/rejected": -366.3542785644531, + "loss": 0.4821, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0509731769561768, + "rewards/margins": 2.7988686561584473, + "rewards/rejected": -4.849842071533203, + "step": 1074 + }, + { + "epoch": 0.22, + "learning_rate": 1.5567226890756304e-05, + "logits/chosen": -2.3373594284057617, + "logits/rejected": -1.7792909145355225, + "logps/chosen": -413.07147216796875, + "logps/rejected": -318.1133728027344, + "loss": 0.5406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9407414197921753, + "rewards/margins": 2.8012142181396484, + "rewards/rejected": -3.741955518722534, + "step": 1075 + }, + { + "epoch": 0.23, + "learning_rate": 1.5563025210084034e-05, + "logits/chosen": -2.227137804031372, + "logits/rejected": -1.7259658575057983, + "logps/chosen": -266.745849609375, + "logps/rejected": -220.77035522460938, + "loss": 0.2263, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5586907863616943, + "rewards/margins": 3.1512322425842285, + "rewards/rejected": -4.709922790527344, + "step": 1076 + }, + { + "epoch": 0.23, + "learning_rate": 1.5558823529411767e-05, + "logits/chosen": -2.1326308250427246, + "logits/rejected": -1.9316304922103882, + "logps/chosen": -335.54302978515625, + "logps/rejected": -338.352294921875, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.738598346710205, + "rewards/margins": 3.2073771953582764, + "rewards/rejected": -4.9459757804870605, + "step": 1077 + }, + { + "epoch": 0.23, + "learning_rate": 1.5554621848739498e-05, + "logits/chosen": -2.1827735900878906, + "logits/rejected": -1.9865981340408325, + "logps/chosen": -247.40914916992188, + "logps/rejected": -263.6688232421875, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8604538440704346, + "rewards/margins": 3.1691832542419434, + "rewards/rejected": -5.029636859893799, + "step": 1078 + }, + { + "epoch": 0.23, + "learning_rate": 1.5550420168067228e-05, + "logits/chosen": -2.3564889430999756, + "logits/rejected": -1.8677804470062256, + "logps/chosen": -338.5962829589844, + "logps/rejected": -284.51513671875, + "loss": 0.1204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3590996265411377, + "rewards/margins": 4.387566089630127, + "rewards/rejected": -5.746665954589844, + "step": 1079 + }, + { + "epoch": 0.23, + "learning_rate": 1.5546218487394958e-05, + "logits/chosen": -2.318563222885132, + "logits/rejected": -1.9634912014007568, + "logps/chosen": -312.05987548828125, + "logps/rejected": -288.6165771484375, + "loss": 0.5744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9886575937271118, + "rewards/margins": 2.711333751678467, + "rewards/rejected": -4.699991226196289, + "step": 1080 + }, + { + "epoch": 0.23, + "learning_rate": 1.554201680672269e-05, + "logits/chosen": -2.0526483058929443, + "logits/rejected": -2.097865581512451, + "logps/chosen": -259.4316101074219, + "logps/rejected": -348.186279296875, + "loss": 0.157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3162293434143066, + "rewards/margins": 4.7040276527404785, + "rewards/rejected": -6.020256519317627, + "step": 1081 + }, + { + "epoch": 0.23, + "learning_rate": 1.5537815126050422e-05, + "logits/chosen": -2.038674831390381, + "logits/rejected": -2.182054042816162, + "logps/chosen": -304.2615661621094, + "logps/rejected": -369.98541259765625, + "loss": 0.2917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.508150339126587, + "rewards/margins": 3.3606181144714355, + "rewards/rejected": -4.868768215179443, + "step": 1082 + }, + { + "epoch": 0.23, + "learning_rate": 1.5533613445378152e-05, + "logits/chosen": -2.2732157707214355, + "logits/rejected": -2.1809120178222656, + "logps/chosen": -198.60305786132812, + "logps/rejected": -271.8071594238281, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9015932083129883, + "rewards/margins": 4.2239227294921875, + "rewards/rejected": -6.125515937805176, + "step": 1083 + }, + { + "epoch": 0.23, + "learning_rate": 1.5529411764705882e-05, + "logits/chosen": -1.7566792964935303, + "logits/rejected": -1.9713528156280518, + "logps/chosen": -295.0103759765625, + "logps/rejected": -356.4507141113281, + "loss": 0.2124, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9597657322883606, + "rewards/margins": 3.1596593856811523, + "rewards/rejected": -4.119425296783447, + "step": 1084 + }, + { + "epoch": 0.23, + "learning_rate": 1.5525210084033616e-05, + "logits/chosen": -2.1095163822174072, + "logits/rejected": -1.8498040437698364, + "logps/chosen": -180.953857421875, + "logps/rejected": -244.48233032226562, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7570704221725464, + "rewards/margins": 3.4634978771209717, + "rewards/rejected": -5.2205681800842285, + "step": 1085 + }, + { + "epoch": 0.23, + "learning_rate": 1.5521008403361346e-05, + "logits/chosen": -2.217158317565918, + "logits/rejected": -2.0744872093200684, + "logps/chosen": -308.2737121582031, + "logps/rejected": -311.7294921875, + "loss": 0.2425, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1905388832092285, + "rewards/margins": 3.3126914501190186, + "rewards/rejected": -5.503230094909668, + "step": 1086 + }, + { + "epoch": 0.23, + "learning_rate": 1.5516806722689076e-05, + "logits/chosen": -2.2046380043029785, + "logits/rejected": -1.9783538579940796, + "logps/chosen": -298.5821533203125, + "logps/rejected": -354.7811279296875, + "loss": 0.2855, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4117244482040405, + "rewards/margins": 3.6839473247528076, + "rewards/rejected": -5.095672130584717, + "step": 1087 + }, + { + "epoch": 0.23, + "learning_rate": 1.551260504201681e-05, + "logits/chosen": -1.9239532947540283, + "logits/rejected": -1.963545799255371, + "logps/chosen": -263.43341064453125, + "logps/rejected": -310.1040344238281, + "loss": 0.4771, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7402842044830322, + "rewards/margins": 3.6185336112976074, + "rewards/rejected": -5.358818054199219, + "step": 1088 + }, + { + "epoch": 0.23, + "learning_rate": 1.550840336134454e-05, + "logits/chosen": -2.3074653148651123, + "logits/rejected": -1.7325465679168701, + "logps/chosen": -335.1184997558594, + "logps/rejected": -249.3388214111328, + "loss": 0.4232, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9868955612182617, + "rewards/margins": 2.674745559692383, + "rewards/rejected": -4.6616411209106445, + "step": 1089 + }, + { + "epoch": 0.23, + "learning_rate": 1.550420168067227e-05, + "logits/chosen": -2.0268568992614746, + "logits/rejected": -2.0188844203948975, + "logps/chosen": -339.70538330078125, + "logps/rejected": -341.326904296875, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8043559789657593, + "rewards/margins": 2.4153013229370117, + "rewards/rejected": -4.2196574211120605, + "step": 1090 + }, + { + "epoch": 0.23, + "learning_rate": 1.55e-05, + "logits/chosen": -2.002237319946289, + "logits/rejected": -1.7771501541137695, + "logps/chosen": -371.6806945800781, + "logps/rejected": -374.4664306640625, + "loss": 0.1993, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1405192613601685, + "rewards/margins": 4.472423076629639, + "rewards/rejected": -5.612942218780518, + "step": 1091 + }, + { + "epoch": 0.23, + "learning_rate": 1.5495798319327734e-05, + "logits/chosen": -2.1593756675720215, + "logits/rejected": -2.1301050186157227, + "logps/chosen": -391.8299560546875, + "logps/rejected": -381.370849609375, + "loss": 0.6286, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7045612335205078, + "rewards/margins": 2.896866798400879, + "rewards/rejected": -4.601428031921387, + "step": 1092 + }, + { + "epoch": 0.23, + "learning_rate": 1.5491596638655465e-05, + "logits/chosen": -1.9469997882843018, + "logits/rejected": -2.158026695251465, + "logps/chosen": -262.4390563964844, + "logps/rejected": -405.81622314453125, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.338805675506592, + "rewards/margins": 2.7942445278167725, + "rewards/rejected": -5.133049964904785, + "step": 1093 + }, + { + "epoch": 0.23, + "learning_rate": 1.5487394957983195e-05, + "logits/chosen": -2.2205824851989746, + "logits/rejected": -2.04874849319458, + "logps/chosen": -324.73260498046875, + "logps/rejected": -373.7428894042969, + "loss": 0.2497, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.142815589904785, + "rewards/margins": 3.312791585922241, + "rewards/rejected": -5.455606937408447, + "step": 1094 + }, + { + "epoch": 0.23, + "learning_rate": 1.5483193277310925e-05, + "logits/chosen": -2.1143898963928223, + "logits/rejected": -1.974482536315918, + "logps/chosen": -287.5416259765625, + "logps/rejected": -325.3800354003906, + "loss": 0.2505, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.365784168243408, + "rewards/margins": 2.8378050327301025, + "rewards/rejected": -5.203588962554932, + "step": 1095 + }, + { + "epoch": 0.23, + "learning_rate": 1.547899159663866e-05, + "logits/chosen": -1.9907307624816895, + "logits/rejected": -1.8054124116897583, + "logps/chosen": -381.383056640625, + "logps/rejected": -390.400634765625, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6027462482452393, + "rewards/margins": 3.7396140098571777, + "rewards/rejected": -5.342360019683838, + "step": 1096 + }, + { + "epoch": 0.23, + "learning_rate": 1.547478991596639e-05, + "logits/chosen": -2.516204357147217, + "logits/rejected": -2.2973971366882324, + "logps/chosen": -380.7392272949219, + "logps/rejected": -491.6644287109375, + "loss": 0.3093, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6022915840148926, + "rewards/margins": 2.3710646629333496, + "rewards/rejected": -3.973356246948242, + "step": 1097 + }, + { + "epoch": 0.23, + "learning_rate": 1.547058823529412e-05, + "logits/chosen": -2.0064024925231934, + "logits/rejected": -2.0810389518737793, + "logps/chosen": -301.1163024902344, + "logps/rejected": -320.45526123046875, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7084771394729614, + "rewards/margins": 3.3058886528015137, + "rewards/rejected": -5.014366149902344, + "step": 1098 + }, + { + "epoch": 0.23, + "learning_rate": 1.546638655462185e-05, + "logits/chosen": -2.1477789878845215, + "logits/rejected": -1.7381205558776855, + "logps/chosen": -280.3091125488281, + "logps/rejected": -250.7599639892578, + "loss": 0.3194, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1036622524261475, + "rewards/margins": 2.9275271892547607, + "rewards/rejected": -5.031189441680908, + "step": 1099 + }, + { + "epoch": 0.23, + "learning_rate": 1.5462184873949583e-05, + "logits/chosen": -2.0066843032836914, + "logits/rejected": -1.875346302986145, + "logps/chosen": -364.3885498046875, + "logps/rejected": -331.80474853515625, + "loss": 0.5688, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2363851070404053, + "rewards/margins": 3.241832733154297, + "rewards/rejected": -5.478217124938965, + "step": 1100 + }, + { + "epoch": 0.23, + "learning_rate": 1.5457983193277313e-05, + "logits/chosen": -2.2245278358459473, + "logits/rejected": -2.165071725845337, + "logps/chosen": -385.35992431640625, + "logps/rejected": -378.7532958984375, + "loss": 0.4057, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3071773052215576, + "rewards/margins": 2.8613665103912354, + "rewards/rejected": -4.168543815612793, + "step": 1101 + }, + { + "epoch": 0.23, + "learning_rate": 1.5453781512605043e-05, + "logits/chosen": -1.9166970252990723, + "logits/rejected": -2.02241587638855, + "logps/chosen": -191.66220092773438, + "logps/rejected": -311.7247314453125, + "loss": 0.1897, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5967204570770264, + "rewards/margins": 4.551992416381836, + "rewards/rejected": -6.148713111877441, + "step": 1102 + }, + { + "epoch": 0.23, + "learning_rate": 1.5449579831932773e-05, + "logits/chosen": -2.1552774906158447, + "logits/rejected": -1.921294093132019, + "logps/chosen": -367.08697509765625, + "logps/rejected": -337.5814208984375, + "loss": 0.5372, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5889697074890137, + "rewards/margins": 3.4586124420166016, + "rewards/rejected": -5.047582149505615, + "step": 1103 + }, + { + "epoch": 0.23, + "learning_rate": 1.5445378151260507e-05, + "logits/chosen": -2.3032045364379883, + "logits/rejected": -2.0854313373565674, + "logps/chosen": -330.4469299316406, + "logps/rejected": -299.21771240234375, + "loss": 0.4528, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7191451787948608, + "rewards/margins": 3.301757335662842, + "rewards/rejected": -5.020902633666992, + "step": 1104 + }, + { + "epoch": 0.23, + "learning_rate": 1.5441176470588237e-05, + "logits/chosen": -1.7520146369934082, + "logits/rejected": -1.9824368953704834, + "logps/chosen": -188.16159057617188, + "logps/rejected": -291.055908203125, + "loss": 0.5205, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.774923324584961, + "rewards/margins": 2.4503207206726074, + "rewards/rejected": -5.225244045257568, + "step": 1105 + }, + { + "epoch": 0.23, + "learning_rate": 1.5436974789915968e-05, + "logits/chosen": -2.1155741214752197, + "logits/rejected": -2.234722852706909, + "logps/chosen": -248.34115600585938, + "logps/rejected": -326.1280517578125, + "loss": 0.1846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.901938796043396, + "rewards/margins": 4.598575592041016, + "rewards/rejected": -5.500514507293701, + "step": 1106 + }, + { + "epoch": 0.23, + "learning_rate": 1.5432773109243698e-05, + "logits/chosen": -2.0443203449249268, + "logits/rejected": -1.9454327821731567, + "logps/chosen": -246.5683135986328, + "logps/rejected": -273.5752868652344, + "loss": 0.9168, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.8990705013275146, + "rewards/margins": 1.810771107673645, + "rewards/rejected": -4.709841728210449, + "step": 1107 + }, + { + "epoch": 0.23, + "learning_rate": 1.542857142857143e-05, + "logits/chosen": -2.3644843101501465, + "logits/rejected": -2.0450124740600586, + "logps/chosen": -321.63824462890625, + "logps/rejected": -299.8153381347656, + "loss": 0.3858, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6680455207824707, + "rewards/margins": 3.6659014225006104, + "rewards/rejected": -5.333946704864502, + "step": 1108 + }, + { + "epoch": 0.23, + "learning_rate": 1.542436974789916e-05, + "logits/chosen": -2.1518239974975586, + "logits/rejected": -1.5929746627807617, + "logps/chosen": -379.90313720703125, + "logps/rejected": -266.75, + "loss": 0.4633, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8649249076843262, + "rewards/margins": 2.570263385772705, + "rewards/rejected": -3.4351882934570312, + "step": 1109 + }, + { + "epoch": 0.23, + "learning_rate": 1.5420168067226892e-05, + "logits/chosen": -2.2537670135498047, + "logits/rejected": -1.780888319015503, + "logps/chosen": -324.98681640625, + "logps/rejected": -354.29119873046875, + "loss": 0.1541, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0924406051635742, + "rewards/margins": 3.7305901050567627, + "rewards/rejected": -4.823030471801758, + "step": 1110 + }, + { + "epoch": 0.23, + "learning_rate": 1.5415966386554625e-05, + "logits/chosen": -2.037766695022583, + "logits/rejected": -1.8503694534301758, + "logps/chosen": -229.6671905517578, + "logps/rejected": -244.55088806152344, + "loss": 0.3077, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7889714241027832, + "rewards/margins": 3.404996633529663, + "rewards/rejected": -5.193967819213867, + "step": 1111 + }, + { + "epoch": 0.23, + "learning_rate": 1.5411764705882356e-05, + "logits/chosen": -1.98320734500885, + "logits/rejected": -1.9117662906646729, + "logps/chosen": -227.1378631591797, + "logps/rejected": -248.35687255859375, + "loss": 0.4428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5258548259735107, + "rewards/margins": 2.148599147796631, + "rewards/rejected": -3.6744537353515625, + "step": 1112 + }, + { + "epoch": 0.23, + "learning_rate": 1.5407563025210086e-05, + "logits/chosen": -2.256152868270874, + "logits/rejected": -2.0154685974121094, + "logps/chosen": -336.7105712890625, + "logps/rejected": -313.89361572265625, + "loss": 0.1727, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.451967477798462, + "rewards/margins": 3.9846582412719727, + "rewards/rejected": -5.4366254806518555, + "step": 1113 + }, + { + "epoch": 0.23, + "learning_rate": 1.5403361344537816e-05, + "logits/chosen": -2.311757802963257, + "logits/rejected": -1.9487649202346802, + "logps/chosen": -297.34844970703125, + "logps/rejected": -255.541015625, + "loss": 0.2071, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2653576135635376, + "rewards/margins": 4.1422038078308105, + "rewards/rejected": -5.407561302185059, + "step": 1114 + }, + { + "epoch": 0.23, + "learning_rate": 1.539915966386555e-05, + "logits/chosen": -2.1543688774108887, + "logits/rejected": -1.8869092464447021, + "logps/chosen": -375.5109558105469, + "logps/rejected": -310.30157470703125, + "loss": 0.4776, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1006371974945068, + "rewards/margins": 3.3518919944763184, + "rewards/rejected": -4.452528953552246, + "step": 1115 + }, + { + "epoch": 0.23, + "learning_rate": 1.539495798319328e-05, + "logits/chosen": -2.2909069061279297, + "logits/rejected": -2.140662431716919, + "logps/chosen": -333.17913818359375, + "logps/rejected": -360.0805358886719, + "loss": 0.0911, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4079232215881348, + "rewards/margins": 5.112290382385254, + "rewards/rejected": -6.520214080810547, + "step": 1116 + }, + { + "epoch": 0.23, + "learning_rate": 1.539075630252101e-05, + "logits/chosen": -2.166163682937622, + "logits/rejected": -1.9165594577789307, + "logps/chosen": -392.78741455078125, + "logps/rejected": -381.2095947265625, + "loss": 0.4071, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1723179817199707, + "rewards/margins": 2.4729695320129395, + "rewards/rejected": -3.6452877521514893, + "step": 1117 + }, + { + "epoch": 0.23, + "learning_rate": 1.538655462184874e-05, + "logits/chosen": -2.211686611175537, + "logits/rejected": -1.7472550868988037, + "logps/chosen": -389.500732421875, + "logps/rejected": -333.60577392578125, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.303159475326538, + "rewards/margins": 3.5912461280822754, + "rewards/rejected": -4.894405364990234, + "step": 1118 + }, + { + "epoch": 0.23, + "learning_rate": 1.5382352941176474e-05, + "logits/chosen": -2.27036714553833, + "logits/rejected": -1.9544291496276855, + "logps/chosen": -320.41552734375, + "logps/rejected": -291.5990905761719, + "loss": 0.3973, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4500982761383057, + "rewards/margins": 2.831218719482422, + "rewards/rejected": -4.281316757202148, + "step": 1119 + }, + { + "epoch": 0.23, + "learning_rate": 1.5378151260504204e-05, + "logits/chosen": -2.2146658897399902, + "logits/rejected": -1.7627410888671875, + "logps/chosen": -249.08782958984375, + "logps/rejected": -220.69442749023438, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7283214330673218, + "rewards/margins": 2.3475148677825928, + "rewards/rejected": -4.075836181640625, + "step": 1120 + }, + { + "epoch": 0.23, + "learning_rate": 1.5373949579831934e-05, + "logits/chosen": -2.0229198932647705, + "logits/rejected": -2.1845083236694336, + "logps/chosen": -351.8758239746094, + "logps/rejected": -332.13922119140625, + "loss": 0.3351, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4682971239089966, + "rewards/margins": 3.523670196533203, + "rewards/rejected": -4.99196720123291, + "step": 1121 + }, + { + "epoch": 0.23, + "learning_rate": 1.5369747899159665e-05, + "logits/chosen": -2.1117653846740723, + "logits/rejected": -1.7490055561065674, + "logps/chosen": -313.01495361328125, + "logps/rejected": -305.625732421875, + "loss": 0.3378, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8274101614952087, + "rewards/margins": 3.156586170196533, + "rewards/rejected": -3.9839963912963867, + "step": 1122 + }, + { + "epoch": 0.23, + "learning_rate": 1.5365546218487398e-05, + "logits/chosen": -2.1178884506225586, + "logits/rejected": -2.2283873558044434, + "logps/chosen": -216.9889678955078, + "logps/rejected": -223.59548950195312, + "loss": 0.4544, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.624551773071289, + "rewards/margins": 2.3504786491394043, + "rewards/rejected": -3.9750306606292725, + "step": 1123 + }, + { + "epoch": 0.24, + "learning_rate": 1.536134453781513e-05, + "logits/chosen": -1.6962761878967285, + "logits/rejected": -1.551252841949463, + "logps/chosen": -290.80126953125, + "logps/rejected": -290.831298828125, + "loss": 0.2137, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2125012874603271, + "rewards/margins": 4.068419933319092, + "rewards/rejected": -5.280921459197998, + "step": 1124 + }, + { + "epoch": 0.24, + "learning_rate": 1.535714285714286e-05, + "logits/chosen": -2.1728169918060303, + "logits/rejected": -1.9521615505218506, + "logps/chosen": -265.2166442871094, + "logps/rejected": -225.73260498046875, + "loss": 0.235, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0623376369476318, + "rewards/margins": 2.468724012374878, + "rewards/rejected": -3.531061887741089, + "step": 1125 + }, + { + "epoch": 0.24, + "learning_rate": 1.535294117647059e-05, + "logits/chosen": -2.416719675064087, + "logits/rejected": -1.825348973274231, + "logps/chosen": -434.5524597167969, + "logps/rejected": -350.5001220703125, + "loss": 0.6002, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3969985246658325, + "rewards/margins": 2.2322611808776855, + "rewards/rejected": -3.6292595863342285, + "step": 1126 + }, + { + "epoch": 0.24, + "learning_rate": 1.5348739495798323e-05, + "logits/chosen": -1.4729070663452148, + "logits/rejected": -1.7964214086532593, + "logps/chosen": -240.53033447265625, + "logps/rejected": -335.87872314453125, + "loss": 0.1162, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3010525703430176, + "rewards/margins": 5.342162132263184, + "rewards/rejected": -6.643215656280518, + "step": 1127 + }, + { + "epoch": 0.24, + "learning_rate": 1.5344537815126053e-05, + "logits/chosen": -2.0500781536102295, + "logits/rejected": -1.7759275436401367, + "logps/chosen": -379.8785400390625, + "logps/rejected": -381.34393310546875, + "loss": 0.2684, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3641512393951416, + "rewards/margins": 3.6229872703552246, + "rewards/rejected": -4.987138748168945, + "step": 1128 + }, + { + "epoch": 0.24, + "learning_rate": 1.5340336134453783e-05, + "logits/chosen": -2.2796554565429688, + "logits/rejected": -1.5617339611053467, + "logps/chosen": -342.2637023925781, + "logps/rejected": -270.35797119140625, + "loss": 0.2627, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3469939231872559, + "rewards/margins": 3.1188435554504395, + "rewards/rejected": -4.465837478637695, + "step": 1129 + }, + { + "epoch": 0.24, + "learning_rate": 1.5336134453781513e-05, + "logits/chosen": -1.9529991149902344, + "logits/rejected": -1.8362208604812622, + "logps/chosen": -280.33538818359375, + "logps/rejected": -325.3298034667969, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9795182943344116, + "rewards/margins": 1.8745293617248535, + "rewards/rejected": -2.8540475368499756, + "step": 1130 + }, + { + "epoch": 0.24, + "learning_rate": 1.5331932773109247e-05, + "logits/chosen": -2.2299227714538574, + "logits/rejected": -1.8161027431488037, + "logps/chosen": -443.1059265136719, + "logps/rejected": -369.3968505859375, + "loss": 0.2148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9800748825073242, + "rewards/margins": 3.3618011474609375, + "rewards/rejected": -4.341876029968262, + "step": 1131 + }, + { + "epoch": 0.24, + "learning_rate": 1.5327731092436977e-05, + "logits/chosen": -1.9197090864181519, + "logits/rejected": -1.7479817867279053, + "logps/chosen": -218.59765625, + "logps/rejected": -264.2467041015625, + "loss": 0.2305, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.237673044204712, + "rewards/margins": 4.283493518829346, + "rewards/rejected": -5.5211663246154785, + "step": 1132 + }, + { + "epoch": 0.24, + "learning_rate": 1.5323529411764707e-05, + "logits/chosen": -1.852963924407959, + "logits/rejected": -2.3476314544677734, + "logps/chosen": -232.69512939453125, + "logps/rejected": -415.89324951171875, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7668110132217407, + "rewards/margins": 2.7833516597747803, + "rewards/rejected": -4.5501627922058105, + "step": 1133 + }, + { + "epoch": 0.24, + "learning_rate": 1.531932773109244e-05, + "logits/chosen": -2.0508055686950684, + "logits/rejected": -1.5809041261672974, + "logps/chosen": -332.0795593261719, + "logps/rejected": -310.4665832519531, + "loss": 0.3461, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9561271667480469, + "rewards/margins": 3.4839415550231934, + "rewards/rejected": -5.440068244934082, + "step": 1134 + }, + { + "epoch": 0.24, + "learning_rate": 1.531512605042017e-05, + "logits/chosen": -1.9303784370422363, + "logits/rejected": -1.9079740047454834, + "logps/chosen": -276.6993103027344, + "logps/rejected": -321.5356750488281, + "loss": 0.2891, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3192062377929688, + "rewards/margins": 4.3332109451293945, + "rewards/rejected": -6.652416706085205, + "step": 1135 + }, + { + "epoch": 0.24, + "learning_rate": 1.5310924369747898e-05, + "logits/chosen": -2.265953302383423, + "logits/rejected": -2.0183496475219727, + "logps/chosen": -363.24395751953125, + "logps/rejected": -329.00091552734375, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8615310788154602, + "rewards/margins": 2.2970175743103027, + "rewards/rejected": -3.1585488319396973, + "step": 1136 + }, + { + "epoch": 0.24, + "learning_rate": 1.530672268907563e-05, + "logits/chosen": -2.182778835296631, + "logits/rejected": -1.8062870502471924, + "logps/chosen": -290.48028564453125, + "logps/rejected": -242.54159545898438, + "loss": 0.463, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1942212581634521, + "rewards/margins": 1.5299921035766602, + "rewards/rejected": -2.7242136001586914, + "step": 1137 + }, + { + "epoch": 0.24, + "learning_rate": 1.5302521008403362e-05, + "logits/chosen": -2.180172920227051, + "logits/rejected": -1.9833571910858154, + "logps/chosen": -300.0404968261719, + "logps/rejected": -311.17547607421875, + "loss": 0.3538, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.848576545715332, + "rewards/margins": 2.6968636512756348, + "rewards/rejected": -4.545440673828125, + "step": 1138 + }, + { + "epoch": 0.24, + "learning_rate": 1.5298319327731092e-05, + "logits/chosen": -1.848240613937378, + "logits/rejected": -1.7186464071273804, + "logps/chosen": -291.4215393066406, + "logps/rejected": -259.6919860839844, + "loss": 0.6207, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0645060539245605, + "rewards/margins": 1.3446435928344727, + "rewards/rejected": -3.409149646759033, + "step": 1139 + }, + { + "epoch": 0.24, + "learning_rate": 1.5294117647058822e-05, + "logits/chosen": -2.2319459915161133, + "logits/rejected": -2.116305351257324, + "logps/chosen": -285.8015441894531, + "logps/rejected": -268.8495788574219, + "loss": 0.4726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2956557273864746, + "rewards/margins": 1.930763840675354, + "rewards/rejected": -3.226419687271118, + "step": 1140 + }, + { + "epoch": 0.24, + "learning_rate": 1.5289915966386556e-05, + "logits/chosen": -2.074985980987549, + "logits/rejected": -1.5709457397460938, + "logps/chosen": -317.5798034667969, + "logps/rejected": -310.46435546875, + "loss": 0.3275, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7155604362487793, + "rewards/margins": 2.4036169052124023, + "rewards/rejected": -4.119177341461182, + "step": 1141 + }, + { + "epoch": 0.24, + "learning_rate": 1.5285714285714286e-05, + "logits/chosen": -1.925581455230713, + "logits/rejected": -1.942786455154419, + "logps/chosen": -377.4695739746094, + "logps/rejected": -359.2512512207031, + "loss": 0.1962, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6716963648796082, + "rewards/margins": 3.584994316101074, + "rewards/rejected": -4.256690979003906, + "step": 1142 + }, + { + "epoch": 0.24, + "learning_rate": 1.5281512605042016e-05, + "logits/chosen": -2.1712486743927, + "logits/rejected": -2.071918487548828, + "logps/chosen": -304.91314697265625, + "logps/rejected": -350.95379638671875, + "loss": 0.5103, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2292113304138184, + "rewards/margins": 2.801612377166748, + "rewards/rejected": -4.030823707580566, + "step": 1143 + }, + { + "epoch": 0.24, + "learning_rate": 1.527731092436975e-05, + "logits/chosen": -2.0356624126434326, + "logits/rejected": -1.598527193069458, + "logps/chosen": -289.5799255371094, + "logps/rejected": -371.9056091308594, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6706043481826782, + "rewards/margins": 4.31977653503418, + "rewards/rejected": -4.990380764007568, + "step": 1144 + }, + { + "epoch": 0.24, + "learning_rate": 1.527310924369748e-05, + "logits/chosen": -2.3796591758728027, + "logits/rejected": -1.9921081066131592, + "logps/chosen": -256.31146240234375, + "logps/rejected": -307.0758056640625, + "loss": 0.4264, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6932662725448608, + "rewards/margins": 3.287510871887207, + "rewards/rejected": -4.980776786804199, + "step": 1145 + }, + { + "epoch": 0.24, + "learning_rate": 1.526890756302521e-05, + "logits/chosen": -2.0256996154785156, + "logits/rejected": -1.9255012273788452, + "logps/chosen": -170.7609405517578, + "logps/rejected": -289.58856201171875, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6248992681503296, + "rewards/margins": 3.564387083053589, + "rewards/rejected": -5.189286231994629, + "step": 1146 + }, + { + "epoch": 0.24, + "learning_rate": 1.526470588235294e-05, + "logits/chosen": -2.3289265632629395, + "logits/rejected": -2.098623752593994, + "logps/chosen": -363.14874267578125, + "logps/rejected": -376.38555908203125, + "loss": 0.296, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9599385857582092, + "rewards/margins": 2.7599925994873047, + "rewards/rejected": -3.719931125640869, + "step": 1147 + }, + { + "epoch": 0.24, + "learning_rate": 1.5260504201680674e-05, + "logits/chosen": -2.3561441898345947, + "logits/rejected": -1.7958621978759766, + "logps/chosen": -361.17633056640625, + "logps/rejected": -386.96893310546875, + "loss": 0.1898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0192713737487793, + "rewards/margins": 3.683149814605713, + "rewards/rejected": -4.702421188354492, + "step": 1148 + }, + { + "epoch": 0.24, + "learning_rate": 1.5256302521008404e-05, + "logits/chosen": -1.8521263599395752, + "logits/rejected": -2.0047295093536377, + "logps/chosen": -198.44480895996094, + "logps/rejected": -304.13226318359375, + "loss": 0.3763, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4949949979782104, + "rewards/margins": 2.9132418632507324, + "rewards/rejected": -4.408236980438232, + "step": 1149 + }, + { + "epoch": 0.24, + "learning_rate": 1.5252100840336135e-05, + "logits/chosen": -2.2550265789031982, + "logits/rejected": -1.8941617012023926, + "logps/chosen": -433.40771484375, + "logps/rejected": -396.194580078125, + "loss": 0.0882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.745537281036377, + "rewards/margins": 4.053267955780029, + "rewards/rejected": -4.798805236816406, + "step": 1150 + }, + { + "epoch": 0.24, + "learning_rate": 1.5247899159663866e-05, + "logits/chosen": -2.058539867401123, + "logits/rejected": -1.7930188179016113, + "logps/chosen": -335.4761962890625, + "logps/rejected": -344.7674865722656, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1005860567092896, + "rewards/margins": 2.015758514404297, + "rewards/rejected": -3.116344451904297, + "step": 1151 + }, + { + "epoch": 0.24, + "learning_rate": 1.5243697478991597e-05, + "logits/chosen": -2.295010805130005, + "logits/rejected": -1.729496955871582, + "logps/chosen": -253.00033569335938, + "logps/rejected": -252.05328369140625, + "loss": 0.3484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1418451070785522, + "rewards/margins": 3.6182286739349365, + "rewards/rejected": -4.760074138641357, + "step": 1152 + }, + { + "epoch": 0.24, + "learning_rate": 1.5239495798319329e-05, + "logits/chosen": -2.195406436920166, + "logits/rejected": -1.7860475778579712, + "logps/chosen": -363.5596618652344, + "logps/rejected": -372.95013427734375, + "loss": 0.2198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5104256868362427, + "rewards/margins": 3.341014862060547, + "rewards/rejected": -3.8514404296875, + "step": 1153 + }, + { + "epoch": 0.24, + "learning_rate": 1.5235294117647059e-05, + "logits/chosen": -2.098393440246582, + "logits/rejected": -2.1312763690948486, + "logps/chosen": -389.659423828125, + "logps/rejected": -314.89373779296875, + "loss": 0.263, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4257755279541016, + "rewards/margins": 2.055939197540283, + "rewards/rejected": -3.4817147254943848, + "step": 1154 + }, + { + "epoch": 0.24, + "learning_rate": 1.523109243697479e-05, + "logits/chosen": -2.2290303707122803, + "logits/rejected": -2.139991283416748, + "logps/chosen": -226.84371948242188, + "logps/rejected": -317.8481140136719, + "loss": 0.1434, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7654358148574829, + "rewards/margins": 4.401028633117676, + "rewards/rejected": -5.166464805603027, + "step": 1155 + }, + { + "epoch": 0.24, + "learning_rate": 1.5226890756302521e-05, + "logits/chosen": -1.8731436729431152, + "logits/rejected": -2.31772518157959, + "logps/chosen": -262.9060974121094, + "logps/rejected": -360.10992431640625, + "loss": 0.2233, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6219669580459595, + "rewards/margins": 4.034126281738281, + "rewards/rejected": -5.656093120574951, + "step": 1156 + }, + { + "epoch": 0.24, + "learning_rate": 1.5222689075630253e-05, + "logits/chosen": -2.472644090652466, + "logits/rejected": -2.0184128284454346, + "logps/chosen": -368.617919921875, + "logps/rejected": -336.496337890625, + "loss": 0.1419, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1643491983413696, + "rewards/margins": 4.170035362243652, + "rewards/rejected": -5.334384441375732, + "step": 1157 + }, + { + "epoch": 0.24, + "learning_rate": 1.5218487394957983e-05, + "logits/chosen": -2.2415294647216797, + "logits/rejected": -1.8104461431503296, + "logps/chosen": -399.25042724609375, + "logps/rejected": -375.8715515136719, + "loss": 0.2422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0566871166229248, + "rewards/margins": 3.323225736618042, + "rewards/rejected": -4.379912853240967, + "step": 1158 + }, + { + "epoch": 0.24, + "learning_rate": 1.5214285714285715e-05, + "logits/chosen": -2.205252170562744, + "logits/rejected": -1.8324387073516846, + "logps/chosen": -359.388671875, + "logps/rejected": -363.02655029296875, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.436235189437866, + "rewards/margins": 3.160069704055786, + "rewards/rejected": -5.596304893493652, + "step": 1159 + }, + { + "epoch": 0.24, + "learning_rate": 1.5210084033613445e-05, + "logits/chosen": -2.1195058822631836, + "logits/rejected": -1.8476227521896362, + "logps/chosen": -403.9994201660156, + "logps/rejected": -294.9276123046875, + "loss": 0.4712, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.329033613204956, + "rewards/margins": 3.0810399055480957, + "rewards/rejected": -4.410073757171631, + "step": 1160 + }, + { + "epoch": 0.24, + "learning_rate": 1.5205882352941177e-05, + "logits/chosen": -2.224428176879883, + "logits/rejected": -1.8038300275802612, + "logps/chosen": -262.11114501953125, + "logps/rejected": -295.90057373046875, + "loss": 0.3352, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1146607398986816, + "rewards/margins": 3.3923401832580566, + "rewards/rejected": -5.507000923156738, + "step": 1161 + }, + { + "epoch": 0.24, + "learning_rate": 1.5201680672268907e-05, + "logits/chosen": -1.646331548690796, + "logits/rejected": -2.291196346282959, + "logps/chosen": -98.36325073242188, + "logps/rejected": -234.84335327148438, + "loss": 0.2678, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7265188694000244, + "rewards/margins": 3.967656135559082, + "rewards/rejected": -5.6941752433776855, + "step": 1162 + }, + { + "epoch": 0.24, + "learning_rate": 1.519747899159664e-05, + "logits/chosen": -2.0785071849823, + "logits/rejected": -2.061446189880371, + "logps/chosen": -359.7589111328125, + "logps/rejected": -330.1527404785156, + "loss": 0.2576, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2150859832763672, + "rewards/margins": 3.0104217529296875, + "rewards/rejected": -4.225507736206055, + "step": 1163 + }, + { + "epoch": 0.24, + "learning_rate": 1.519327731092437e-05, + "logits/chosen": -1.7263665199279785, + "logits/rejected": -2.04426646232605, + "logps/chosen": -196.4629364013672, + "logps/rejected": -380.02984619140625, + "loss": 0.1058, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4911531209945679, + "rewards/margins": 5.467153072357178, + "rewards/rejected": -6.958306312561035, + "step": 1164 + }, + { + "epoch": 0.24, + "learning_rate": 1.5189075630252101e-05, + "logits/chosen": -2.488992929458618, + "logits/rejected": -1.8200385570526123, + "logps/chosen": -383.03936767578125, + "logps/rejected": -298.67730712890625, + "loss": 0.2467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3275595903396606, + "rewards/margins": 4.3523712158203125, + "rewards/rejected": -5.679930686950684, + "step": 1165 + }, + { + "epoch": 0.24, + "learning_rate": 1.5184873949579833e-05, + "logits/chosen": -1.8770177364349365, + "logits/rejected": -1.548773169517517, + "logps/chosen": -245.68344116210938, + "logps/rejected": -237.56875610351562, + "loss": 0.4612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7806426286697388, + "rewards/margins": 1.6030175685882568, + "rewards/rejected": -3.383660316467285, + "step": 1166 + }, + { + "epoch": 0.24, + "learning_rate": 1.5180672268907564e-05, + "logits/chosen": -2.0789072513580322, + "logits/rejected": -2.193075656890869, + "logps/chosen": -325.4642333984375, + "logps/rejected": -434.20294189453125, + "loss": 0.2357, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9273462295532227, + "rewards/margins": 3.5224170684814453, + "rewards/rejected": -5.449763298034668, + "step": 1167 + }, + { + "epoch": 0.24, + "learning_rate": 1.5176470588235295e-05, + "logits/chosen": -1.8905764818191528, + "logits/rejected": -1.5760489702224731, + "logps/chosen": -287.21142578125, + "logps/rejected": -286.29345703125, + "loss": 0.2064, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5765068531036377, + "rewards/margins": 4.297512531280518, + "rewards/rejected": -5.874019622802734, + "step": 1168 + }, + { + "epoch": 0.24, + "learning_rate": 1.5172268907563026e-05, + "logits/chosen": -2.0364441871643066, + "logits/rejected": -2.1332294940948486, + "logps/chosen": -324.4705505371094, + "logps/rejected": -291.461669921875, + "loss": 0.1753, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2496651411056519, + "rewards/margins": 4.754192352294922, + "rewards/rejected": -6.003857612609863, + "step": 1169 + }, + { + "epoch": 0.24, + "learning_rate": 1.5168067226890758e-05, + "logits/chosen": -1.92880117893219, + "logits/rejected": -2.3277297019958496, + "logps/chosen": -315.2311096191406, + "logps/rejected": -323.5077819824219, + "loss": 0.2561, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5292918682098389, + "rewards/margins": 4.4300312995910645, + "rewards/rejected": -5.959322929382324, + "step": 1170 + }, + { + "epoch": 0.24, + "learning_rate": 1.5163865546218488e-05, + "logits/chosen": -2.1607794761657715, + "logits/rejected": -1.8354965448379517, + "logps/chosen": -385.51019287109375, + "logps/rejected": -333.3912353515625, + "loss": 0.299, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.22383975982666, + "rewards/margins": 4.075891494750977, + "rewards/rejected": -6.299731254577637, + "step": 1171 + }, + { + "epoch": 0.25, + "learning_rate": 1.515966386554622e-05, + "logits/chosen": -2.104717254638672, + "logits/rejected": -2.279975652694702, + "logps/chosen": -207.76742553710938, + "logps/rejected": -389.9500732421875, + "loss": 0.3505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9203474521636963, + "rewards/margins": 3.584094762802124, + "rewards/rejected": -5.50444221496582, + "step": 1172 + }, + { + "epoch": 0.25, + "learning_rate": 1.515546218487395e-05, + "logits/chosen": -2.137160062789917, + "logits/rejected": -2.1771063804626465, + "logps/chosen": -343.55914306640625, + "logps/rejected": -384.8305969238281, + "loss": 0.2173, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2285467386245728, + "rewards/margins": 4.091034412384033, + "rewards/rejected": -5.319580554962158, + "step": 1173 + }, + { + "epoch": 0.25, + "learning_rate": 1.5151260504201682e-05, + "logits/chosen": -2.150630474090576, + "logits/rejected": -2.2648842334747314, + "logps/chosen": -374.4417724609375, + "logps/rejected": -381.836669921875, + "loss": 0.43, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.69371497631073, + "rewards/margins": 2.7535622119903564, + "rewards/rejected": -4.447277069091797, + "step": 1174 + }, + { + "epoch": 0.25, + "learning_rate": 1.5147058823529412e-05, + "logits/chosen": -2.009212017059326, + "logits/rejected": -1.942438006401062, + "logps/chosen": -305.95404052734375, + "logps/rejected": -310.2395324707031, + "loss": 0.2136, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.851820468902588, + "rewards/margins": 4.212376117706299, + "rewards/rejected": -6.064196586608887, + "step": 1175 + }, + { + "epoch": 0.25, + "learning_rate": 1.5142857142857144e-05, + "logits/chosen": -2.0716683864593506, + "logits/rejected": -1.8512918949127197, + "logps/chosen": -341.6034240722656, + "logps/rejected": -356.722412109375, + "loss": 0.1297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0413408279418945, + "rewards/margins": 4.149041652679443, + "rewards/rejected": -6.190382480621338, + "step": 1176 + }, + { + "epoch": 0.25, + "learning_rate": 1.5138655462184874e-05, + "logits/chosen": -1.8365814685821533, + "logits/rejected": -1.7843756675720215, + "logps/chosen": -396.0382385253906, + "logps/rejected": -394.5455627441406, + "loss": 0.2291, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4077531099319458, + "rewards/margins": 3.8222298622131348, + "rewards/rejected": -5.229983329772949, + "step": 1177 + }, + { + "epoch": 0.25, + "learning_rate": 1.5134453781512606e-05, + "logits/chosen": -1.930666208267212, + "logits/rejected": -1.7580417394638062, + "logps/chosen": -310.0386962890625, + "logps/rejected": -384.09393310546875, + "loss": 0.4902, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8741931915283203, + "rewards/margins": 3.8421387672424316, + "rewards/rejected": -6.71633243560791, + "step": 1178 + }, + { + "epoch": 0.25, + "learning_rate": 1.5130252100840336e-05, + "logits/chosen": -2.257823944091797, + "logits/rejected": -1.7922625541687012, + "logps/chosen": -331.9492492675781, + "logps/rejected": -247.1479949951172, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9673147201538086, + "rewards/margins": 2.276440143585205, + "rewards/rejected": -5.2437543869018555, + "step": 1179 + }, + { + "epoch": 0.25, + "learning_rate": 1.5126050420168068e-05, + "logits/chosen": -2.345992088317871, + "logits/rejected": -2.2462806701660156, + "logps/chosen": -355.7221374511719, + "logps/rejected": -426.6363525390625, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4929978847503662, + "rewards/margins": 4.724306106567383, + "rewards/rejected": -6.21730375289917, + "step": 1180 + }, + { + "epoch": 0.25, + "learning_rate": 1.5121848739495799e-05, + "logits/chosen": -2.040586233139038, + "logits/rejected": -1.7297885417938232, + "logps/chosen": -343.8881530761719, + "logps/rejected": -352.7177734375, + "loss": 0.8962, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.852390766143799, + "rewards/margins": 2.0606026649475098, + "rewards/rejected": -4.912993431091309, + "step": 1181 + }, + { + "epoch": 0.25, + "learning_rate": 1.511764705882353e-05, + "logits/chosen": -2.219639778137207, + "logits/rejected": -2.1185994148254395, + "logps/chosen": -467.3554382324219, + "logps/rejected": -422.8388671875, + "loss": 0.4088, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7696847915649414, + "rewards/margins": 3.0175669193267822, + "rewards/rejected": -4.7872514724731445, + "step": 1182 + }, + { + "epoch": 0.25, + "learning_rate": 1.511344537815126e-05, + "logits/chosen": -2.1489109992980957, + "logits/rejected": -1.979834794998169, + "logps/chosen": -255.49951171875, + "logps/rejected": -287.5498962402344, + "loss": 0.5424, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4700961112976074, + "rewards/margins": 4.546854019165039, + "rewards/rejected": -7.016949653625488, + "step": 1183 + }, + { + "epoch": 0.25, + "learning_rate": 1.5109243697478993e-05, + "logits/chosen": -2.1440699100494385, + "logits/rejected": -2.0058140754699707, + "logps/chosen": -368.530517578125, + "logps/rejected": -352.6629943847656, + "loss": 0.0977, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6444863080978394, + "rewards/margins": 4.887570858001709, + "rewards/rejected": -5.532057285308838, + "step": 1184 + }, + { + "epoch": 0.25, + "learning_rate": 1.5105042016806723e-05, + "logits/chosen": -2.0883612632751465, + "logits/rejected": -1.9995131492614746, + "logps/chosen": -285.09320068359375, + "logps/rejected": -347.45361328125, + "loss": 0.1575, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.339712619781494, + "rewards/margins": 3.4179694652557373, + "rewards/rejected": -5.757681846618652, + "step": 1185 + }, + { + "epoch": 0.25, + "learning_rate": 1.5100840336134455e-05, + "logits/chosen": -2.131596088409424, + "logits/rejected": -1.7838683128356934, + "logps/chosen": -360.7657775878906, + "logps/rejected": -258.069580078125, + "loss": 0.1869, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5763764381408691, + "rewards/margins": 4.613051891326904, + "rewards/rejected": -6.189428329467773, + "step": 1186 + }, + { + "epoch": 0.25, + "learning_rate": 1.5096638655462185e-05, + "logits/chosen": -1.929497480392456, + "logits/rejected": -1.8926093578338623, + "logps/chosen": -227.36569213867188, + "logps/rejected": -228.517333984375, + "loss": 0.1753, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5404335260391235, + "rewards/margins": 4.4586310386657715, + "rewards/rejected": -5.9990644454956055, + "step": 1187 + }, + { + "epoch": 0.25, + "learning_rate": 1.5092436974789917e-05, + "logits/chosen": -2.088183879852295, + "logits/rejected": -2.080101490020752, + "logps/chosen": -269.8706359863281, + "logps/rejected": -295.9175109863281, + "loss": 0.3102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8600308895111084, + "rewards/margins": 3.108163833618164, + "rewards/rejected": -4.968194961547852, + "step": 1188 + }, + { + "epoch": 0.25, + "learning_rate": 1.5088235294117649e-05, + "logits/chosen": -1.9852622747421265, + "logits/rejected": -1.657360315322876, + "logps/chosen": -348.3370361328125, + "logps/rejected": -423.8879699707031, + "loss": 0.5602, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3043925762176514, + "rewards/margins": 3.0825114250183105, + "rewards/rejected": -5.386903762817383, + "step": 1189 + }, + { + "epoch": 0.25, + "learning_rate": 1.5084033613445379e-05, + "logits/chosen": -1.8977195024490356, + "logits/rejected": -1.7659292221069336, + "logps/chosen": -325.5350036621094, + "logps/rejected": -324.3315124511719, + "loss": 0.1718, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.330365538597107, + "rewards/margins": 5.726489543914795, + "rewards/rejected": -7.056855201721191, + "step": 1190 + }, + { + "epoch": 0.25, + "learning_rate": 1.5079831932773111e-05, + "logits/chosen": -2.3464407920837402, + "logits/rejected": -1.7052339315414429, + "logps/chosen": -382.53253173828125, + "logps/rejected": -333.1458740234375, + "loss": 0.6533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7083666324615479, + "rewards/margins": 3.4127748012542725, + "rewards/rejected": -5.12114143371582, + "step": 1191 + }, + { + "epoch": 0.25, + "learning_rate": 1.5075630252100841e-05, + "logits/chosen": -2.2305030822753906, + "logits/rejected": -1.8566007614135742, + "logps/chosen": -301.3981628417969, + "logps/rejected": -268.1978759765625, + "loss": 0.347, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.473177671432495, + "rewards/margins": 2.4284377098083496, + "rewards/rejected": -4.901615142822266, + "step": 1192 + }, + { + "epoch": 0.25, + "learning_rate": 1.5071428571428573e-05, + "logits/chosen": -2.0026190280914307, + "logits/rejected": -2.090710163116455, + "logps/chosen": -174.71018981933594, + "logps/rejected": -192.7593994140625, + "loss": 0.7049, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2492282390594482, + "rewards/margins": 0.8885723948478699, + "rewards/rejected": -3.137800693511963, + "step": 1193 + }, + { + "epoch": 0.25, + "learning_rate": 1.5067226890756303e-05, + "logits/chosen": -2.3190665245056152, + "logits/rejected": -1.9814680814743042, + "logps/chosen": -373.4332275390625, + "logps/rejected": -345.6336975097656, + "loss": 0.416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.515828847885132, + "rewards/margins": 2.645512819290161, + "rewards/rejected": -5.161341667175293, + "step": 1194 + }, + { + "epoch": 0.25, + "learning_rate": 1.5063025210084035e-05, + "logits/chosen": -2.291168212890625, + "logits/rejected": -1.6695159673690796, + "logps/chosen": -280.7257385253906, + "logps/rejected": -347.4483642578125, + "loss": 0.8863, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0984272956848145, + "rewards/margins": 2.378565788269043, + "rewards/rejected": -4.476993083953857, + "step": 1195 + }, + { + "epoch": 0.25, + "learning_rate": 1.5058823529411765e-05, + "logits/chosen": -1.8760955333709717, + "logits/rejected": -1.5016248226165771, + "logps/chosen": -269.7454833984375, + "logps/rejected": -264.7718811035156, + "loss": 0.3244, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.398189067840576, + "rewards/margins": 5.083847999572754, + "rewards/rejected": -7.482037544250488, + "step": 1196 + }, + { + "epoch": 0.25, + "learning_rate": 1.5054621848739497e-05, + "logits/chosen": -2.094642400741577, + "logits/rejected": -1.7723804712295532, + "logps/chosen": -208.8339385986328, + "logps/rejected": -184.4738006591797, + "loss": 0.2928, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.588585615158081, + "rewards/margins": 3.3819456100463867, + "rewards/rejected": -4.970530986785889, + "step": 1197 + }, + { + "epoch": 0.25, + "learning_rate": 1.5050420168067228e-05, + "logits/chosen": -2.2957119941711426, + "logits/rejected": -1.9593957662582397, + "logps/chosen": -418.91845703125, + "logps/rejected": -350.30194091796875, + "loss": 0.598, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5677578449249268, + "rewards/margins": 2.54343843460083, + "rewards/rejected": -4.111196517944336, + "step": 1198 + }, + { + "epoch": 0.25, + "learning_rate": 1.504621848739496e-05, + "logits/chosen": -2.02939772605896, + "logits/rejected": -1.7546391487121582, + "logps/chosen": -359.5293273925781, + "logps/rejected": -315.7480773925781, + "loss": 0.3058, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6924017667770386, + "rewards/margins": 3.448317050933838, + "rewards/rejected": -5.140718936920166, + "step": 1199 + }, + { + "epoch": 0.25, + "learning_rate": 1.504201680672269e-05, + "logits/chosen": -2.1023483276367188, + "logits/rejected": -1.813053846359253, + "logps/chosen": -424.4500427246094, + "logps/rejected": -445.86981201171875, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4255785942077637, + "rewards/margins": 4.9957685470581055, + "rewards/rejected": -6.421347141265869, + "step": 1200 + }, + { + "epoch": 0.25, + "learning_rate": 1.5037815126050422e-05, + "logits/chosen": -2.163423538208008, + "logits/rejected": -1.7977042198181152, + "logps/chosen": -312.72906494140625, + "logps/rejected": -290.58489990234375, + "loss": 0.464, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5628819465637207, + "rewards/margins": 2.356330394744873, + "rewards/rejected": -4.919212341308594, + "step": 1201 + }, + { + "epoch": 0.25, + "learning_rate": 1.5033613445378152e-05, + "logits/chosen": -2.3980746269226074, + "logits/rejected": -1.9703707695007324, + "logps/chosen": -354.43701171875, + "logps/rejected": -327.32403564453125, + "loss": 0.0771, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9741089344024658, + "rewards/margins": 4.94292688369751, + "rewards/rejected": -6.9170355796813965, + "step": 1202 + }, + { + "epoch": 0.25, + "learning_rate": 1.5029411764705884e-05, + "logits/chosen": -1.775939702987671, + "logits/rejected": -1.851386547088623, + "logps/chosen": -212.29611206054688, + "logps/rejected": -278.2469482421875, + "loss": 0.1344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4006495475769043, + "rewards/margins": 4.711906433105469, + "rewards/rejected": -6.112556457519531, + "step": 1203 + }, + { + "epoch": 0.25, + "learning_rate": 1.5025210084033614e-05, + "logits/chosen": -1.8301061391830444, + "logits/rejected": -1.931321382522583, + "logps/chosen": -267.00152587890625, + "logps/rejected": -255.85873413085938, + "loss": 0.5117, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.289505958557129, + "rewards/margins": 3.6016359329223633, + "rewards/rejected": -5.891141891479492, + "step": 1204 + }, + { + "epoch": 0.25, + "learning_rate": 1.5021008403361346e-05, + "logits/chosen": -2.188115119934082, + "logits/rejected": -1.6208341121673584, + "logps/chosen": -352.37017822265625, + "logps/rejected": -311.46661376953125, + "loss": 0.1562, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1205482482910156, + "rewards/margins": 4.6318278312683105, + "rewards/rejected": -6.752375602722168, + "step": 1205 + }, + { + "epoch": 0.25, + "learning_rate": 1.5016806722689076e-05, + "logits/chosen": -2.007305860519409, + "logits/rejected": -1.346221923828125, + "logps/chosen": -353.40533447265625, + "logps/rejected": -273.8104553222656, + "loss": 0.2806, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2381479740142822, + "rewards/margins": 3.3132522106170654, + "rewards/rejected": -6.551400184631348, + "step": 1206 + }, + { + "epoch": 0.25, + "learning_rate": 1.5012605042016808e-05, + "logits/chosen": -2.2197954654693604, + "logits/rejected": -2.0422654151916504, + "logps/chosen": -255.7327117919922, + "logps/rejected": -208.35516357421875, + "loss": 0.209, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1673836708068848, + "rewards/margins": 3.4634766578674316, + "rewards/rejected": -5.630859851837158, + "step": 1207 + }, + { + "epoch": 0.25, + "learning_rate": 1.5008403361344538e-05, + "logits/chosen": -2.260037899017334, + "logits/rejected": -2.1306285858154297, + "logps/chosen": -372.90069580078125, + "logps/rejected": -350.7979736328125, + "loss": 0.287, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2288615703582764, + "rewards/margins": 3.9649460315704346, + "rewards/rejected": -6.193807601928711, + "step": 1208 + }, + { + "epoch": 0.25, + "learning_rate": 1.500420168067227e-05, + "logits/chosen": -1.9778294563293457, + "logits/rejected": -1.902571201324463, + "logps/chosen": -332.653564453125, + "logps/rejected": -352.37591552734375, + "loss": 0.15, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7417478561401367, + "rewards/margins": 4.269123077392578, + "rewards/rejected": -7.010870933532715, + "step": 1209 + }, + { + "epoch": 0.25, + "learning_rate": 1.5000000000000002e-05, + "logits/chosen": -2.5007359981536865, + "logits/rejected": -2.257749319076538, + "logps/chosen": -425.8990478515625, + "logps/rejected": -416.16937255859375, + "loss": 0.1807, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.001979351043701, + "rewards/margins": 3.5866730213165283, + "rewards/rejected": -5.588652610778809, + "step": 1210 + }, + { + "epoch": 0.25, + "learning_rate": 1.4995798319327732e-05, + "logits/chosen": -2.112091541290283, + "logits/rejected": -1.5676326751708984, + "logps/chosen": -362.8662414550781, + "logps/rejected": -316.5755920410156, + "loss": 0.2354, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3783485889434814, + "rewards/margins": 5.855016708374023, + "rewards/rejected": -8.233365058898926, + "step": 1211 + }, + { + "epoch": 0.25, + "learning_rate": 1.4991596638655464e-05, + "logits/chosen": -2.2172112464904785, + "logits/rejected": -1.773216962814331, + "logps/chosen": -347.5205078125, + "logps/rejected": -280.25506591796875, + "loss": 0.1626, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1004161834716797, + "rewards/margins": 3.5782556533813477, + "rewards/rejected": -6.678671836853027, + "step": 1212 + }, + { + "epoch": 0.25, + "learning_rate": 1.4987394957983194e-05, + "logits/chosen": -2.1802289485931396, + "logits/rejected": -1.5975370407104492, + "logps/chosen": -308.1313171386719, + "logps/rejected": -263.97760009765625, + "loss": 0.2375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.720303535461426, + "rewards/margins": 3.6396279335021973, + "rewards/rejected": -6.359931945800781, + "step": 1213 + }, + { + "epoch": 0.25, + "learning_rate": 1.4983193277310926e-05, + "logits/chosen": -2.058438539505005, + "logits/rejected": -1.876246690750122, + "logps/chosen": -391.8819274902344, + "logps/rejected": -530.6412963867188, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8544301986694336, + "rewards/margins": 6.893628120422363, + "rewards/rejected": -8.748058319091797, + "step": 1214 + }, + { + "epoch": 0.25, + "learning_rate": 1.4978991596638657e-05, + "logits/chosen": -2.1150925159454346, + "logits/rejected": -2.0265910625457764, + "logps/chosen": -368.80072021484375, + "logps/rejected": -273.08599853515625, + "loss": 0.136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8796675205230713, + "rewards/margins": 4.929985046386719, + "rewards/rejected": -6.809652328491211, + "step": 1215 + }, + { + "epoch": 0.25, + "learning_rate": 1.4974789915966388e-05, + "logits/chosen": -2.1040945053100586, + "logits/rejected": -1.812894582748413, + "logps/chosen": -223.75489807128906, + "logps/rejected": -294.23748779296875, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8705954551696777, + "rewards/margins": 4.475533485412598, + "rewards/rejected": -7.346128940582275, + "step": 1216 + }, + { + "epoch": 0.25, + "learning_rate": 1.4970588235294119e-05, + "logits/chosen": -2.1186511516571045, + "logits/rejected": -2.0732200145721436, + "logps/chosen": -196.33123779296875, + "logps/rejected": -243.09954833984375, + "loss": 0.3386, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.925447702407837, + "rewards/margins": 3.0269150733947754, + "rewards/rejected": -5.952362537384033, + "step": 1217 + }, + { + "epoch": 0.25, + "learning_rate": 1.496638655462185e-05, + "logits/chosen": -2.140348434448242, + "logits/rejected": -2.054626941680908, + "logps/chosen": -365.5283203125, + "logps/rejected": -360.31658935546875, + "loss": 0.2577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.55711030960083, + "rewards/margins": 7.321220397949219, + "rewards/rejected": -9.87833023071289, + "step": 1218 + }, + { + "epoch": 0.26, + "learning_rate": 1.496218487394958e-05, + "logits/chosen": -1.9368691444396973, + "logits/rejected": -2.0441527366638184, + "logps/chosen": -268.2828369140625, + "logps/rejected": -333.4814147949219, + "loss": 0.1697, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8126085996627808, + "rewards/margins": 5.2551655769348145, + "rewards/rejected": -7.067774295806885, + "step": 1219 + }, + { + "epoch": 0.26, + "learning_rate": 1.4957983193277313e-05, + "logits/chosen": -1.5235059261322021, + "logits/rejected": -1.9660358428955078, + "logps/chosen": -340.4520263671875, + "logps/rejected": -402.4558410644531, + "loss": 0.1864, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3598809242248535, + "rewards/margins": 4.044033050537109, + "rewards/rejected": -6.403914451599121, + "step": 1220 + }, + { + "epoch": 0.26, + "learning_rate": 1.4953781512605043e-05, + "logits/chosen": -2.52192759513855, + "logits/rejected": -2.013643503189087, + "logps/chosen": -359.35919189453125, + "logps/rejected": -305.2646789550781, + "loss": 0.2536, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6135849952697754, + "rewards/margins": 4.372774124145508, + "rewards/rejected": -6.986359596252441, + "step": 1221 + }, + { + "epoch": 0.26, + "learning_rate": 1.4949579831932775e-05, + "logits/chosen": -2.1180217266082764, + "logits/rejected": -2.2037787437438965, + "logps/chosen": -379.3891296386719, + "logps/rejected": -420.52386474609375, + "loss": 0.3081, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5315847396850586, + "rewards/margins": 3.5132007598876953, + "rewards/rejected": -6.044785499572754, + "step": 1222 + }, + { + "epoch": 0.26, + "learning_rate": 1.4945378151260505e-05, + "logits/chosen": -2.2050085067749023, + "logits/rejected": -2.200244665145874, + "logps/chosen": -291.6566467285156, + "logps/rejected": -331.0263977050781, + "loss": 0.9311, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7258543968200684, + "rewards/margins": 2.866224765777588, + "rewards/rejected": -5.592079162597656, + "step": 1223 + }, + { + "epoch": 0.26, + "learning_rate": 1.4941176470588237e-05, + "logits/chosen": -2.3285887241363525, + "logits/rejected": -2.107588291168213, + "logps/chosen": -400.912841796875, + "logps/rejected": -377.38128662109375, + "loss": 0.4701, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6222760677337646, + "rewards/margins": 4.1373066902160645, + "rewards/rejected": -6.75958251953125, + "step": 1224 + }, + { + "epoch": 0.26, + "learning_rate": 1.4936974789915967e-05, + "logits/chosen": -2.198303699493408, + "logits/rejected": -1.980510950088501, + "logps/chosen": -343.60888671875, + "logps/rejected": -422.5761413574219, + "loss": 0.4523, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5156986713409424, + "rewards/margins": 5.014289379119873, + "rewards/rejected": -7.529987812042236, + "step": 1225 + }, + { + "epoch": 0.26, + "learning_rate": 1.49327731092437e-05, + "logits/chosen": -2.3580985069274902, + "logits/rejected": -2.328951835632324, + "logps/chosen": -321.87957763671875, + "logps/rejected": -307.9559326171875, + "loss": 0.6262, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0097098350524902, + "rewards/margins": 2.486520290374756, + "rewards/rejected": -5.496230125427246, + "step": 1226 + }, + { + "epoch": 0.26, + "learning_rate": 1.492857142857143e-05, + "logits/chosen": -2.226259469985962, + "logits/rejected": -2.0274949073791504, + "logps/chosen": -173.77804565429688, + "logps/rejected": -253.33920288085938, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7933404445648193, + "rewards/margins": 5.47879695892334, + "rewards/rejected": -8.272137641906738, + "step": 1227 + }, + { + "epoch": 0.26, + "learning_rate": 1.4924369747899161e-05, + "logits/chosen": -2.068190813064575, + "logits/rejected": -1.8367664813995361, + "logps/chosen": -436.0915222167969, + "logps/rejected": -471.1249694824219, + "loss": 0.2845, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7218289375305176, + "rewards/margins": 3.41749906539917, + "rewards/rejected": -6.139328479766846, + "step": 1228 + }, + { + "epoch": 0.26, + "learning_rate": 1.4920168067226892e-05, + "logits/chosen": -2.171875238418579, + "logits/rejected": -2.072824716567993, + "logps/chosen": -380.54364013671875, + "logps/rejected": -369.61004638671875, + "loss": 0.4066, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0371713638305664, + "rewards/margins": 2.700989246368408, + "rewards/rejected": -5.738160610198975, + "step": 1229 + }, + { + "epoch": 0.26, + "learning_rate": 1.4915966386554623e-05, + "logits/chosen": -2.262847423553467, + "logits/rejected": -2.2142810821533203, + "logps/chosen": -306.16766357421875, + "logps/rejected": -299.1580505371094, + "loss": 0.4377, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5485153198242188, + "rewards/margins": 3.2537474632263184, + "rewards/rejected": -6.802263259887695, + "step": 1230 + }, + { + "epoch": 0.26, + "learning_rate": 1.4911764705882354e-05, + "logits/chosen": -1.872605800628662, + "logits/rejected": -1.5538092851638794, + "logps/chosen": -310.0887451171875, + "logps/rejected": -280.9057312011719, + "loss": 0.2836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3451385498046875, + "rewards/margins": 3.2125837802886963, + "rewards/rejected": -6.557722091674805, + "step": 1231 + }, + { + "epoch": 0.26, + "learning_rate": 1.4907563025210086e-05, + "logits/chosen": -2.133260726928711, + "logits/rejected": -2.011679172515869, + "logps/chosen": -351.28204345703125, + "logps/rejected": -568.9136352539062, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1848526000976562, + "rewards/margins": 5.747891426086426, + "rewards/rejected": -7.932744026184082, + "step": 1232 + }, + { + "epoch": 0.26, + "learning_rate": 1.4903361344537817e-05, + "logits/chosen": -2.139843463897705, + "logits/rejected": -2.057265281677246, + "logps/chosen": -316.66693115234375, + "logps/rejected": -332.8795471191406, + "loss": 0.3868, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.246325969696045, + "rewards/margins": 4.390238285064697, + "rewards/rejected": -6.636564254760742, + "step": 1233 + }, + { + "epoch": 0.26, + "learning_rate": 1.4899159663865548e-05, + "logits/chosen": -2.131394386291504, + "logits/rejected": -2.0964486598968506, + "logps/chosen": -300.88348388671875, + "logps/rejected": -331.24310302734375, + "loss": 0.3583, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0885329246520996, + "rewards/margins": 3.7785253524780273, + "rewards/rejected": -6.867058277130127, + "step": 1234 + }, + { + "epoch": 0.26, + "learning_rate": 1.489495798319328e-05, + "logits/chosen": -2.165121555328369, + "logits/rejected": -2.1415867805480957, + "logps/chosen": -395.57470703125, + "logps/rejected": -383.6015625, + "loss": 0.4236, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.561150074005127, + "rewards/margins": 3.0468127727508545, + "rewards/rejected": -6.607962608337402, + "step": 1235 + }, + { + "epoch": 0.26, + "learning_rate": 1.489075630252101e-05, + "logits/chosen": -2.044487476348877, + "logits/rejected": -2.0329229831695557, + "logps/chosen": -236.61151123046875, + "logps/rejected": -259.8937683105469, + "loss": 0.1177, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.607759714126587, + "rewards/margins": 5.409193992614746, + "rewards/rejected": -8.01695442199707, + "step": 1236 + }, + { + "epoch": 0.26, + "learning_rate": 1.4886554621848742e-05, + "logits/chosen": -2.4068093299865723, + "logits/rejected": -2.0248050689697266, + "logps/chosen": -343.762451171875, + "logps/rejected": -284.1758117675781, + "loss": 0.365, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.357558250427246, + "rewards/margins": 3.0251946449279785, + "rewards/rejected": -6.382753372192383, + "step": 1237 + }, + { + "epoch": 0.26, + "learning_rate": 1.4882352941176472e-05, + "logits/chosen": -2.171760082244873, + "logits/rejected": -1.9044723510742188, + "logps/chosen": -325.18353271484375, + "logps/rejected": -402.3902587890625, + "loss": 0.3132, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.185366630554199, + "rewards/margins": 4.973031520843506, + "rewards/rejected": -8.158397674560547, + "step": 1238 + }, + { + "epoch": 0.26, + "learning_rate": 1.4878151260504204e-05, + "logits/chosen": -2.098283529281616, + "logits/rejected": -1.8983526229858398, + "logps/chosen": -388.46197509765625, + "logps/rejected": -354.659423828125, + "loss": 0.2702, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1840741634368896, + "rewards/margins": 4.17909574508667, + "rewards/rejected": -6.363170146942139, + "step": 1239 + }, + { + "epoch": 0.26, + "learning_rate": 1.4873949579831934e-05, + "logits/chosen": -2.224400520324707, + "logits/rejected": -2.192211627960205, + "logps/chosen": -285.0545654296875, + "logps/rejected": -337.7933654785156, + "loss": 0.3727, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2407422065734863, + "rewards/margins": 2.813994884490967, + "rewards/rejected": -6.054736614227295, + "step": 1240 + }, + { + "epoch": 0.26, + "learning_rate": 1.4869747899159666e-05, + "logits/chosen": -2.2433032989501953, + "logits/rejected": -1.9871091842651367, + "logps/chosen": -332.70184326171875, + "logps/rejected": -339.0936279296875, + "loss": 0.7457, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3039932250976562, + "rewards/margins": 3.8591604232788086, + "rewards/rejected": -7.163153648376465, + "step": 1241 + }, + { + "epoch": 0.26, + "learning_rate": 1.4865546218487396e-05, + "logits/chosen": -2.037442684173584, + "logits/rejected": -2.013826370239258, + "logps/chosen": -386.5946350097656, + "logps/rejected": -540.1484985351562, + "loss": 1.0011, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.319838762283325, + "rewards/margins": 3.8513245582580566, + "rewards/rejected": -7.171163558959961, + "step": 1242 + }, + { + "epoch": 0.26, + "learning_rate": 1.4861344537815128e-05, + "logits/chosen": -2.255568027496338, + "logits/rejected": -2.10587739944458, + "logps/chosen": -363.89447021484375, + "logps/rejected": -320.13983154296875, + "loss": 0.1884, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.78605055809021, + "rewards/margins": 3.9073190689086914, + "rewards/rejected": -5.693369388580322, + "step": 1243 + }, + { + "epoch": 0.26, + "learning_rate": 1.4857142857142858e-05, + "logits/chosen": -2.293961763381958, + "logits/rejected": -1.8324017524719238, + "logps/chosen": -414.66143798828125, + "logps/rejected": -318.05426025390625, + "loss": 0.5809, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3593478202819824, + "rewards/margins": 3.067253589630127, + "rewards/rejected": -5.426601409912109, + "step": 1244 + }, + { + "epoch": 0.26, + "learning_rate": 1.485294117647059e-05, + "logits/chosen": -2.19781494140625, + "logits/rejected": -2.1973230838775635, + "logps/chosen": -320.99908447265625, + "logps/rejected": -331.1231689453125, + "loss": 0.302, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4913461208343506, + "rewards/margins": 3.0858981609344482, + "rewards/rejected": -6.577244281768799, + "step": 1245 + }, + { + "epoch": 0.26, + "learning_rate": 1.484873949579832e-05, + "logits/chosen": -2.202479362487793, + "logits/rejected": -1.8314063549041748, + "logps/chosen": -324.1307678222656, + "logps/rejected": -309.2308654785156, + "loss": 0.372, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7506794929504395, + "rewards/margins": 3.8087098598480225, + "rewards/rejected": -6.559390068054199, + "step": 1246 + }, + { + "epoch": 0.26, + "learning_rate": 1.4844537815126052e-05, + "logits/chosen": -2.0784807205200195, + "logits/rejected": -2.043990135192871, + "logps/chosen": -391.52606201171875, + "logps/rejected": -294.4495544433594, + "loss": 0.4006, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3852477073669434, + "rewards/margins": 4.829652786254883, + "rewards/rejected": -7.214900970458984, + "step": 1247 + }, + { + "epoch": 0.26, + "learning_rate": 1.4840336134453783e-05, + "logits/chosen": -2.2601428031921387, + "logits/rejected": -1.4105021953582764, + "logps/chosen": -333.609375, + "logps/rejected": -311.25885009765625, + "loss": 0.1278, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8014662265777588, + "rewards/margins": 4.8279595375061035, + "rewards/rejected": -6.629426002502441, + "step": 1248 + }, + { + "epoch": 0.26, + "learning_rate": 1.4836134453781515e-05, + "logits/chosen": -2.1901583671569824, + "logits/rejected": -1.9587454795837402, + "logps/chosen": -262.41839599609375, + "logps/rejected": -346.70880126953125, + "loss": 0.4117, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.328275680541992, + "rewards/margins": 3.0793232917785645, + "rewards/rejected": -5.407598972320557, + "step": 1249 + }, + { + "epoch": 0.26, + "learning_rate": 1.4831932773109245e-05, + "logits/chosen": -2.3683345317840576, + "logits/rejected": -2.1028990745544434, + "logps/chosen": -375.647705078125, + "logps/rejected": -354.07373046875, + "loss": 0.3542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.460684061050415, + "rewards/margins": 2.711263656616211, + "rewards/rejected": -4.171947956085205, + "step": 1250 + }, + { + "epoch": 0.26, + "learning_rate": 1.4827731092436977e-05, + "logits/chosen": -2.0579822063446045, + "logits/rejected": -2.1787192821502686, + "logps/chosen": -340.0102844238281, + "logps/rejected": -309.2815856933594, + "loss": 0.513, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4277546405792236, + "rewards/margins": 3.2885842323303223, + "rewards/rejected": -5.716339111328125, + "step": 1251 + }, + { + "epoch": 0.26, + "learning_rate": 1.4823529411764707e-05, + "logits/chosen": -2.3111047744750977, + "logits/rejected": -1.9673919677734375, + "logps/chosen": -313.46514892578125, + "logps/rejected": -382.71282958984375, + "loss": 0.2811, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5062832832336426, + "rewards/margins": 2.6285202503204346, + "rewards/rejected": -5.134803771972656, + "step": 1252 + }, + { + "epoch": 0.26, + "learning_rate": 1.4819327731092439e-05, + "logits/chosen": -2.0615389347076416, + "logits/rejected": -2.158968448638916, + "logps/chosen": -430.1170654296875, + "logps/rejected": -438.90911865234375, + "loss": 0.2845, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.925978422164917, + "rewards/margins": 3.712069272994995, + "rewards/rejected": -5.638047695159912, + "step": 1253 + }, + { + "epoch": 0.26, + "learning_rate": 1.4815126050420169e-05, + "logits/chosen": -1.9804422855377197, + "logits/rejected": -2.1082122325897217, + "logps/chosen": -276.42486572265625, + "logps/rejected": -311.48687744140625, + "loss": 0.1469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8941290378570557, + "rewards/margins": 3.601116418838501, + "rewards/rejected": -5.495245933532715, + "step": 1254 + }, + { + "epoch": 0.26, + "learning_rate": 1.4810924369747901e-05, + "logits/chosen": -2.087886333465576, + "logits/rejected": -1.9520126581192017, + "logps/chosen": -410.7392272949219, + "logps/rejected": -337.5001525878906, + "loss": 0.4123, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7130568027496338, + "rewards/margins": 2.0264892578125, + "rewards/rejected": -3.739546298980713, + "step": 1255 + }, + { + "epoch": 0.26, + "learning_rate": 1.4806722689075633e-05, + "logits/chosen": -2.0098466873168945, + "logits/rejected": -1.8570048809051514, + "logps/chosen": -233.21795654296875, + "logps/rejected": -285.7274169921875, + "loss": 0.563, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.897686719894409, + "rewards/margins": 1.9628833532333374, + "rewards/rejected": -4.860569477081299, + "step": 1256 + }, + { + "epoch": 0.26, + "learning_rate": 1.4802521008403363e-05, + "logits/chosen": -2.0937600135803223, + "logits/rejected": -2.1659560203552246, + "logps/chosen": -307.9239501953125, + "logps/rejected": -371.7438659667969, + "loss": 0.2424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.197117328643799, + "rewards/margins": 2.7394893169403076, + "rewards/rejected": -4.9366068840026855, + "step": 1257 + }, + { + "epoch": 0.26, + "learning_rate": 1.4798319327731095e-05, + "logits/chosen": -2.297569513320923, + "logits/rejected": -1.78489089012146, + "logps/chosen": -369.9482421875, + "logps/rejected": -388.3670654296875, + "loss": 0.1519, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9404587745666504, + "rewards/margins": 4.160933017730713, + "rewards/rejected": -6.101391792297363, + "step": 1258 + }, + { + "epoch": 0.26, + "learning_rate": 1.4794117647058825e-05, + "logits/chosen": -2.0682833194732666, + "logits/rejected": -1.9511396884918213, + "logps/chosen": -241.9205780029297, + "logps/rejected": -362.4847412109375, + "loss": 0.2041, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0030581951141357, + "rewards/margins": 4.154212474822998, + "rewards/rejected": -6.157270431518555, + "step": 1259 + }, + { + "epoch": 0.26, + "learning_rate": 1.4789915966386557e-05, + "logits/chosen": -2.019435405731201, + "logits/rejected": -2.0918893814086914, + "logps/chosen": -296.945068359375, + "logps/rejected": -288.6117858886719, + "loss": 0.4968, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8838021755218506, + "rewards/margins": 2.7718918323516846, + "rewards/rejected": -5.655694007873535, + "step": 1260 + }, + { + "epoch": 0.26, + "learning_rate": 1.4785714285714287e-05, + "logits/chosen": -2.21256160736084, + "logits/rejected": -1.863285779953003, + "logps/chosen": -422.43719482421875, + "logps/rejected": -348.0411376953125, + "loss": 0.4344, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2615697383880615, + "rewards/margins": 2.5435805320739746, + "rewards/rejected": -4.805150032043457, + "step": 1261 + }, + { + "epoch": 0.26, + "learning_rate": 1.478151260504202e-05, + "logits/chosen": -2.252715587615967, + "logits/rejected": -2.1746504306793213, + "logps/chosen": -312.0749206542969, + "logps/rejected": -298.6753845214844, + "loss": 0.5479, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2686710357666016, + "rewards/margins": 3.1939423084259033, + "rewards/rejected": -5.462613582611084, + "step": 1262 + }, + { + "epoch": 0.26, + "learning_rate": 1.477731092436975e-05, + "logits/chosen": -2.375532865524292, + "logits/rejected": -1.6960687637329102, + "logps/chosen": -349.6080017089844, + "logps/rejected": -293.5823974609375, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8709702491760254, + "rewards/margins": 3.1332614421844482, + "rewards/rejected": -5.0042314529418945, + "step": 1263 + }, + { + "epoch": 0.26, + "learning_rate": 1.4773109243697481e-05, + "logits/chosen": -1.8829500675201416, + "logits/rejected": -2.027601957321167, + "logps/chosen": -220.30653381347656, + "logps/rejected": -346.98382568359375, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.29658842086792, + "rewards/margins": 2.055175542831421, + "rewards/rejected": -4.351763725280762, + "step": 1264 + }, + { + "epoch": 0.26, + "learning_rate": 1.4768907563025212e-05, + "logits/chosen": -2.33864688873291, + "logits/rejected": -1.9569733142852783, + "logps/chosen": -424.92462158203125, + "logps/rejected": -360.5376892089844, + "loss": 0.3562, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.835164189338684, + "rewards/margins": 3.688948154449463, + "rewards/rejected": -5.524112224578857, + "step": 1265 + }, + { + "epoch": 0.26, + "learning_rate": 1.4764705882352944e-05, + "logits/chosen": -2.4514994621276855, + "logits/rejected": -2.103682041168213, + "logps/chosen": -295.4800109863281, + "logps/rejected": -256.933349609375, + "loss": 0.2922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.148221969604492, + "rewards/margins": 3.3808376789093018, + "rewards/rejected": -5.529059886932373, + "step": 1266 + }, + { + "epoch": 0.27, + "learning_rate": 1.4760504201680674e-05, + "logits/chosen": -2.0571556091308594, + "logits/rejected": -1.548018217086792, + "logps/chosen": -398.4248046875, + "logps/rejected": -276.8052978515625, + "loss": 0.4752, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9338788986206055, + "rewards/margins": 2.7316012382507324, + "rewards/rejected": -5.665480613708496, + "step": 1267 + }, + { + "epoch": 0.27, + "learning_rate": 1.4756302521008406e-05, + "logits/chosen": -2.417056083679199, + "logits/rejected": -1.7876685857772827, + "logps/chosen": -492.06158447265625, + "logps/rejected": -354.4727783203125, + "loss": 0.7385, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2213516235351562, + "rewards/margins": 2.8694396018981934, + "rewards/rejected": -5.09079122543335, + "step": 1268 + }, + { + "epoch": 0.27, + "learning_rate": 1.4752100840336136e-05, + "logits/chosen": -1.624762773513794, + "logits/rejected": -1.976487398147583, + "logps/chosen": -233.65724182128906, + "logps/rejected": -442.7911682128906, + "loss": 0.2496, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4648206233978271, + "rewards/margins": 4.145246505737305, + "rewards/rejected": -5.610067367553711, + "step": 1269 + }, + { + "epoch": 0.27, + "learning_rate": 1.4747899159663868e-05, + "logits/chosen": -2.186835289001465, + "logits/rejected": -2.3797495365142822, + "logps/chosen": -293.5005187988281, + "logps/rejected": -321.178466796875, + "loss": 0.1936, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6081080436706543, + "rewards/margins": 3.19716739654541, + "rewards/rejected": -4.8052754402160645, + "step": 1270 + }, + { + "epoch": 0.27, + "learning_rate": 1.4743697478991598e-05, + "logits/chosen": -2.3055331707000732, + "logits/rejected": -2.0779318809509277, + "logps/chosen": -233.13351440429688, + "logps/rejected": -241.09288024902344, + "loss": 0.3474, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4241771697998047, + "rewards/margins": 3.886657476425171, + "rewards/rejected": -6.310834884643555, + "step": 1271 + }, + { + "epoch": 0.27, + "learning_rate": 1.473949579831933e-05, + "logits/chosen": -2.095747470855713, + "logits/rejected": -1.788177251815796, + "logps/chosen": -359.4222106933594, + "logps/rejected": -326.16900634765625, + "loss": 0.1375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9823145866394043, + "rewards/margins": 5.11967658996582, + "rewards/rejected": -7.101991653442383, + "step": 1272 + }, + { + "epoch": 0.27, + "learning_rate": 1.473529411764706e-05, + "logits/chosen": -2.210921287536621, + "logits/rejected": -1.7555038928985596, + "logps/chosen": -348.13568115234375, + "logps/rejected": -311.5467529296875, + "loss": 0.294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7462973594665527, + "rewards/margins": 3.652886390686035, + "rewards/rejected": -5.399183750152588, + "step": 1273 + }, + { + "epoch": 0.27, + "learning_rate": 1.4731092436974792e-05, + "logits/chosen": -2.228372097015381, + "logits/rejected": -1.7779014110565186, + "logps/chosen": -242.40841674804688, + "logps/rejected": -266.3146057128906, + "loss": 0.4707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0871236324310303, + "rewards/margins": 1.6827473640441895, + "rewards/rejected": -3.769871234893799, + "step": 1274 + }, + { + "epoch": 0.27, + "learning_rate": 1.4726890756302522e-05, + "logits/chosen": -2.227362632751465, + "logits/rejected": -2.0143706798553467, + "logps/chosen": -365.6995849609375, + "logps/rejected": -424.6846923828125, + "loss": 0.0639, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7715792655944824, + "rewards/margins": 5.921769142150879, + "rewards/rejected": -6.693347930908203, + "step": 1275 + }, + { + "epoch": 0.27, + "learning_rate": 1.4722689075630254e-05, + "logits/chosen": -2.287069797515869, + "logits/rejected": -2.1423909664154053, + "logps/chosen": -382.0042419433594, + "logps/rejected": -445.89288330078125, + "loss": 0.3104, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9299278259277344, + "rewards/margins": 3.295947551727295, + "rewards/rejected": -5.225875377655029, + "step": 1276 + }, + { + "epoch": 0.27, + "learning_rate": 1.4718487394957986e-05, + "logits/chosen": -2.1588196754455566, + "logits/rejected": -2.1907949447631836, + "logps/chosen": -332.0426025390625, + "logps/rejected": -431.2523193359375, + "loss": 0.2578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.533712387084961, + "rewards/margins": 3.361938714981079, + "rewards/rejected": -4.895650863647461, + "step": 1277 + }, + { + "epoch": 0.27, + "learning_rate": 1.4714285714285716e-05, + "logits/chosen": -2.192356586456299, + "logits/rejected": -2.303767204284668, + "logps/chosen": -349.8454895019531, + "logps/rejected": -368.3902893066406, + "loss": 0.4539, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9580715894699097, + "rewards/margins": 2.893841028213501, + "rewards/rejected": -4.851912498474121, + "step": 1278 + }, + { + "epoch": 0.27, + "learning_rate": 1.4710084033613448e-05, + "logits/chosen": -2.2149999141693115, + "logits/rejected": -1.8579695224761963, + "logps/chosen": -246.74853515625, + "logps/rejected": -243.54165649414062, + "loss": 0.3374, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1781527996063232, + "rewards/margins": 2.468636989593506, + "rewards/rejected": -4.646790027618408, + "step": 1279 + }, + { + "epoch": 0.27, + "learning_rate": 1.4705882352941179e-05, + "logits/chosen": -2.1588666439056396, + "logits/rejected": -1.8670859336853027, + "logps/chosen": -348.0268859863281, + "logps/rejected": -362.1605529785156, + "loss": 0.1768, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1999893188476562, + "rewards/margins": 3.294858455657959, + "rewards/rejected": -5.494847774505615, + "step": 1280 + }, + { + "epoch": 0.27, + "learning_rate": 1.470168067226891e-05, + "logits/chosen": -2.2778701782226562, + "logits/rejected": -1.9129645824432373, + "logps/chosen": -324.49468994140625, + "logps/rejected": -364.21978759765625, + "loss": 0.2615, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.301697015762329, + "rewards/margins": 3.9454219341278076, + "rewards/rejected": -6.2471184730529785, + "step": 1281 + }, + { + "epoch": 0.27, + "learning_rate": 1.469747899159664e-05, + "logits/chosen": -2.1504523754119873, + "logits/rejected": -2.355855941772461, + "logps/chosen": -310.143798828125, + "logps/rejected": -343.95294189453125, + "loss": 0.6833, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.759343147277832, + "rewards/margins": 1.7168376445770264, + "rewards/rejected": -4.4761810302734375, + "step": 1282 + }, + { + "epoch": 0.27, + "learning_rate": 1.4693277310924373e-05, + "logits/chosen": -2.0251173973083496, + "logits/rejected": -1.9537947177886963, + "logps/chosen": -400.30206298828125, + "logps/rejected": -375.833984375, + "loss": 0.3784, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8339409828186035, + "rewards/margins": 2.0957093238830566, + "rewards/rejected": -3.92965030670166, + "step": 1283 + }, + { + "epoch": 0.27, + "learning_rate": 1.4689075630252103e-05, + "logits/chosen": -2.236034870147705, + "logits/rejected": -2.0384464263916016, + "logps/chosen": -464.8644104003906, + "logps/rejected": -341.2873229980469, + "loss": 0.1873, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4539563655853271, + "rewards/margins": 5.187774181365967, + "rewards/rejected": -6.641730308532715, + "step": 1284 + }, + { + "epoch": 0.27, + "learning_rate": 1.4684873949579831e-05, + "logits/chosen": -2.086874485015869, + "logits/rejected": -1.9425218105316162, + "logps/chosen": -325.1651916503906, + "logps/rejected": -437.2560729980469, + "loss": 0.2233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2628142833709717, + "rewards/margins": 4.152487754821777, + "rewards/rejected": -6.415302276611328, + "step": 1285 + }, + { + "epoch": 0.27, + "learning_rate": 1.4680672268907563e-05, + "logits/chosen": -2.2547316551208496, + "logits/rejected": -1.5304312705993652, + "logps/chosen": -377.79949951171875, + "logps/rejected": -314.02484130859375, + "loss": 0.174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.972707748413086, + "rewards/margins": 3.568516969680786, + "rewards/rejected": -5.541224956512451, + "step": 1286 + }, + { + "epoch": 0.27, + "learning_rate": 1.4676470588235294e-05, + "logits/chosen": -1.993772029876709, + "logits/rejected": -2.102482795715332, + "logps/chosen": -369.169677734375, + "logps/rejected": -431.16851806640625, + "loss": 0.1246, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.942072868347168, + "rewards/margins": 3.7872238159179688, + "rewards/rejected": -5.7292962074279785, + "step": 1287 + }, + { + "epoch": 0.27, + "learning_rate": 1.4672268907563025e-05, + "logits/chosen": -2.1982686519622803, + "logits/rejected": -1.89798104763031, + "logps/chosen": -372.82952880859375, + "logps/rejected": -294.7918701171875, + "loss": 0.6966, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.335519313812256, + "rewards/margins": 2.2533819675445557, + "rewards/rejected": -4.588900566101074, + "step": 1288 + }, + { + "epoch": 0.27, + "learning_rate": 1.4668067226890756e-05, + "logits/chosen": -2.4971814155578613, + "logits/rejected": -2.1143112182617188, + "logps/chosen": -419.9573669433594, + "logps/rejected": -348.7078552246094, + "loss": 0.2194, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1714119911193848, + "rewards/margins": 3.3058905601501465, + "rewards/rejected": -5.477302551269531, + "step": 1289 + }, + { + "epoch": 0.27, + "learning_rate": 1.4663865546218488e-05, + "logits/chosen": -2.149749279022217, + "logits/rejected": -1.6361687183380127, + "logps/chosen": -385.1405334472656, + "logps/rejected": -299.35797119140625, + "loss": 0.3123, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.231283187866211, + "rewards/margins": 3.6453938484191895, + "rewards/rejected": -5.8766770362854, + "step": 1290 + }, + { + "epoch": 0.27, + "learning_rate": 1.465966386554622e-05, + "logits/chosen": -2.481612205505371, + "logits/rejected": -1.9378042221069336, + "logps/chosen": -379.52447509765625, + "logps/rejected": -298.8292541503906, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5607129335403442, + "rewards/margins": 3.0255861282348633, + "rewards/rejected": -4.586298942565918, + "step": 1291 + }, + { + "epoch": 0.27, + "learning_rate": 1.465546218487395e-05, + "logits/chosen": -2.3154311180114746, + "logits/rejected": -2.3894214630126953, + "logps/chosen": -388.3495178222656, + "logps/rejected": -428.69390869140625, + "loss": 0.5803, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.547250747680664, + "rewards/margins": 2.6211979389190674, + "rewards/rejected": -4.168448448181152, + "step": 1292 + }, + { + "epoch": 0.27, + "learning_rate": 1.4651260504201682e-05, + "logits/chosen": -2.0681214332580566, + "logits/rejected": -2.1275618076324463, + "logps/chosen": -288.99432373046875, + "logps/rejected": -265.9241943359375, + "loss": 1.1976, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.152647018432617, + "rewards/margins": 0.7481802105903625, + "rewards/rejected": -3.900827407836914, + "step": 1293 + }, + { + "epoch": 0.27, + "learning_rate": 1.4647058823529412e-05, + "logits/chosen": -2.1913821697235107, + "logits/rejected": -2.0273146629333496, + "logps/chosen": -326.0242919921875, + "logps/rejected": -324.28851318359375, + "loss": 0.2837, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2924721240997314, + "rewards/margins": 3.1881661415100098, + "rewards/rejected": -5.48063850402832, + "step": 1294 + }, + { + "epoch": 0.27, + "learning_rate": 1.4642857142857144e-05, + "logits/chosen": -2.4202380180358887, + "logits/rejected": -1.8570458889007568, + "logps/chosen": -377.66375732421875, + "logps/rejected": -364.09552001953125, + "loss": 0.367, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6750707626342773, + "rewards/margins": 3.2902719974517822, + "rewards/rejected": -5.9653425216674805, + "step": 1295 + }, + { + "epoch": 0.27, + "learning_rate": 1.4638655462184874e-05, + "logits/chosen": -2.4300355911254883, + "logits/rejected": -2.0552690029144287, + "logps/chosen": -327.7197570800781, + "logps/rejected": -359.2673034667969, + "loss": 0.2016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.563568592071533, + "rewards/margins": 3.2751879692077637, + "rewards/rejected": -5.838756561279297, + "step": 1296 + }, + { + "epoch": 0.27, + "learning_rate": 1.4634453781512606e-05, + "logits/chosen": -1.87666916847229, + "logits/rejected": -1.7819392681121826, + "logps/chosen": -388.83917236328125, + "logps/rejected": -364.4532470703125, + "loss": 0.3303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9207514524459839, + "rewards/margins": 2.494077682495117, + "rewards/rejected": -4.414829254150391, + "step": 1297 + }, + { + "epoch": 0.27, + "learning_rate": 1.4630252100840336e-05, + "logits/chosen": -2.4306719303131104, + "logits/rejected": -1.7703864574432373, + "logps/chosen": -455.485107421875, + "logps/rejected": -354.526123046875, + "loss": 0.9332, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7424826622009277, + "rewards/margins": 2.436265707015991, + "rewards/rejected": -5.17874813079834, + "step": 1298 + }, + { + "epoch": 0.27, + "learning_rate": 1.4626050420168068e-05, + "logits/chosen": -2.37957763671875, + "logits/rejected": -1.4981766939163208, + "logps/chosen": -322.4650573730469, + "logps/rejected": -235.86851501464844, + "loss": 0.5699, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1211328506469727, + "rewards/margins": 3.071442127227783, + "rewards/rejected": -6.192574977874756, + "step": 1299 + }, + { + "epoch": 0.27, + "learning_rate": 1.4621848739495798e-05, + "logits/chosen": -2.272067070007324, + "logits/rejected": -2.377992630004883, + "logps/chosen": -340.32928466796875, + "logps/rejected": -375.86334228515625, + "loss": 0.698, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5345244407653809, + "rewards/margins": 1.9435549974441528, + "rewards/rejected": -3.478079319000244, + "step": 1300 + }, + { + "epoch": 0.27, + "learning_rate": 1.461764705882353e-05, + "logits/chosen": -2.0112035274505615, + "logits/rejected": -1.68162202835083, + "logps/chosen": -326.16461181640625, + "logps/rejected": -250.62173461914062, + "loss": 0.2758, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8275446891784668, + "rewards/margins": 3.0117392539978027, + "rewards/rejected": -4.8392839431762695, + "step": 1301 + }, + { + "epoch": 0.27, + "learning_rate": 1.461344537815126e-05, + "logits/chosen": -2.1388676166534424, + "logits/rejected": -1.4437810182571411, + "logps/chosen": -277.62640380859375, + "logps/rejected": -212.07789611816406, + "loss": 0.4673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8518950939178467, + "rewards/margins": 2.283548355102539, + "rewards/rejected": -5.135443210601807, + "step": 1302 + }, + { + "epoch": 0.27, + "learning_rate": 1.4609243697478992e-05, + "logits/chosen": -2.077181577682495, + "logits/rejected": -1.7752044200897217, + "logps/chosen": -423.83868408203125, + "logps/rejected": -338.0631408691406, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6175929307937622, + "rewards/margins": 2.147789716720581, + "rewards/rejected": -3.7653825283050537, + "step": 1303 + }, + { + "epoch": 0.27, + "learning_rate": 1.4605042016806723e-05, + "logits/chosen": -2.079444408416748, + "logits/rejected": -1.9716310501098633, + "logps/chosen": -331.0086364746094, + "logps/rejected": -394.33447265625, + "loss": 0.1341, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6591163873672485, + "rewards/margins": 3.6345081329345703, + "rewards/rejected": -5.2936248779296875, + "step": 1304 + }, + { + "epoch": 0.27, + "learning_rate": 1.4600840336134454e-05, + "logits/chosen": -2.2736055850982666, + "logits/rejected": -1.9814271926879883, + "logps/chosen": -369.89117431640625, + "logps/rejected": -418.7023620605469, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.251814603805542, + "rewards/margins": 3.0796754360198975, + "rewards/rejected": -5.3314900398254395, + "step": 1305 + }, + { + "epoch": 0.27, + "learning_rate": 1.4596638655462185e-05, + "logits/chosen": -2.195260763168335, + "logits/rejected": -1.8201298713684082, + "logps/chosen": -327.15350341796875, + "logps/rejected": -280.43701171875, + "loss": 0.2084, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5229108333587646, + "rewards/margins": 3.891502857208252, + "rewards/rejected": -5.414413928985596, + "step": 1306 + }, + { + "epoch": 0.27, + "learning_rate": 1.4592436974789917e-05, + "logits/chosen": -2.049452066421509, + "logits/rejected": -1.7393059730529785, + "logps/chosen": -297.6163635253906, + "logps/rejected": -290.0152893066406, + "loss": 0.0824, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4388139247894287, + "rewards/margins": 3.801703929901123, + "rewards/rejected": -6.240517616271973, + "step": 1307 + }, + { + "epoch": 0.27, + "learning_rate": 1.4588235294117647e-05, + "logits/chosen": -2.1753246784210205, + "logits/rejected": -2.2490053176879883, + "logps/chosen": -290.8746032714844, + "logps/rejected": -377.81585693359375, + "loss": 0.6163, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.167635917663574, + "rewards/margins": 2.7528676986694336, + "rewards/rejected": -5.920504093170166, + "step": 1308 + }, + { + "epoch": 0.27, + "learning_rate": 1.4584033613445379e-05, + "logits/chosen": -2.2561471462249756, + "logits/rejected": -1.9270517826080322, + "logps/chosen": -389.7734375, + "logps/rejected": -400.7041015625, + "loss": 0.3211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.382431983947754, + "rewards/margins": 3.2687344551086426, + "rewards/rejected": -4.6511664390563965, + "step": 1309 + }, + { + "epoch": 0.27, + "learning_rate": 1.4579831932773109e-05, + "logits/chosen": -2.1898486614227295, + "logits/rejected": -1.756376028060913, + "logps/chosen": -374.4280700683594, + "logps/rejected": -323.3240051269531, + "loss": 0.3636, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.966078519821167, + "rewards/margins": 2.720151424407959, + "rewards/rejected": -4.686229705810547, + "step": 1310 + }, + { + "epoch": 0.27, + "learning_rate": 1.4575630252100841e-05, + "logits/chosen": -2.33266544342041, + "logits/rejected": -1.8984947204589844, + "logps/chosen": -522.5299072265625, + "logps/rejected": -358.5198669433594, + "loss": 0.4265, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7072179317474365, + "rewards/margins": 3.603027820587158, + "rewards/rejected": -6.310245990753174, + "step": 1311 + }, + { + "epoch": 0.27, + "learning_rate": 1.4571428571428573e-05, + "logits/chosen": -2.269780158996582, + "logits/rejected": -1.7072854042053223, + "logps/chosen": -409.7075500488281, + "logps/rejected": -286.8658142089844, + "loss": 0.2319, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6139099597930908, + "rewards/margins": 4.00421667098999, + "rewards/rejected": -5.61812686920166, + "step": 1312 + }, + { + "epoch": 0.27, + "learning_rate": 1.4567226890756303e-05, + "logits/chosen": -2.279707908630371, + "logits/rejected": -1.958705186843872, + "logps/chosen": -284.9219970703125, + "logps/rejected": -266.9258117675781, + "loss": 0.6472, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.398683786392212, + "rewards/margins": 1.7753677368164062, + "rewards/rejected": -4.174051284790039, + "step": 1313 + }, + { + "epoch": 0.27, + "learning_rate": 1.4563025210084035e-05, + "logits/chosen": -2.028229236602783, + "logits/rejected": -2.01723575592041, + "logps/chosen": -225.2945556640625, + "logps/rejected": -348.3422546386719, + "loss": 0.5339, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0043818950653076, + "rewards/margins": 2.426387310028076, + "rewards/rejected": -4.430769443511963, + "step": 1314 + }, + { + "epoch": 0.28, + "learning_rate": 1.4558823529411765e-05, + "logits/chosen": -2.006513833999634, + "logits/rejected": -1.862075924873352, + "logps/chosen": -261.91943359375, + "logps/rejected": -263.3758850097656, + "loss": 0.4764, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2605090141296387, + "rewards/margins": 3.3402509689331055, + "rewards/rejected": -5.600759983062744, + "step": 1315 + }, + { + "epoch": 0.28, + "learning_rate": 1.4554621848739497e-05, + "logits/chosen": -2.073103189468384, + "logits/rejected": -1.504417896270752, + "logps/chosen": -366.36212158203125, + "logps/rejected": -351.7047424316406, + "loss": 0.29, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.465597152709961, + "rewards/margins": 3.9981558322906494, + "rewards/rejected": -6.463752746582031, + "step": 1316 + }, + { + "epoch": 0.28, + "learning_rate": 1.4550420168067227e-05, + "logits/chosen": -1.9902853965759277, + "logits/rejected": -1.7849637269973755, + "logps/chosen": -382.65234375, + "logps/rejected": -327.2167663574219, + "loss": 0.2324, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.443115472793579, + "rewards/margins": 3.523207426071167, + "rewards/rejected": -4.966322898864746, + "step": 1317 + }, + { + "epoch": 0.28, + "learning_rate": 1.454621848739496e-05, + "logits/chosen": -2.1147031784057617, + "logits/rejected": -1.4941411018371582, + "logps/chosen": -310.183837890625, + "logps/rejected": -373.72235107421875, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.981034278869629, + "rewards/margins": 3.8686423301696777, + "rewards/rejected": -5.849676609039307, + "step": 1318 + }, + { + "epoch": 0.28, + "learning_rate": 1.454201680672269e-05, + "logits/chosen": -2.1353440284729004, + "logits/rejected": -1.4690742492675781, + "logps/chosen": -449.19293212890625, + "logps/rejected": -315.3084716796875, + "loss": 0.3037, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3374102115631104, + "rewards/margins": 3.0057873725891113, + "rewards/rejected": -4.343197822570801, + "step": 1319 + }, + { + "epoch": 0.28, + "learning_rate": 1.4537815126050421e-05, + "logits/chosen": -2.21055006980896, + "logits/rejected": -2.2417702674865723, + "logps/chosen": -322.8713684082031, + "logps/rejected": -338.9656677246094, + "loss": 0.2249, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5615465641021729, + "rewards/margins": 3.093017578125, + "rewards/rejected": -4.654563903808594, + "step": 1320 + }, + { + "epoch": 0.28, + "learning_rate": 1.4533613445378152e-05, + "logits/chosen": -2.0902719497680664, + "logits/rejected": -1.757875919342041, + "logps/chosen": -409.47332763671875, + "logps/rejected": -330.07220458984375, + "loss": 0.7013, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.09310245513916, + "rewards/margins": 2.0264174938201904, + "rewards/rejected": -4.11952018737793, + "step": 1321 + }, + { + "epoch": 0.28, + "learning_rate": 1.4529411764705883e-05, + "logits/chosen": -1.9497241973876953, + "logits/rejected": -1.6293092966079712, + "logps/chosen": -269.0858459472656, + "logps/rejected": -393.73101806640625, + "loss": 0.1908, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9988043308258057, + "rewards/margins": 4.232430458068848, + "rewards/rejected": -6.231234550476074, + "step": 1322 + }, + { + "epoch": 0.28, + "learning_rate": 1.4525210084033614e-05, + "logits/chosen": -1.8930530548095703, + "logits/rejected": -1.6600115299224854, + "logps/chosen": -306.607666015625, + "logps/rejected": -291.586181640625, + "loss": 0.1753, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6383129358291626, + "rewards/margins": 4.642019271850586, + "rewards/rejected": -6.280332088470459, + "step": 1323 + }, + { + "epoch": 0.28, + "learning_rate": 1.4521008403361346e-05, + "logits/chosen": -2.08837890625, + "logits/rejected": -1.6526830196380615, + "logps/chosen": -401.0961608886719, + "logps/rejected": -346.9503173828125, + "loss": 0.4379, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.025801181793213, + "rewards/margins": 2.590909957885742, + "rewards/rejected": -4.616711139678955, + "step": 1324 + }, + { + "epoch": 0.28, + "learning_rate": 1.4516806722689076e-05, + "logits/chosen": -2.074071168899536, + "logits/rejected": -1.504553198814392, + "logps/chosen": -343.7025146484375, + "logps/rejected": -293.0475158691406, + "loss": 0.3903, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9261579513549805, + "rewards/margins": 2.914755344390869, + "rewards/rejected": -5.840913772583008, + "step": 1325 + }, + { + "epoch": 0.28, + "learning_rate": 1.4512605042016808e-05, + "logits/chosen": -2.2069931030273438, + "logits/rejected": -1.898561716079712, + "logps/chosen": -250.13214111328125, + "logps/rejected": -231.3361053466797, + "loss": 0.1909, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.880393624305725, + "rewards/margins": 4.386942386627197, + "rewards/rejected": -6.267335891723633, + "step": 1326 + }, + { + "epoch": 0.28, + "learning_rate": 1.4508403361344538e-05, + "logits/chosen": -1.9026392698287964, + "logits/rejected": -1.8256945610046387, + "logps/chosen": -381.3522644042969, + "logps/rejected": -377.548095703125, + "loss": 0.3133, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.120358943939209, + "rewards/margins": 2.705876350402832, + "rewards/rejected": -4.826234817504883, + "step": 1327 + }, + { + "epoch": 0.28, + "learning_rate": 1.450420168067227e-05, + "logits/chosen": -1.9654139280319214, + "logits/rejected": -2.182123899459839, + "logps/chosen": -359.7715759277344, + "logps/rejected": -378.7041931152344, + "loss": 0.7192, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.767465353012085, + "rewards/margins": 1.6621711254119873, + "rewards/rejected": -3.4296364784240723, + "step": 1328 + }, + { + "epoch": 0.28, + "learning_rate": 1.45e-05, + "logits/chosen": -2.1207194328308105, + "logits/rejected": -1.7894389629364014, + "logps/chosen": -286.53564453125, + "logps/rejected": -209.88235473632812, + "loss": 0.1868, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2361412048339844, + "rewards/margins": 3.677762985229492, + "rewards/rejected": -5.913904190063477, + "step": 1329 + }, + { + "epoch": 0.28, + "learning_rate": 1.4495798319327732e-05, + "logits/chosen": -2.329207420349121, + "logits/rejected": -1.724953055381775, + "logps/chosen": -291.082275390625, + "logps/rejected": -210.01910400390625, + "loss": 0.502, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.560848593711853, + "rewards/margins": 1.8143994808197021, + "rewards/rejected": -3.3752479553222656, + "step": 1330 + }, + { + "epoch": 0.28, + "learning_rate": 1.4491596638655462e-05, + "logits/chosen": -2.0099661350250244, + "logits/rejected": -2.170433282852173, + "logps/chosen": -372.9079284667969, + "logps/rejected": -428.889404296875, + "loss": 0.1353, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3571659326553345, + "rewards/margins": 4.624889373779297, + "rewards/rejected": -5.982055187225342, + "step": 1331 + }, + { + "epoch": 0.28, + "learning_rate": 1.4487394957983194e-05, + "logits/chosen": -1.8971928358078003, + "logits/rejected": -1.4239734411239624, + "logps/chosen": -375.3805847167969, + "logps/rejected": -327.14764404296875, + "loss": 0.2606, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8845016956329346, + "rewards/margins": 3.621929883956909, + "rewards/rejected": -5.506431579589844, + "step": 1332 + }, + { + "epoch": 0.28, + "learning_rate": 1.4483193277310924e-05, + "logits/chosen": -2.2307510375976562, + "logits/rejected": -1.8042258024215698, + "logps/chosen": -346.00054931640625, + "logps/rejected": -318.7489013671875, + "loss": 0.4428, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0084006786346436, + "rewards/margins": 2.5001821517944336, + "rewards/rejected": -4.508583068847656, + "step": 1333 + }, + { + "epoch": 0.28, + "learning_rate": 1.4478991596638656e-05, + "logits/chosen": -1.5064363479614258, + "logits/rejected": -1.740829586982727, + "logps/chosen": -323.7245788574219, + "logps/rejected": -345.5672912597656, + "loss": 0.2669, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2592781782150269, + "rewards/margins": 3.6023240089416504, + "rewards/rejected": -4.861601829528809, + "step": 1334 + }, + { + "epoch": 0.28, + "learning_rate": 1.4474789915966388e-05, + "logits/chosen": -2.2216014862060547, + "logits/rejected": -2.0204238891601562, + "logps/chosen": -311.18768310546875, + "logps/rejected": -392.4696350097656, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7154901027679443, + "rewards/margins": 3.872194290161133, + "rewards/rejected": -5.587684631347656, + "step": 1335 + }, + { + "epoch": 0.28, + "learning_rate": 1.4470588235294118e-05, + "logits/chosen": -1.9822871685028076, + "logits/rejected": -1.7488360404968262, + "logps/chosen": -280.7610168457031, + "logps/rejected": -235.25970458984375, + "loss": 0.321, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3817272186279297, + "rewards/margins": 2.813412666320801, + "rewards/rejected": -5.1951398849487305, + "step": 1336 + }, + { + "epoch": 0.28, + "learning_rate": 1.446638655462185e-05, + "logits/chosen": -2.03188419342041, + "logits/rejected": -1.7353472709655762, + "logps/chosen": -416.252685546875, + "logps/rejected": -409.362060546875, + "loss": 0.2245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6621981859207153, + "rewards/margins": 3.5806288719177246, + "rewards/rejected": -4.24282693862915, + "step": 1337 + }, + { + "epoch": 0.28, + "learning_rate": 1.446218487394958e-05, + "logits/chosen": -2.3418874740600586, + "logits/rejected": -2.1813251972198486, + "logps/chosen": -511.1329650878906, + "logps/rejected": -414.8935546875, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8967446088790894, + "rewards/margins": 3.791290521621704, + "rewards/rejected": -4.688035011291504, + "step": 1338 + }, + { + "epoch": 0.28, + "learning_rate": 1.4457983193277312e-05, + "logits/chosen": -2.1939597129821777, + "logits/rejected": -2.1087088584899902, + "logps/chosen": -262.8658752441406, + "logps/rejected": -281.9622802734375, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.960983157157898, + "rewards/margins": 1.6198943853378296, + "rewards/rejected": -3.5808775424957275, + "step": 1339 + }, + { + "epoch": 0.28, + "learning_rate": 1.4453781512605043e-05, + "logits/chosen": -1.946852207183838, + "logits/rejected": -2.133152484893799, + "logps/chosen": -316.7564697265625, + "logps/rejected": -367.07684326171875, + "loss": 0.5389, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.003140926361084, + "rewards/margins": 2.608355760574341, + "rewards/rejected": -4.611496925354004, + "step": 1340 + }, + { + "epoch": 0.28, + "learning_rate": 1.4449579831932775e-05, + "logits/chosen": -2.0431888103485107, + "logits/rejected": -2.161670684814453, + "logps/chosen": -264.0682373046875, + "logps/rejected": -371.82293701171875, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.340522050857544, + "rewards/margins": 4.733572483062744, + "rewards/rejected": -6.074094772338867, + "step": 1341 + }, + { + "epoch": 0.28, + "learning_rate": 1.4445378151260505e-05, + "logits/chosen": -2.390133857727051, + "logits/rejected": -2.0135855674743652, + "logps/chosen": -314.830322265625, + "logps/rejected": -286.2728271484375, + "loss": 0.2072, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1324608325958252, + "rewards/margins": 3.554328441619873, + "rewards/rejected": -4.686789512634277, + "step": 1342 + }, + { + "epoch": 0.28, + "learning_rate": 1.4441176470588237e-05, + "logits/chosen": -2.2505812644958496, + "logits/rejected": -1.9314416646957397, + "logps/chosen": -270.4881591796875, + "logps/rejected": -309.67431640625, + "loss": 0.1867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5261785984039307, + "rewards/margins": 4.273470878601074, + "rewards/rejected": -5.799649715423584, + "step": 1343 + }, + { + "epoch": 0.28, + "learning_rate": 1.4436974789915967e-05, + "logits/chosen": -2.0623788833618164, + "logits/rejected": -2.1383819580078125, + "logps/chosen": -438.3116455078125, + "logps/rejected": -351.8424987792969, + "loss": 0.6391, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8496077060699463, + "rewards/margins": 2.317702531814575, + "rewards/rejected": -4.1673102378845215, + "step": 1344 + }, + { + "epoch": 0.28, + "learning_rate": 1.4432773109243699e-05, + "logits/chosen": -1.8724569082260132, + "logits/rejected": -1.6876424551010132, + "logps/chosen": -177.82357788085938, + "logps/rejected": -213.06161499023438, + "loss": 0.3606, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.142467975616455, + "rewards/margins": 2.969913959503174, + "rewards/rejected": -5.112381935119629, + "step": 1345 + }, + { + "epoch": 0.28, + "learning_rate": 1.4428571428571429e-05, + "logits/chosen": -2.0339412689208984, + "logits/rejected": -1.7853180170059204, + "logps/chosen": -353.97137451171875, + "logps/rejected": -302.7935791015625, + "loss": 0.4354, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6244193315505981, + "rewards/margins": 3.8671085834503174, + "rewards/rejected": -5.491527557373047, + "step": 1346 + }, + { + "epoch": 0.28, + "learning_rate": 1.4424369747899161e-05, + "logits/chosen": -1.5565789937973022, + "logits/rejected": -1.540840983390808, + "logps/chosen": -265.2157287597656, + "logps/rejected": -309.3747253417969, + "loss": 0.0784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1418890953063965, + "rewards/margins": 5.682125091552734, + "rewards/rejected": -7.824014663696289, + "step": 1347 + }, + { + "epoch": 0.28, + "learning_rate": 1.4420168067226891e-05, + "logits/chosen": -1.9758803844451904, + "logits/rejected": -1.7769906520843506, + "logps/chosen": -356.5367431640625, + "logps/rejected": -361.5923767089844, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.301805257797241, + "rewards/margins": 1.9082293510437012, + "rewards/rejected": -4.210034370422363, + "step": 1348 + }, + { + "epoch": 0.28, + "learning_rate": 1.4415966386554623e-05, + "logits/chosen": -2.1623311042785645, + "logits/rejected": -1.3819507360458374, + "logps/chosen": -312.258056640625, + "logps/rejected": -269.7995300292969, + "loss": 0.2686, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9637415409088135, + "rewards/margins": 3.338813066482544, + "rewards/rejected": -6.302555084228516, + "step": 1349 + }, + { + "epoch": 0.28, + "learning_rate": 1.4411764705882353e-05, + "logits/chosen": -2.185626268386841, + "logits/rejected": -1.970828890800476, + "logps/chosen": -305.52099609375, + "logps/rejected": -313.5020751953125, + "loss": 0.2481, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7606041431427002, + "rewards/margins": 3.6268043518066406, + "rewards/rejected": -5.387408256530762, + "step": 1350 + }, + { + "epoch": 0.28, + "learning_rate": 1.4407563025210085e-05, + "logits/chosen": -2.145057201385498, + "logits/rejected": -1.7470591068267822, + "logps/chosen": -342.51934814453125, + "logps/rejected": -304.72381591796875, + "loss": 0.5061, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2816319465637207, + "rewards/margins": 2.417202949523926, + "rewards/rejected": -4.698835372924805, + "step": 1351 + }, + { + "epoch": 0.28, + "learning_rate": 1.4403361344537816e-05, + "logits/chosen": -2.219416379928589, + "logits/rejected": -2.1211817264556885, + "logps/chosen": -291.16815185546875, + "logps/rejected": -315.04644775390625, + "loss": 0.4846, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0490496158599854, + "rewards/margins": 2.4418845176696777, + "rewards/rejected": -4.490933895111084, + "step": 1352 + }, + { + "epoch": 0.28, + "learning_rate": 1.4399159663865547e-05, + "logits/chosen": -2.3597846031188965, + "logits/rejected": -2.012449264526367, + "logps/chosen": -328.9736022949219, + "logps/rejected": -285.1427917480469, + "loss": 0.6528, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1324307918548584, + "rewards/margins": 2.0794105529785156, + "rewards/rejected": -4.211841106414795, + "step": 1353 + }, + { + "epoch": 0.28, + "learning_rate": 1.4394957983193278e-05, + "logits/chosen": -2.3281662464141846, + "logits/rejected": -1.9261395931243896, + "logps/chosen": -277.23504638671875, + "logps/rejected": -291.3643798828125, + "loss": 0.1952, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5924992561340332, + "rewards/margins": 3.4374356269836426, + "rewards/rejected": -5.029934883117676, + "step": 1354 + }, + { + "epoch": 0.28, + "learning_rate": 1.439075630252101e-05, + "logits/chosen": -2.215686559677124, + "logits/rejected": -1.8025391101837158, + "logps/chosen": -368.61248779296875, + "logps/rejected": -334.749267578125, + "loss": 0.3733, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.251800298690796, + "rewards/margins": 3.607045888900757, + "rewards/rejected": -5.858846187591553, + "step": 1355 + }, + { + "epoch": 0.28, + "learning_rate": 1.4386554621848741e-05, + "logits/chosen": -2.04526948928833, + "logits/rejected": -1.8921620845794678, + "logps/chosen": -338.46295166015625, + "logps/rejected": -273.7668762207031, + "loss": 0.3371, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9005348682403564, + "rewards/margins": 4.121726036071777, + "rewards/rejected": -6.022260665893555, + "step": 1356 + }, + { + "epoch": 0.28, + "learning_rate": 1.4382352941176472e-05, + "logits/chosen": -2.20761775970459, + "logits/rejected": -2.3307690620422363, + "logps/chosen": -305.7279052734375, + "logps/rejected": -340.6954040527344, + "loss": 0.4888, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.021620035171509, + "rewards/margins": 4.108015537261963, + "rewards/rejected": -6.129635810852051, + "step": 1357 + }, + { + "epoch": 0.28, + "learning_rate": 1.4378151260504204e-05, + "logits/chosen": -2.196288585662842, + "logits/rejected": -2.134573459625244, + "logps/chosen": -357.29656982421875, + "logps/rejected": -398.37384033203125, + "loss": 0.2675, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2381525039672852, + "rewards/margins": 5.813896179199219, + "rewards/rejected": -7.052048683166504, + "step": 1358 + }, + { + "epoch": 0.28, + "learning_rate": 1.4373949579831934e-05, + "logits/chosen": -1.9696464538574219, + "logits/rejected": -1.7178163528442383, + "logps/chosen": -244.8395538330078, + "logps/rejected": -249.77944946289062, + "loss": 0.3165, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1890835762023926, + "rewards/margins": 4.14453125, + "rewards/rejected": -6.333614826202393, + "step": 1359 + }, + { + "epoch": 0.28, + "learning_rate": 1.4369747899159666e-05, + "logits/chosen": -2.192800998687744, + "logits/rejected": -2.068331003189087, + "logps/chosen": -403.55072021484375, + "logps/rejected": -467.9599609375, + "loss": 0.5077, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5516631603240967, + "rewards/margins": 4.056065559387207, + "rewards/rejected": -5.607728958129883, + "step": 1360 + }, + { + "epoch": 0.28, + "learning_rate": 1.4365546218487396e-05, + "logits/chosen": -1.9563089609146118, + "logits/rejected": -1.8423224687576294, + "logps/chosen": -376.00592041015625, + "logps/rejected": -324.96246337890625, + "loss": 0.2359, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.748529314994812, + "rewards/margins": 2.0949015617370605, + "rewards/rejected": -3.843430995941162, + "step": 1361 + }, + { + "epoch": 0.28, + "learning_rate": 1.4361344537815128e-05, + "logits/chosen": -2.38944149017334, + "logits/rejected": -1.783832311630249, + "logps/chosen": -526.784423828125, + "logps/rejected": -452.2239074707031, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2716701030731201, + "rewards/margins": 4.190761566162109, + "rewards/rejected": -5.462431907653809, + "step": 1362 + }, + { + "epoch": 0.29, + "learning_rate": 1.4357142857142858e-05, + "logits/chosen": -2.2259650230407715, + "logits/rejected": -1.9749398231506348, + "logps/chosen": -462.24713134765625, + "logps/rejected": -349.829833984375, + "loss": 0.1995, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6469653844833374, + "rewards/margins": 4.059388160705566, + "rewards/rejected": -5.706353187561035, + "step": 1363 + }, + { + "epoch": 0.29, + "learning_rate": 1.435294117647059e-05, + "logits/chosen": -1.866605281829834, + "logits/rejected": -1.9206607341766357, + "logps/chosen": -307.0938720703125, + "logps/rejected": -346.87939453125, + "loss": 0.6358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9438176155090332, + "rewards/margins": 2.2784810066223145, + "rewards/rejected": -4.222298622131348, + "step": 1364 + }, + { + "epoch": 0.29, + "learning_rate": 1.434873949579832e-05, + "logits/chosen": -2.322901487350464, + "logits/rejected": -2.1697988510131836, + "logps/chosen": -293.63720703125, + "logps/rejected": -384.8137512207031, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6706867218017578, + "rewards/margins": 4.994970321655273, + "rewards/rejected": -6.665657043457031, + "step": 1365 + }, + { + "epoch": 0.29, + "learning_rate": 1.4344537815126052e-05, + "logits/chosen": -2.1275501251220703, + "logits/rejected": -1.7042955160140991, + "logps/chosen": -274.61004638671875, + "logps/rejected": -272.31329345703125, + "loss": 0.2297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5854241847991943, + "rewards/margins": 3.9313039779663086, + "rewards/rejected": -5.516728401184082, + "step": 1366 + }, + { + "epoch": 0.29, + "learning_rate": 1.4340336134453782e-05, + "logits/chosen": -2.3549113273620605, + "logits/rejected": -1.7642543315887451, + "logps/chosen": -307.1114807128906, + "logps/rejected": -257.6966247558594, + "loss": 0.2112, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9655675888061523, + "rewards/margins": 4.53652286529541, + "rewards/rejected": -6.502089977264404, + "step": 1367 + }, + { + "epoch": 0.29, + "learning_rate": 1.4336134453781514e-05, + "logits/chosen": -1.9896125793457031, + "logits/rejected": -1.9963213205337524, + "logps/chosen": -331.064208984375, + "logps/rejected": -333.5207214355469, + "loss": 0.3627, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6463813781738281, + "rewards/margins": 3.4043655395507812, + "rewards/rejected": -5.050746917724609, + "step": 1368 + }, + { + "epoch": 0.29, + "learning_rate": 1.4331932773109245e-05, + "logits/chosen": -1.8610866069793701, + "logits/rejected": -1.3744876384735107, + "logps/chosen": -483.6385803222656, + "logps/rejected": -394.76666259765625, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8994272947311401, + "rewards/margins": 2.30643630027771, + "rewards/rejected": -4.205862998962402, + "step": 1369 + }, + { + "epoch": 0.29, + "learning_rate": 1.4327731092436976e-05, + "logits/chosen": -2.0155768394470215, + "logits/rejected": -1.7789020538330078, + "logps/chosen": -359.98345947265625, + "logps/rejected": -367.2741394042969, + "loss": 0.2914, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6618175506591797, + "rewards/margins": 5.603056907653809, + "rewards/rejected": -8.264874458312988, + "step": 1370 + }, + { + "epoch": 0.29, + "learning_rate": 1.4323529411764707e-05, + "logits/chosen": -2.188124656677246, + "logits/rejected": -2.212658405303955, + "logps/chosen": -259.1091003417969, + "logps/rejected": -336.0252990722656, + "loss": 0.3649, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.294404983520508, + "rewards/margins": 2.6968204975128174, + "rewards/rejected": -4.991225719451904, + "step": 1371 + }, + { + "epoch": 0.29, + "learning_rate": 1.4319327731092439e-05, + "logits/chosen": -1.9551945924758911, + "logits/rejected": -2.2238640785217285, + "logps/chosen": -244.2034454345703, + "logps/rejected": -278.9943542480469, + "loss": 0.6181, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.210861921310425, + "rewards/margins": 2.8090715408325195, + "rewards/rejected": -5.019933223724365, + "step": 1372 + }, + { + "epoch": 0.29, + "learning_rate": 1.4315126050420169e-05, + "logits/chosen": -2.5590403079986572, + "logits/rejected": -1.902327537536621, + "logps/chosen": -402.0172424316406, + "logps/rejected": -352.5087890625, + "loss": 0.2206, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3248834609985352, + "rewards/margins": 3.991797685623169, + "rewards/rejected": -5.316681385040283, + "step": 1373 + }, + { + "epoch": 0.29, + "learning_rate": 1.43109243697479e-05, + "logits/chosen": -2.2591326236724854, + "logits/rejected": -2.1572072505950928, + "logps/chosen": -363.91986083984375, + "logps/rejected": -314.7950134277344, + "loss": 0.3981, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9238401651382446, + "rewards/margins": 2.5552515983581543, + "rewards/rejected": -4.479092121124268, + "step": 1374 + }, + { + "epoch": 0.29, + "learning_rate": 1.4306722689075631e-05, + "logits/chosen": -2.0042858123779297, + "logits/rejected": -1.991528034210205, + "logps/chosen": -216.24986267089844, + "logps/rejected": -282.8819885253906, + "loss": 0.3352, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.637871503829956, + "rewards/margins": 4.296302318572998, + "rewards/rejected": -5.934174537658691, + "step": 1375 + }, + { + "epoch": 0.29, + "learning_rate": 1.4302521008403363e-05, + "logits/chosen": -1.7971625328063965, + "logits/rejected": -1.8850367069244385, + "logps/chosen": -295.9991149902344, + "logps/rejected": -342.00982666015625, + "loss": 0.2578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4014968872070312, + "rewards/margins": 3.755101442337036, + "rewards/rejected": -6.1565985679626465, + "step": 1376 + }, + { + "epoch": 0.29, + "learning_rate": 1.4298319327731093e-05, + "logits/chosen": -1.9173038005828857, + "logits/rejected": -2.0198090076446533, + "logps/chosen": -236.44973754882812, + "logps/rejected": -332.3042297363281, + "loss": 0.7922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.514732837677002, + "rewards/margins": 3.3359315395355225, + "rewards/rejected": -5.850664138793945, + "step": 1377 + }, + { + "epoch": 0.29, + "learning_rate": 1.4294117647058825e-05, + "logits/chosen": -1.889622688293457, + "logits/rejected": -1.6587648391723633, + "logps/chosen": -322.12762451171875, + "logps/rejected": -361.54913330078125, + "loss": 0.4025, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3205466270446777, + "rewards/margins": 3.280498504638672, + "rewards/rejected": -5.60104513168335, + "step": 1378 + }, + { + "epoch": 0.29, + "learning_rate": 1.4289915966386557e-05, + "logits/chosen": -1.9957934617996216, + "logits/rejected": -1.8034297227859497, + "logps/chosen": -385.8522033691406, + "logps/rejected": -391.60791015625, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.900431513786316, + "rewards/margins": 4.756706237792969, + "rewards/rejected": -6.657137870788574, + "step": 1379 + }, + { + "epoch": 0.29, + "learning_rate": 1.4285714285714287e-05, + "logits/chosen": -2.246464252471924, + "logits/rejected": -2.290904998779297, + "logps/chosen": -484.63482666015625, + "logps/rejected": -521.557861328125, + "loss": 0.2752, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4362725019454956, + "rewards/margins": 4.5701680183410645, + "rewards/rejected": -6.006440162658691, + "step": 1380 + }, + { + "epoch": 0.29, + "learning_rate": 1.4281512605042019e-05, + "logits/chosen": -2.1868128776550293, + "logits/rejected": -2.008037567138672, + "logps/chosen": -291.0711975097656, + "logps/rejected": -363.279541015625, + "loss": 0.7281, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4228601455688477, + "rewards/margins": 1.9803717136383057, + "rewards/rejected": -4.403232097625732, + "step": 1381 + }, + { + "epoch": 0.29, + "learning_rate": 1.427731092436975e-05, + "logits/chosen": -2.1113853454589844, + "logits/rejected": -1.9716029167175293, + "logps/chosen": -206.6416015625, + "logps/rejected": -233.48175048828125, + "loss": 0.2923, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.611916422843933, + "rewards/margins": 2.6725914478302, + "rewards/rejected": -4.284507751464844, + "step": 1382 + }, + { + "epoch": 0.29, + "learning_rate": 1.4273109243697481e-05, + "logits/chosen": -2.266472339630127, + "logits/rejected": -1.6123251914978027, + "logps/chosen": -259.5782165527344, + "logps/rejected": -239.5970458984375, + "loss": 0.1817, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0504469871520996, + "rewards/margins": 3.492048740386963, + "rewards/rejected": -5.5424957275390625, + "step": 1383 + }, + { + "epoch": 0.29, + "learning_rate": 1.4268907563025211e-05, + "logits/chosen": -2.2564308643341064, + "logits/rejected": -2.074984550476074, + "logps/chosen": -246.646240234375, + "logps/rejected": -261.6790771484375, + "loss": 0.5386, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7185564041137695, + "rewards/margins": 2.2213246822357178, + "rewards/rejected": -3.939880847930908, + "step": 1384 + }, + { + "epoch": 0.29, + "learning_rate": 1.4264705882352943e-05, + "logits/chosen": -2.042741060256958, + "logits/rejected": -2.086050510406494, + "logps/chosen": -249.09671020507812, + "logps/rejected": -305.54327392578125, + "loss": 0.3511, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8466858863830566, + "rewards/margins": 2.257530689239502, + "rewards/rejected": -5.104217052459717, + "step": 1385 + }, + { + "epoch": 0.29, + "learning_rate": 1.4260504201680674e-05, + "logits/chosen": -1.9240500926971436, + "logits/rejected": -1.71843683719635, + "logps/chosen": -378.9152526855469, + "logps/rejected": -288.91363525390625, + "loss": 0.3061, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9485299587249756, + "rewards/margins": 2.8650171756744385, + "rewards/rejected": -4.813547134399414, + "step": 1386 + }, + { + "epoch": 0.29, + "learning_rate": 1.4256302521008405e-05, + "logits/chosen": -2.0926356315612793, + "logits/rejected": -1.8393793106079102, + "logps/chosen": -310.0332336425781, + "logps/rejected": -357.0968017578125, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5772117376327515, + "rewards/margins": 4.06320333480835, + "rewards/rejected": -5.640415191650391, + "step": 1387 + }, + { + "epoch": 0.29, + "learning_rate": 1.4252100840336136e-05, + "logits/chosen": -2.328073263168335, + "logits/rejected": -1.8795721530914307, + "logps/chosen": -310.5036315917969, + "logps/rejected": -258.7889404296875, + "loss": 0.4366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.696972608566284, + "rewards/margins": 2.1630706787109375, + "rewards/rejected": -4.860043525695801, + "step": 1388 + }, + { + "epoch": 0.29, + "learning_rate": 1.4247899159663868e-05, + "logits/chosen": -2.2217347621917725, + "logits/rejected": -1.9185413122177124, + "logps/chosen": -410.98291015625, + "logps/rejected": -374.36431884765625, + "loss": 0.1633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1255168914794922, + "rewards/margins": 4.061927795410156, + "rewards/rejected": -5.187444686889648, + "step": 1389 + }, + { + "epoch": 0.29, + "learning_rate": 1.4243697478991598e-05, + "logits/chosen": -2.434920310974121, + "logits/rejected": -1.9310954809188843, + "logps/chosen": -352.0817565917969, + "logps/rejected": -306.16217041015625, + "loss": 0.402, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3086655139923096, + "rewards/margins": 2.7785186767578125, + "rewards/rejected": -5.087183952331543, + "step": 1390 + }, + { + "epoch": 0.29, + "learning_rate": 1.423949579831933e-05, + "logits/chosen": -2.722306489944458, + "logits/rejected": -1.9645229578018188, + "logps/chosen": -463.1763000488281, + "logps/rejected": -381.077880859375, + "loss": 0.1992, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8835253715515137, + "rewards/margins": 5.22479248046875, + "rewards/rejected": -7.108318328857422, + "step": 1391 + }, + { + "epoch": 0.29, + "learning_rate": 1.423529411764706e-05, + "logits/chosen": -1.903465747833252, + "logits/rejected": -1.8752306699752808, + "logps/chosen": -294.4646911621094, + "logps/rejected": -315.959716796875, + "loss": 0.255, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.365931510925293, + "rewards/margins": 3.459804058074951, + "rewards/rejected": -5.825736045837402, + "step": 1392 + }, + { + "epoch": 0.29, + "learning_rate": 1.4231092436974792e-05, + "logits/chosen": -1.9860609769821167, + "logits/rejected": -1.8734023571014404, + "logps/chosen": -263.76318359375, + "logps/rejected": -421.1259765625, + "loss": 0.584, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8977365493774414, + "rewards/margins": 2.9429378509521484, + "rewards/rejected": -4.84067440032959, + "step": 1393 + }, + { + "epoch": 0.29, + "learning_rate": 1.4226890756302522e-05, + "logits/chosen": -2.0194225311279297, + "logits/rejected": -2.149589776992798, + "logps/chosen": -299.4510498046875, + "logps/rejected": -353.85516357421875, + "loss": 0.116, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5131078958511353, + "rewards/margins": 4.944719314575195, + "rewards/rejected": -6.457827091217041, + "step": 1394 + }, + { + "epoch": 0.29, + "learning_rate": 1.4222689075630254e-05, + "logits/chosen": -2.270944356918335, + "logits/rejected": -2.208620309829712, + "logps/chosen": -356.35430908203125, + "logps/rejected": -359.2975769042969, + "loss": 0.4048, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4218827486038208, + "rewards/margins": 2.709190607070923, + "rewards/rejected": -4.131073474884033, + "step": 1395 + }, + { + "epoch": 0.29, + "learning_rate": 1.4218487394957984e-05, + "logits/chosen": -2.1151790618896484, + "logits/rejected": -1.9973409175872803, + "logps/chosen": -378.75018310546875, + "logps/rejected": -381.0204162597656, + "loss": 0.511, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8107185363769531, + "rewards/margins": 2.4462521076202393, + "rewards/rejected": -4.256970405578613, + "step": 1396 + }, + { + "epoch": 0.29, + "learning_rate": 1.4214285714285716e-05, + "logits/chosen": -2.3903427124023438, + "logits/rejected": -1.8573503494262695, + "logps/chosen": -474.4723205566406, + "logps/rejected": -374.26409912109375, + "loss": 0.264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2410386800765991, + "rewards/margins": 3.4629642963409424, + "rewards/rejected": -4.704002857208252, + "step": 1397 + }, + { + "epoch": 0.29, + "learning_rate": 1.4210084033613446e-05, + "logits/chosen": -1.777158260345459, + "logits/rejected": -1.8338042497634888, + "logps/chosen": -291.2160949707031, + "logps/rejected": -375.8805847167969, + "loss": 0.2922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.819044828414917, + "rewards/margins": 4.440203666687012, + "rewards/rejected": -6.259248733520508, + "step": 1398 + }, + { + "epoch": 0.29, + "learning_rate": 1.4205882352941178e-05, + "logits/chosen": -2.0171923637390137, + "logits/rejected": -2.1588454246520996, + "logps/chosen": -378.64129638671875, + "logps/rejected": -435.9288330078125, + "loss": 0.2699, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9165548086166382, + "rewards/margins": 4.612350940704346, + "rewards/rejected": -6.528904914855957, + "step": 1399 + }, + { + "epoch": 0.29, + "learning_rate": 1.4201680672268908e-05, + "logits/chosen": -2.226290702819824, + "logits/rejected": -1.8148497343063354, + "logps/chosen": -428.9877624511719, + "logps/rejected": -361.9025573730469, + "loss": 0.3227, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.497218132019043, + "rewards/margins": 4.796828746795654, + "rewards/rejected": -6.294047832489014, + "step": 1400 + }, + { + "epoch": 0.29, + "learning_rate": 1.419747899159664e-05, + "logits/chosen": -2.370039463043213, + "logits/rejected": -1.8396408557891846, + "logps/chosen": -376.5686340332031, + "logps/rejected": -353.7760009765625, + "loss": 0.4752, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.027282953262329, + "rewards/margins": 2.6561946868896484, + "rewards/rejected": -4.683477401733398, + "step": 1401 + }, + { + "epoch": 0.29, + "learning_rate": 1.4193277310924372e-05, + "logits/chosen": -2.050440549850464, + "logits/rejected": -1.3979960680007935, + "logps/chosen": -429.3384094238281, + "logps/rejected": -331.03118896484375, + "loss": 0.1083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4097628593444824, + "rewards/margins": 3.863023519515991, + "rewards/rejected": -5.2727861404418945, + "step": 1402 + }, + { + "epoch": 0.29, + "learning_rate": 1.4189075630252103e-05, + "logits/chosen": -2.343156337738037, + "logits/rejected": -2.148479700088501, + "logps/chosen": -340.35040283203125, + "logps/rejected": -346.0924987792969, + "loss": 0.3717, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.152132511138916, + "rewards/margins": 3.4840362071990967, + "rewards/rejected": -5.636168479919434, + "step": 1403 + }, + { + "epoch": 0.29, + "learning_rate": 1.4184873949579834e-05, + "logits/chosen": -2.289759635925293, + "logits/rejected": -1.5942713022232056, + "logps/chosen": -287.6488037109375, + "logps/rejected": -287.570556640625, + "loss": 0.2233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8250104188919067, + "rewards/margins": 3.9715144634246826, + "rewards/rejected": -5.796525001525879, + "step": 1404 + }, + { + "epoch": 0.29, + "learning_rate": 1.4180672268907565e-05, + "logits/chosen": -2.307034492492676, + "logits/rejected": -1.817615032196045, + "logps/chosen": -266.3467712402344, + "logps/rejected": -298.97711181640625, + "loss": 0.3836, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6582483053207397, + "rewards/margins": 3.8191475868225098, + "rewards/rejected": -5.477396011352539, + "step": 1405 + }, + { + "epoch": 0.29, + "learning_rate": 1.4176470588235297e-05, + "logits/chosen": -2.200026035308838, + "logits/rejected": -2.2222208976745605, + "logps/chosen": -372.63671875, + "logps/rejected": -288.7843017578125, + "loss": 0.6764, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3803563117980957, + "rewards/margins": 2.51523494720459, + "rewards/rejected": -4.8955912590026855, + "step": 1406 + }, + { + "epoch": 0.29, + "learning_rate": 1.4172268907563027e-05, + "logits/chosen": -1.7498670816421509, + "logits/rejected": -1.4911487102508545, + "logps/chosen": -302.2646484375, + "logps/rejected": -356.66064453125, + "loss": 0.1221, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.326611876487732, + "rewards/margins": 6.048299312591553, + "rewards/rejected": -7.374910354614258, + "step": 1407 + }, + { + "epoch": 0.29, + "learning_rate": 1.4168067226890759e-05, + "logits/chosen": -1.918386697769165, + "logits/rejected": -2.01069712638855, + "logps/chosen": -318.57952880859375, + "logps/rejected": -327.1434631347656, + "loss": 0.139, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.995417833328247, + "rewards/margins": 4.115584373474121, + "rewards/rejected": -6.111002445220947, + "step": 1408 + }, + { + "epoch": 0.29, + "learning_rate": 1.4163865546218489e-05, + "logits/chosen": -2.5465407371520996, + "logits/rejected": -2.0716960430145264, + "logps/chosen": -307.10791015625, + "logps/rejected": -270.0506591796875, + "loss": 0.4094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.426265239715576, + "rewards/margins": 3.5469722747802734, + "rewards/rejected": -5.97323751449585, + "step": 1409 + }, + { + "epoch": 0.29, + "learning_rate": 1.4159663865546221e-05, + "logits/chosen": -2.2793445587158203, + "logits/rejected": -1.9437533617019653, + "logps/chosen": -354.13043212890625, + "logps/rejected": -317.4933166503906, + "loss": 0.3388, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.526595115661621, + "rewards/margins": 4.063589096069336, + "rewards/rejected": -6.590184211730957, + "step": 1410 + }, + { + "epoch": 0.3, + "learning_rate": 1.4155462184873951e-05, + "logits/chosen": -2.152052640914917, + "logits/rejected": -2.047597646713257, + "logps/chosen": -282.5943603515625, + "logps/rejected": -351.9524230957031, + "loss": 0.3019, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7080355882644653, + "rewards/margins": 3.4433722496032715, + "rewards/rejected": -5.151407718658447, + "step": 1411 + }, + { + "epoch": 0.3, + "learning_rate": 1.4151260504201683e-05, + "logits/chosen": -1.9197349548339844, + "logits/rejected": -2.351728916168213, + "logps/chosen": -367.78955078125, + "logps/rejected": -420.56182861328125, + "loss": 0.2875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.511461615562439, + "rewards/margins": 4.561715602874756, + "rewards/rejected": -6.073177337646484, + "step": 1412 + }, + { + "epoch": 0.3, + "learning_rate": 1.4147058823529413e-05, + "logits/chosen": -2.3096063137054443, + "logits/rejected": -1.6582013368606567, + "logps/chosen": -457.0124206542969, + "logps/rejected": -337.5274658203125, + "loss": 0.1846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0087244510650635, + "rewards/margins": 3.5492029190063477, + "rewards/rejected": -5.557927131652832, + "step": 1413 + }, + { + "epoch": 0.3, + "learning_rate": 1.4142857142857145e-05, + "logits/chosen": -2.1169986724853516, + "logits/rejected": -1.9908413887023926, + "logps/chosen": -303.75579833984375, + "logps/rejected": -331.300048828125, + "loss": 0.4889, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7114627361297607, + "rewards/margins": 3.6085968017578125, + "rewards/rejected": -5.320059299468994, + "step": 1414 + }, + { + "epoch": 0.3, + "learning_rate": 1.4138655462184875e-05, + "logits/chosen": -1.9522316455841064, + "logits/rejected": -1.7111420631408691, + "logps/chosen": -309.3537902832031, + "logps/rejected": -359.9637451171875, + "loss": 0.5529, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2922170162200928, + "rewards/margins": 3.2318155765533447, + "rewards/rejected": -5.524031639099121, + "step": 1415 + }, + { + "epoch": 0.3, + "learning_rate": 1.4134453781512607e-05, + "logits/chosen": -2.042774200439453, + "logits/rejected": -2.008208751678467, + "logps/chosen": -307.2394104003906, + "logps/rejected": -354.38079833984375, + "loss": 0.2916, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.459295392036438, + "rewards/margins": 4.657646656036377, + "rewards/rejected": -6.116942405700684, + "step": 1416 + }, + { + "epoch": 0.3, + "learning_rate": 1.4130252100840338e-05, + "logits/chosen": -2.1195902824401855, + "logits/rejected": -2.215888261795044, + "logps/chosen": -345.33868408203125, + "logps/rejected": -351.12353515625, + "loss": 0.3433, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9914941787719727, + "rewards/margins": 3.133695602416992, + "rewards/rejected": -5.125189781188965, + "step": 1417 + }, + { + "epoch": 0.3, + "learning_rate": 1.412605042016807e-05, + "logits/chosen": -2.2169976234436035, + "logits/rejected": -2.1237587928771973, + "logps/chosen": -332.54766845703125, + "logps/rejected": -393.8397216796875, + "loss": 0.4222, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5497066974639893, + "rewards/margins": 4.237919807434082, + "rewards/rejected": -6.787626266479492, + "step": 1418 + }, + { + "epoch": 0.3, + "learning_rate": 1.41218487394958e-05, + "logits/chosen": -2.218597650527954, + "logits/rejected": -2.225863456726074, + "logps/chosen": -284.6513366699219, + "logps/rejected": -314.7806396484375, + "loss": 0.1528, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5764003992080688, + "rewards/margins": 3.9082117080688477, + "rewards/rejected": -5.484612464904785, + "step": 1419 + }, + { + "epoch": 0.3, + "learning_rate": 1.4117647058823532e-05, + "logits/chosen": -2.3294286727905273, + "logits/rejected": -1.8868989944458008, + "logps/chosen": -221.39682006835938, + "logps/rejected": -273.1705017089844, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5244605541229248, + "rewards/margins": 6.720041275024414, + "rewards/rejected": -8.244502067565918, + "step": 1420 + }, + { + "epoch": 0.3, + "learning_rate": 1.4113445378151262e-05, + "logits/chosen": -1.9156928062438965, + "logits/rejected": -2.148045063018799, + "logps/chosen": -355.34710693359375, + "logps/rejected": -393.51190185546875, + "loss": 0.6206, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.2118263244628906, + "rewards/margins": 1.231339454650879, + "rewards/rejected": -3.4431657791137695, + "step": 1421 + }, + { + "epoch": 0.3, + "learning_rate": 1.4109243697478994e-05, + "logits/chosen": -1.7248703241348267, + "logits/rejected": -1.6772284507751465, + "logps/chosen": -322.4893798828125, + "logps/rejected": -353.74322509765625, + "loss": 0.3688, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3381917476654053, + "rewards/margins": 3.126230478286743, + "rewards/rejected": -5.464422225952148, + "step": 1422 + }, + { + "epoch": 0.3, + "learning_rate": 1.4105042016806726e-05, + "logits/chosen": -2.0415499210357666, + "logits/rejected": -1.9954216480255127, + "logps/chosen": -335.72906494140625, + "logps/rejected": -324.9419250488281, + "loss": 0.2606, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8611574172973633, + "rewards/margins": 4.32957124710083, + "rewards/rejected": -6.190728187561035, + "step": 1423 + }, + { + "epoch": 0.3, + "learning_rate": 1.4100840336134456e-05, + "logits/chosen": -2.2340097427368164, + "logits/rejected": -1.9404314756393433, + "logps/chosen": -409.8175048828125, + "logps/rejected": -471.13421630859375, + "loss": 0.2892, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4455287456512451, + "rewards/margins": 2.9760255813598633, + "rewards/rejected": -4.4215545654296875, + "step": 1424 + }, + { + "epoch": 0.3, + "learning_rate": 1.4096638655462188e-05, + "logits/chosen": -2.0100035667419434, + "logits/rejected": -2.005782127380371, + "logps/chosen": -354.4512634277344, + "logps/rejected": -388.3296813964844, + "loss": 0.2543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.003450393676758, + "rewards/margins": 4.292349338531494, + "rewards/rejected": -6.295799732208252, + "step": 1425 + }, + { + "epoch": 0.3, + "learning_rate": 1.4092436974789918e-05, + "logits/chosen": -2.0746102333068848, + "logits/rejected": -1.8971270322799683, + "logps/chosen": -335.074951171875, + "logps/rejected": -378.8569641113281, + "loss": 0.4782, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.54323148727417, + "rewards/margins": 3.748412609100342, + "rewards/rejected": -5.291645050048828, + "step": 1426 + }, + { + "epoch": 0.3, + "learning_rate": 1.408823529411765e-05, + "logits/chosen": -2.40716814994812, + "logits/rejected": -1.339943766593933, + "logps/chosen": -423.3092346191406, + "logps/rejected": -227.74795532226562, + "loss": 0.1333, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8600265979766846, + "rewards/margins": 3.3094186782836914, + "rewards/rejected": -4.169445037841797, + "step": 1427 + }, + { + "epoch": 0.3, + "learning_rate": 1.408403361344538e-05, + "logits/chosen": -1.6201808452606201, + "logits/rejected": -1.7330325841903687, + "logps/chosen": -261.90325927734375, + "logps/rejected": -265.1957702636719, + "loss": 0.4388, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4698610305786133, + "rewards/margins": 2.8409595489501953, + "rewards/rejected": -5.310820579528809, + "step": 1428 + }, + { + "epoch": 0.3, + "learning_rate": 1.4079831932773112e-05, + "logits/chosen": -2.33933162689209, + "logits/rejected": -2.242175579071045, + "logps/chosen": -342.8474426269531, + "logps/rejected": -339.5798645019531, + "loss": 0.194, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0780028104782104, + "rewards/margins": 4.006252288818359, + "rewards/rejected": -5.084254741668701, + "step": 1429 + }, + { + "epoch": 0.3, + "learning_rate": 1.4075630252100842e-05, + "logits/chosen": -2.217787981033325, + "logits/rejected": -1.9487733840942383, + "logps/chosen": -392.06915283203125, + "logps/rejected": -397.7019958496094, + "loss": 0.2811, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.412872552871704, + "rewards/margins": 2.923387050628662, + "rewards/rejected": -4.336259841918945, + "step": 1430 + }, + { + "epoch": 0.3, + "learning_rate": 1.4071428571428574e-05, + "logits/chosen": -2.429800510406494, + "logits/rejected": -2.1618807315826416, + "logps/chosen": -390.2091064453125, + "logps/rejected": -338.1448669433594, + "loss": 0.5931, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3931775093078613, + "rewards/margins": 3.402618885040283, + "rewards/rejected": -4.7957963943481445, + "step": 1431 + }, + { + "epoch": 0.3, + "learning_rate": 1.4067226890756304e-05, + "logits/chosen": -2.263923168182373, + "logits/rejected": -1.8978078365325928, + "logps/chosen": -312.78607177734375, + "logps/rejected": -325.0115051269531, + "loss": 0.1069, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5175653100013733, + "rewards/margins": 5.243579864501953, + "rewards/rejected": -5.761145114898682, + "step": 1432 + }, + { + "epoch": 0.3, + "learning_rate": 1.4063025210084036e-05, + "logits/chosen": -2.1740081310272217, + "logits/rejected": -1.7976738214492798, + "logps/chosen": -508.65997314453125, + "logps/rejected": -427.5522155761719, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4478694200515747, + "rewards/margins": 3.086674213409424, + "rewards/rejected": -3.534543037414551, + "step": 1433 + }, + { + "epoch": 0.3, + "learning_rate": 1.4058823529411765e-05, + "logits/chosen": -1.932845950126648, + "logits/rejected": -2.2601325511932373, + "logps/chosen": -370.04498291015625, + "logps/rejected": -397.68121337890625, + "loss": 0.198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0850374698638916, + "rewards/margins": 2.811128616333008, + "rewards/rejected": -2.8961658477783203, + "step": 1434 + }, + { + "epoch": 0.3, + "learning_rate": 1.4054621848739495e-05, + "logits/chosen": -2.4003355503082275, + "logits/rejected": -2.3966779708862305, + "logps/chosen": -366.4671630859375, + "logps/rejected": -337.98388671875, + "loss": 0.4612, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4155943393707275, + "rewards/margins": 3.2621355056762695, + "rewards/rejected": -4.677729606628418, + "step": 1435 + }, + { + "epoch": 0.3, + "learning_rate": 1.4050420168067227e-05, + "logits/chosen": -1.882743239402771, + "logits/rejected": -1.546684741973877, + "logps/chosen": -277.36663818359375, + "logps/rejected": -239.81103515625, + "loss": 0.281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.165391445159912, + "rewards/margins": 2.6982457637786865, + "rewards/rejected": -3.8636374473571777, + "step": 1436 + }, + { + "epoch": 0.3, + "learning_rate": 1.4046218487394959e-05, + "logits/chosen": -1.6716489791870117, + "logits/rejected": -1.711898922920227, + "logps/chosen": -312.05230712890625, + "logps/rejected": -363.0792236328125, + "loss": 0.462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3209638595581055, + "rewards/margins": 3.0096306800842285, + "rewards/rejected": -4.330594539642334, + "step": 1437 + }, + { + "epoch": 0.3, + "learning_rate": 1.4042016806722689e-05, + "logits/chosen": -1.9865370988845825, + "logits/rejected": -1.8661975860595703, + "logps/chosen": -358.96990966796875, + "logps/rejected": -427.45050048828125, + "loss": 0.6389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6543587446212769, + "rewards/margins": 1.7921056747436523, + "rewards/rejected": -3.4464643001556396, + "step": 1438 + }, + { + "epoch": 0.3, + "learning_rate": 1.4037815126050421e-05, + "logits/chosen": -2.063514232635498, + "logits/rejected": -1.9396929740905762, + "logps/chosen": -303.96685791015625, + "logps/rejected": -316.91351318359375, + "loss": 0.4936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5536534786224365, + "rewards/margins": 2.088104248046875, + "rewards/rejected": -3.6417577266693115, + "step": 1439 + }, + { + "epoch": 0.3, + "learning_rate": 1.4033613445378151e-05, + "logits/chosen": -2.2052083015441895, + "logits/rejected": -2.17655348777771, + "logps/chosen": -331.864013671875, + "logps/rejected": -397.267333984375, + "loss": 0.5044, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1957004070281982, + "rewards/margins": 3.524470567703247, + "rewards/rejected": -4.720170974731445, + "step": 1440 + }, + { + "epoch": 0.3, + "learning_rate": 1.4029411764705883e-05, + "logits/chosen": -2.3663828372955322, + "logits/rejected": -2.201965093612671, + "logps/chosen": -245.049560546875, + "logps/rejected": -223.2589111328125, + "loss": 0.2926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5418893098831177, + "rewards/margins": 2.591193199157715, + "rewards/rejected": -3.133082628250122, + "step": 1441 + }, + { + "epoch": 0.3, + "learning_rate": 1.4025210084033613e-05, + "logits/chosen": -1.9682854413986206, + "logits/rejected": -2.1284408569335938, + "logps/chosen": -265.33026123046875, + "logps/rejected": -429.75689697265625, + "loss": 0.2527, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5952258110046387, + "rewards/margins": 3.873617172241211, + "rewards/rejected": -5.468842506408691, + "step": 1442 + }, + { + "epoch": 0.3, + "learning_rate": 1.4021008403361345e-05, + "logits/chosen": -2.1686043739318848, + "logits/rejected": -2.0361034870147705, + "logps/chosen": -249.85365295410156, + "logps/rejected": -280.7278137207031, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2448259592056274, + "rewards/margins": 5.711902618408203, + "rewards/rejected": -6.956728458404541, + "step": 1443 + }, + { + "epoch": 0.3, + "learning_rate": 1.4016806722689076e-05, + "logits/chosen": -1.8979355096817017, + "logits/rejected": -2.087982654571533, + "logps/chosen": -300.1328430175781, + "logps/rejected": -384.6341857910156, + "loss": 0.2986, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9855170249938965, + "rewards/margins": 3.8924458026885986, + "rewards/rejected": -4.877963066101074, + "step": 1444 + }, + { + "epoch": 0.3, + "learning_rate": 1.4012605042016807e-05, + "logits/chosen": -2.076925277709961, + "logits/rejected": -1.8943102359771729, + "logps/chosen": -238.86732482910156, + "logps/rejected": -249.40692138671875, + "loss": 0.3663, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1582841873168945, + "rewards/margins": 2.887338399887085, + "rewards/rejected": -4.045622825622559, + "step": 1445 + }, + { + "epoch": 0.3, + "learning_rate": 1.4008403361344538e-05, + "logits/chosen": -1.8368775844573975, + "logits/rejected": -1.9516080617904663, + "logps/chosen": -248.00221252441406, + "logps/rejected": -302.8523254394531, + "loss": 0.373, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7918825149536133, + "rewards/margins": 3.4885544776916504, + "rewards/rejected": -5.2804365158081055, + "step": 1446 + }, + { + "epoch": 0.3, + "learning_rate": 1.400420168067227e-05, + "logits/chosen": -2.305478811264038, + "logits/rejected": -1.9125757217407227, + "logps/chosen": -379.840576171875, + "logps/rejected": -350.6776123046875, + "loss": 0.293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6017919778823853, + "rewards/margins": 3.715904712677002, + "rewards/rejected": -4.317697048187256, + "step": 1447 + }, + { + "epoch": 0.3, + "learning_rate": 1.4e-05, + "logits/chosen": -2.017338752746582, + "logits/rejected": -1.982062578201294, + "logps/chosen": -262.70953369140625, + "logps/rejected": -327.8833312988281, + "loss": 0.45, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4727370738983154, + "rewards/margins": 2.93798565864563, + "rewards/rejected": -4.410722732543945, + "step": 1448 + }, + { + "epoch": 0.3, + "learning_rate": 1.3995798319327732e-05, + "logits/chosen": -1.9766700267791748, + "logits/rejected": -1.7306606769561768, + "logps/chosen": -381.85467529296875, + "logps/rejected": -277.19549560546875, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7957472801208496, + "rewards/margins": 3.728302478790283, + "rewards/rejected": -5.524049758911133, + "step": 1449 + }, + { + "epoch": 0.3, + "learning_rate": 1.3991596638655462e-05, + "logits/chosen": -2.0307886600494385, + "logits/rejected": -1.9838840961456299, + "logps/chosen": -207.32003784179688, + "logps/rejected": -225.19398498535156, + "loss": 0.5563, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3031766414642334, + "rewards/margins": 2.3507626056671143, + "rewards/rejected": -4.653939247131348, + "step": 1450 + }, + { + "epoch": 0.3, + "learning_rate": 1.3987394957983194e-05, + "logits/chosen": -2.244724988937378, + "logits/rejected": -1.6441290378570557, + "logps/chosen": -309.74267578125, + "logps/rejected": -258.3617858886719, + "loss": 0.4104, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.564531922340393, + "rewards/margins": 1.9238146543502808, + "rewards/rejected": -3.488346576690674, + "step": 1451 + }, + { + "epoch": 0.3, + "learning_rate": 1.3983193277310924e-05, + "logits/chosen": -2.0285205841064453, + "logits/rejected": -2.0154714584350586, + "logps/chosen": -313.3587951660156, + "logps/rejected": -338.3735046386719, + "loss": 0.4524, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2947394847869873, + "rewards/margins": 2.834505558013916, + "rewards/rejected": -5.129245758056641, + "step": 1452 + }, + { + "epoch": 0.3, + "learning_rate": 1.3978991596638656e-05, + "logits/chosen": -2.079664707183838, + "logits/rejected": -1.7986183166503906, + "logps/chosen": -342.4872741699219, + "logps/rejected": -332.5512390136719, + "loss": 0.3734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.215742588043213, + "rewards/margins": 3.3589580059051514, + "rewards/rejected": -4.574700355529785, + "step": 1453 + }, + { + "epoch": 0.3, + "learning_rate": 1.3974789915966386e-05, + "logits/chosen": -2.3691060543060303, + "logits/rejected": -2.168433904647827, + "logps/chosen": -300.65838623046875, + "logps/rejected": -248.2224884033203, + "loss": 0.5816, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3956382274627686, + "rewards/margins": 1.9216790199279785, + "rewards/rejected": -3.317317247390747, + "step": 1454 + }, + { + "epoch": 0.3, + "learning_rate": 1.3970588235294118e-05, + "logits/chosen": -1.884229063987732, + "logits/rejected": -2.1718380451202393, + "logps/chosen": -224.0662841796875, + "logps/rejected": -341.57086181640625, + "loss": 0.1135, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7332942485809326, + "rewards/margins": 4.621888160705566, + "rewards/rejected": -6.355182647705078, + "step": 1455 + }, + { + "epoch": 0.3, + "learning_rate": 1.3966386554621848e-05, + "logits/chosen": -2.1268794536590576, + "logits/rejected": -1.4593603610992432, + "logps/chosen": -386.80291748046875, + "logps/rejected": -360.8091735839844, + "loss": 0.2729, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0995765924453735, + "rewards/margins": 3.890530586242676, + "rewards/rejected": -4.99010705947876, + "step": 1456 + }, + { + "epoch": 0.3, + "learning_rate": 1.396218487394958e-05, + "logits/chosen": -2.7517549991607666, + "logits/rejected": -2.046962261199951, + "logps/chosen": -509.4554748535156, + "logps/rejected": -346.8223571777344, + "loss": 0.1636, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.186597466468811, + "rewards/margins": 3.055948257446289, + "rewards/rejected": -4.2425456047058105, + "step": 1457 + }, + { + "epoch": 0.31, + "learning_rate": 1.3957983193277312e-05, + "logits/chosen": -1.9073152542114258, + "logits/rejected": -1.9612417221069336, + "logps/chosen": -346.36553955078125, + "logps/rejected": -308.7762145996094, + "loss": 0.3268, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3581430912017822, + "rewards/margins": 2.0800538063049316, + "rewards/rejected": -3.4381966590881348, + "step": 1458 + }, + { + "epoch": 0.31, + "learning_rate": 1.3953781512605042e-05, + "logits/chosen": -2.021149158477783, + "logits/rejected": -1.6026772260665894, + "logps/chosen": -340.4598693847656, + "logps/rejected": -320.02813720703125, + "loss": 0.4163, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2530276775360107, + "rewards/margins": 3.745903491973877, + "rewards/rejected": -5.998931407928467, + "step": 1459 + }, + { + "epoch": 0.31, + "learning_rate": 1.3949579831932774e-05, + "logits/chosen": -2.28706955909729, + "logits/rejected": -1.9779248237609863, + "logps/chosen": -378.193359375, + "logps/rejected": -329.69671630859375, + "loss": 0.2713, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9672653675079346, + "rewards/margins": 3.409691333770752, + "rewards/rejected": -5.376956462860107, + "step": 1460 + }, + { + "epoch": 0.31, + "learning_rate": 1.3945378151260505e-05, + "logits/chosen": -2.3210740089416504, + "logits/rejected": -1.943753957748413, + "logps/chosen": -360.6578674316406, + "logps/rejected": -307.4670715332031, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.347554087638855, + "rewards/margins": 4.432538986206055, + "rewards/rejected": -5.780092716217041, + "step": 1461 + }, + { + "epoch": 0.31, + "learning_rate": 1.3941176470588236e-05, + "logits/chosen": -2.2012858390808105, + "logits/rejected": -1.7777936458587646, + "logps/chosen": -335.41705322265625, + "logps/rejected": -332.2799987792969, + "loss": 0.4697, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9669578075408936, + "rewards/margins": 2.368565559387207, + "rewards/rejected": -4.33552360534668, + "step": 1462 + }, + { + "epoch": 0.31, + "learning_rate": 1.3936974789915967e-05, + "logits/chosen": -2.1187491416931152, + "logits/rejected": -1.7113761901855469, + "logps/chosen": -281.1143798828125, + "logps/rejected": -240.6015167236328, + "loss": 0.4022, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9238791465759277, + "rewards/margins": 3.1040847301483154, + "rewards/rejected": -5.027963638305664, + "step": 1463 + }, + { + "epoch": 0.31, + "learning_rate": 1.3932773109243699e-05, + "logits/chosen": -2.2906455993652344, + "logits/rejected": -1.875886082649231, + "logps/chosen": -316.2896728515625, + "logps/rejected": -359.71014404296875, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2699439525604248, + "rewards/margins": 5.328958988189697, + "rewards/rejected": -6.598902702331543, + "step": 1464 + }, + { + "epoch": 0.31, + "learning_rate": 1.3928571428571429e-05, + "logits/chosen": -2.2704389095306396, + "logits/rejected": -2.1847217082977295, + "logps/chosen": -323.0194091796875, + "logps/rejected": -363.40191650390625, + "loss": 0.4094, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.188535451889038, + "rewards/margins": 2.950287342071533, + "rewards/rejected": -5.13882303237915, + "step": 1465 + }, + { + "epoch": 0.31, + "learning_rate": 1.392436974789916e-05, + "logits/chosen": -2.203469753265381, + "logits/rejected": -2.120384931564331, + "logps/chosen": -336.221435546875, + "logps/rejected": -396.68536376953125, + "loss": 0.2712, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1328576803207397, + "rewards/margins": 5.192601203918457, + "rewards/rejected": -6.325458526611328, + "step": 1466 + }, + { + "epoch": 0.31, + "learning_rate": 1.3920168067226891e-05, + "logits/chosen": -2.3901824951171875, + "logits/rejected": -1.7001736164093018, + "logps/chosen": -418.62738037109375, + "logps/rejected": -267.2537841796875, + "loss": 0.2328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.376701831817627, + "rewards/margins": 3.787548065185547, + "rewards/rejected": -5.164250373840332, + "step": 1467 + }, + { + "epoch": 0.31, + "learning_rate": 1.3915966386554623e-05, + "logits/chosen": -2.334265947341919, + "logits/rejected": -1.885516881942749, + "logps/chosen": -327.3313293457031, + "logps/rejected": -397.9049987792969, + "loss": 0.2187, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5809022188186646, + "rewards/margins": 4.74301290512085, + "rewards/rejected": -6.323915481567383, + "step": 1468 + }, + { + "epoch": 0.31, + "learning_rate": 1.3911764705882353e-05, + "logits/chosen": -2.3096418380737305, + "logits/rejected": -1.8101575374603271, + "logps/chosen": -403.4140625, + "logps/rejected": -348.06353759765625, + "loss": 0.1577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0830509662628174, + "rewards/margins": 4.245474338531494, + "rewards/rejected": -6.328524589538574, + "step": 1469 + }, + { + "epoch": 0.31, + "learning_rate": 1.3907563025210085e-05, + "logits/chosen": -1.9424877166748047, + "logits/rejected": -2.0393130779266357, + "logps/chosen": -393.0169677734375, + "logps/rejected": -432.44482421875, + "loss": 0.4121, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3869253396987915, + "rewards/margins": 3.0890190601348877, + "rewards/rejected": -4.475944519042969, + "step": 1470 + }, + { + "epoch": 0.31, + "learning_rate": 1.3903361344537815e-05, + "logits/chosen": -2.292696714401245, + "logits/rejected": -2.1200037002563477, + "logps/chosen": -347.6493225097656, + "logps/rejected": -364.51190185546875, + "loss": 0.2717, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0046565532684326, + "rewards/margins": 3.614495277404785, + "rewards/rejected": -5.619152069091797, + "step": 1471 + }, + { + "epoch": 0.31, + "learning_rate": 1.3899159663865547e-05, + "logits/chosen": -2.054111957550049, + "logits/rejected": -1.5909301042556763, + "logps/chosen": -373.861572265625, + "logps/rejected": -281.73895263671875, + "loss": 0.2596, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.579409122467041, + "rewards/margins": 3.710562229156494, + "rewards/rejected": -5.289971351623535, + "step": 1472 + }, + { + "epoch": 0.31, + "learning_rate": 1.3894957983193277e-05, + "logits/chosen": -1.830489993095398, + "logits/rejected": -2.200249671936035, + "logps/chosen": -243.86029052734375, + "logps/rejected": -269.4947204589844, + "loss": 0.1208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7853641510009766, + "rewards/margins": 4.046734809875488, + "rewards/rejected": -5.832098960876465, + "step": 1473 + }, + { + "epoch": 0.31, + "learning_rate": 1.389075630252101e-05, + "logits/chosen": -2.157987594604492, + "logits/rejected": -2.1717755794525146, + "logps/chosen": -422.9354248046875, + "logps/rejected": -461.49676513671875, + "loss": 0.7948, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.138658046722412, + "rewards/margins": 1.8930866718292236, + "rewards/rejected": -4.031744956970215, + "step": 1474 + }, + { + "epoch": 0.31, + "learning_rate": 1.388655462184874e-05, + "logits/chosen": -2.288348913192749, + "logits/rejected": -1.7758862972259521, + "logps/chosen": -348.1274719238281, + "logps/rejected": -287.52044677734375, + "loss": 0.3611, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.988929033279419, + "rewards/margins": 3.304992198944092, + "rewards/rejected": -5.293920993804932, + "step": 1475 + }, + { + "epoch": 0.31, + "learning_rate": 1.3882352941176471e-05, + "logits/chosen": -1.9256844520568848, + "logits/rejected": -1.8006024360656738, + "logps/chosen": -293.7676086425781, + "logps/rejected": -327.9270935058594, + "loss": 0.4368, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7139800786972046, + "rewards/margins": 2.328761100769043, + "rewards/rejected": -4.042741298675537, + "step": 1476 + }, + { + "epoch": 0.31, + "learning_rate": 1.3878151260504202e-05, + "logits/chosen": -1.7193092107772827, + "logits/rejected": -1.9379358291625977, + "logps/chosen": -381.2054443359375, + "logps/rejected": -343.2018127441406, + "loss": 0.3519, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5428836345672607, + "rewards/margins": 3.2864980697631836, + "rewards/rejected": -5.829381942749023, + "step": 1477 + }, + { + "epoch": 0.31, + "learning_rate": 1.3873949579831934e-05, + "logits/chosen": -2.099285125732422, + "logits/rejected": -1.6909632682800293, + "logps/chosen": -403.2125244140625, + "logps/rejected": -325.96710205078125, + "loss": 0.4383, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6311864852905273, + "rewards/margins": 3.1062440872192383, + "rewards/rejected": -5.737430572509766, + "step": 1478 + }, + { + "epoch": 0.31, + "learning_rate": 1.3869747899159664e-05, + "logits/chosen": -2.607786178588867, + "logits/rejected": -2.2463717460632324, + "logps/chosen": -529.0430297851562, + "logps/rejected": -443.3426208496094, + "loss": 0.405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17839226126670837, + "rewards/margins": 3.6817080974578857, + "rewards/rejected": -3.860100746154785, + "step": 1479 + }, + { + "epoch": 0.31, + "learning_rate": 1.3865546218487396e-05, + "logits/chosen": -1.884071946144104, + "logits/rejected": -1.9651480913162231, + "logps/chosen": -279.3707275390625, + "logps/rejected": -312.4449768066406, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.823885917663574, + "rewards/margins": 3.289954900741577, + "rewards/rejected": -6.113840579986572, + "step": 1480 + }, + { + "epoch": 0.31, + "learning_rate": 1.3861344537815128e-05, + "logits/chosen": -1.9239745140075684, + "logits/rejected": -1.473430871963501, + "logps/chosen": -325.9935302734375, + "logps/rejected": -245.03480529785156, + "loss": 0.1864, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1942408084869385, + "rewards/margins": 3.5219736099243164, + "rewards/rejected": -5.716214179992676, + "step": 1481 + }, + { + "epoch": 0.31, + "learning_rate": 1.3857142857142858e-05, + "logits/chosen": -2.4463226795196533, + "logits/rejected": -1.8636772632598877, + "logps/chosen": -322.02203369140625, + "logps/rejected": -347.62799072265625, + "loss": 0.2507, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1330406665802, + "rewards/margins": 2.6586828231811523, + "rewards/rejected": -4.791723251342773, + "step": 1482 + }, + { + "epoch": 0.31, + "learning_rate": 1.385294117647059e-05, + "logits/chosen": -2.1206326484680176, + "logits/rejected": -1.9888421297073364, + "logps/chosen": -243.22801208496094, + "logps/rejected": -316.9754333496094, + "loss": 0.3883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1763663291931152, + "rewards/margins": 2.4952845573425293, + "rewards/rejected": -4.6716508865356445, + "step": 1483 + }, + { + "epoch": 0.31, + "learning_rate": 1.384873949579832e-05, + "logits/chosen": -1.9638179540634155, + "logits/rejected": -1.9315054416656494, + "logps/chosen": -282.06280517578125, + "logps/rejected": -330.5403137207031, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2130484580993652, + "rewards/margins": 3.475529193878174, + "rewards/rejected": -4.688577651977539, + "step": 1484 + }, + { + "epoch": 0.31, + "learning_rate": 1.3844537815126052e-05, + "logits/chosen": -2.2887396812438965, + "logits/rejected": -1.9831876754760742, + "logps/chosen": -306.62750244140625, + "logps/rejected": -406.44354248046875, + "loss": 0.1397, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7298030853271484, + "rewards/margins": 3.4616518020629883, + "rewards/rejected": -5.1914544105529785, + "step": 1485 + }, + { + "epoch": 0.31, + "learning_rate": 1.3840336134453782e-05, + "logits/chosen": -2.236088752746582, + "logits/rejected": -1.503354549407959, + "logps/chosen": -408.9189453125, + "logps/rejected": -398.43756103515625, + "loss": 0.1694, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9077842235565186, + "rewards/margins": 4.053970813751221, + "rewards/rejected": -5.96175479888916, + "step": 1486 + }, + { + "epoch": 0.31, + "learning_rate": 1.3836134453781514e-05, + "logits/chosen": -2.0901851654052734, + "logits/rejected": -1.8386940956115723, + "logps/chosen": -309.65673828125, + "logps/rejected": -276.49676513671875, + "loss": 0.3927, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4351847171783447, + "rewards/margins": 3.065126657485962, + "rewards/rejected": -4.500311374664307, + "step": 1487 + }, + { + "epoch": 0.31, + "learning_rate": 1.3831932773109244e-05, + "logits/chosen": -2.3285272121429443, + "logits/rejected": -1.8131104707717896, + "logps/chosen": -369.39202880859375, + "logps/rejected": -257.6651306152344, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2563161849975586, + "rewards/margins": 3.7034010887145996, + "rewards/rejected": -5.959717750549316, + "step": 1488 + }, + { + "epoch": 0.31, + "learning_rate": 1.3827731092436976e-05, + "logits/chosen": -2.1904854774475098, + "logits/rejected": -1.9328203201293945, + "logps/chosen": -497.05462646484375, + "logps/rejected": -491.21868896484375, + "loss": 0.7401, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7987942695617676, + "rewards/margins": 2.3589625358581543, + "rewards/rejected": -5.157756328582764, + "step": 1489 + }, + { + "epoch": 0.31, + "learning_rate": 1.3823529411764706e-05, + "logits/chosen": -2.2366514205932617, + "logits/rejected": -1.8316683769226074, + "logps/chosen": -524.6250610351562, + "logps/rejected": -513.5143432617188, + "loss": 0.6225, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2956128120422363, + "rewards/margins": 4.453464508056641, + "rewards/rejected": -5.749077320098877, + "step": 1490 + }, + { + "epoch": 0.31, + "learning_rate": 1.3819327731092438e-05, + "logits/chosen": -2.22904634475708, + "logits/rejected": -2.2269725799560547, + "logps/chosen": -403.003173828125, + "logps/rejected": -416.63543701171875, + "loss": 0.3929, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9268664121627808, + "rewards/margins": 4.095552921295166, + "rewards/rejected": -6.022418975830078, + "step": 1491 + }, + { + "epoch": 0.31, + "learning_rate": 1.3815126050420169e-05, + "logits/chosen": -2.5502898693084717, + "logits/rejected": -2.4671010971069336, + "logps/chosen": -358.4950256347656, + "logps/rejected": -432.85040283203125, + "loss": 0.1763, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.804923176765442, + "rewards/margins": 4.578037738800049, + "rewards/rejected": -6.382961273193359, + "step": 1492 + }, + { + "epoch": 0.31, + "learning_rate": 1.38109243697479e-05, + "logits/chosen": -2.1150963306427, + "logits/rejected": -1.798793077468872, + "logps/chosen": -375.71893310546875, + "logps/rejected": -373.99774169921875, + "loss": 0.6347, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3594367504119873, + "rewards/margins": 2.245410442352295, + "rewards/rejected": -3.6048471927642822, + "step": 1493 + }, + { + "epoch": 0.31, + "learning_rate": 1.380672268907563e-05, + "logits/chosen": -1.9354262351989746, + "logits/rejected": -1.7193901538848877, + "logps/chosen": -314.1204833984375, + "logps/rejected": -388.4920959472656, + "loss": 0.3218, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.189091205596924, + "rewards/margins": 3.745532989501953, + "rewards/rejected": -5.934624671936035, + "step": 1494 + }, + { + "epoch": 0.31, + "learning_rate": 1.3802521008403363e-05, + "logits/chosen": -2.2270185947418213, + "logits/rejected": -1.7530972957611084, + "logps/chosen": -293.441650390625, + "logps/rejected": -275.045654296875, + "loss": 0.3753, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4976768493652344, + "rewards/margins": 3.213649272918701, + "rewards/rejected": -5.711325645446777, + "step": 1495 + }, + { + "epoch": 0.31, + "learning_rate": 1.3798319327731093e-05, + "logits/chosen": -2.329249143600464, + "logits/rejected": -1.483070969581604, + "logps/chosen": -409.2109375, + "logps/rejected": -339.3017578125, + "loss": 0.1405, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8761192560195923, + "rewards/margins": 4.959670543670654, + "rewards/rejected": -6.835789680480957, + "step": 1496 + }, + { + "epoch": 0.31, + "learning_rate": 1.3794117647058825e-05, + "logits/chosen": -2.076392889022827, + "logits/rejected": -1.6949396133422852, + "logps/chosen": -357.1734619140625, + "logps/rejected": -416.26593017578125, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.605079174041748, + "rewards/margins": 4.900635242462158, + "rewards/rejected": -6.505714416503906, + "step": 1497 + }, + { + "epoch": 0.31, + "learning_rate": 1.3789915966386555e-05, + "logits/chosen": -1.8389716148376465, + "logits/rejected": -2.081244468688965, + "logps/chosen": -291.5338439941406, + "logps/rejected": -351.85089111328125, + "loss": 0.4227, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.177229404449463, + "rewards/margins": 2.7249724864959717, + "rewards/rejected": -3.9022021293640137, + "step": 1498 + }, + { + "epoch": 0.31, + "learning_rate": 1.3785714285714287e-05, + "logits/chosen": -2.1917097568511963, + "logits/rejected": -2.056948661804199, + "logps/chosen": -463.5747985839844, + "logps/rejected": -365.2350769042969, + "loss": 0.3869, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6626700162887573, + "rewards/margins": 3.252378463745117, + "rewards/rejected": -4.915048599243164, + "step": 1499 + }, + { + "epoch": 0.31, + "learning_rate": 1.3781512605042017e-05, + "logits/chosen": -2.14890456199646, + "logits/rejected": -1.7595752477645874, + "logps/chosen": -389.83038330078125, + "logps/rejected": -335.79974365234375, + "loss": 0.4572, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3210645914077759, + "rewards/margins": 2.0165557861328125, + "rewards/rejected": -3.337620496749878, + "step": 1500 + }, + { + "epoch": 0.31, + "learning_rate": 1.3777310924369749e-05, + "logits/chosen": -1.8883776664733887, + "logits/rejected": -1.7146973609924316, + "logps/chosen": -359.68695068359375, + "logps/rejected": -418.39569091796875, + "loss": 0.3067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3695051670074463, + "rewards/margins": 2.8430089950561523, + "rewards/rejected": -5.2125139236450195, + "step": 1501 + }, + { + "epoch": 0.31, + "learning_rate": 1.3773109243697481e-05, + "logits/chosen": -2.3197641372680664, + "logits/rejected": -2.1042861938476562, + "logps/chosen": -257.4062805175781, + "logps/rejected": -292.05487060546875, + "loss": 0.2399, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.731605291366577, + "rewards/margins": 3.0217349529266357, + "rewards/rejected": -5.753340721130371, + "step": 1502 + }, + { + "epoch": 0.31, + "learning_rate": 1.3768907563025211e-05, + "logits/chosen": -2.204181671142578, + "logits/rejected": -1.837156057357788, + "logps/chosen": -374.1438293457031, + "logps/rejected": -398.418212890625, + "loss": 0.3051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.359133243560791, + "rewards/margins": 4.114659309387207, + "rewards/rejected": -5.473792552947998, + "step": 1503 + }, + { + "epoch": 0.31, + "learning_rate": 1.3764705882352943e-05, + "logits/chosen": -1.883932113647461, + "logits/rejected": -1.8983148336410522, + "logps/chosen": -353.15582275390625, + "logps/rejected": -368.32672119140625, + "loss": 0.2818, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.108870506286621, + "rewards/margins": 3.661158323287964, + "rewards/rejected": -5.770028591156006, + "step": 1504 + }, + { + "epoch": 0.31, + "learning_rate": 1.3760504201680673e-05, + "logits/chosen": -2.2103703022003174, + "logits/rejected": -2.035720109939575, + "logps/chosen": -302.2536926269531, + "logps/rejected": -324.2633361816406, + "loss": 0.193, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0097177028656006, + "rewards/margins": 3.105665683746338, + "rewards/rejected": -5.115383625030518, + "step": 1505 + }, + { + "epoch": 0.32, + "learning_rate": 1.3756302521008405e-05, + "logits/chosen": -2.266658067703247, + "logits/rejected": -2.1572327613830566, + "logps/chosen": -287.13897705078125, + "logps/rejected": -312.1755065917969, + "loss": 0.2548, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0984175205230713, + "rewards/margins": 3.4841713905334473, + "rewards/rejected": -5.582589149475098, + "step": 1506 + }, + { + "epoch": 0.32, + "learning_rate": 1.3752100840336135e-05, + "logits/chosen": -2.4099137783050537, + "logits/rejected": -2.189911127090454, + "logps/chosen": -393.07073974609375, + "logps/rejected": -372.637451171875, + "loss": 0.3239, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9389022588729858, + "rewards/margins": 3.230184316635132, + "rewards/rejected": -5.169086456298828, + "step": 1507 + }, + { + "epoch": 0.32, + "learning_rate": 1.3747899159663867e-05, + "logits/chosen": -2.5213332176208496, + "logits/rejected": -1.6966464519500732, + "logps/chosen": -520.781982421875, + "logps/rejected": -334.50439453125, + "loss": 0.3161, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5399690866470337, + "rewards/margins": 3.962235450744629, + "rewards/rejected": -5.502203941345215, + "step": 1508 + }, + { + "epoch": 0.32, + "learning_rate": 1.3743697478991598e-05, + "logits/chosen": -2.213902235031128, + "logits/rejected": -2.2826530933380127, + "logps/chosen": -550.6090087890625, + "logps/rejected": -494.15533447265625, + "loss": 0.4333, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0108642578125, + "rewards/margins": 4.040138244628906, + "rewards/rejected": -6.051002502441406, + "step": 1509 + }, + { + "epoch": 0.32, + "learning_rate": 1.373949579831933e-05, + "logits/chosen": -2.0629067420959473, + "logits/rejected": -1.8256256580352783, + "logps/chosen": -363.9134826660156, + "logps/rejected": -384.60546875, + "loss": 0.1812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8285789489746094, + "rewards/margins": 4.576192855834961, + "rewards/rejected": -5.4047722816467285, + "step": 1510 + }, + { + "epoch": 0.32, + "learning_rate": 1.373529411764706e-05, + "logits/chosen": -2.224418878555298, + "logits/rejected": -1.8851299285888672, + "logps/chosen": -380.7570495605469, + "logps/rejected": -293.6314392089844, + "loss": 0.4504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3901138305664062, + "rewards/margins": 3.098491907119751, + "rewards/rejected": -5.488605499267578, + "step": 1511 + }, + { + "epoch": 0.32, + "learning_rate": 1.3731092436974792e-05, + "logits/chosen": -2.150125026702881, + "logits/rejected": -1.5734336376190186, + "logps/chosen": -321.7039794921875, + "logps/rejected": -261.9718017578125, + "loss": 0.426, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6680748462677, + "rewards/margins": 2.8966400623321533, + "rewards/rejected": -5.564715385437012, + "step": 1512 + }, + { + "epoch": 0.32, + "learning_rate": 1.3726890756302522e-05, + "logits/chosen": -1.7281129360198975, + "logits/rejected": -2.0702805519104004, + "logps/chosen": -293.47711181640625, + "logps/rejected": -375.4337158203125, + "loss": 0.1276, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3544812202453613, + "rewards/margins": 4.82112455368042, + "rewards/rejected": -7.175605773925781, + "step": 1513 + }, + { + "epoch": 0.32, + "learning_rate": 1.3722689075630254e-05, + "logits/chosen": -2.35453200340271, + "logits/rejected": -1.9226280450820923, + "logps/chosen": -429.4017333984375, + "logps/rejected": -427.2686462402344, + "loss": 0.3681, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.928022861480713, + "rewards/margins": 3.355212926864624, + "rewards/rejected": -5.283236503601074, + "step": 1514 + }, + { + "epoch": 0.32, + "learning_rate": 1.3718487394957984e-05, + "logits/chosen": -2.3008413314819336, + "logits/rejected": -2.0070688724517822, + "logps/chosen": -327.9501953125, + "logps/rejected": -258.4668884277344, + "loss": 0.2023, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7532017230987549, + "rewards/margins": 3.323464870452881, + "rewards/rejected": -5.076666831970215, + "step": 1515 + }, + { + "epoch": 0.32, + "learning_rate": 1.3714285714285716e-05, + "logits/chosen": -2.2839841842651367, + "logits/rejected": -2.0053277015686035, + "logps/chosen": -294.37078857421875, + "logps/rejected": -305.677978515625, + "loss": 0.1584, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8836138248443604, + "rewards/margins": 4.847850322723389, + "rewards/rejected": -6.731464385986328, + "step": 1516 + }, + { + "epoch": 0.32, + "learning_rate": 1.3710084033613446e-05, + "logits/chosen": -1.8805383443832397, + "logits/rejected": -1.9337414503097534, + "logps/chosen": -314.4667663574219, + "logps/rejected": -362.4813232421875, + "loss": 0.7267, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0425734519958496, + "rewards/margins": 3.1742258071899414, + "rewards/rejected": -6.216799259185791, + "step": 1517 + }, + { + "epoch": 0.32, + "learning_rate": 1.3705882352941178e-05, + "logits/chosen": -2.324641466140747, + "logits/rejected": -2.1905229091644287, + "logps/chosen": -354.46856689453125, + "logps/rejected": -381.2355041503906, + "loss": 1.1062, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7712717056274414, + "rewards/margins": 1.989945411682129, + "rewards/rejected": -4.76121711730957, + "step": 1518 + }, + { + "epoch": 0.32, + "learning_rate": 1.3701680672268908e-05, + "logits/chosen": -1.9229813814163208, + "logits/rejected": -1.4142259359359741, + "logps/chosen": -364.59454345703125, + "logps/rejected": -324.77886962890625, + "loss": 0.4455, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.35444712638855, + "rewards/margins": 4.673713684082031, + "rewards/rejected": -7.028160572052002, + "step": 1519 + }, + { + "epoch": 0.32, + "learning_rate": 1.369747899159664e-05, + "logits/chosen": -1.9923980236053467, + "logits/rejected": -2.0252585411071777, + "logps/chosen": -297.9746398925781, + "logps/rejected": -328.2971496582031, + "loss": 0.2642, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5249955654144287, + "rewards/margins": 2.997795343399048, + "rewards/rejected": -5.522790908813477, + "step": 1520 + }, + { + "epoch": 0.32, + "learning_rate": 1.369327731092437e-05, + "logits/chosen": -2.101191282272339, + "logits/rejected": -2.0765464305877686, + "logps/chosen": -302.0186767578125, + "logps/rejected": -423.00286865234375, + "loss": 0.6472, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7222628593444824, + "rewards/margins": 4.560105323791504, + "rewards/rejected": -7.282368183135986, + "step": 1521 + }, + { + "epoch": 0.32, + "learning_rate": 1.3689075630252102e-05, + "logits/chosen": -2.2064096927642822, + "logits/rejected": -2.107010841369629, + "logps/chosen": -303.9696044921875, + "logps/rejected": -335.8317565917969, + "loss": 0.142, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5918830633163452, + "rewards/margins": 5.0224833488464355, + "rewards/rejected": -6.61436653137207, + "step": 1522 + }, + { + "epoch": 0.32, + "learning_rate": 1.3684873949579832e-05, + "logits/chosen": -2.2449867725372314, + "logits/rejected": -1.964526891708374, + "logps/chosen": -372.4798278808594, + "logps/rejected": -383.7760009765625, + "loss": 0.4904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1155154705047607, + "rewards/margins": 3.672675132751465, + "rewards/rejected": -5.788190841674805, + "step": 1523 + }, + { + "epoch": 0.32, + "learning_rate": 1.3680672268907564e-05, + "logits/chosen": -2.2364954948425293, + "logits/rejected": -1.6362565755844116, + "logps/chosen": -376.10650634765625, + "logps/rejected": -370.7212829589844, + "loss": 0.1605, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3037221431732178, + "rewards/margins": 5.390576362609863, + "rewards/rejected": -7.69429874420166, + "step": 1524 + }, + { + "epoch": 0.32, + "learning_rate": 1.3676470588235296e-05, + "logits/chosen": -2.167163372039795, + "logits/rejected": -2.133495330810547, + "logps/chosen": -331.7828063964844, + "logps/rejected": -350.56689453125, + "loss": 0.2241, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.061875104904175, + "rewards/margins": 3.3810033798217773, + "rewards/rejected": -5.442878723144531, + "step": 1525 + }, + { + "epoch": 0.32, + "learning_rate": 1.3672268907563027e-05, + "logits/chosen": -2.3961386680603027, + "logits/rejected": -2.074831008911133, + "logps/chosen": -406.5638122558594, + "logps/rejected": -332.0755920410156, + "loss": 0.2613, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5826833248138428, + "rewards/margins": 3.226261615753174, + "rewards/rejected": -4.808945178985596, + "step": 1526 + }, + { + "epoch": 0.32, + "learning_rate": 1.3668067226890758e-05, + "logits/chosen": -1.8209517002105713, + "logits/rejected": -1.583056092262268, + "logps/chosen": -396.4560546875, + "logps/rejected": -459.8486328125, + "loss": 0.155, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5934081077575684, + "rewards/margins": 4.904259204864502, + "rewards/rejected": -6.497666835784912, + "step": 1527 + }, + { + "epoch": 0.32, + "learning_rate": 1.3663865546218489e-05, + "logits/chosen": -2.3694851398468018, + "logits/rejected": -2.150752305984497, + "logps/chosen": -301.579833984375, + "logps/rejected": -278.3642578125, + "loss": 0.2743, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8127012252807617, + "rewards/margins": 3.071471929550171, + "rewards/rejected": -4.884173393249512, + "step": 1528 + }, + { + "epoch": 0.32, + "learning_rate": 1.365966386554622e-05, + "logits/chosen": -2.119192123413086, + "logits/rejected": -1.8645139932632446, + "logps/chosen": -317.47686767578125, + "logps/rejected": -351.76104736328125, + "loss": 0.127, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.608067512512207, + "rewards/margins": 4.637997627258301, + "rewards/rejected": -6.246065616607666, + "step": 1529 + }, + { + "epoch": 0.32, + "learning_rate": 1.365546218487395e-05, + "logits/chosen": -2.2033910751342773, + "logits/rejected": -2.280977249145508, + "logps/chosen": -371.7592468261719, + "logps/rejected": -423.4562072753906, + "loss": 0.0931, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8018500804901123, + "rewards/margins": 5.040394306182861, + "rewards/rejected": -6.842245101928711, + "step": 1530 + }, + { + "epoch": 0.32, + "learning_rate": 1.3651260504201683e-05, + "logits/chosen": -2.2729990482330322, + "logits/rejected": -1.9864797592163086, + "logps/chosen": -314.2420349121094, + "logps/rejected": -250.8751678466797, + "loss": 0.3364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0307931900024414, + "rewards/margins": 3.4755325317382812, + "rewards/rejected": -5.506325721740723, + "step": 1531 + }, + { + "epoch": 0.32, + "learning_rate": 1.3647058823529413e-05, + "logits/chosen": -2.1584627628326416, + "logits/rejected": -2.263535261154175, + "logps/chosen": -268.9049987792969, + "logps/rejected": -347.4558410644531, + "loss": 0.2831, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2791728973388672, + "rewards/margins": 3.707303762435913, + "rewards/rejected": -4.986476898193359, + "step": 1532 + }, + { + "epoch": 0.32, + "learning_rate": 1.3642857142857145e-05, + "logits/chosen": -2.0850987434387207, + "logits/rejected": -2.1552116870880127, + "logps/chosen": -342.29730224609375, + "logps/rejected": -416.7658386230469, + "loss": 0.5831, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.244588613510132, + "rewards/margins": 3.867178201675415, + "rewards/rejected": -6.111766815185547, + "step": 1533 + }, + { + "epoch": 0.32, + "learning_rate": 1.3638655462184875e-05, + "logits/chosen": -1.9951586723327637, + "logits/rejected": -1.5541911125183105, + "logps/chosen": -322.2633056640625, + "logps/rejected": -295.8976745605469, + "loss": 0.2129, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8857260942459106, + "rewards/margins": 3.7069284915924072, + "rewards/rejected": -5.592655181884766, + "step": 1534 + }, + { + "epoch": 0.32, + "learning_rate": 1.3634453781512607e-05, + "logits/chosen": -2.1387205123901367, + "logits/rejected": -2.0219874382019043, + "logps/chosen": -248.5108642578125, + "logps/rejected": -242.661376953125, + "loss": 0.311, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3165674209594727, + "rewards/margins": 3.790159225463867, + "rewards/rejected": -6.10672664642334, + "step": 1535 + }, + { + "epoch": 0.32, + "learning_rate": 1.3630252100840337e-05, + "logits/chosen": -2.195915699005127, + "logits/rejected": -1.7216646671295166, + "logps/chosen": -230.2270050048828, + "logps/rejected": -247.8502960205078, + "loss": 0.3246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8406018018722534, + "rewards/margins": 3.7172141075134277, + "rewards/rejected": -5.557816505432129, + "step": 1536 + }, + { + "epoch": 0.32, + "learning_rate": 1.3626050420168069e-05, + "logits/chosen": -1.831616759300232, + "logits/rejected": -1.6921265125274658, + "logps/chosen": -416.4903869628906, + "logps/rejected": -335.02154541015625, + "loss": 0.3942, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5379319190979004, + "rewards/margins": 3.0702579021453857, + "rewards/rejected": -4.608189582824707, + "step": 1537 + }, + { + "epoch": 0.32, + "learning_rate": 1.36218487394958e-05, + "logits/chosen": -1.9427571296691895, + "logits/rejected": -1.903017520904541, + "logps/chosen": -269.3683776855469, + "logps/rejected": -294.51824951171875, + "loss": 0.3237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.74898099899292, + "rewards/margins": 3.7656946182250977, + "rewards/rejected": -5.514675617218018, + "step": 1538 + }, + { + "epoch": 0.32, + "learning_rate": 1.3617647058823531e-05, + "logits/chosen": -2.1402130126953125, + "logits/rejected": -2.1775643825531006, + "logps/chosen": -326.2735595703125, + "logps/rejected": -396.31488037109375, + "loss": 0.4029, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9723838567733765, + "rewards/margins": 3.529466152191162, + "rewards/rejected": -5.501850128173828, + "step": 1539 + }, + { + "epoch": 0.32, + "learning_rate": 1.3613445378151261e-05, + "logits/chosen": -2.238013744354248, + "logits/rejected": -2.1054842472076416, + "logps/chosen": -286.1468811035156, + "logps/rejected": -289.3240966796875, + "loss": 0.3241, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3906116485595703, + "rewards/margins": 3.4860012531280518, + "rewards/rejected": -4.876612663269043, + "step": 1540 + }, + { + "epoch": 0.32, + "learning_rate": 1.3609243697478993e-05, + "logits/chosen": -2.4581689834594727, + "logits/rejected": -2.0619163513183594, + "logps/chosen": -325.19097900390625, + "logps/rejected": -281.8824462890625, + "loss": 0.1773, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.159588575363159, + "rewards/margins": 4.178308963775635, + "rewards/rejected": -6.337897300720215, + "step": 1541 + }, + { + "epoch": 0.32, + "learning_rate": 1.3605042016806724e-05, + "logits/chosen": -2.159458875656128, + "logits/rejected": -1.886964201927185, + "logps/chosen": -404.42779541015625, + "logps/rejected": -361.19317626953125, + "loss": 0.6505, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0231292247772217, + "rewards/margins": 2.7935128211975098, + "rewards/rejected": -4.8166422843933105, + "step": 1542 + }, + { + "epoch": 0.32, + "learning_rate": 1.3600840336134456e-05, + "logits/chosen": -2.320007801055908, + "logits/rejected": -1.4519001245498657, + "logps/chosen": -353.6142578125, + "logps/rejected": -286.37030029296875, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0375373363494873, + "rewards/margins": 4.341235160827637, + "rewards/rejected": -6.378772258758545, + "step": 1543 + }, + { + "epoch": 0.32, + "learning_rate": 1.3596638655462186e-05, + "logits/chosen": -2.3219215869903564, + "logits/rejected": -2.1535050868988037, + "logps/chosen": -387.14959716796875, + "logps/rejected": -345.3487854003906, + "loss": 0.1528, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2154974937438965, + "rewards/margins": 3.955014944076538, + "rewards/rejected": -6.170513153076172, + "step": 1544 + }, + { + "epoch": 0.32, + "learning_rate": 1.3592436974789918e-05, + "logits/chosen": -2.2055633068084717, + "logits/rejected": -1.9450434446334839, + "logps/chosen": -406.12213134765625, + "logps/rejected": -436.635986328125, + "loss": 0.2705, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9468222856521606, + "rewards/margins": 4.452834129333496, + "rewards/rejected": -6.399656295776367, + "step": 1545 + }, + { + "epoch": 0.32, + "learning_rate": 1.3588235294117648e-05, + "logits/chosen": -2.125680685043335, + "logits/rejected": -2.161830186843872, + "logps/chosen": -265.9605712890625, + "logps/rejected": -342.02362060546875, + "loss": 0.8159, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7846896648406982, + "rewards/margins": 1.8813824653625488, + "rewards/rejected": -3.666072368621826, + "step": 1546 + }, + { + "epoch": 0.32, + "learning_rate": 1.358403361344538e-05, + "logits/chosen": -2.17728853225708, + "logits/rejected": -1.5438613891601562, + "logps/chosen": -429.7187805175781, + "logps/rejected": -414.5464172363281, + "loss": 0.2032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5351991653442383, + "rewards/margins": 3.728818893432617, + "rewards/rejected": -6.264018535614014, + "step": 1547 + }, + { + "epoch": 0.32, + "learning_rate": 1.3579831932773112e-05, + "logits/chosen": -2.4130184650421143, + "logits/rejected": -2.1025772094726562, + "logps/chosen": -413.2592468261719, + "logps/rejected": -372.26910400390625, + "loss": 0.3896, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.985079288482666, + "rewards/margins": 2.734118938446045, + "rewards/rejected": -4.719197750091553, + "step": 1548 + }, + { + "epoch": 0.32, + "learning_rate": 1.3575630252100842e-05, + "logits/chosen": -2.1688692569732666, + "logits/rejected": -2.0466580390930176, + "logps/chosen": -322.7155456542969, + "logps/rejected": -362.5126037597656, + "loss": 0.3998, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2712650299072266, + "rewards/margins": 2.960850238800049, + "rewards/rejected": -5.232114791870117, + "step": 1549 + }, + { + "epoch": 0.32, + "learning_rate": 1.3571428571428574e-05, + "logits/chosen": -1.9294326305389404, + "logits/rejected": -2.00925874710083, + "logps/chosen": -264.7958068847656, + "logps/rejected": -318.3216857910156, + "loss": 0.278, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6849164962768555, + "rewards/margins": 4.047970771789551, + "rewards/rejected": -5.732887268066406, + "step": 1550 + }, + { + "epoch": 0.32, + "learning_rate": 1.3567226890756304e-05, + "logits/chosen": -2.2406880855560303, + "logits/rejected": -1.7296757698059082, + "logps/chosen": -420.7481994628906, + "logps/rejected": -354.55828857421875, + "loss": 0.2845, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.687650442123413, + "rewards/margins": 3.1254658699035645, + "rewards/rejected": -5.813116073608398, + "step": 1551 + }, + { + "epoch": 0.32, + "learning_rate": 1.3563025210084036e-05, + "logits/chosen": -2.248481512069702, + "logits/rejected": -2.3428001403808594, + "logps/chosen": -234.35072326660156, + "logps/rejected": -266.1012268066406, + "loss": 0.2467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.225402593612671, + "rewards/margins": 2.3584494590759277, + "rewards/rejected": -4.583852291107178, + "step": 1552 + }, + { + "epoch": 0.32, + "learning_rate": 1.3558823529411766e-05, + "logits/chosen": -1.8489086627960205, + "logits/rejected": -1.7926712036132812, + "logps/chosen": -300.8004150390625, + "logps/rejected": -226.19790649414062, + "loss": 0.4611, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7169501781463623, + "rewards/margins": 2.567204713821411, + "rewards/rejected": -4.284154891967773, + "step": 1553 + }, + { + "epoch": 0.33, + "learning_rate": 1.3554621848739498e-05, + "logits/chosen": -1.6713650226593018, + "logits/rejected": -1.644329309463501, + "logps/chosen": -350.7186584472656, + "logps/rejected": -419.7698669433594, + "loss": 0.2679, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1753358840942383, + "rewards/margins": 2.6158359050750732, + "rewards/rejected": -4.791172027587891, + "step": 1554 + }, + { + "epoch": 0.33, + "learning_rate": 1.3550420168067228e-05, + "logits/chosen": -2.1645591259002686, + "logits/rejected": -1.9457404613494873, + "logps/chosen": -378.60113525390625, + "logps/rejected": -404.9617614746094, + "loss": 0.3379, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9129743576049805, + "rewards/margins": 4.23444128036499, + "rewards/rejected": -6.1474151611328125, + "step": 1555 + }, + { + "epoch": 0.33, + "learning_rate": 1.354621848739496e-05, + "logits/chosen": -2.1021556854248047, + "logits/rejected": -1.9415308237075806, + "logps/chosen": -338.2377624511719, + "logps/rejected": -343.495849609375, + "loss": 0.4451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0029456615448, + "rewards/margins": 4.4245123863220215, + "rewards/rejected": -6.427458763122559, + "step": 1556 + }, + { + "epoch": 0.33, + "learning_rate": 1.354201680672269e-05, + "logits/chosen": -2.2972755432128906, + "logits/rejected": -2.0061802864074707, + "logps/chosen": -292.63671875, + "logps/rejected": -244.11346435546875, + "loss": 0.3559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2151107788085938, + "rewards/margins": 3.3020784854888916, + "rewards/rejected": -4.5171895027160645, + "step": 1557 + }, + { + "epoch": 0.33, + "learning_rate": 1.3537815126050422e-05, + "logits/chosen": -2.1243882179260254, + "logits/rejected": -2.206490993499756, + "logps/chosen": -317.6093444824219, + "logps/rejected": -319.8685607910156, + "loss": 0.5334, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.480548858642578, + "rewards/margins": 1.9016865491867065, + "rewards/rejected": -4.382235527038574, + "step": 1558 + }, + { + "epoch": 0.33, + "learning_rate": 1.3533613445378153e-05, + "logits/chosen": -2.2083888053894043, + "logits/rejected": -1.625854730606079, + "logps/chosen": -295.3682861328125, + "logps/rejected": -243.1657257080078, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3686109781265259, + "rewards/margins": 4.547513484954834, + "rewards/rejected": -5.91612434387207, + "step": 1559 + }, + { + "epoch": 0.33, + "learning_rate": 1.3529411764705885e-05, + "logits/chosen": -2.3533897399902344, + "logits/rejected": -2.2308616638183594, + "logps/chosen": -264.24200439453125, + "logps/rejected": -262.6607360839844, + "loss": 0.2788, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8086416721343994, + "rewards/margins": 4.327005386352539, + "rewards/rejected": -7.135647296905518, + "step": 1560 + }, + { + "epoch": 0.33, + "learning_rate": 1.3525210084033615e-05, + "logits/chosen": -2.256122589111328, + "logits/rejected": -1.7468054294586182, + "logps/chosen": -362.7600402832031, + "logps/rejected": -300.8109130859375, + "loss": 0.1618, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7323319911956787, + "rewards/margins": 4.372570037841797, + "rewards/rejected": -7.104902267456055, + "step": 1561 + }, + { + "epoch": 0.33, + "learning_rate": 1.3521008403361347e-05, + "logits/chosen": -2.0059309005737305, + "logits/rejected": -1.9676084518432617, + "logps/chosen": -218.7572784423828, + "logps/rejected": -259.714599609375, + "loss": 0.3778, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1661858558654785, + "rewards/margins": 2.790099620819092, + "rewards/rejected": -4.95628547668457, + "step": 1562 + }, + { + "epoch": 0.33, + "learning_rate": 1.3516806722689077e-05, + "logits/chosen": -1.9147124290466309, + "logits/rejected": -2.0551693439483643, + "logps/chosen": -235.26498413085938, + "logps/rejected": -268.75628662109375, + "loss": 0.2156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4063061475753784, + "rewards/margins": 3.8116002082824707, + "rewards/rejected": -5.2179059982299805, + "step": 1563 + }, + { + "epoch": 0.33, + "learning_rate": 1.3512605042016809e-05, + "logits/chosen": -2.004727363586426, + "logits/rejected": -1.9379262924194336, + "logps/chosen": -251.23013305664062, + "logps/rejected": -284.62158203125, + "loss": 0.2364, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.857201099395752, + "rewards/margins": 3.7103936672210693, + "rewards/rejected": -5.5675950050354, + "step": 1564 + }, + { + "epoch": 0.33, + "learning_rate": 1.3508403361344539e-05, + "logits/chosen": -2.0529003143310547, + "logits/rejected": -1.6386899948120117, + "logps/chosen": -348.4374694824219, + "logps/rejected": -318.3455810546875, + "loss": 0.1693, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1770364046096802, + "rewards/margins": 3.762047052383423, + "rewards/rejected": -4.939083576202393, + "step": 1565 + }, + { + "epoch": 0.33, + "learning_rate": 1.3504201680672271e-05, + "logits/chosen": -1.9407405853271484, + "logits/rejected": -1.967835545539856, + "logps/chosen": -297.806884765625, + "logps/rejected": -417.64300537109375, + "loss": 0.5938, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.647566556930542, + "rewards/margins": 2.837312698364258, + "rewards/rejected": -5.484879016876221, + "step": 1566 + }, + { + "epoch": 0.33, + "learning_rate": 1.3500000000000001e-05, + "logits/chosen": -2.065253257751465, + "logits/rejected": -1.7523162364959717, + "logps/chosen": -411.849853515625, + "logps/rejected": -347.3288269042969, + "loss": 0.3419, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7883737087249756, + "rewards/margins": 3.5711944103240967, + "rewards/rejected": -5.359567642211914, + "step": 1567 + }, + { + "epoch": 0.33, + "learning_rate": 1.3495798319327733e-05, + "logits/chosen": -1.73996102809906, + "logits/rejected": -2.1035900115966797, + "logps/chosen": -267.34564208984375, + "logps/rejected": -309.44073486328125, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5502264499664307, + "rewards/margins": 4.979929447174072, + "rewards/rejected": -6.530155658721924, + "step": 1568 + }, + { + "epoch": 0.33, + "learning_rate": 1.3491596638655465e-05, + "logits/chosen": -2.1515371799468994, + "logits/rejected": -1.896789312362671, + "logps/chosen": -306.22601318359375, + "logps/rejected": -347.0390930175781, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8402398824691772, + "rewards/margins": 4.423365592956543, + "rewards/rejected": -6.26360559463501, + "step": 1569 + }, + { + "epoch": 0.33, + "learning_rate": 1.3487394957983195e-05, + "logits/chosen": -1.6260254383087158, + "logits/rejected": -1.356723666191101, + "logps/chosen": -284.29986572265625, + "logps/rejected": -299.8603820800781, + "loss": 0.2447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8083734512329102, + "rewards/margins": 4.358606338500977, + "rewards/rejected": -6.166979789733887, + "step": 1570 + }, + { + "epoch": 0.33, + "learning_rate": 1.3483193277310927e-05, + "logits/chosen": -2.114750385284424, + "logits/rejected": -2.10658597946167, + "logps/chosen": -271.921875, + "logps/rejected": -352.1289978027344, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.115615129470825, + "rewards/margins": 4.238959312438965, + "rewards/rejected": -6.354574680328369, + "step": 1571 + }, + { + "epoch": 0.33, + "learning_rate": 1.3478991596638657e-05, + "logits/chosen": -2.02907133102417, + "logits/rejected": -1.9975831508636475, + "logps/chosen": -378.42486572265625, + "logps/rejected": -327.30615234375, + "loss": 0.5258, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0004987716674805, + "rewards/margins": 4.245112419128418, + "rewards/rejected": -6.245611190795898, + "step": 1572 + }, + { + "epoch": 0.33, + "learning_rate": 1.347478991596639e-05, + "logits/chosen": -2.024139642715454, + "logits/rejected": -1.9395785331726074, + "logps/chosen": -337.4092102050781, + "logps/rejected": -343.1611022949219, + "loss": 0.352, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1575865745544434, + "rewards/margins": 4.801280975341797, + "rewards/rejected": -7.958868026733398, + "step": 1573 + }, + { + "epoch": 0.33, + "learning_rate": 1.347058823529412e-05, + "logits/chosen": -2.7350261211395264, + "logits/rejected": -2.4601964950561523, + "logps/chosen": -372.3827209472656, + "logps/rejected": -355.8847351074219, + "loss": 0.1385, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3894000053405762, + "rewards/margins": 5.207067012786865, + "rewards/rejected": -6.596466064453125, + "step": 1574 + }, + { + "epoch": 0.33, + "learning_rate": 1.3466386554621851e-05, + "logits/chosen": -2.297157049179077, + "logits/rejected": -2.0700793266296387, + "logps/chosen": -461.0620422363281, + "logps/rejected": -433.7490539550781, + "loss": 0.8345, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1593585014343262, + "rewards/margins": 1.5880155563354492, + "rewards/rejected": -2.7473740577697754, + "step": 1575 + }, + { + "epoch": 0.33, + "learning_rate": 1.3462184873949582e-05, + "logits/chosen": -2.4267542362213135, + "logits/rejected": -2.0636322498321533, + "logps/chosen": -466.5465393066406, + "logps/rejected": -398.1875915527344, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1418555974960327, + "rewards/margins": 5.091866493225098, + "rewards/rejected": -6.233721733093262, + "step": 1576 + }, + { + "epoch": 0.33, + "learning_rate": 1.3457983193277314e-05, + "logits/chosen": -2.1529932022094727, + "logits/rejected": -2.2697815895080566, + "logps/chosen": -293.11346435546875, + "logps/rejected": -287.35882568359375, + "loss": 0.2801, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4375574588775635, + "rewards/margins": 3.544675827026367, + "rewards/rejected": -5.982232570648193, + "step": 1577 + }, + { + "epoch": 0.33, + "learning_rate": 1.3453781512605044e-05, + "logits/chosen": -2.2474112510681152, + "logits/rejected": -1.8983111381530762, + "logps/chosen": -318.96844482421875, + "logps/rejected": -273.52911376953125, + "loss": 0.6269, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.342780351638794, + "rewards/margins": 2.3649682998657227, + "rewards/rejected": -4.7077484130859375, + "step": 1578 + }, + { + "epoch": 0.33, + "learning_rate": 1.3449579831932776e-05, + "logits/chosen": -2.4370815753936768, + "logits/rejected": -1.7357474565505981, + "logps/chosen": -281.50775146484375, + "logps/rejected": -270.159912109375, + "loss": 0.33, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.201629638671875, + "rewards/margins": 3.859788179397583, + "rewards/rejected": -7.061417579650879, + "step": 1579 + }, + { + "epoch": 0.33, + "learning_rate": 1.3445378151260506e-05, + "logits/chosen": -2.2266438007354736, + "logits/rejected": -1.5917251110076904, + "logps/chosen": -461.8985290527344, + "logps/rejected": -317.9659423828125, + "loss": 0.2462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3814454078674316, + "rewards/margins": 4.184391975402832, + "rewards/rejected": -6.565837860107422, + "step": 1580 + }, + { + "epoch": 0.33, + "learning_rate": 1.3441176470588238e-05, + "logits/chosen": -1.9953975677490234, + "logits/rejected": -1.895160436630249, + "logps/chosen": -355.9602966308594, + "logps/rejected": -389.32464599609375, + "loss": 0.3802, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3407206535339355, + "rewards/margins": 3.418330430984497, + "rewards/rejected": -5.759051322937012, + "step": 1581 + }, + { + "epoch": 0.33, + "learning_rate": 1.3436974789915966e-05, + "logits/chosen": -2.139695405960083, + "logits/rejected": -2.2296462059020996, + "logps/chosen": -266.51953125, + "logps/rejected": -270.2725524902344, + "loss": 0.5884, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8951754570007324, + "rewards/margins": 1.4570629596710205, + "rewards/rejected": -4.352237701416016, + "step": 1582 + }, + { + "epoch": 0.33, + "learning_rate": 1.3432773109243698e-05, + "logits/chosen": -2.320556163787842, + "logits/rejected": -1.923557162284851, + "logps/chosen": -382.3410949707031, + "logps/rejected": -337.8780212402344, + "loss": 0.2495, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8711174726486206, + "rewards/margins": 3.854853391647339, + "rewards/rejected": -5.72597074508667, + "step": 1583 + }, + { + "epoch": 0.33, + "learning_rate": 1.3428571428571429e-05, + "logits/chosen": -2.2081713676452637, + "logits/rejected": -2.0048463344573975, + "logps/chosen": -356.354736328125, + "logps/rejected": -312.5806884765625, + "loss": 0.1059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9250987768173218, + "rewards/margins": 4.980665683746338, + "rewards/rejected": -6.905764579772949, + "step": 1584 + }, + { + "epoch": 0.33, + "learning_rate": 1.342436974789916e-05, + "logits/chosen": -2.0496139526367188, + "logits/rejected": -2.1422438621520996, + "logps/chosen": -326.78680419921875, + "logps/rejected": -350.79803466796875, + "loss": 0.384, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.362083911895752, + "rewards/margins": 2.575965404510498, + "rewards/rejected": -5.93804931640625, + "step": 1585 + }, + { + "epoch": 0.33, + "learning_rate": 1.342016806722689e-05, + "logits/chosen": -2.0161404609680176, + "logits/rejected": -1.8577167987823486, + "logps/chosen": -273.4517517089844, + "logps/rejected": -384.1920166015625, + "loss": 0.1485, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.296023368835449, + "rewards/margins": 4.978274822235107, + "rewards/rejected": -7.274298667907715, + "step": 1586 + }, + { + "epoch": 0.33, + "learning_rate": 1.3415966386554623e-05, + "logits/chosen": -2.0820136070251465, + "logits/rejected": -1.3261100053787231, + "logps/chosen": -288.47271728515625, + "logps/rejected": -273.7978210449219, + "loss": 0.4761, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.16377592086792, + "rewards/margins": 2.335578441619873, + "rewards/rejected": -5.499354362487793, + "step": 1587 + }, + { + "epoch": 0.33, + "learning_rate": 1.3411764705882353e-05, + "logits/chosen": -2.101217746734619, + "logits/rejected": -1.9143552780151367, + "logps/chosen": -262.65216064453125, + "logps/rejected": -306.89154052734375, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4857940673828125, + "rewards/margins": 4.514498710632324, + "rewards/rejected": -7.000293254852295, + "step": 1588 + }, + { + "epoch": 0.33, + "learning_rate": 1.3407563025210085e-05, + "logits/chosen": -1.752553939819336, + "logits/rejected": -1.8540618419647217, + "logps/chosen": -254.5794677734375, + "logps/rejected": -311.7756652832031, + "loss": 0.1714, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.510009288787842, + "rewards/margins": 3.2181200981140137, + "rewards/rejected": -5.728129863739014, + "step": 1589 + }, + { + "epoch": 0.33, + "learning_rate": 1.3403361344537815e-05, + "logits/chosen": -1.9992791414260864, + "logits/rejected": -1.8400518894195557, + "logps/chosen": -334.0417175292969, + "logps/rejected": -314.65203857421875, + "loss": 0.3031, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1570544242858887, + "rewards/margins": 2.4814817905426025, + "rewards/rejected": -4.63853645324707, + "step": 1590 + }, + { + "epoch": 0.33, + "learning_rate": 1.3399159663865547e-05, + "logits/chosen": -2.3284952640533447, + "logits/rejected": -1.8115315437316895, + "logps/chosen": -456.9496154785156, + "logps/rejected": -394.57647705078125, + "loss": 0.2951, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4232962131500244, + "rewards/margins": 3.6977291107177734, + "rewards/rejected": -6.121025085449219, + "step": 1591 + }, + { + "epoch": 0.33, + "learning_rate": 1.3394957983193277e-05, + "logits/chosen": -2.32025146484375, + "logits/rejected": -1.61851167678833, + "logps/chosen": -358.76959228515625, + "logps/rejected": -292.3694152832031, + "loss": 0.3308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7484536170959473, + "rewards/margins": 4.471254348754883, + "rewards/rejected": -7.219708442687988, + "step": 1592 + }, + { + "epoch": 0.33, + "learning_rate": 1.3390756302521009e-05, + "logits/chosen": -1.9726165533065796, + "logits/rejected": -1.9987127780914307, + "logps/chosen": -339.5418395996094, + "logps/rejected": -375.7403259277344, + "loss": 0.2997, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1259608268737793, + "rewards/margins": 4.168889045715332, + "rewards/rejected": -6.294849872589111, + "step": 1593 + }, + { + "epoch": 0.33, + "learning_rate": 1.338655462184874e-05, + "logits/chosen": -1.9736922979354858, + "logits/rejected": -2.131272315979004, + "logps/chosen": -250.42123413085938, + "logps/rejected": -293.5691833496094, + "loss": 0.1682, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9538503885269165, + "rewards/margins": 4.706883907318115, + "rewards/rejected": -6.660734176635742, + "step": 1594 + }, + { + "epoch": 0.33, + "learning_rate": 1.3382352941176471e-05, + "logits/chosen": -1.9635119438171387, + "logits/rejected": -1.9307979345321655, + "logps/chosen": -361.7359313964844, + "logps/rejected": -383.010986328125, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.316967487335205, + "rewards/margins": 3.3284764289855957, + "rewards/rejected": -6.645443916320801, + "step": 1595 + }, + { + "epoch": 0.33, + "learning_rate": 1.3378151260504201e-05, + "logits/chosen": -1.9345074892044067, + "logits/rejected": -1.4651312828063965, + "logps/chosen": -299.09796142578125, + "logps/rejected": -350.7173156738281, + "loss": 0.2306, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1147069931030273, + "rewards/margins": 5.237401962280273, + "rewards/rejected": -8.352108001708984, + "step": 1596 + }, + { + "epoch": 0.33, + "learning_rate": 1.3373949579831933e-05, + "logits/chosen": -1.8867934942245483, + "logits/rejected": -2.0758984088897705, + "logps/chosen": -397.86029052734375, + "logps/rejected": -445.2793273925781, + "loss": 0.4863, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.097174644470215, + "rewards/margins": 5.038077354431152, + "rewards/rejected": -8.135251998901367, + "step": 1597 + }, + { + "epoch": 0.33, + "learning_rate": 1.3369747899159663e-05, + "logits/chosen": -1.8559684753417969, + "logits/rejected": -1.8518497943878174, + "logps/chosen": -257.2762451171875, + "logps/rejected": -309.48760986328125, + "loss": 0.5987, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4215240478515625, + "rewards/margins": 3.0474164485931396, + "rewards/rejected": -5.468940258026123, + "step": 1598 + }, + { + "epoch": 0.33, + "learning_rate": 1.3365546218487395e-05, + "logits/chosen": -2.084303617477417, + "logits/rejected": -1.8956245183944702, + "logps/chosen": -282.5265197753906, + "logps/rejected": -314.05340576171875, + "loss": 0.2218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.216721534729004, + "rewards/margins": 3.907532215118408, + "rewards/rejected": -6.124253749847412, + "step": 1599 + }, + { + "epoch": 0.33, + "learning_rate": 1.3361344537815126e-05, + "logits/chosen": -2.170419216156006, + "logits/rejected": -1.6748894453048706, + "logps/chosen": -311.447021484375, + "logps/rejected": -375.8804931640625, + "loss": 0.1649, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9995734691619873, + "rewards/margins": 6.639828681945801, + "rewards/rejected": -8.639402389526367, + "step": 1600 + }, + { + "epoch": 0.33, + "learning_rate": 1.3357142857142858e-05, + "logits/chosen": -2.301140785217285, + "logits/rejected": -2.0822901725769043, + "logps/chosen": -398.6423645019531, + "logps/rejected": -423.72247314453125, + "loss": 0.2527, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.662114143371582, + "rewards/margins": 4.394876956939697, + "rewards/rejected": -7.0569915771484375, + "step": 1601 + }, + { + "epoch": 0.34, + "learning_rate": 1.3352941176470588e-05, + "logits/chosen": -2.2291455268859863, + "logits/rejected": -1.717329978942871, + "logps/chosen": -428.1994934082031, + "logps/rejected": -380.1053771972656, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5141818523406982, + "rewards/margins": 4.748443603515625, + "rewards/rejected": -7.262625694274902, + "step": 1602 + }, + { + "epoch": 0.34, + "learning_rate": 1.334873949579832e-05, + "logits/chosen": -1.7454628944396973, + "logits/rejected": -1.5623927116394043, + "logps/chosen": -356.7296142578125, + "logps/rejected": -285.88934326171875, + "loss": 0.4121, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.248469829559326, + "rewards/margins": 3.267338991165161, + "rewards/rejected": -6.515809059143066, + "step": 1603 + }, + { + "epoch": 0.34, + "learning_rate": 1.3344537815126052e-05, + "logits/chosen": -1.997895359992981, + "logits/rejected": -2.042762041091919, + "logps/chosen": -325.62799072265625, + "logps/rejected": -427.9535217285156, + "loss": 0.5784, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2319893836975098, + "rewards/margins": 2.6737725734710693, + "rewards/rejected": -5.90576171875, + "step": 1604 + }, + { + "epoch": 0.34, + "learning_rate": 1.3340336134453782e-05, + "logits/chosen": -2.14200496673584, + "logits/rejected": -1.776376485824585, + "logps/chosen": -338.14031982421875, + "logps/rejected": -341.0394287109375, + "loss": 0.2672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.81601619720459, + "rewards/margins": 4.751769065856934, + "rewards/rejected": -7.567784786224365, + "step": 1605 + }, + { + "epoch": 0.34, + "learning_rate": 1.3336134453781514e-05, + "logits/chosen": -2.154301166534424, + "logits/rejected": -2.2191789150238037, + "logps/chosen": -245.09988403320312, + "logps/rejected": -270.04296875, + "loss": 0.4949, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.518545150756836, + "rewards/margins": 3.6310489177703857, + "rewards/rejected": -6.149594306945801, + "step": 1606 + }, + { + "epoch": 0.34, + "learning_rate": 1.3331932773109244e-05, + "logits/chosen": -2.1032347679138184, + "logits/rejected": -1.9936023950576782, + "logps/chosen": -279.7778015136719, + "logps/rejected": -286.44573974609375, + "loss": 0.5778, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5201468467712402, + "rewards/margins": 4.267714977264404, + "rewards/rejected": -6.7878618240356445, + "step": 1607 + }, + { + "epoch": 0.34, + "learning_rate": 1.3327731092436976e-05, + "logits/chosen": -2.084012269973755, + "logits/rejected": -2.074237585067749, + "logps/chosen": -279.6593017578125, + "logps/rejected": -316.08209228515625, + "loss": 0.203, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.870898962020874, + "rewards/margins": 4.5558671951293945, + "rewards/rejected": -7.426766395568848, + "step": 1608 + }, + { + "epoch": 0.34, + "learning_rate": 1.3323529411764706e-05, + "logits/chosen": -1.9728329181671143, + "logits/rejected": -2.1290392875671387, + "logps/chosen": -295.37481689453125, + "logps/rejected": -329.74774169921875, + "loss": 0.6767, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4919238090515137, + "rewards/margins": 2.4638986587524414, + "rewards/rejected": -5.955822467803955, + "step": 1609 + }, + { + "epoch": 0.34, + "learning_rate": 1.3319327731092438e-05, + "logits/chosen": -2.03446102142334, + "logits/rejected": -2.120352029800415, + "logps/chosen": -320.86419677734375, + "logps/rejected": -396.77557373046875, + "loss": 0.444, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7813310623168945, + "rewards/margins": 4.058514595031738, + "rewards/rejected": -6.839846134185791, + "step": 1610 + }, + { + "epoch": 0.34, + "learning_rate": 1.3315126050420168e-05, + "logits/chosen": -2.34096097946167, + "logits/rejected": -2.0522515773773193, + "logps/chosen": -285.84429931640625, + "logps/rejected": -268.8486633300781, + "loss": 0.7912, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.580796480178833, + "rewards/margins": 2.9453368186950684, + "rewards/rejected": -6.5261335372924805, + "step": 1611 + }, + { + "epoch": 0.34, + "learning_rate": 1.33109243697479e-05, + "logits/chosen": -2.0140230655670166, + "logits/rejected": -2.0360476970672607, + "logps/chosen": -332.80133056640625, + "logps/rejected": -365.2991943359375, + "loss": 0.1464, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9450170993804932, + "rewards/margins": 3.3869357109069824, + "rewards/rejected": -5.331953048706055, + "step": 1612 + }, + { + "epoch": 0.34, + "learning_rate": 1.330672268907563e-05, + "logits/chosen": -2.324519634246826, + "logits/rejected": -2.0606119632720947, + "logps/chosen": -318.5538635253906, + "logps/rejected": -316.2134094238281, + "loss": 0.2262, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.42474102973938, + "rewards/margins": 3.705200433731079, + "rewards/rejected": -6.129941463470459, + "step": 1613 + }, + { + "epoch": 0.34, + "learning_rate": 1.3302521008403362e-05, + "logits/chosen": -1.748196005821228, + "logits/rejected": -1.6406477689743042, + "logps/chosen": -226.12464904785156, + "logps/rejected": -332.0560607910156, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4759981632232666, + "rewards/margins": 4.606532096862793, + "rewards/rejected": -7.0825300216674805, + "step": 1614 + }, + { + "epoch": 0.34, + "learning_rate": 1.3298319327731092e-05, + "logits/chosen": -1.9413701295852661, + "logits/rejected": -1.6713558435440063, + "logps/chosen": -279.9207763671875, + "logps/rejected": -290.0434875488281, + "loss": 0.4252, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.070911884307861, + "rewards/margins": 3.3115811347961426, + "rewards/rejected": -7.382493019104004, + "step": 1615 + }, + { + "epoch": 0.34, + "learning_rate": 1.3294117647058824e-05, + "logits/chosen": -1.7115473747253418, + "logits/rejected": -1.4939265251159668, + "logps/chosen": -304.6336975097656, + "logps/rejected": -362.10821533203125, + "loss": 0.086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4965295791625977, + "rewards/margins": 4.550251007080078, + "rewards/rejected": -6.046780109405518, + "step": 1616 + }, + { + "epoch": 0.34, + "learning_rate": 1.3289915966386555e-05, + "logits/chosen": -2.1417582035064697, + "logits/rejected": -1.6566880941390991, + "logps/chosen": -325.8069763183594, + "logps/rejected": -319.3828125, + "loss": 0.1858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1280019283294678, + "rewards/margins": 4.70501184463501, + "rewards/rejected": -7.833014011383057, + "step": 1617 + }, + { + "epoch": 0.34, + "learning_rate": 1.3285714285714287e-05, + "logits/chosen": -2.29227876663208, + "logits/rejected": -1.990504503250122, + "logps/chosen": -370.6865539550781, + "logps/rejected": -328.5603332519531, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.699510931968689, + "rewards/margins": 3.1582775115966797, + "rewards/rejected": -4.857788562774658, + "step": 1618 + }, + { + "epoch": 0.34, + "learning_rate": 1.3281512605042017e-05, + "logits/chosen": -2.13417649269104, + "logits/rejected": -1.991564393043518, + "logps/chosen": -353.3240661621094, + "logps/rejected": -299.34173583984375, + "loss": 0.5271, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.6974196434020996, + "rewards/margins": 2.587613105773926, + "rewards/rejected": -5.285032749176025, + "step": 1619 + }, + { + "epoch": 0.34, + "learning_rate": 1.3277310924369749e-05, + "logits/chosen": -2.203852415084839, + "logits/rejected": -2.0532944202423096, + "logps/chosen": -406.81585693359375, + "logps/rejected": -375.972412109375, + "loss": 0.3639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.869723081588745, + "rewards/margins": 3.372016668319702, + "rewards/rejected": -6.241739749908447, + "step": 1620 + }, + { + "epoch": 0.34, + "learning_rate": 1.3273109243697479e-05, + "logits/chosen": -2.300136089324951, + "logits/rejected": -2.291567325592041, + "logps/chosen": -359.29046630859375, + "logps/rejected": -418.3973388671875, + "loss": 0.3779, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1010377407073975, + "rewards/margins": 2.3206593990325928, + "rewards/rejected": -5.42169713973999, + "step": 1621 + }, + { + "epoch": 0.34, + "learning_rate": 1.326890756302521e-05, + "logits/chosen": -2.215416669845581, + "logits/rejected": -2.0284652709960938, + "logps/chosen": -443.34149169921875, + "logps/rejected": -392.05841064453125, + "loss": 0.7605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.803683042526245, + "rewards/margins": 2.662907123565674, + "rewards/rejected": -5.466590881347656, + "step": 1622 + }, + { + "epoch": 0.34, + "learning_rate": 1.3264705882352941e-05, + "logits/chosen": -1.876626968383789, + "logits/rejected": -1.7013816833496094, + "logps/chosen": -291.8648681640625, + "logps/rejected": -270.8343811035156, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.566002368927002, + "rewards/margins": 3.1309456825256348, + "rewards/rejected": -5.696948051452637, + "step": 1623 + }, + { + "epoch": 0.34, + "learning_rate": 1.3260504201680673e-05, + "logits/chosen": -2.1808621883392334, + "logits/rejected": -1.827493667602539, + "logps/chosen": -394.2679138183594, + "logps/rejected": -330.83563232421875, + "loss": 0.2369, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4910149574279785, + "rewards/margins": 3.987213373184204, + "rewards/rejected": -6.478228569030762, + "step": 1624 + }, + { + "epoch": 0.34, + "learning_rate": 1.3256302521008403e-05, + "logits/chosen": -2.3626720905303955, + "logits/rejected": -1.9575340747833252, + "logps/chosen": -369.42230224609375, + "logps/rejected": -374.7626037597656, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5884933471679688, + "rewards/margins": 5.056520462036133, + "rewards/rejected": -7.645013809204102, + "step": 1625 + }, + { + "epoch": 0.34, + "learning_rate": 1.3252100840336135e-05, + "logits/chosen": -1.9236102104187012, + "logits/rejected": -1.849417805671692, + "logps/chosen": -352.7290954589844, + "logps/rejected": -333.08489990234375, + "loss": 0.2424, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1833720207214355, + "rewards/margins": 4.010716915130615, + "rewards/rejected": -6.194088459014893, + "step": 1626 + }, + { + "epoch": 0.34, + "learning_rate": 1.3247899159663867e-05, + "logits/chosen": -2.1266355514526367, + "logits/rejected": -2.061347484588623, + "logps/chosen": -421.3648681640625, + "logps/rejected": -310.83734130859375, + "loss": 0.4052, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0288095474243164, + "rewards/margins": 4.6182332038879395, + "rewards/rejected": -6.647042274475098, + "step": 1627 + }, + { + "epoch": 0.34, + "learning_rate": 1.3243697478991597e-05, + "logits/chosen": -1.8946964740753174, + "logits/rejected": -1.628699779510498, + "logps/chosen": -332.45196533203125, + "logps/rejected": -315.9475402832031, + "loss": 0.1712, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3637945652008057, + "rewards/margins": 5.166635513305664, + "rewards/rejected": -7.530430316925049, + "step": 1628 + }, + { + "epoch": 0.34, + "learning_rate": 1.3239495798319329e-05, + "logits/chosen": -2.267000675201416, + "logits/rejected": -2.2757129669189453, + "logps/chosen": -337.1736145019531, + "logps/rejected": -299.029541015625, + "loss": 0.4137, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.241429328918457, + "rewards/margins": 2.8422224521636963, + "rewards/rejected": -6.083651542663574, + "step": 1629 + }, + { + "epoch": 0.34, + "learning_rate": 1.323529411764706e-05, + "logits/chosen": -2.2052745819091797, + "logits/rejected": -2.096834897994995, + "logps/chosen": -361.2476806640625, + "logps/rejected": -363.7479553222656, + "loss": 0.3502, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6909987926483154, + "rewards/margins": 3.670335054397583, + "rewards/rejected": -5.361333847045898, + "step": 1630 + }, + { + "epoch": 0.34, + "learning_rate": 1.3231092436974791e-05, + "logits/chosen": -1.9855976104736328, + "logits/rejected": -2.1909592151641846, + "logps/chosen": -375.8970642089844, + "logps/rejected": -350.7624206542969, + "loss": 0.4673, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4242157936096191, + "rewards/margins": 3.23439884185791, + "rewards/rejected": -4.658614635467529, + "step": 1631 + }, + { + "epoch": 0.34, + "learning_rate": 1.3226890756302521e-05, + "logits/chosen": -2.0138039588928223, + "logits/rejected": -2.0690550804138184, + "logps/chosen": -201.61300659179688, + "logps/rejected": -325.20428466796875, + "loss": 0.2314, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3579530715942383, + "rewards/margins": 5.00855827331543, + "rewards/rejected": -7.366511344909668, + "step": 1632 + }, + { + "epoch": 0.34, + "learning_rate": 1.3222689075630253e-05, + "logits/chosen": -2.2799272537231445, + "logits/rejected": -1.7658333778381348, + "logps/chosen": -409.2884826660156, + "logps/rejected": -312.7856750488281, + "loss": 0.4545, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5605814456939697, + "rewards/margins": 2.6428394317626953, + "rewards/rejected": -4.203420639038086, + "step": 1633 + }, + { + "epoch": 0.34, + "learning_rate": 1.3218487394957984e-05, + "logits/chosen": -2.151341438293457, + "logits/rejected": -1.793632984161377, + "logps/chosen": -322.8631591796875, + "logps/rejected": -288.47479248046875, + "loss": 0.1409, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0815314054489136, + "rewards/margins": 4.677453517913818, + "rewards/rejected": -5.7589850425720215, + "step": 1634 + }, + { + "epoch": 0.34, + "learning_rate": 1.3214285714285716e-05, + "logits/chosen": -2.164707899093628, + "logits/rejected": -1.5787895917892456, + "logps/chosen": -325.22772216796875, + "logps/rejected": -293.02630615234375, + "loss": 0.2226, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0245189666748047, + "rewards/margins": 2.891779899597168, + "rewards/rejected": -4.916299343109131, + "step": 1635 + }, + { + "epoch": 0.34, + "learning_rate": 1.3210084033613446e-05, + "logits/chosen": -2.020193576812744, + "logits/rejected": -1.8130548000335693, + "logps/chosen": -280.83221435546875, + "logps/rejected": -333.7970886230469, + "loss": 0.2689, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9360840320587158, + "rewards/margins": 2.9368505477905273, + "rewards/rejected": -4.872934818267822, + "step": 1636 + }, + { + "epoch": 0.34, + "learning_rate": 1.3205882352941178e-05, + "logits/chosen": -2.1149120330810547, + "logits/rejected": -1.5744532346725464, + "logps/chosen": -383.81488037109375, + "logps/rejected": -387.3052978515625, + "loss": 0.1363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.796541154384613, + "rewards/margins": 4.915252208709717, + "rewards/rejected": -5.711793899536133, + "step": 1637 + }, + { + "epoch": 0.34, + "learning_rate": 1.3201680672268908e-05, + "logits/chosen": -1.8643388748168945, + "logits/rejected": -2.0202724933624268, + "logps/chosen": -333.83203125, + "logps/rejected": -293.6918029785156, + "loss": 0.4123, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1033411026000977, + "rewards/margins": 1.726037859916687, + "rewards/rejected": -3.829378843307495, + "step": 1638 + }, + { + "epoch": 0.34, + "learning_rate": 1.319747899159664e-05, + "logits/chosen": -2.349349021911621, + "logits/rejected": -1.67177152633667, + "logps/chosen": -328.877197265625, + "logps/rejected": -352.9701232910156, + "loss": 0.3002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7484947443008423, + "rewards/margins": 4.518703460693359, + "rewards/rejected": -6.26719856262207, + "step": 1639 + }, + { + "epoch": 0.34, + "learning_rate": 1.319327731092437e-05, + "logits/chosen": -2.196012496948242, + "logits/rejected": -1.8797670602798462, + "logps/chosen": -371.23480224609375, + "logps/rejected": -340.79345703125, + "loss": 0.2041, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7328266501426697, + "rewards/margins": 3.8800745010375977, + "rewards/rejected": -4.612901210784912, + "step": 1640 + }, + { + "epoch": 0.34, + "learning_rate": 1.3189075630252102e-05, + "logits/chosen": -2.105725049972534, + "logits/rejected": -2.1755247116088867, + "logps/chosen": -318.401123046875, + "logps/rejected": -389.2422790527344, + "loss": 0.6833, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0785398483276367, + "rewards/margins": 2.153319835662842, + "rewards/rejected": -4.23185920715332, + "step": 1641 + }, + { + "epoch": 0.34, + "learning_rate": 1.3184873949579832e-05, + "logits/chosen": -1.8373757600784302, + "logits/rejected": -1.813422441482544, + "logps/chosen": -252.28561401367188, + "logps/rejected": -243.92356872558594, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.691725254058838, + "rewards/margins": 2.1929168701171875, + "rewards/rejected": -3.8846421241760254, + "step": 1642 + }, + { + "epoch": 0.34, + "learning_rate": 1.3180672268907564e-05, + "logits/chosen": -2.041912317276001, + "logits/rejected": -1.4242440462112427, + "logps/chosen": -298.460693359375, + "logps/rejected": -293.2347106933594, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1325955390930176, + "rewards/margins": 2.9155337810516357, + "rewards/rejected": -4.048129081726074, + "step": 1643 + }, + { + "epoch": 0.34, + "learning_rate": 1.3176470588235294e-05, + "logits/chosen": -1.719459056854248, + "logits/rejected": -1.5744354724884033, + "logps/chosen": -363.67266845703125, + "logps/rejected": -313.31292724609375, + "loss": 0.2477, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8975053429603577, + "rewards/margins": 4.022075653076172, + "rewards/rejected": -4.919580936431885, + "step": 1644 + }, + { + "epoch": 0.34, + "learning_rate": 1.3172268907563026e-05, + "logits/chosen": -2.153738498687744, + "logits/rejected": -1.793424129486084, + "logps/chosen": -334.5963134765625, + "logps/rejected": -268.81036376953125, + "loss": 0.1327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6021678447723389, + "rewards/margins": 4.574833393096924, + "rewards/rejected": -5.177000999450684, + "step": 1645 + }, + { + "epoch": 0.34, + "learning_rate": 1.3168067226890756e-05, + "logits/chosen": -2.1177711486816406, + "logits/rejected": -2.199002742767334, + "logps/chosen": -380.64990234375, + "logps/rejected": -387.0482177734375, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27392637729644775, + "rewards/margins": 3.4210567474365234, + "rewards/rejected": -3.1471304893493652, + "step": 1646 + }, + { + "epoch": 0.34, + "learning_rate": 1.3163865546218488e-05, + "logits/chosen": -2.1755337715148926, + "logits/rejected": -1.8445426225662231, + "logps/chosen": -306.31964111328125, + "logps/rejected": -261.7821960449219, + "loss": 0.4009, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.680124282836914, + "rewards/margins": 1.8594114780426025, + "rewards/rejected": -3.5395355224609375, + "step": 1647 + }, + { + "epoch": 0.34, + "learning_rate": 1.315966386554622e-05, + "logits/chosen": -2.325439929962158, + "logits/rejected": -1.9233131408691406, + "logps/chosen": -283.2689208984375, + "logps/rejected": -243.05316162109375, + "loss": 0.4014, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9223891496658325, + "rewards/margins": 2.5972461700439453, + "rewards/rejected": -3.5196352005004883, + "step": 1648 + }, + { + "epoch": 0.34, + "learning_rate": 1.315546218487395e-05, + "logits/chosen": -1.7490565776824951, + "logits/rejected": -1.658849835395813, + "logps/chosen": -238.8974609375, + "logps/rejected": -272.6612548828125, + "loss": 0.2375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3518321514129639, + "rewards/margins": 3.057460069656372, + "rewards/rejected": -4.409292221069336, + "step": 1649 + }, + { + "epoch": 0.35, + "learning_rate": 1.3151260504201682e-05, + "logits/chosen": -2.298921823501587, + "logits/rejected": -2.1890370845794678, + "logps/chosen": -340.6194152832031, + "logps/rejected": -415.7105407714844, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0801947116851807, + "rewards/margins": 2.944852352142334, + "rewards/rejected": -4.0250468254089355, + "step": 1650 + }, + { + "epoch": 0.35, + "learning_rate": 1.3147058823529413e-05, + "logits/chosen": -2.0056240558624268, + "logits/rejected": -2.009559154510498, + "logps/chosen": -328.42962646484375, + "logps/rejected": -442.249267578125, + "loss": 0.0666, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7312971949577332, + "rewards/margins": 5.398517608642578, + "rewards/rejected": -6.129814624786377, + "step": 1651 + }, + { + "epoch": 0.35, + "learning_rate": 1.3142857142857145e-05, + "logits/chosen": -2.2275142669677734, + "logits/rejected": -1.7594361305236816, + "logps/chosen": -312.7616271972656, + "logps/rejected": -246.13156127929688, + "loss": 0.2979, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8992899060249329, + "rewards/margins": 3.912074089050293, + "rewards/rejected": -4.81136417388916, + "step": 1652 + }, + { + "epoch": 0.35, + "learning_rate": 1.3138655462184875e-05, + "logits/chosen": -2.0658066272735596, + "logits/rejected": -1.7544759511947632, + "logps/chosen": -440.46502685546875, + "logps/rejected": -368.6729431152344, + "loss": 0.1422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9311587810516357, + "rewards/margins": 5.469476222991943, + "rewards/rejected": -6.400634765625, + "step": 1653 + }, + { + "epoch": 0.35, + "learning_rate": 1.3134453781512607e-05, + "logits/chosen": -2.035953998565674, + "logits/rejected": -2.1727447509765625, + "logps/chosen": -276.2041015625, + "logps/rejected": -363.9278564453125, + "loss": 0.3343, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0082136392593384, + "rewards/margins": 3.557870626449585, + "rewards/rejected": -4.566083908081055, + "step": 1654 + }, + { + "epoch": 0.35, + "learning_rate": 1.3130252100840337e-05, + "logits/chosen": -2.1151387691497803, + "logits/rejected": -1.5189028978347778, + "logps/chosen": -315.59197998046875, + "logps/rejected": -293.00732421875, + "loss": 0.2988, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.333604097366333, + "rewards/margins": 2.8727025985717773, + "rewards/rejected": -4.206306457519531, + "step": 1655 + }, + { + "epoch": 0.35, + "learning_rate": 1.3126050420168069e-05, + "logits/chosen": -2.190833568572998, + "logits/rejected": -1.8973686695098877, + "logps/chosen": -326.2862548828125, + "logps/rejected": -229.34274291992188, + "loss": 0.2783, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0256329774856567, + "rewards/margins": 2.987743854522705, + "rewards/rejected": -4.013376712799072, + "step": 1656 + }, + { + "epoch": 0.35, + "learning_rate": 1.3121848739495799e-05, + "logits/chosen": -2.1583807468414307, + "logits/rejected": -1.9810147285461426, + "logps/chosen": -289.7817687988281, + "logps/rejected": -330.85675048828125, + "loss": 0.104, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2317479848861694, + "rewards/margins": 5.332660675048828, + "rewards/rejected": -6.564408779144287, + "step": 1657 + }, + { + "epoch": 0.35, + "learning_rate": 1.3117647058823531e-05, + "logits/chosen": -1.7619329690933228, + "logits/rejected": -1.868417501449585, + "logps/chosen": -224.9616241455078, + "logps/rejected": -288.0164794921875, + "loss": 0.1409, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1625595092773438, + "rewards/margins": 4.720888614654541, + "rewards/rejected": -6.883448600769043, + "step": 1658 + }, + { + "epoch": 0.35, + "learning_rate": 1.3113445378151261e-05, + "logits/chosen": -2.2975878715515137, + "logits/rejected": -1.8441202640533447, + "logps/chosen": -465.9472351074219, + "logps/rejected": -357.5421142578125, + "loss": 0.1762, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5699071884155273, + "rewards/margins": 4.306576728820801, + "rewards/rejected": -4.876484394073486, + "step": 1659 + }, + { + "epoch": 0.35, + "learning_rate": 1.3109243697478993e-05, + "logits/chosen": -2.096003532409668, + "logits/rejected": -1.9058789014816284, + "logps/chosen": -388.0535888671875, + "logps/rejected": -395.0906066894531, + "loss": 0.5501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2597575187683105, + "rewards/margins": 2.4487240314483643, + "rewards/rejected": -3.708481550216675, + "step": 1660 + }, + { + "epoch": 0.35, + "learning_rate": 1.3105042016806723e-05, + "logits/chosen": -2.369222402572632, + "logits/rejected": -2.5502641201019287, + "logps/chosen": -394.6534729003906, + "logps/rejected": -359.2616271972656, + "loss": 0.4286, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1266310214996338, + "rewards/margins": 2.82256817817688, + "rewards/rejected": -3.9491991996765137, + "step": 1661 + }, + { + "epoch": 0.35, + "learning_rate": 1.3100840336134455e-05, + "logits/chosen": -1.8997111320495605, + "logits/rejected": -2.1685891151428223, + "logps/chosen": -400.38848876953125, + "logps/rejected": -414.1773681640625, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8077974319458008, + "rewards/margins": 4.451169967651367, + "rewards/rejected": -5.258967399597168, + "step": 1662 + }, + { + "epoch": 0.35, + "learning_rate": 1.3096638655462185e-05, + "logits/chosen": -2.1664371490478516, + "logits/rejected": -1.9259426593780518, + "logps/chosen": -316.7283935546875, + "logps/rejected": -343.28485107421875, + "loss": 0.1062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8430044651031494, + "rewards/margins": 4.996984481811523, + "rewards/rejected": -6.839988708496094, + "step": 1663 + }, + { + "epoch": 0.35, + "learning_rate": 1.3092436974789917e-05, + "logits/chosen": -2.1360716819763184, + "logits/rejected": -1.9247126579284668, + "logps/chosen": -354.1589050292969, + "logps/rejected": -270.7493896484375, + "loss": 0.2434, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0694624185562134, + "rewards/margins": 3.043073892593384, + "rewards/rejected": -4.1125359535217285, + "step": 1664 + }, + { + "epoch": 0.35, + "learning_rate": 1.3088235294117648e-05, + "logits/chosen": -2.1518170833587646, + "logits/rejected": -1.7931030988693237, + "logps/chosen": -371.633544921875, + "logps/rejected": -380.6823425292969, + "loss": 0.4844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.833035409450531, + "rewards/margins": 4.394563674926758, + "rewards/rejected": -5.227599143981934, + "step": 1665 + }, + { + "epoch": 0.35, + "learning_rate": 1.308403361344538e-05, + "logits/chosen": -2.020862102508545, + "logits/rejected": -1.6553512811660767, + "logps/chosen": -469.6575927734375, + "logps/rejected": -349.978515625, + "loss": 0.1766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8553345203399658, + "rewards/margins": 4.1483259201049805, + "rewards/rejected": -5.003661155700684, + "step": 1666 + }, + { + "epoch": 0.35, + "learning_rate": 1.307983193277311e-05, + "logits/chosen": -1.8492913246154785, + "logits/rejected": -1.68524169921875, + "logps/chosen": -261.5853271484375, + "logps/rejected": -290.50189208984375, + "loss": 0.5328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9632350206375122, + "rewards/margins": 2.1344101428985596, + "rewards/rejected": -4.097645282745361, + "step": 1667 + }, + { + "epoch": 0.35, + "learning_rate": 1.3075630252100842e-05, + "logits/chosen": -2.036646842956543, + "logits/rejected": -1.932445764541626, + "logps/chosen": -334.4857482910156, + "logps/rejected": -478.5420227050781, + "loss": 0.3821, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.255039930343628, + "rewards/margins": 4.666387557983398, + "rewards/rejected": -5.9214277267456055, + "step": 1668 + }, + { + "epoch": 0.35, + "learning_rate": 1.3071428571428572e-05, + "logits/chosen": -2.1748783588409424, + "logits/rejected": -2.013597011566162, + "logps/chosen": -269.46929931640625, + "logps/rejected": -242.2169189453125, + "loss": 0.3967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2207040786743164, + "rewards/margins": 3.145719289779663, + "rewards/rejected": -4.3664231300354, + "step": 1669 + }, + { + "epoch": 0.35, + "learning_rate": 1.3067226890756304e-05, + "logits/chosen": -1.9189457893371582, + "logits/rejected": -1.7714476585388184, + "logps/chosen": -447.6824951171875, + "logps/rejected": -468.66583251953125, + "loss": 0.2026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8013466000556946, + "rewards/margins": 4.112392425537109, + "rewards/rejected": -4.913739204406738, + "step": 1670 + }, + { + "epoch": 0.35, + "learning_rate": 1.3063025210084036e-05, + "logits/chosen": -1.7721881866455078, + "logits/rejected": -1.9631613492965698, + "logps/chosen": -361.6551513671875, + "logps/rejected": -399.54364013671875, + "loss": 0.4022, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5698039531707764, + "rewards/margins": 3.922056198120117, + "rewards/rejected": -5.491860389709473, + "step": 1671 + }, + { + "epoch": 0.35, + "learning_rate": 1.3058823529411766e-05, + "logits/chosen": -2.0548341274261475, + "logits/rejected": -1.5207706689834595, + "logps/chosen": -327.8741149902344, + "logps/rejected": -278.2254638671875, + "loss": 0.4888, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5704176425933838, + "rewards/margins": 3.4351437091827393, + "rewards/rejected": -5.005561828613281, + "step": 1672 + }, + { + "epoch": 0.35, + "learning_rate": 1.3054621848739498e-05, + "logits/chosen": -1.7771590948104858, + "logits/rejected": -2.0469908714294434, + "logps/chosen": -230.1295166015625, + "logps/rejected": -401.906005859375, + "loss": 0.1166, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2550369501113892, + "rewards/margins": 4.642829895019531, + "rewards/rejected": -5.897866725921631, + "step": 1673 + }, + { + "epoch": 0.35, + "learning_rate": 1.3050420168067228e-05, + "logits/chosen": -2.1374335289001465, + "logits/rejected": -1.9265004396438599, + "logps/chosen": -384.01922607421875, + "logps/rejected": -340.26812744140625, + "loss": 0.2386, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.002604365348816, + "rewards/margins": 3.3432116508483887, + "rewards/rejected": -4.345816135406494, + "step": 1674 + }, + { + "epoch": 0.35, + "learning_rate": 1.304621848739496e-05, + "logits/chosen": -1.818610429763794, + "logits/rejected": -1.6062915325164795, + "logps/chosen": -314.78363037109375, + "logps/rejected": -473.2773132324219, + "loss": 0.1387, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6892642974853516, + "rewards/margins": 4.108543872833252, + "rewards/rejected": -4.7978081703186035, + "step": 1675 + }, + { + "epoch": 0.35, + "learning_rate": 1.304201680672269e-05, + "logits/chosen": -1.9655473232269287, + "logits/rejected": -1.8932485580444336, + "logps/chosen": -329.175048828125, + "logps/rejected": -301.1618347167969, + "loss": 0.5714, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2502942085266113, + "rewards/margins": 1.7439815998077393, + "rewards/rejected": -3.9942760467529297, + "step": 1676 + }, + { + "epoch": 0.35, + "learning_rate": 1.3037815126050422e-05, + "logits/chosen": -2.179201602935791, + "logits/rejected": -1.374028205871582, + "logps/chosen": -383.7030334472656, + "logps/rejected": -332.54937744140625, + "loss": 0.3797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0847861766815186, + "rewards/margins": 3.713181734085083, + "rewards/rejected": -4.79796838760376, + "step": 1677 + }, + { + "epoch": 0.35, + "learning_rate": 1.3033613445378152e-05, + "logits/chosen": -2.0480589866638184, + "logits/rejected": -1.5211020708084106, + "logps/chosen": -469.7742004394531, + "logps/rejected": -297.0853271484375, + "loss": 0.0797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6429076194763184, + "rewards/margins": 4.332094192504883, + "rewards/rejected": -4.975001335144043, + "step": 1678 + }, + { + "epoch": 0.35, + "learning_rate": 1.3029411764705884e-05, + "logits/chosen": -2.1266839504241943, + "logits/rejected": -1.9897475242614746, + "logps/chosen": -259.80059814453125, + "logps/rejected": -320.8403015136719, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4387110471725464, + "rewards/margins": 3.5361859798431396, + "rewards/rejected": -4.9748969078063965, + "step": 1679 + }, + { + "epoch": 0.35, + "learning_rate": 1.3025210084033614e-05, + "logits/chosen": -2.203198194503784, + "logits/rejected": -1.957892656326294, + "logps/chosen": -353.9150085449219, + "logps/rejected": -285.92095947265625, + "loss": 0.2757, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6760694980621338, + "rewards/margins": 2.700798988342285, + "rewards/rejected": -4.376868724822998, + "step": 1680 + }, + { + "epoch": 0.35, + "learning_rate": 1.3021008403361346e-05, + "logits/chosen": -2.254056692123413, + "logits/rejected": -1.9210858345031738, + "logps/chosen": -306.054931640625, + "logps/rejected": -313.51678466796875, + "loss": 0.1053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.317150592803955, + "rewards/margins": 4.63206672668457, + "rewards/rejected": -5.949216842651367, + "step": 1681 + }, + { + "epoch": 0.35, + "learning_rate": 1.3016806722689077e-05, + "logits/chosen": -2.178771495819092, + "logits/rejected": -1.9883192777633667, + "logps/chosen": -380.27874755859375, + "logps/rejected": -404.1409912109375, + "loss": 0.1743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.469873309135437, + "rewards/margins": 5.4599432945251465, + "rewards/rejected": -5.929816722869873, + "step": 1682 + }, + { + "epoch": 0.35, + "learning_rate": 1.3012605042016809e-05, + "logits/chosen": -2.228998899459839, + "logits/rejected": -1.970962643623352, + "logps/chosen": -261.6079406738281, + "logps/rejected": -294.96038818359375, + "loss": 0.3451, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3882781267166138, + "rewards/margins": 3.478013753890991, + "rewards/rejected": -4.8662919998168945, + "step": 1683 + }, + { + "epoch": 0.35, + "learning_rate": 1.3008403361344539e-05, + "logits/chosen": -2.1628856658935547, + "logits/rejected": -1.9326066970825195, + "logps/chosen": -355.4634094238281, + "logps/rejected": -398.8327331542969, + "loss": 0.4217, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5831526517868042, + "rewards/margins": 3.4703168869018555, + "rewards/rejected": -5.053469657897949, + "step": 1684 + }, + { + "epoch": 0.35, + "learning_rate": 1.300420168067227e-05, + "logits/chosen": -1.8383350372314453, + "logits/rejected": -1.75159752368927, + "logps/chosen": -292.02459716796875, + "logps/rejected": -351.68597412109375, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.371532917022705, + "rewards/margins": 2.804790496826172, + "rewards/rejected": -5.176322937011719, + "step": 1685 + }, + { + "epoch": 0.35, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -1.9471732378005981, + "logits/rejected": -1.7567026615142822, + "logps/chosen": -310.8711853027344, + "logps/rejected": -369.2457275390625, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2930207252502441, + "rewards/margins": 4.7606048583984375, + "rewards/rejected": -6.053625583648682, + "step": 1686 + }, + { + "epoch": 0.35, + "learning_rate": 1.2995798319327733e-05, + "logits/chosen": -2.4826221466064453, + "logits/rejected": -1.883579134941101, + "logps/chosen": -353.9053039550781, + "logps/rejected": -299.3062744140625, + "loss": 0.2734, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2804255485534668, + "rewards/margins": 3.731477737426758, + "rewards/rejected": -5.011903285980225, + "step": 1687 + }, + { + "epoch": 0.35, + "learning_rate": 1.2991596638655463e-05, + "logits/chosen": -1.949670433998108, + "logits/rejected": -2.1552414894104004, + "logps/chosen": -287.7423400878906, + "logps/rejected": -339.22406005859375, + "loss": 0.2297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.794440507888794, + "rewards/margins": 3.315462350845337, + "rewards/rejected": -5.109902858734131, + "step": 1688 + }, + { + "epoch": 0.35, + "learning_rate": 1.2987394957983195e-05, + "logits/chosen": -2.0938405990600586, + "logits/rejected": -1.6132357120513916, + "logps/chosen": -408.9571533203125, + "logps/rejected": -335.2628479003906, + "loss": 0.1852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.367767095565796, + "rewards/margins": 5.658818244934082, + "rewards/rejected": -7.026585578918457, + "step": 1689 + }, + { + "epoch": 0.35, + "learning_rate": 1.2983193277310925e-05, + "logits/chosen": -2.1019012928009033, + "logits/rejected": -2.130849599838257, + "logps/chosen": -369.7879638671875, + "logps/rejected": -396.8240966796875, + "loss": 0.5204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4950433373451233, + "rewards/margins": 3.2884771823883057, + "rewards/rejected": -3.783520221710205, + "step": 1690 + }, + { + "epoch": 0.35, + "learning_rate": 1.2978991596638657e-05, + "logits/chosen": -2.2379186153411865, + "logits/rejected": -2.2304952144622803, + "logps/chosen": -280.2316589355469, + "logps/rejected": -361.62396240234375, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9434300661087036, + "rewards/margins": 3.52789306640625, + "rewards/rejected": -5.471323013305664, + "step": 1691 + }, + { + "epoch": 0.35, + "learning_rate": 1.2974789915966387e-05, + "logits/chosen": -2.225423574447632, + "logits/rejected": -1.4466331005096436, + "logps/chosen": -308.32159423828125, + "logps/rejected": -233.5181884765625, + "loss": 0.6001, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.222628116607666, + "rewards/margins": 1.9431620836257935, + "rewards/rejected": -4.16579008102417, + "step": 1692 + }, + { + "epoch": 0.35, + "learning_rate": 1.297058823529412e-05, + "logits/chosen": -1.9872664213180542, + "logits/rejected": -1.6573081016540527, + "logps/chosen": -271.5209655761719, + "logps/rejected": -313.69921875, + "loss": 0.4143, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1143531799316406, + "rewards/margins": 2.284224510192871, + "rewards/rejected": -4.398577690124512, + "step": 1693 + }, + { + "epoch": 0.35, + "learning_rate": 1.2966386554621851e-05, + "logits/chosen": -2.2252771854400635, + "logits/rejected": -2.257521390914917, + "logps/chosen": -238.01895141601562, + "logps/rejected": -240.06521606445312, + "loss": 0.2024, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4267501831054688, + "rewards/margins": 2.910149097442627, + "rewards/rejected": -5.3368988037109375, + "step": 1694 + }, + { + "epoch": 0.35, + "learning_rate": 1.2962184873949581e-05, + "logits/chosen": -2.2101621627807617, + "logits/rejected": -1.762978196144104, + "logps/chosen": -242.93405151367188, + "logps/rejected": -236.29269409179688, + "loss": 0.3158, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.816237449645996, + "rewards/margins": 2.754617214202881, + "rewards/rejected": -4.570854663848877, + "step": 1695 + }, + { + "epoch": 0.35, + "learning_rate": 1.2957983193277313e-05, + "logits/chosen": -2.1546010971069336, + "logits/rejected": -1.8238812685012817, + "logps/chosen": -285.2514953613281, + "logps/rejected": -349.20550537109375, + "loss": 0.1674, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3295536041259766, + "rewards/margins": 3.6256442070007324, + "rewards/rejected": -4.955197811126709, + "step": 1696 + }, + { + "epoch": 0.36, + "learning_rate": 1.2953781512605043e-05, + "logits/chosen": -2.1779394149780273, + "logits/rejected": -1.8502485752105713, + "logps/chosen": -292.621826171875, + "logps/rejected": -299.5543212890625, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6964635848999023, + "rewards/margins": 2.1967406272888184, + "rewards/rejected": -3.8932042121887207, + "step": 1697 + }, + { + "epoch": 0.36, + "learning_rate": 1.2949579831932775e-05, + "logits/chosen": -2.0696017742156982, + "logits/rejected": -1.9356679916381836, + "logps/chosen": -313.1915283203125, + "logps/rejected": -340.41754150390625, + "loss": 0.2216, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.732241153717041, + "rewards/margins": 4.313253879547119, + "rewards/rejected": -6.045494556427002, + "step": 1698 + }, + { + "epoch": 0.36, + "learning_rate": 1.2945378151260506e-05, + "logits/chosen": -2.3744168281555176, + "logits/rejected": -1.8174413442611694, + "logps/chosen": -354.8280944824219, + "logps/rejected": -304.28192138671875, + "loss": 0.3596, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7457529306411743, + "rewards/margins": 3.110261917114258, + "rewards/rejected": -3.856015205383301, + "step": 1699 + }, + { + "epoch": 0.36, + "learning_rate": 1.2941176470588238e-05, + "logits/chosen": -2.397106170654297, + "logits/rejected": -2.2451157569885254, + "logps/chosen": -337.36480712890625, + "logps/rejected": -287.72027587890625, + "loss": 0.4298, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0775704383850098, + "rewards/margins": 2.816797971725464, + "rewards/rejected": -4.894368648529053, + "step": 1700 + }, + { + "epoch": 0.36, + "learning_rate": 1.2936974789915968e-05, + "logits/chosen": -1.84895920753479, + "logits/rejected": -1.7207789421081543, + "logps/chosen": -374.7227478027344, + "logps/rejected": -318.84088134765625, + "loss": 0.3819, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4286924600601196, + "rewards/margins": 2.8925764560699463, + "rewards/rejected": -4.3212690353393555, + "step": 1701 + }, + { + "epoch": 0.36, + "learning_rate": 1.29327731092437e-05, + "logits/chosen": -2.4556751251220703, + "logits/rejected": -1.8722951412200928, + "logps/chosen": -341.0693664550781, + "logps/rejected": -348.319091796875, + "loss": 0.2113, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9394290447235107, + "rewards/margins": 3.7422666549682617, + "rewards/rejected": -5.681694984436035, + "step": 1702 + }, + { + "epoch": 0.36, + "learning_rate": 1.292857142857143e-05, + "logits/chosen": -2.3573899269104004, + "logits/rejected": -2.2098684310913086, + "logps/chosen": -405.17132568359375, + "logps/rejected": -351.2940673828125, + "loss": 0.1851, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8742220401763916, + "rewards/margins": 3.484382152557373, + "rewards/rejected": -5.3586039543151855, + "step": 1703 + }, + { + "epoch": 0.36, + "learning_rate": 1.2924369747899162e-05, + "logits/chosen": -2.083980083465576, + "logits/rejected": -2.036158323287964, + "logps/chosen": -341.2261962890625, + "logps/rejected": -345.0203857421875, + "loss": 0.237, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.434563159942627, + "rewards/margins": 3.8078739643096924, + "rewards/rejected": -5.242437362670898, + "step": 1704 + }, + { + "epoch": 0.36, + "learning_rate": 1.2920168067226892e-05, + "logits/chosen": -2.2365522384643555, + "logits/rejected": -1.835016131401062, + "logps/chosen": -326.5932312011719, + "logps/rejected": -302.18243408203125, + "loss": 0.2322, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9178954362869263, + "rewards/margins": 3.2546377182006836, + "rewards/rejected": -5.17253303527832, + "step": 1705 + }, + { + "epoch": 0.36, + "learning_rate": 1.2915966386554624e-05, + "logits/chosen": -1.9193065166473389, + "logits/rejected": -1.8553701639175415, + "logps/chosen": -295.0126953125, + "logps/rejected": -276.7940979003906, + "loss": 0.5254, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5003440380096436, + "rewards/margins": 3.9395179748535156, + "rewards/rejected": -6.43986177444458, + "step": 1706 + }, + { + "epoch": 0.36, + "learning_rate": 1.2911764705882354e-05, + "logits/chosen": -2.281768798828125, + "logits/rejected": -1.9560492038726807, + "logps/chosen": -360.1552734375, + "logps/rejected": -312.75018310546875, + "loss": 0.2035, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6499032974243164, + "rewards/margins": 3.5419821739196777, + "rewards/rejected": -5.191885948181152, + "step": 1707 + }, + { + "epoch": 0.36, + "learning_rate": 1.2907563025210086e-05, + "logits/chosen": -2.2163238525390625, + "logits/rejected": -2.014315128326416, + "logps/chosen": -292.4384765625, + "logps/rejected": -328.5286560058594, + "loss": 0.2725, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8796565532684326, + "rewards/margins": 3.0182971954345703, + "rewards/rejected": -4.897953510284424, + "step": 1708 + }, + { + "epoch": 0.36, + "learning_rate": 1.2903361344537816e-05, + "logits/chosen": -2.0062756538391113, + "logits/rejected": -1.6757999658584595, + "logps/chosen": -375.4515686035156, + "logps/rejected": -343.0420837402344, + "loss": 0.3978, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3917741775512695, + "rewards/margins": 3.356097459793091, + "rewards/rejected": -5.747871398925781, + "step": 1709 + }, + { + "epoch": 0.36, + "learning_rate": 1.2899159663865548e-05, + "logits/chosen": -1.9768619537353516, + "logits/rejected": -2.1316959857940674, + "logps/chosen": -263.69525146484375, + "logps/rejected": -283.38055419921875, + "loss": 0.1474, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.441432237625122, + "rewards/margins": 3.5668373107910156, + "rewards/rejected": -5.008269309997559, + "step": 1710 + }, + { + "epoch": 0.36, + "learning_rate": 1.2894957983193278e-05, + "logits/chosen": -2.216149091720581, + "logits/rejected": -1.718430995941162, + "logps/chosen": -354.5867004394531, + "logps/rejected": -282.2593078613281, + "loss": 0.165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6971702575683594, + "rewards/margins": 4.483598232269287, + "rewards/rejected": -6.180768013000488, + "step": 1711 + }, + { + "epoch": 0.36, + "learning_rate": 1.289075630252101e-05, + "logits/chosen": -1.678892731666565, + "logits/rejected": -1.6397696733474731, + "logps/chosen": -243.97219848632812, + "logps/rejected": -267.4880065917969, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9729036092758179, + "rewards/margins": 3.4702534675598145, + "rewards/rejected": -5.443157196044922, + "step": 1712 + }, + { + "epoch": 0.36, + "learning_rate": 1.288655462184874e-05, + "logits/chosen": -2.331613302230835, + "logits/rejected": -2.3062655925750732, + "logps/chosen": -310.0999755859375, + "logps/rejected": -326.1746520996094, + "loss": 0.4395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.202488660812378, + "rewards/margins": 2.093700408935547, + "rewards/rejected": -4.296189308166504, + "step": 1713 + }, + { + "epoch": 0.36, + "learning_rate": 1.2882352941176473e-05, + "logits/chosen": -2.1066927909851074, + "logits/rejected": -1.738647222518921, + "logps/chosen": -486.341796875, + "logps/rejected": -322.0362243652344, + "loss": 0.3909, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0142428874969482, + "rewards/margins": 4.968484878540039, + "rewards/rejected": -5.982728004455566, + "step": 1714 + }, + { + "epoch": 0.36, + "learning_rate": 1.2878151260504204e-05, + "logits/chosen": -2.419487237930298, + "logits/rejected": -2.159360408782959, + "logps/chosen": -434.7996826171875, + "logps/rejected": -349.596923828125, + "loss": 0.4208, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8335366249084473, + "rewards/margins": 3.2302937507629395, + "rewards/rejected": -5.063830375671387, + "step": 1715 + }, + { + "epoch": 0.36, + "learning_rate": 1.2873949579831935e-05, + "logits/chosen": -2.220304489135742, + "logits/rejected": -1.7708113193511963, + "logps/chosen": -379.8236083984375, + "logps/rejected": -368.7391357421875, + "loss": 0.2752, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0264580249786377, + "rewards/margins": 4.4852471351623535, + "rewards/rejected": -6.51170539855957, + "step": 1716 + }, + { + "epoch": 0.36, + "learning_rate": 1.2869747899159667e-05, + "logits/chosen": -2.1748082637786865, + "logits/rejected": -1.5972838401794434, + "logps/chosen": -407.4959716796875, + "logps/rejected": -305.56988525390625, + "loss": 0.1021, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.893852949142456, + "rewards/margins": 4.107840538024902, + "rewards/rejected": -6.0016937255859375, + "step": 1717 + }, + { + "epoch": 0.36, + "learning_rate": 1.2865546218487397e-05, + "logits/chosen": -2.144066333770752, + "logits/rejected": -1.5571010112762451, + "logps/chosen": -267.53857421875, + "logps/rejected": -250.59426879882812, + "loss": 0.2017, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.165712833404541, + "rewards/margins": 4.660351753234863, + "rewards/rejected": -6.826064586639404, + "step": 1718 + }, + { + "epoch": 0.36, + "learning_rate": 1.2861344537815129e-05, + "logits/chosen": -2.232433795928955, + "logits/rejected": -1.9537347555160522, + "logps/chosen": -430.7542724609375, + "logps/rejected": -387.421630859375, + "loss": 0.1598, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.114269733428955, + "rewards/margins": 4.251499176025391, + "rewards/rejected": -5.365768909454346, + "step": 1719 + }, + { + "epoch": 0.36, + "learning_rate": 1.2857142857142859e-05, + "logits/chosen": -1.731231927871704, + "logits/rejected": -1.8613042831420898, + "logps/chosen": -224.86611938476562, + "logps/rejected": -280.61004638671875, + "loss": 0.4473, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4507578611373901, + "rewards/margins": 3.0061612129211426, + "rewards/rejected": -4.456918716430664, + "step": 1720 + }, + { + "epoch": 0.36, + "learning_rate": 1.285294117647059e-05, + "logits/chosen": -2.3639960289001465, + "logits/rejected": -1.8054587841033936, + "logps/chosen": -391.4491271972656, + "logps/rejected": -310.6698913574219, + "loss": 0.1761, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1012992858886719, + "rewards/margins": 3.7864742279052734, + "rewards/rejected": -4.887773513793945, + "step": 1721 + }, + { + "epoch": 0.36, + "learning_rate": 1.2848739495798321e-05, + "logits/chosen": -2.1803512573242188, + "logits/rejected": -2.035006046295166, + "logps/chosen": -230.33721923828125, + "logps/rejected": -293.82275390625, + "loss": 0.2387, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.971229076385498, + "rewards/margins": 2.6798977851867676, + "rewards/rejected": -4.651126861572266, + "step": 1722 + }, + { + "epoch": 0.36, + "learning_rate": 1.2844537815126053e-05, + "logits/chosen": -1.9941805601119995, + "logits/rejected": -1.542644739151001, + "logps/chosen": -399.8520202636719, + "logps/rejected": -358.58721923828125, + "loss": 0.1589, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1562397480010986, + "rewards/margins": 3.811459541320801, + "rewards/rejected": -5.96769905090332, + "step": 1723 + }, + { + "epoch": 0.36, + "learning_rate": 1.2840336134453783e-05, + "logits/chosen": -1.867579460144043, + "logits/rejected": -2.0439634323120117, + "logps/chosen": -263.67864990234375, + "logps/rejected": -341.67333984375, + "loss": 0.5047, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1695754528045654, + "rewards/margins": 3.127701997756958, + "rewards/rejected": -5.297277450561523, + "step": 1724 + }, + { + "epoch": 0.36, + "learning_rate": 1.2836134453781515e-05, + "logits/chosen": -2.253267765045166, + "logits/rejected": -2.07066011428833, + "logps/chosen": -425.99786376953125, + "logps/rejected": -325.1829833984375, + "loss": 0.7645, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5230449438095093, + "rewards/margins": 3.2469072341918945, + "rewards/rejected": -4.769951820373535, + "step": 1725 + }, + { + "epoch": 0.36, + "learning_rate": 1.2831932773109245e-05, + "logits/chosen": -2.3167994022369385, + "logits/rejected": -2.1905324459075928, + "logps/chosen": -373.3262634277344, + "logps/rejected": -357.4512939453125, + "loss": 0.1458, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1414622068405151, + "rewards/margins": 4.395660400390625, + "rewards/rejected": -5.53712272644043, + "step": 1726 + }, + { + "epoch": 0.36, + "learning_rate": 1.2827731092436977e-05, + "logits/chosen": -2.2212750911712646, + "logits/rejected": -1.9495553970336914, + "logps/chosen": -336.9986877441406, + "logps/rejected": -290.710693359375, + "loss": 0.3887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9888758659362793, + "rewards/margins": 4.125742435455322, + "rewards/rejected": -6.11461877822876, + "step": 1727 + }, + { + "epoch": 0.36, + "learning_rate": 1.2823529411764707e-05, + "logits/chosen": -1.922631025314331, + "logits/rejected": -1.7247275114059448, + "logps/chosen": -405.75860595703125, + "logps/rejected": -395.06939697265625, + "loss": 0.4779, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4472742080688477, + "rewards/margins": 3.2764339447021484, + "rewards/rejected": -6.723708152770996, + "step": 1728 + }, + { + "epoch": 0.36, + "learning_rate": 1.281932773109244e-05, + "logits/chosen": -1.857358455657959, + "logits/rejected": -2.1009011268615723, + "logps/chosen": -262.1487731933594, + "logps/rejected": -258.7678527832031, + "loss": 0.2587, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.333517074584961, + "rewards/margins": 2.741753101348877, + "rewards/rejected": -5.07526969909668, + "step": 1729 + }, + { + "epoch": 0.36, + "learning_rate": 1.281512605042017e-05, + "logits/chosen": -1.9830610752105713, + "logits/rejected": -1.7838393449783325, + "logps/chosen": -349.3528137207031, + "logps/rejected": -388.6442565917969, + "loss": 0.6238, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6089346408843994, + "rewards/margins": 4.052203178405762, + "rewards/rejected": -5.661137580871582, + "step": 1730 + }, + { + "epoch": 0.36, + "learning_rate": 1.28109243697479e-05, + "logits/chosen": -2.016331672668457, + "logits/rejected": -1.0829658508300781, + "logps/chosen": -321.3553466796875, + "logps/rejected": -256.7331848144531, + "loss": 0.101, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.072201728820801, + "rewards/margins": 4.869070529937744, + "rewards/rejected": -6.941271781921387, + "step": 1731 + }, + { + "epoch": 0.36, + "learning_rate": 1.280672268907563e-05, + "logits/chosen": -1.9750113487243652, + "logits/rejected": -1.803163766860962, + "logps/chosen": -400.969482421875, + "logps/rejected": -338.1429443359375, + "loss": 0.1438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5870110988616943, + "rewards/margins": 5.043107986450195, + "rewards/rejected": -6.630119323730469, + "step": 1732 + }, + { + "epoch": 0.36, + "learning_rate": 1.2802521008403362e-05, + "logits/chosen": -1.8410203456878662, + "logits/rejected": -1.9059548377990723, + "logps/chosen": -324.8757629394531, + "logps/rejected": -383.896728515625, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1457061767578125, + "rewards/margins": 2.7147622108459473, + "rewards/rejected": -4.860467910766602, + "step": 1733 + }, + { + "epoch": 0.36, + "learning_rate": 1.2798319327731092e-05, + "logits/chosen": -1.937516689300537, + "logits/rejected": -1.4987492561340332, + "logps/chosen": -252.4307098388672, + "logps/rejected": -281.6831359863281, + "loss": 0.2898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8815139532089233, + "rewards/margins": 3.419353723526001, + "rewards/rejected": -5.300867557525635, + "step": 1734 + }, + { + "epoch": 0.36, + "learning_rate": 1.2794117647058824e-05, + "logits/chosen": -2.0901570320129395, + "logits/rejected": -1.9519517421722412, + "logps/chosen": -375.2912902832031, + "logps/rejected": -329.3047790527344, + "loss": 0.2603, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5332554578781128, + "rewards/margins": 3.8134186267852783, + "rewards/rejected": -5.346673965454102, + "step": 1735 + }, + { + "epoch": 0.36, + "learning_rate": 1.2789915966386554e-05, + "logits/chosen": -1.781891942024231, + "logits/rejected": -1.8139662742614746, + "logps/chosen": -300.4434814453125, + "logps/rejected": -304.4271240234375, + "loss": 0.4837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9548029899597168, + "rewards/margins": 3.0653021335601807, + "rewards/rejected": -5.020104885101318, + "step": 1736 + }, + { + "epoch": 0.36, + "learning_rate": 1.2785714285714286e-05, + "logits/chosen": -2.093195676803589, + "logits/rejected": -1.839459776878357, + "logps/chosen": -292.4847106933594, + "logps/rejected": -354.5055236816406, + "loss": 0.2414, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5953119993209839, + "rewards/margins": 3.8514628410339355, + "rewards/rejected": -5.446774959564209, + "step": 1737 + }, + { + "epoch": 0.36, + "learning_rate": 1.2781512605042016e-05, + "logits/chosen": -1.785282850265503, + "logits/rejected": -1.9226555824279785, + "logps/chosen": -287.1199951171875, + "logps/rejected": -390.8167419433594, + "loss": 0.2232, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9651846885681152, + "rewards/margins": 4.778981685638428, + "rewards/rejected": -6.744166374206543, + "step": 1738 + }, + { + "epoch": 0.36, + "learning_rate": 1.2777310924369748e-05, + "logits/chosen": -1.881270408630371, + "logits/rejected": -2.089820146560669, + "logps/chosen": -214.3849334716797, + "logps/rejected": -250.7349853515625, + "loss": 0.3226, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.861527919769287, + "rewards/margins": 3.774306297302246, + "rewards/rejected": -5.635834217071533, + "step": 1739 + }, + { + "epoch": 0.36, + "learning_rate": 1.2773109243697479e-05, + "logits/chosen": -2.109286308288574, + "logits/rejected": -1.6026495695114136, + "logps/chosen": -299.84893798828125, + "logps/rejected": -299.3256530761719, + "loss": 0.575, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.815997838973999, + "rewards/margins": 2.7296788692474365, + "rewards/rejected": -4.5456767082214355, + "step": 1740 + }, + { + "epoch": 0.36, + "learning_rate": 1.276890756302521e-05, + "logits/chosen": -1.9613996744155884, + "logits/rejected": -1.8531324863433838, + "logps/chosen": -271.5999755859375, + "logps/rejected": -299.2306213378906, + "loss": 0.4834, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6028615236282349, + "rewards/margins": 3.492419958114624, + "rewards/rejected": -5.095281600952148, + "step": 1741 + }, + { + "epoch": 0.36, + "learning_rate": 1.276470588235294e-05, + "logits/chosen": -1.491971731185913, + "logits/rejected": -1.623412847518921, + "logps/chosen": -319.4019775390625, + "logps/rejected": -435.154052734375, + "loss": 0.0974, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7717623710632324, + "rewards/margins": 4.269248962402344, + "rewards/rejected": -6.041011333465576, + "step": 1742 + }, + { + "epoch": 0.36, + "learning_rate": 1.2760504201680673e-05, + "logits/chosen": -2.0967891216278076, + "logits/rejected": -1.8779772520065308, + "logps/chosen": -376.2422180175781, + "logps/rejected": -333.02972412109375, + "loss": 0.6872, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.602220058441162, + "rewards/margins": 2.32466459274292, + "rewards/rejected": -4.926884651184082, + "step": 1743 + }, + { + "epoch": 0.36, + "learning_rate": 1.2756302521008403e-05, + "logits/chosen": -1.8823425769805908, + "logits/rejected": -1.8060266971588135, + "logps/chosen": -405.7274475097656, + "logps/rejected": -409.785400390625, + "loss": 0.2944, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4691033363342285, + "rewards/margins": 5.218448638916016, + "rewards/rejected": -7.687552452087402, + "step": 1744 + }, + { + "epoch": 0.37, + "learning_rate": 1.2752100840336135e-05, + "logits/chosen": -2.1131157875061035, + "logits/rejected": -1.805212140083313, + "logps/chosen": -228.3209686279297, + "logps/rejected": -298.92724609375, + "loss": 0.1408, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1740851402282715, + "rewards/margins": 4.319099426269531, + "rewards/rejected": -6.4931840896606445, + "step": 1745 + }, + { + "epoch": 0.37, + "learning_rate": 1.2747899159663865e-05, + "logits/chosen": -1.852203369140625, + "logits/rejected": -2.099804639816284, + "logps/chosen": -266.0347595214844, + "logps/rejected": -323.8255310058594, + "loss": 0.6344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4451680183410645, + "rewards/margins": 3.8588805198669434, + "rewards/rejected": -6.304048538208008, + "step": 1746 + }, + { + "epoch": 0.37, + "learning_rate": 1.2743697478991597e-05, + "logits/chosen": -2.2081375122070312, + "logits/rejected": -1.62109375, + "logps/chosen": -323.3291931152344, + "logps/rejected": -277.005615234375, + "loss": 0.1506, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2035279273986816, + "rewards/margins": 4.323655128479004, + "rewards/rejected": -6.527183532714844, + "step": 1747 + }, + { + "epoch": 0.37, + "learning_rate": 1.2739495798319327e-05, + "logits/chosen": -2.1493332386016846, + "logits/rejected": -1.4306750297546387, + "logps/chosen": -322.77001953125, + "logps/rejected": -262.59930419921875, + "loss": 0.1992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7850747108459473, + "rewards/margins": 2.7682077884674072, + "rewards/rejected": -5.553282260894775, + "step": 1748 + }, + { + "epoch": 0.37, + "learning_rate": 1.2735294117647059e-05, + "logits/chosen": -1.9452846050262451, + "logits/rejected": -1.942493200302124, + "logps/chosen": -250.05810546875, + "logps/rejected": -329.5069274902344, + "loss": 0.1991, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.417715549468994, + "rewards/margins": 3.924509048461914, + "rewards/rejected": -6.342224597930908, + "step": 1749 + }, + { + "epoch": 0.37, + "learning_rate": 1.2731092436974791e-05, + "logits/chosen": -2.0381953716278076, + "logits/rejected": -1.6223368644714355, + "logps/chosen": -346.24871826171875, + "logps/rejected": -302.119384765625, + "loss": 0.4739, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.098102331161499, + "rewards/margins": 3.149576187133789, + "rewards/rejected": -5.247678756713867, + "step": 1750 + }, + { + "epoch": 0.37, + "learning_rate": 1.2726890756302521e-05, + "logits/chosen": -2.0168075561523438, + "logits/rejected": -2.028434991836548, + "logps/chosen": -261.8702697753906, + "logps/rejected": -310.6669006347656, + "loss": 0.367, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.033168315887451, + "rewards/margins": 2.440186023712158, + "rewards/rejected": -4.473354339599609, + "step": 1751 + }, + { + "epoch": 0.37, + "learning_rate": 1.2722689075630253e-05, + "logits/chosen": -2.227324962615967, + "logits/rejected": -1.8990821838378906, + "logps/chosen": -516.052978515625, + "logps/rejected": -438.0694580078125, + "loss": 0.5383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.112281084060669, + "rewards/margins": 3.0466365814208984, + "rewards/rejected": -5.1589179039001465, + "step": 1752 + }, + { + "epoch": 0.37, + "learning_rate": 1.2718487394957983e-05, + "logits/chosen": -1.8136128187179565, + "logits/rejected": -2.124232769012451, + "logps/chosen": -258.5289306640625, + "logps/rejected": -339.23126220703125, + "loss": 0.2653, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.013992190361023, + "rewards/margins": 2.954085350036621, + "rewards/rejected": -3.9680776596069336, + "step": 1753 + }, + { + "epoch": 0.37, + "learning_rate": 1.2714285714285715e-05, + "logits/chosen": -2.2477266788482666, + "logits/rejected": -1.9282069206237793, + "logps/chosen": -310.39239501953125, + "logps/rejected": -351.3646545410156, + "loss": 0.306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4616888761520386, + "rewards/margins": 4.0910491943359375, + "rewards/rejected": -5.552738189697266, + "step": 1754 + }, + { + "epoch": 0.37, + "learning_rate": 1.2710084033613445e-05, + "logits/chosen": -1.712146282196045, + "logits/rejected": -1.875839114189148, + "logps/chosen": -206.52664184570312, + "logps/rejected": -245.27178955078125, + "loss": 0.1806, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6540236473083496, + "rewards/margins": 4.343644618988037, + "rewards/rejected": -5.997668266296387, + "step": 1755 + }, + { + "epoch": 0.37, + "learning_rate": 1.2705882352941177e-05, + "logits/chosen": -2.070586919784546, + "logits/rejected": -1.6574022769927979, + "logps/chosen": -286.0633544921875, + "logps/rejected": -268.3511962890625, + "loss": 0.4091, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6145892143249512, + "rewards/margins": 3.4328179359436035, + "rewards/rejected": -5.047407150268555, + "step": 1756 + }, + { + "epoch": 0.37, + "learning_rate": 1.2701680672268908e-05, + "logits/chosen": -2.1463136672973633, + "logits/rejected": -1.8782405853271484, + "logps/chosen": -284.5740661621094, + "logps/rejected": -271.18231201171875, + "loss": 0.1689, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2716894149780273, + "rewards/margins": 4.896943092346191, + "rewards/rejected": -6.168633460998535, + "step": 1757 + }, + { + "epoch": 0.37, + "learning_rate": 1.269747899159664e-05, + "logits/chosen": -2.218388795852661, + "logits/rejected": -1.7556936740875244, + "logps/chosen": -237.75625610351562, + "logps/rejected": -241.58062744140625, + "loss": 0.2472, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8457057476043701, + "rewards/margins": 3.9747471809387207, + "rewards/rejected": -5.82045316696167, + "step": 1758 + }, + { + "epoch": 0.37, + "learning_rate": 1.269327731092437e-05, + "logits/chosen": -2.075504779815674, + "logits/rejected": -1.9565949440002441, + "logps/chosen": -424.69708251953125, + "logps/rejected": -400.16796875, + "loss": 0.5175, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5709381103515625, + "rewards/margins": 3.244272232055664, + "rewards/rejected": -5.815209865570068, + "step": 1759 + }, + { + "epoch": 0.37, + "learning_rate": 1.2689075630252102e-05, + "logits/chosen": -1.6349000930786133, + "logits/rejected": -1.7339026927947998, + "logps/chosen": -326.32177734375, + "logps/rejected": -389.75250244140625, + "loss": 0.4943, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3336246013641357, + "rewards/margins": 2.4318251609802246, + "rewards/rejected": -4.765449523925781, + "step": 1760 + }, + { + "epoch": 0.37, + "learning_rate": 1.2684873949579832e-05, + "logits/chosen": -2.1046242713928223, + "logits/rejected": -1.6036499738693237, + "logps/chosen": -264.6695556640625, + "logps/rejected": -248.53213500976562, + "loss": 0.2808, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1439019441604614, + "rewards/margins": 3.315765380859375, + "rewards/rejected": -4.459667205810547, + "step": 1761 + }, + { + "epoch": 0.37, + "learning_rate": 1.2680672268907564e-05, + "logits/chosen": -2.101350784301758, + "logits/rejected": -2.019341468811035, + "logps/chosen": -265.4784240722656, + "logps/rejected": -354.42535400390625, + "loss": 0.0987, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0359983444213867, + "rewards/margins": 4.140917778015137, + "rewards/rejected": -6.176916122436523, + "step": 1762 + }, + { + "epoch": 0.37, + "learning_rate": 1.2676470588235294e-05, + "logits/chosen": -2.0482287406921387, + "logits/rejected": -1.8655407428741455, + "logps/chosen": -331.2798767089844, + "logps/rejected": -361.16424560546875, + "loss": 0.2392, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2306530475616455, + "rewards/margins": 3.424342155456543, + "rewards/rejected": -5.654995441436768, + "step": 1763 + }, + { + "epoch": 0.37, + "learning_rate": 1.2672268907563026e-05, + "logits/chosen": -1.7039258480072021, + "logits/rejected": -1.7191208600997925, + "logps/chosen": -304.27294921875, + "logps/rejected": -319.51922607421875, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9424810409545898, + "rewards/margins": 5.4296650886535645, + "rewards/rejected": -7.372146129608154, + "step": 1764 + }, + { + "epoch": 0.37, + "learning_rate": 1.2668067226890756e-05, + "logits/chosen": -2.05804443359375, + "logits/rejected": -1.5029891729354858, + "logps/chosen": -347.4152526855469, + "logps/rejected": -315.47039794921875, + "loss": 0.1379, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3976651430130005, + "rewards/margins": 5.146216869354248, + "rewards/rejected": -6.543881893157959, + "step": 1765 + }, + { + "epoch": 0.37, + "learning_rate": 1.2663865546218488e-05, + "logits/chosen": -2.1624886989593506, + "logits/rejected": -1.7037053108215332, + "logps/chosen": -347.3921813964844, + "logps/rejected": -356.04449462890625, + "loss": 0.1601, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8009859323501587, + "rewards/margins": 4.418320178985596, + "rewards/rejected": -6.219306468963623, + "step": 1766 + }, + { + "epoch": 0.37, + "learning_rate": 1.2659663865546218e-05, + "logits/chosen": -2.02056884765625, + "logits/rejected": -1.9386581182479858, + "logps/chosen": -345.57525634765625, + "logps/rejected": -402.684326171875, + "loss": 0.2409, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8323227167129517, + "rewards/margins": 4.6929826736450195, + "rewards/rejected": -6.52530574798584, + "step": 1767 + }, + { + "epoch": 0.37, + "learning_rate": 1.265546218487395e-05, + "logits/chosen": -2.2112550735473633, + "logits/rejected": -1.8923091888427734, + "logps/chosen": -477.8106384277344, + "logps/rejected": -340.67205810546875, + "loss": 0.1922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.971642017364502, + "rewards/margins": 4.327385902404785, + "rewards/rejected": -5.299027442932129, + "step": 1768 + }, + { + "epoch": 0.37, + "learning_rate": 1.265126050420168e-05, + "logits/chosen": -2.1023073196411133, + "logits/rejected": -1.7306984663009644, + "logps/chosen": -303.4323425292969, + "logps/rejected": -323.16973876953125, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.486727237701416, + "rewards/margins": 4.5621747970581055, + "rewards/rejected": -7.04890251159668, + "step": 1769 + }, + { + "epoch": 0.37, + "learning_rate": 1.2647058823529412e-05, + "logits/chosen": -1.9863693714141846, + "logits/rejected": -1.9215013980865479, + "logps/chosen": -454.6671447753906, + "logps/rejected": -401.3580322265625, + "loss": 0.392, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4198495149612427, + "rewards/margins": 2.676342010498047, + "rewards/rejected": -4.09619140625, + "step": 1770 + }, + { + "epoch": 0.37, + "learning_rate": 1.2642857142857143e-05, + "logits/chosen": -1.9747538566589355, + "logits/rejected": -1.675748348236084, + "logps/chosen": -375.435546875, + "logps/rejected": -293.918701171875, + "loss": 0.6753, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.045513391494751, + "rewards/margins": 2.489720106124878, + "rewards/rejected": -5.535233497619629, + "step": 1771 + }, + { + "epoch": 0.37, + "learning_rate": 1.2638655462184874e-05, + "logits/chosen": -1.6234101057052612, + "logits/rejected": -1.4660919904708862, + "logps/chosen": -333.38330078125, + "logps/rejected": -311.9102478027344, + "loss": 0.2456, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.617193579673767, + "rewards/margins": 3.9928269386291504, + "rewards/rejected": -5.610020637512207, + "step": 1772 + }, + { + "epoch": 0.37, + "learning_rate": 1.2634453781512606e-05, + "logits/chosen": -1.9884085655212402, + "logits/rejected": -1.7980440855026245, + "logps/chosen": -207.3913116455078, + "logps/rejected": -298.2470397949219, + "loss": 0.2065, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4766228199005127, + "rewards/margins": 5.363602638244629, + "rewards/rejected": -7.8402252197265625, + "step": 1773 + }, + { + "epoch": 0.37, + "learning_rate": 1.2630252100840337e-05, + "logits/chosen": -2.0810811519622803, + "logits/rejected": -1.9490880966186523, + "logps/chosen": -271.8350830078125, + "logps/rejected": -317.02825927734375, + "loss": 0.3528, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.03255033493042, + "rewards/margins": 3.89451265335083, + "rewards/rejected": -5.927062511444092, + "step": 1774 + }, + { + "epoch": 0.37, + "learning_rate": 1.2626050420168069e-05, + "logits/chosen": -2.2524514198303223, + "logits/rejected": -1.6927098035812378, + "logps/chosen": -302.945556640625, + "logps/rejected": -291.4638366699219, + "loss": 0.2733, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8773443698883057, + "rewards/margins": 4.05056619644165, + "rewards/rejected": -5.927910327911377, + "step": 1775 + }, + { + "epoch": 0.37, + "learning_rate": 1.2621848739495799e-05, + "logits/chosen": -2.152149200439453, + "logits/rejected": -1.9415919780731201, + "logps/chosen": -373.915771484375, + "logps/rejected": -305.147216796875, + "loss": 0.3026, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.968032121658325, + "rewards/margins": 4.385983943939209, + "rewards/rejected": -7.354016304016113, + "step": 1776 + }, + { + "epoch": 0.37, + "learning_rate": 1.261764705882353e-05, + "logits/chosen": -2.1463778018951416, + "logits/rejected": -2.111959218978882, + "logps/chosen": -310.9941711425781, + "logps/rejected": -276.97479248046875, + "loss": 0.1454, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.508514404296875, + "rewards/margins": 4.306140899658203, + "rewards/rejected": -5.814655303955078, + "step": 1777 + }, + { + "epoch": 0.37, + "learning_rate": 1.2613445378151261e-05, + "logits/chosen": -1.9919275045394897, + "logits/rejected": -2.049964666366577, + "logps/chosen": -310.8265686035156, + "logps/rejected": -322.8869934082031, + "loss": 0.3032, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.978891372680664, + "rewards/margins": 3.948613405227661, + "rewards/rejected": -6.927504539489746, + "step": 1778 + }, + { + "epoch": 0.37, + "learning_rate": 1.2609243697478993e-05, + "logits/chosen": -2.1305489540100098, + "logits/rejected": -2.024134635925293, + "logps/chosen": -350.2703857421875, + "logps/rejected": -362.7064514160156, + "loss": 0.0997, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6748934984207153, + "rewards/margins": 6.170192241668701, + "rewards/rejected": -7.845085620880127, + "step": 1779 + }, + { + "epoch": 0.37, + "learning_rate": 1.2605042016806723e-05, + "logits/chosen": -2.060441493988037, + "logits/rejected": -1.7549536228179932, + "logps/chosen": -324.79705810546875, + "logps/rejected": -316.6864929199219, + "loss": 0.5761, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6303749084472656, + "rewards/margins": 3.6565146446228027, + "rewards/rejected": -6.286889553070068, + "step": 1780 + }, + { + "epoch": 0.37, + "learning_rate": 1.2600840336134455e-05, + "logits/chosen": -2.2033870220184326, + "logits/rejected": -2.097784996032715, + "logps/chosen": -274.9949645996094, + "logps/rejected": -363.62841796875, + "loss": 0.2504, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.490706205368042, + "rewards/margins": 3.516636848449707, + "rewards/rejected": -6.007343292236328, + "step": 1781 + }, + { + "epoch": 0.37, + "learning_rate": 1.2596638655462185e-05, + "logits/chosen": -2.078474521636963, + "logits/rejected": -2.1691579818725586, + "logps/chosen": -321.22637939453125, + "logps/rejected": -319.9599914550781, + "loss": 0.3183, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.421416997909546, + "rewards/margins": 3.7803854942321777, + "rewards/rejected": -6.201802730560303, + "step": 1782 + }, + { + "epoch": 0.37, + "learning_rate": 1.2592436974789917e-05, + "logits/chosen": -2.055881977081299, + "logits/rejected": -1.9867157936096191, + "logps/chosen": -284.76165771484375, + "logps/rejected": -295.0223388671875, + "loss": 0.1942, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6157869100570679, + "rewards/margins": 3.34968638420105, + "rewards/rejected": -4.965473175048828, + "step": 1783 + }, + { + "epoch": 0.37, + "learning_rate": 1.2588235294117647e-05, + "logits/chosen": -2.031898021697998, + "logits/rejected": -1.9355424642562866, + "logps/chosen": -348.9483642578125, + "logps/rejected": -362.38568115234375, + "loss": 0.1382, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9775464534759521, + "rewards/margins": 3.9106125831604004, + "rewards/rejected": -5.888158798217773, + "step": 1784 + }, + { + "epoch": 0.37, + "learning_rate": 1.258403361344538e-05, + "logits/chosen": -2.340874671936035, + "logits/rejected": -2.0267221927642822, + "logps/chosen": -359.5558776855469, + "logps/rejected": -412.1551513671875, + "loss": 0.2687, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.624579668045044, + "rewards/margins": 5.184628486633301, + "rewards/rejected": -7.809207439422607, + "step": 1785 + }, + { + "epoch": 0.37, + "learning_rate": 1.257983193277311e-05, + "logits/chosen": -2.0666234493255615, + "logits/rejected": -1.7046170234680176, + "logps/chosen": -346.9346923828125, + "logps/rejected": -302.680419921875, + "loss": 0.4075, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1801719665527344, + "rewards/margins": 4.3180341720581055, + "rewards/rejected": -7.49820613861084, + "step": 1786 + }, + { + "epoch": 0.37, + "learning_rate": 1.2575630252100841e-05, + "logits/chosen": -1.8434706926345825, + "logits/rejected": -1.7718141078948975, + "logps/chosen": -180.12802124023438, + "logps/rejected": -277.0544738769531, + "loss": 0.1289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9735708236694336, + "rewards/margins": 5.332880020141602, + "rewards/rejected": -8.306450843811035, + "step": 1787 + }, + { + "epoch": 0.37, + "learning_rate": 1.2571428571428572e-05, + "logits/chosen": -1.8972259759902954, + "logits/rejected": -1.821812629699707, + "logps/chosen": -302.0366516113281, + "logps/rejected": -306.9195556640625, + "loss": 0.2788, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.321805477142334, + "rewards/margins": 3.3359506130218506, + "rewards/rejected": -5.6577558517456055, + "step": 1788 + }, + { + "epoch": 0.37, + "learning_rate": 1.2567226890756304e-05, + "logits/chosen": -1.8635022640228271, + "logits/rejected": -1.8238285779953003, + "logps/chosen": -432.66510009765625, + "logps/rejected": -597.79248046875, + "loss": 0.2711, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3408639430999756, + "rewards/margins": 3.872145414352417, + "rewards/rejected": -7.213009357452393, + "step": 1789 + }, + { + "epoch": 0.37, + "learning_rate": 1.2563025210084034e-05, + "logits/chosen": -2.29345703125, + "logits/rejected": -1.7246224880218506, + "logps/chosen": -506.98736572265625, + "logps/rejected": -349.2126159667969, + "loss": 0.5001, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9519894123077393, + "rewards/margins": 4.988216400146484, + "rewards/rejected": -6.940206050872803, + "step": 1790 + }, + { + "epoch": 0.37, + "learning_rate": 1.2558823529411766e-05, + "logits/chosen": -2.330901861190796, + "logits/rejected": -1.635657548904419, + "logps/chosen": -424.2184143066406, + "logps/rejected": -334.1633605957031, + "loss": 0.1574, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3283421993255615, + "rewards/margins": 4.64754581451416, + "rewards/rejected": -6.975888252258301, + "step": 1791 + }, + { + "epoch": 0.37, + "learning_rate": 1.2554621848739496e-05, + "logits/chosen": -2.1808953285217285, + "logits/rejected": -1.8212199211120605, + "logps/chosen": -305.38995361328125, + "logps/rejected": -276.0369567871094, + "loss": 0.2905, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2360754013061523, + "rewards/margins": 2.9036636352539062, + "rewards/rejected": -6.139739036560059, + "step": 1792 + }, + { + "epoch": 0.38, + "learning_rate": 1.2550420168067228e-05, + "logits/chosen": -1.8467988967895508, + "logits/rejected": -2.1311261653900146, + "logps/chosen": -357.418212890625, + "logps/rejected": -379.6018371582031, + "loss": 0.3273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3376049995422363, + "rewards/margins": 3.7321510314941406, + "rewards/rejected": -6.069755554199219, + "step": 1793 + }, + { + "epoch": 0.38, + "learning_rate": 1.254621848739496e-05, + "logits/chosen": -2.3632607460021973, + "logits/rejected": -1.9759379625320435, + "logps/chosen": -416.6093444824219, + "logps/rejected": -334.78594970703125, + "loss": 0.5734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3181886672973633, + "rewards/margins": 3.4687159061431885, + "rewards/rejected": -6.786904335021973, + "step": 1794 + }, + { + "epoch": 0.38, + "learning_rate": 1.254201680672269e-05, + "logits/chosen": -2.1788644790649414, + "logits/rejected": -1.834309697151184, + "logps/chosen": -338.6228332519531, + "logps/rejected": -351.3233337402344, + "loss": 0.0836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.84698748588562, + "rewards/margins": 4.984423637390137, + "rewards/rejected": -7.831410884857178, + "step": 1795 + }, + { + "epoch": 0.38, + "learning_rate": 1.2537815126050422e-05, + "logits/chosen": -2.167393207550049, + "logits/rejected": -1.8985700607299805, + "logps/chosen": -296.945556640625, + "logps/rejected": -323.76593017578125, + "loss": 0.5838, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.3066606521606445, + "rewards/margins": 2.161682605743408, + "rewards/rejected": -5.468343257904053, + "step": 1796 + }, + { + "epoch": 0.38, + "learning_rate": 1.2533613445378152e-05, + "logits/chosen": -2.003908634185791, + "logits/rejected": -2.1356709003448486, + "logps/chosen": -451.6329345703125, + "logps/rejected": -366.5298156738281, + "loss": 0.2519, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0159528255462646, + "rewards/margins": 4.419299125671387, + "rewards/rejected": -7.435251712799072, + "step": 1797 + }, + { + "epoch": 0.38, + "learning_rate": 1.2529411764705884e-05, + "logits/chosen": -2.101807117462158, + "logits/rejected": -1.8441613912582397, + "logps/chosen": -382.4060974121094, + "logps/rejected": -411.38531494140625, + "loss": 0.1702, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4624245166778564, + "rewards/margins": 4.282219886779785, + "rewards/rejected": -6.744643688201904, + "step": 1798 + }, + { + "epoch": 0.38, + "learning_rate": 1.2525210084033614e-05, + "logits/chosen": -2.0932106971740723, + "logits/rejected": -1.756744384765625, + "logps/chosen": -321.96435546875, + "logps/rejected": -413.52667236328125, + "loss": 0.1299, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6015396118164062, + "rewards/margins": 4.415146827697754, + "rewards/rejected": -7.01668643951416, + "step": 1799 + }, + { + "epoch": 0.38, + "learning_rate": 1.2521008403361346e-05, + "logits/chosen": -2.006406545639038, + "logits/rejected": -1.9936182498931885, + "logps/chosen": -381.1437683105469, + "logps/rejected": -385.24273681640625, + "loss": 0.1648, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6243560314178467, + "rewards/margins": 4.992252349853516, + "rewards/rejected": -6.616608142852783, + "step": 1800 + }, + { + "epoch": 0.38, + "learning_rate": 1.2516806722689076e-05, + "logits/chosen": -1.9763023853302002, + "logits/rejected": -2.1236701011657715, + "logps/chosen": -384.1940612792969, + "logps/rejected": -301.92425537109375, + "loss": 0.3579, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.282367706298828, + "rewards/margins": 3.205954074859619, + "rewards/rejected": -6.488321781158447, + "step": 1801 + }, + { + "epoch": 0.38, + "learning_rate": 1.2512605042016808e-05, + "logits/chosen": -1.9965115785598755, + "logits/rejected": -2.1180171966552734, + "logps/chosen": -328.7938537597656, + "logps/rejected": -369.47601318359375, + "loss": 0.3751, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.850086212158203, + "rewards/margins": 4.269739627838135, + "rewards/rejected": -7.119826316833496, + "step": 1802 + }, + { + "epoch": 0.38, + "learning_rate": 1.2508403361344538e-05, + "logits/chosen": -1.8857592344284058, + "logits/rejected": -1.5286210775375366, + "logps/chosen": -413.4532165527344, + "logps/rejected": -309.3803405761719, + "loss": 0.8262, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.072671413421631, + "rewards/margins": 2.1183438301086426, + "rewards/rejected": -5.191015243530273, + "step": 1803 + }, + { + "epoch": 0.38, + "learning_rate": 1.250420168067227e-05, + "logits/chosen": -1.9694397449493408, + "logits/rejected": -1.7799674272537231, + "logps/chosen": -340.10809326171875, + "logps/rejected": -273.68365478515625, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5552663803100586, + "rewards/margins": 4.07852029800415, + "rewards/rejected": -6.633787155151367, + "step": 1804 + }, + { + "epoch": 0.38, + "learning_rate": 1.25e-05, + "logits/chosen": -2.1740562915802, + "logits/rejected": -1.5397841930389404, + "logps/chosen": -398.8807678222656, + "logps/rejected": -398.06365966796875, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0995306968688965, + "rewards/margins": 5.263266563415527, + "rewards/rejected": -8.362797737121582, + "step": 1805 + }, + { + "epoch": 0.38, + "learning_rate": 1.2495798319327733e-05, + "logits/chosen": -2.008077621459961, + "logits/rejected": -1.8429821729660034, + "logps/chosen": -206.87705993652344, + "logps/rejected": -377.868408203125, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.930143117904663, + "rewards/margins": 6.427976131439209, + "rewards/rejected": -9.358119010925293, + "step": 1806 + }, + { + "epoch": 0.38, + "learning_rate": 1.2491596638655463e-05, + "logits/chosen": -2.292978286743164, + "logits/rejected": -1.769348382949829, + "logps/chosen": -449.630126953125, + "logps/rejected": -356.318115234375, + "loss": 0.3196, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5748438835144043, + "rewards/margins": 3.6595332622528076, + "rewards/rejected": -6.234376907348633, + "step": 1807 + }, + { + "epoch": 0.38, + "learning_rate": 1.2487394957983195e-05, + "logits/chosen": -1.9882423877716064, + "logits/rejected": -1.6709835529327393, + "logps/chosen": -363.9814453125, + "logps/rejected": -350.1273193359375, + "loss": 0.4197, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1633903980255127, + "rewards/margins": 3.874863386154175, + "rewards/rejected": -7.0382537841796875, + "step": 1808 + }, + { + "epoch": 0.38, + "learning_rate": 1.2483193277310925e-05, + "logits/chosen": -1.8266547918319702, + "logits/rejected": -2.1173272132873535, + "logps/chosen": -343.9117126464844, + "logps/rejected": -471.7044372558594, + "loss": 0.2111, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2196078300476074, + "rewards/margins": 4.750805854797363, + "rewards/rejected": -7.9704132080078125, + "step": 1809 + }, + { + "epoch": 0.38, + "learning_rate": 1.2478991596638657e-05, + "logits/chosen": -2.2981908321380615, + "logits/rejected": -2.038205623626709, + "logps/chosen": -289.29864501953125, + "logps/rejected": -438.4106140136719, + "loss": 0.4885, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.077671766281128, + "rewards/margins": 3.2957730293273926, + "rewards/rejected": -6.373445510864258, + "step": 1810 + }, + { + "epoch": 0.38, + "learning_rate": 1.2474789915966387e-05, + "logits/chosen": -2.1740357875823975, + "logits/rejected": -1.7964246273040771, + "logps/chosen": -311.854248046875, + "logps/rejected": -297.034423828125, + "loss": 0.2577, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4747464656829834, + "rewards/margins": 5.402400493621826, + "rewards/rejected": -8.87714672088623, + "step": 1811 + }, + { + "epoch": 0.38, + "learning_rate": 1.2470588235294119e-05, + "logits/chosen": -1.9762654304504395, + "logits/rejected": -2.1120619773864746, + "logps/chosen": -319.1143493652344, + "logps/rejected": -372.3591003417969, + "loss": 0.3884, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8194522857666016, + "rewards/margins": 3.894730806350708, + "rewards/rejected": -6.714183807373047, + "step": 1812 + }, + { + "epoch": 0.38, + "learning_rate": 1.2466386554621849e-05, + "logits/chosen": -1.7402873039245605, + "logits/rejected": -1.7582998275756836, + "logps/chosen": -412.2801208496094, + "logps/rejected": -320.85272216796875, + "loss": 0.377, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.420719623565674, + "rewards/margins": 4.192373752593994, + "rewards/rejected": -7.613093376159668, + "step": 1813 + }, + { + "epoch": 0.38, + "learning_rate": 1.2462184873949581e-05, + "logits/chosen": -2.2431375980377197, + "logits/rejected": -1.6594526767730713, + "logps/chosen": -325.0775146484375, + "logps/rejected": -374.4784240722656, + "loss": 0.3304, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5062460899353027, + "rewards/margins": 4.599099636077881, + "rewards/rejected": -7.105345726013184, + "step": 1814 + }, + { + "epoch": 0.38, + "learning_rate": 1.2457983193277311e-05, + "logits/chosen": -2.1021182537078857, + "logits/rejected": -1.9772509336471558, + "logps/chosen": -365.6492614746094, + "logps/rejected": -404.7083740234375, + "loss": 0.1094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4489972591400146, + "rewards/margins": 7.313340663909912, + "rewards/rejected": -9.762338638305664, + "step": 1815 + }, + { + "epoch": 0.38, + "learning_rate": 1.2453781512605043e-05, + "logits/chosen": -2.2247331142425537, + "logits/rejected": -1.5387839078903198, + "logps/chosen": -421.32513427734375, + "logps/rejected": -293.9612731933594, + "loss": 0.2203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6782078742980957, + "rewards/margins": 4.486258506774902, + "rewards/rejected": -6.164466381072998, + "step": 1816 + }, + { + "epoch": 0.38, + "learning_rate": 1.2449579831932775e-05, + "logits/chosen": -1.4192893505096436, + "logits/rejected": -1.1593286991119385, + "logps/chosen": -247.40869140625, + "logps/rejected": -279.9345397949219, + "loss": 0.321, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.771129608154297, + "rewards/margins": 3.8587136268615723, + "rewards/rejected": -6.629842758178711, + "step": 1817 + }, + { + "epoch": 0.38, + "learning_rate": 1.2445378151260505e-05, + "logits/chosen": -2.0051915645599365, + "logits/rejected": -1.88638436794281, + "logps/chosen": -332.06353759765625, + "logps/rejected": -364.3916015625, + "loss": 0.1771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8817567825317383, + "rewards/margins": 4.202718257904053, + "rewards/rejected": -7.084474563598633, + "step": 1818 + }, + { + "epoch": 0.38, + "learning_rate": 1.2441176470588237e-05, + "logits/chosen": -2.141613006591797, + "logits/rejected": -1.7716871500015259, + "logps/chosen": -342.0851135253906, + "logps/rejected": -266.565185546875, + "loss": 0.9662, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.471552610397339, + "rewards/margins": 3.467884063720703, + "rewards/rejected": -6.939435958862305, + "step": 1819 + }, + { + "epoch": 0.38, + "learning_rate": 1.2436974789915967e-05, + "logits/chosen": -1.7992281913757324, + "logits/rejected": -1.7195043563842773, + "logps/chosen": -266.8771057128906, + "logps/rejected": -328.7740783691406, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.260608434677124, + "rewards/margins": 2.2436792850494385, + "rewards/rejected": -5.5042877197265625, + "step": 1820 + }, + { + "epoch": 0.38, + "learning_rate": 1.24327731092437e-05, + "logits/chosen": -1.9437739849090576, + "logits/rejected": -1.8591917753219604, + "logps/chosen": -241.35433959960938, + "logps/rejected": -244.63584899902344, + "loss": 0.306, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.722043752670288, + "rewards/margins": 3.924424171447754, + "rewards/rejected": -6.646467685699463, + "step": 1821 + }, + { + "epoch": 0.38, + "learning_rate": 1.242857142857143e-05, + "logits/chosen": -2.0968518257141113, + "logits/rejected": -1.806695818901062, + "logps/chosen": -267.68896484375, + "logps/rejected": -335.9344482421875, + "loss": 0.2301, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.919706344604492, + "rewards/margins": 3.8995752334594727, + "rewards/rejected": -6.819281101226807, + "step": 1822 + }, + { + "epoch": 0.38, + "learning_rate": 1.2424369747899162e-05, + "logits/chosen": -2.2418646812438965, + "logits/rejected": -1.9673755168914795, + "logps/chosen": -470.3053283691406, + "logps/rejected": -350.61846923828125, + "loss": 0.3785, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.542850971221924, + "rewards/margins": 3.6771252155303955, + "rewards/rejected": -6.219976425170898, + "step": 1823 + }, + { + "epoch": 0.38, + "learning_rate": 1.2420168067226892e-05, + "logits/chosen": -2.122089385986328, + "logits/rejected": -1.7735962867736816, + "logps/chosen": -298.9089050292969, + "logps/rejected": -381.55462646484375, + "loss": 0.8752, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.350724458694458, + "rewards/margins": 2.9452266693115234, + "rewards/rejected": -6.295950889587402, + "step": 1824 + }, + { + "epoch": 0.38, + "learning_rate": 1.2415966386554624e-05, + "logits/chosen": -1.8487603664398193, + "logits/rejected": -1.6678993701934814, + "logps/chosen": -399.3428649902344, + "logps/rejected": -364.6905517578125, + "loss": 0.1929, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2664742469787598, + "rewards/margins": 3.8054516315460205, + "rewards/rejected": -6.071925163269043, + "step": 1825 + }, + { + "epoch": 0.38, + "learning_rate": 1.2411764705882354e-05, + "logits/chosen": -2.1750378608703613, + "logits/rejected": -2.0403733253479004, + "logps/chosen": -320.1820373535156, + "logps/rejected": -306.345458984375, + "loss": 0.3672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.400602102279663, + "rewards/margins": 3.223055601119995, + "rewards/rejected": -5.623657703399658, + "step": 1826 + }, + { + "epoch": 0.38, + "learning_rate": 1.2407563025210086e-05, + "logits/chosen": -2.0814719200134277, + "logits/rejected": -1.6131364107131958, + "logps/chosen": -378.1086120605469, + "logps/rejected": -374.85986328125, + "loss": 0.1537, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.997178554534912, + "rewards/margins": 2.981276750564575, + "rewards/rejected": -5.978455543518066, + "step": 1827 + }, + { + "epoch": 0.38, + "learning_rate": 1.2403361344537816e-05, + "logits/chosen": -2.1030631065368652, + "logits/rejected": -1.572075366973877, + "logps/chosen": -389.2618408203125, + "logps/rejected": -345.2249755859375, + "loss": 0.3309, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9812209606170654, + "rewards/margins": 4.42304801940918, + "rewards/rejected": -7.404269695281982, + "step": 1828 + }, + { + "epoch": 0.38, + "learning_rate": 1.2399159663865548e-05, + "logits/chosen": -1.9673033952713013, + "logits/rejected": -1.3781259059906006, + "logps/chosen": -368.08544921875, + "logps/rejected": -273.9477233886719, + "loss": 0.3418, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3179750442504883, + "rewards/margins": 4.421310901641846, + "rewards/rejected": -7.739285469055176, + "step": 1829 + }, + { + "epoch": 0.38, + "learning_rate": 1.2394957983193278e-05, + "logits/chosen": -1.919838547706604, + "logits/rejected": -1.8430510759353638, + "logps/chosen": -411.90325927734375, + "logps/rejected": -366.18853759765625, + "loss": 0.124, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.576273202896118, + "rewards/margins": 3.9834365844726562, + "rewards/rejected": -6.5597100257873535, + "step": 1830 + }, + { + "epoch": 0.38, + "learning_rate": 1.239075630252101e-05, + "logits/chosen": -1.9190537929534912, + "logits/rejected": -1.440187692642212, + "logps/chosen": -209.78269958496094, + "logps/rejected": -237.6981201171875, + "loss": 0.1538, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.269538640975952, + "rewards/margins": 4.755407333374023, + "rewards/rejected": -7.024946212768555, + "step": 1831 + }, + { + "epoch": 0.38, + "learning_rate": 1.238655462184874e-05, + "logits/chosen": -2.2594292163848877, + "logits/rejected": -2.1560447216033936, + "logps/chosen": -414.5213623046875, + "logps/rejected": -395.2038879394531, + "loss": 0.6985, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.41933012008667, + "rewards/margins": 4.11453914642334, + "rewards/rejected": -6.53386926651001, + "step": 1832 + }, + { + "epoch": 0.38, + "learning_rate": 1.2382352941176472e-05, + "logits/chosen": -2.161709785461426, + "logits/rejected": -1.6761085987091064, + "logps/chosen": -343.27886962890625, + "logps/rejected": -285.697021484375, + "loss": 0.2704, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8251807689666748, + "rewards/margins": 3.791175365447998, + "rewards/rejected": -5.616356372833252, + "step": 1833 + }, + { + "epoch": 0.38, + "learning_rate": 1.2378151260504202e-05, + "logits/chosen": -2.0671329498291016, + "logits/rejected": -1.7918500900268555, + "logps/chosen": -342.55145263671875, + "logps/rejected": -354.6083984375, + "loss": 0.5277, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6176323890686035, + "rewards/margins": 4.317749977111816, + "rewards/rejected": -6.93538236618042, + "step": 1834 + }, + { + "epoch": 0.38, + "learning_rate": 1.2373949579831934e-05, + "logits/chosen": -2.1687378883361816, + "logits/rejected": -1.8899266719818115, + "logps/chosen": -225.34622192382812, + "logps/rejected": -234.0809783935547, + "loss": 0.233, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9901856184005737, + "rewards/margins": 3.417259931564331, + "rewards/rejected": -5.407445907592773, + "step": 1835 + }, + { + "epoch": 0.38, + "learning_rate": 1.2369747899159665e-05, + "logits/chosen": -2.0599138736724854, + "logits/rejected": -2.1695938110351562, + "logps/chosen": -336.1058044433594, + "logps/rejected": -477.3143310546875, + "loss": 0.5413, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.623231887817383, + "rewards/margins": 3.3073599338531494, + "rewards/rejected": -5.930591583251953, + "step": 1836 + }, + { + "epoch": 0.38, + "learning_rate": 1.2365546218487396e-05, + "logits/chosen": -2.415325164794922, + "logits/rejected": -2.036621332168579, + "logps/chosen": -399.6528625488281, + "logps/rejected": -404.95147705078125, + "loss": 0.3727, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.274367094039917, + "rewards/margins": 3.932096242904663, + "rewards/rejected": -6.206463813781738, + "step": 1837 + }, + { + "epoch": 0.38, + "learning_rate": 1.2361344537815127e-05, + "logits/chosen": -1.9624621868133545, + "logits/rejected": -1.967362403869629, + "logps/chosen": -290.8499755859375, + "logps/rejected": -354.1822509765625, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.795387029647827, + "rewards/margins": 4.234407424926758, + "rewards/rejected": -7.029794692993164, + "step": 1838 + }, + { + "epoch": 0.38, + "learning_rate": 1.2357142857142859e-05, + "logits/chosen": -2.2429933547973633, + "logits/rejected": -1.9684207439422607, + "logps/chosen": -313.1953125, + "logps/rejected": -408.3805236816406, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9226566553115845, + "rewards/margins": 5.263579368591309, + "rewards/rejected": -7.1862359046936035, + "step": 1839 + }, + { + "epoch": 0.38, + "learning_rate": 1.235294117647059e-05, + "logits/chosen": -1.9952725172042847, + "logits/rejected": -1.7642802000045776, + "logps/chosen": -366.318115234375, + "logps/rejected": -391.9005126953125, + "loss": 0.144, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2164859771728516, + "rewards/margins": 3.9159996509552, + "rewards/rejected": -7.132485389709473, + "step": 1840 + }, + { + "epoch": 0.39, + "learning_rate": 1.234873949579832e-05, + "logits/chosen": -2.1719250679016113, + "logits/rejected": -2.069453716278076, + "logps/chosen": -291.865234375, + "logps/rejected": -282.1805114746094, + "loss": 0.3678, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3691577911376953, + "rewards/margins": 4.0803680419921875, + "rewards/rejected": -6.449525356292725, + "step": 1841 + }, + { + "epoch": 0.39, + "learning_rate": 1.2344537815126053e-05, + "logits/chosen": -2.4731712341308594, + "logits/rejected": -2.0194525718688965, + "logps/chosen": -328.2544250488281, + "logps/rejected": -348.77032470703125, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9667929410934448, + "rewards/margins": 3.2032198905944824, + "rewards/rejected": -5.170012474060059, + "step": 1842 + }, + { + "epoch": 0.39, + "learning_rate": 1.2340336134453783e-05, + "logits/chosen": -2.178130865097046, + "logits/rejected": -1.832926869392395, + "logps/chosen": -453.9772644042969, + "logps/rejected": -282.71417236328125, + "loss": 0.3241, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9788473844528198, + "rewards/margins": 3.1136436462402344, + "rewards/rejected": -5.0924906730651855, + "step": 1843 + }, + { + "epoch": 0.39, + "learning_rate": 1.2336134453781515e-05, + "logits/chosen": -2.3749067783355713, + "logits/rejected": -1.684998869895935, + "logps/chosen": -416.4408264160156, + "logps/rejected": -350.44952392578125, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3178791999816895, + "rewards/margins": 5.601921558380127, + "rewards/rejected": -6.919800758361816, + "step": 1844 + }, + { + "epoch": 0.39, + "learning_rate": 1.2331932773109245e-05, + "logits/chosen": -2.029813528060913, + "logits/rejected": -1.5848532915115356, + "logps/chosen": -403.4461669921875, + "logps/rejected": -388.6234436035156, + "loss": 0.5381, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.925960063934326, + "rewards/margins": 2.8877108097076416, + "rewards/rejected": -5.813670635223389, + "step": 1845 + }, + { + "epoch": 0.39, + "learning_rate": 1.2327731092436977e-05, + "logits/chosen": -2.211223840713501, + "logits/rejected": -1.8415236473083496, + "logps/chosen": -335.36273193359375, + "logps/rejected": -318.82379150390625, + "loss": 0.2432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6796250343322754, + "rewards/margins": 4.041940212249756, + "rewards/rejected": -6.721564769744873, + "step": 1846 + }, + { + "epoch": 0.39, + "learning_rate": 1.2323529411764707e-05, + "logits/chosen": -2.3169937133789062, + "logits/rejected": -2.1106038093566895, + "logps/chosen": -226.32061767578125, + "logps/rejected": -293.0423583984375, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.950911521911621, + "rewards/margins": 4.783463478088379, + "rewards/rejected": -7.734374523162842, + "step": 1847 + }, + { + "epoch": 0.39, + "learning_rate": 1.2319327731092439e-05, + "logits/chosen": -1.883255958557129, + "logits/rejected": -1.7112150192260742, + "logps/chosen": -315.20269775390625, + "logps/rejected": -364.6358642578125, + "loss": 0.1057, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.330933094024658, + "rewards/margins": 5.704405784606934, + "rewards/rejected": -9.03533935546875, + "step": 1848 + }, + { + "epoch": 0.39, + "learning_rate": 1.231512605042017e-05, + "logits/chosen": -1.8779338598251343, + "logits/rejected": -1.4539966583251953, + "logps/chosen": -280.99468994140625, + "logps/rejected": -277.5101623535156, + "loss": 0.4459, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0637292861938477, + "rewards/margins": 3.6735315322875977, + "rewards/rejected": -5.7372612953186035, + "step": 1849 + }, + { + "epoch": 0.39, + "learning_rate": 1.2310924369747901e-05, + "logits/chosen": -2.295351028442383, + "logits/rejected": -1.9756437540054321, + "logps/chosen": -307.5069885253906, + "logps/rejected": -249.9366455078125, + "loss": 0.145, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8945703506469727, + "rewards/margins": 4.176760196685791, + "rewards/rejected": -8.071331024169922, + "step": 1850 + }, + { + "epoch": 0.39, + "learning_rate": 1.2306722689075631e-05, + "logits/chosen": -1.9587208032608032, + "logits/rejected": -1.7900536060333252, + "logps/chosen": -344.981689453125, + "logps/rejected": -399.29254150390625, + "loss": 0.3901, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.953500747680664, + "rewards/margins": 2.9267892837524414, + "rewards/rejected": -5.8802900314331055, + "step": 1851 + }, + { + "epoch": 0.39, + "learning_rate": 1.2302521008403363e-05, + "logits/chosen": -2.180280923843384, + "logits/rejected": -1.8776546716690063, + "logps/chosen": -298.33251953125, + "logps/rejected": -330.929931640625, + "loss": 0.2205, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1914925575256348, + "rewards/margins": 3.972294569015503, + "rewards/rejected": -7.163786888122559, + "step": 1852 + }, + { + "epoch": 0.39, + "learning_rate": 1.2298319327731094e-05, + "logits/chosen": -2.248622179031372, + "logits/rejected": -1.5207862854003906, + "logps/chosen": -312.92364501953125, + "logps/rejected": -269.34326171875, + "loss": 0.2348, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.079177141189575, + "rewards/margins": 4.692597389221191, + "rewards/rejected": -6.771773815155029, + "step": 1853 + }, + { + "epoch": 0.39, + "learning_rate": 1.2294117647058826e-05, + "logits/chosen": -1.6843230724334717, + "logits/rejected": -1.6593902111053467, + "logps/chosen": -296.5410461425781, + "logps/rejected": -300.2232360839844, + "loss": 0.6384, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.180907726287842, + "rewards/margins": 2.1584579944610596, + "rewards/rejected": -5.3393659591674805, + "step": 1854 + }, + { + "epoch": 0.39, + "learning_rate": 1.2289915966386556e-05, + "logits/chosen": -1.933302879333496, + "logits/rejected": -1.5852653980255127, + "logps/chosen": -334.7741394042969, + "logps/rejected": -335.3190002441406, + "loss": 0.318, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1619303226470947, + "rewards/margins": 4.065133571624756, + "rewards/rejected": -7.22706413269043, + "step": 1855 + }, + { + "epoch": 0.39, + "learning_rate": 1.2285714285714288e-05, + "logits/chosen": -1.9559996128082275, + "logits/rejected": -2.0741889476776123, + "logps/chosen": -335.0888671875, + "logps/rejected": -390.7261962890625, + "loss": 0.6481, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.708482503890991, + "rewards/margins": 3.836520195007324, + "rewards/rejected": -6.5450029373168945, + "step": 1856 + }, + { + "epoch": 0.39, + "learning_rate": 1.2281512605042018e-05, + "logits/chosen": -2.2684614658355713, + "logits/rejected": -1.9043837785720825, + "logps/chosen": -387.27740478515625, + "logps/rejected": -390.257568359375, + "loss": 0.2113, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9054386615753174, + "rewards/margins": 5.122135162353516, + "rewards/rejected": -7.027573585510254, + "step": 1857 + }, + { + "epoch": 0.39, + "learning_rate": 1.227731092436975e-05, + "logits/chosen": -2.1066670417785645, + "logits/rejected": -1.9788291454315186, + "logps/chosen": -367.8558349609375, + "logps/rejected": -354.92803955078125, + "loss": 0.173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.427006483078003, + "rewards/margins": 3.7116971015930176, + "rewards/rejected": -6.1387038230896, + "step": 1858 + }, + { + "epoch": 0.39, + "learning_rate": 1.227310924369748e-05, + "logits/chosen": -2.207350730895996, + "logits/rejected": -1.7087862491607666, + "logps/chosen": -319.0138244628906, + "logps/rejected": -294.2601318359375, + "loss": 0.3015, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6084644794464111, + "rewards/margins": 4.778830528259277, + "rewards/rejected": -6.387295246124268, + "step": 1859 + }, + { + "epoch": 0.39, + "learning_rate": 1.2268907563025212e-05, + "logits/chosen": -1.8574326038360596, + "logits/rejected": -1.946333646774292, + "logps/chosen": -219.79058837890625, + "logps/rejected": -243.98524475097656, + "loss": 0.7268, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.346604824066162, + "rewards/margins": 2.4032351970672607, + "rewards/rejected": -5.74984073638916, + "step": 1860 + }, + { + "epoch": 0.39, + "learning_rate": 1.2264705882352944e-05, + "logits/chosen": -2.2645788192749023, + "logits/rejected": -1.8685322999954224, + "logps/chosen": -400.17987060546875, + "logps/rejected": -336.9418640136719, + "loss": 0.4802, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.426004648208618, + "rewards/margins": 2.8071141242980957, + "rewards/rejected": -5.233119010925293, + "step": 1861 + }, + { + "epoch": 0.39, + "learning_rate": 1.2260504201680674e-05, + "logits/chosen": -1.938511610031128, + "logits/rejected": -1.6733431816101074, + "logps/chosen": -272.3828125, + "logps/rejected": -316.3158264160156, + "loss": 0.229, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9613900184631348, + "rewards/margins": 5.299942970275879, + "rewards/rejected": -8.261332511901855, + "step": 1862 + }, + { + "epoch": 0.39, + "learning_rate": 1.2256302521008406e-05, + "logits/chosen": -2.374242067337036, + "logits/rejected": -1.8657582998275757, + "logps/chosen": -345.5589294433594, + "logps/rejected": -280.80706787109375, + "loss": 0.2125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.298710346221924, + "rewards/margins": 3.296266794204712, + "rewards/rejected": -5.594976902008057, + "step": 1863 + }, + { + "epoch": 0.39, + "learning_rate": 1.2252100840336136e-05, + "logits/chosen": -2.059525728225708, + "logits/rejected": -1.8084293603897095, + "logps/chosen": -305.459716796875, + "logps/rejected": -347.12939453125, + "loss": 0.1119, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.765744924545288, + "rewards/margins": 5.005342960357666, + "rewards/rejected": -6.771087646484375, + "step": 1864 + }, + { + "epoch": 0.39, + "learning_rate": 1.2247899159663868e-05, + "logits/chosen": -2.169187545776367, + "logits/rejected": -1.6504592895507812, + "logps/chosen": -378.8183898925781, + "logps/rejected": -385.246337890625, + "loss": 0.1025, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1404271125793457, + "rewards/margins": 5.499130725860596, + "rewards/rejected": -6.639557838439941, + "step": 1865 + }, + { + "epoch": 0.39, + "learning_rate": 1.2243697478991598e-05, + "logits/chosen": -2.105647087097168, + "logits/rejected": -1.946838140487671, + "logps/chosen": -451.7643737792969, + "logps/rejected": -392.23602294921875, + "loss": 0.201, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2099411487579346, + "rewards/margins": 4.705042839050293, + "rewards/rejected": -5.914983749389648, + "step": 1866 + }, + { + "epoch": 0.39, + "learning_rate": 1.223949579831933e-05, + "logits/chosen": -2.0883190631866455, + "logits/rejected": -1.921244740486145, + "logps/chosen": -341.49237060546875, + "logps/rejected": -408.92071533203125, + "loss": 0.2228, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4461169242858887, + "rewards/margins": 3.4331729412078857, + "rewards/rejected": -5.879289627075195, + "step": 1867 + }, + { + "epoch": 0.39, + "learning_rate": 1.223529411764706e-05, + "logits/chosen": -2.150362730026245, + "logits/rejected": -2.0336923599243164, + "logps/chosen": -325.43865966796875, + "logps/rejected": -318.94384765625, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.354466199874878, + "rewards/margins": 2.811051368713379, + "rewards/rejected": -5.165517807006836, + "step": 1868 + }, + { + "epoch": 0.39, + "learning_rate": 1.2231092436974792e-05, + "logits/chosen": -2.0908029079437256, + "logits/rejected": -2.0888607501983643, + "logps/chosen": -376.7675476074219, + "logps/rejected": -401.38800048828125, + "loss": 0.0838, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6777400970458984, + "rewards/margins": 5.11637020111084, + "rewards/rejected": -7.7941107749938965, + "step": 1869 + }, + { + "epoch": 0.39, + "learning_rate": 1.2226890756302523e-05, + "logits/chosen": -2.0638866424560547, + "logits/rejected": -2.0652823448181152, + "logps/chosen": -240.63055419921875, + "logps/rejected": -320.2286682128906, + "loss": 0.4652, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.447099447250366, + "rewards/margins": 3.8420157432556152, + "rewards/rejected": -7.289114952087402, + "step": 1870 + }, + { + "epoch": 0.39, + "learning_rate": 1.2222689075630255e-05, + "logits/chosen": -2.0308241844177246, + "logits/rejected": -1.9425818920135498, + "logps/chosen": -311.0562744140625, + "logps/rejected": -281.4987487792969, + "loss": 0.5196, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1683919429779053, + "rewards/margins": 3.7399520874023438, + "rewards/rejected": -5.908344268798828, + "step": 1871 + }, + { + "epoch": 0.39, + "learning_rate": 1.2218487394957985e-05, + "logits/chosen": -2.0704848766326904, + "logits/rejected": -2.019864797592163, + "logps/chosen": -262.2308349609375, + "logps/rejected": -347.4579772949219, + "loss": 0.2654, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4987740516662598, + "rewards/margins": 4.221687316894531, + "rewards/rejected": -6.720461368560791, + "step": 1872 + }, + { + "epoch": 0.39, + "learning_rate": 1.2214285714285717e-05, + "logits/chosen": -2.205272674560547, + "logits/rejected": -1.9787051677703857, + "logps/chosen": -421.778564453125, + "logps/rejected": -324.6605529785156, + "loss": 0.4956, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.56027889251709, + "rewards/margins": 3.219339370727539, + "rewards/rejected": -5.779618263244629, + "step": 1873 + }, + { + "epoch": 0.39, + "learning_rate": 1.2210084033613447e-05, + "logits/chosen": -2.021812915802002, + "logits/rejected": -1.8759019374847412, + "logps/chosen": -241.0976104736328, + "logps/rejected": -316.0867919921875, + "loss": 0.2228, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9314661026000977, + "rewards/margins": 4.005714416503906, + "rewards/rejected": -6.937180519104004, + "step": 1874 + }, + { + "epoch": 0.39, + "learning_rate": 1.2205882352941179e-05, + "logits/chosen": -1.9348081350326538, + "logits/rejected": -1.3799512386322021, + "logps/chosen": -351.6767272949219, + "logps/rejected": -270.9066467285156, + "loss": 0.5187, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.621941089630127, + "rewards/margins": 3.3479952812194824, + "rewards/rejected": -5.969936370849609, + "step": 1875 + }, + { + "epoch": 0.39, + "learning_rate": 1.2201680672268909e-05, + "logits/chosen": -1.9474555253982544, + "logits/rejected": -1.9348561763763428, + "logps/chosen": -437.6739501953125, + "logps/rejected": -411.3087158203125, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4875917434692383, + "rewards/margins": 1.853659987449646, + "rewards/rejected": -4.341251850128174, + "step": 1876 + }, + { + "epoch": 0.39, + "learning_rate": 1.2197478991596641e-05, + "logits/chosen": -1.7438081502914429, + "logits/rejected": -1.4689557552337646, + "logps/chosen": -440.9767761230469, + "logps/rejected": -310.000732421875, + "loss": 0.2247, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.828077793121338, + "rewards/margins": 3.6447439193725586, + "rewards/rejected": -6.4728217124938965, + "step": 1877 + }, + { + "epoch": 0.39, + "learning_rate": 1.2193277310924371e-05, + "logits/chosen": -1.959306001663208, + "logits/rejected": -2.010695219039917, + "logps/chosen": -330.3531494140625, + "logps/rejected": -386.5733642578125, + "loss": 0.6733, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.080143690109253, + "rewards/margins": 2.886941909790039, + "rewards/rejected": -4.967085838317871, + "step": 1878 + }, + { + "epoch": 0.39, + "learning_rate": 1.2189075630252103e-05, + "logits/chosen": -1.805151104927063, + "logits/rejected": -1.7687690258026123, + "logps/chosen": -322.3372802734375, + "logps/rejected": -356.020751953125, + "loss": 0.1384, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.123734951019287, + "rewards/margins": 4.347909927368164, + "rewards/rejected": -6.471644878387451, + "step": 1879 + }, + { + "epoch": 0.39, + "learning_rate": 1.2184873949579832e-05, + "logits/chosen": -2.183786153793335, + "logits/rejected": -1.9782054424285889, + "logps/chosen": -302.1595458984375, + "logps/rejected": -330.3889465332031, + "loss": 0.4196, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5051831007003784, + "rewards/margins": 2.912980556488037, + "rewards/rejected": -4.418163299560547, + "step": 1880 + }, + { + "epoch": 0.39, + "learning_rate": 1.2180672268907564e-05, + "logits/chosen": -1.95109224319458, + "logits/rejected": -2.0503132343292236, + "logps/chosen": -272.64251708984375, + "logps/rejected": -363.9761962890625, + "loss": 0.2787, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5736207962036133, + "rewards/margins": 4.0049285888671875, + "rewards/rejected": -6.578548431396484, + "step": 1881 + }, + { + "epoch": 0.39, + "learning_rate": 1.2176470588235294e-05, + "logits/chosen": -2.1732020378112793, + "logits/rejected": -2.153201103210449, + "logps/chosen": -457.95361328125, + "logps/rejected": -378.83099365234375, + "loss": 0.4302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.479139566421509, + "rewards/margins": 2.250716209411621, + "rewards/rejected": -4.729856014251709, + "step": 1882 + }, + { + "epoch": 0.39, + "learning_rate": 1.2172268907563026e-05, + "logits/chosen": -2.315631866455078, + "logits/rejected": -1.9229233264923096, + "logps/chosen": -488.20550537109375, + "logps/rejected": -354.6662902832031, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7448663711547852, + "rewards/margins": 3.9652786254882812, + "rewards/rejected": -5.710144519805908, + "step": 1883 + }, + { + "epoch": 0.39, + "learning_rate": 1.2168067226890756e-05, + "logits/chosen": -2.077934980392456, + "logits/rejected": -1.9388132095336914, + "logps/chosen": -387.1546630859375, + "logps/rejected": -347.7235412597656, + "loss": 0.2535, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1852521896362305, + "rewards/margins": 3.9739584922790527, + "rewards/rejected": -6.159210681915283, + "step": 1884 + }, + { + "epoch": 0.39, + "learning_rate": 1.2163865546218488e-05, + "logits/chosen": -2.117459774017334, + "logits/rejected": -2.051194906234741, + "logps/chosen": -279.4011535644531, + "logps/rejected": -305.2198486328125, + "loss": 0.2484, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9879226684570312, + "rewards/margins": 3.9829320907592773, + "rewards/rejected": -5.970855236053467, + "step": 1885 + }, + { + "epoch": 0.39, + "learning_rate": 1.2159663865546218e-05, + "logits/chosen": -2.4349608421325684, + "logits/rejected": -2.1075141429901123, + "logps/chosen": -362.0382080078125, + "logps/rejected": -391.5584716796875, + "loss": 0.255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.621695637702942, + "rewards/margins": 4.289048194885254, + "rewards/rejected": -5.910743713378906, + "step": 1886 + }, + { + "epoch": 0.39, + "learning_rate": 1.215546218487395e-05, + "logits/chosen": -1.9408646821975708, + "logits/rejected": -2.12056827545166, + "logps/chosen": -273.6898498535156, + "logps/rejected": -376.93243408203125, + "loss": 0.5742, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.8257675170898438, + "rewards/margins": 3.0630791187286377, + "rewards/rejected": -5.888847351074219, + "step": 1887 + }, + { + "epoch": 0.39, + "learning_rate": 1.215126050420168e-05, + "logits/chosen": -2.063706159591675, + "logits/rejected": -1.9158318042755127, + "logps/chosen": -329.215087890625, + "logps/rejected": -332.4383850097656, + "loss": 0.5738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1032702922821045, + "rewards/margins": 3.3517837524414062, + "rewards/rejected": -5.45505428314209, + "step": 1888 + }, + { + "epoch": 0.4, + "learning_rate": 1.2147058823529412e-05, + "logits/chosen": -1.6034259796142578, + "logits/rejected": -1.8200292587280273, + "logps/chosen": -262.8677673339844, + "logps/rejected": -364.11285400390625, + "loss": 0.3117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.900654911994934, + "rewards/margins": 3.036217212677002, + "rewards/rejected": -4.9368720054626465, + "step": 1889 + }, + { + "epoch": 0.4, + "learning_rate": 1.2142857142857142e-05, + "logits/chosen": -2.2216062545776367, + "logits/rejected": -1.4916836023330688, + "logps/chosen": -337.0675048828125, + "logps/rejected": -309.1094055175781, + "loss": 0.2597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1138360500335693, + "rewards/margins": 3.1984457969665527, + "rewards/rejected": -5.312282085418701, + "step": 1890 + }, + { + "epoch": 0.4, + "learning_rate": 1.2138655462184874e-05, + "logits/chosen": -1.8620685338974, + "logits/rejected": -1.8574579954147339, + "logps/chosen": -267.0176696777344, + "logps/rejected": -299.7701721191406, + "loss": 0.2454, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9521660804748535, + "rewards/margins": 3.5621039867401123, + "rewards/rejected": -6.514269828796387, + "step": 1891 + }, + { + "epoch": 0.4, + "learning_rate": 1.2134453781512604e-05, + "logits/chosen": -2.2182257175445557, + "logits/rejected": -2.009401798248291, + "logps/chosen": -350.63861083984375, + "logps/rejected": -344.4203796386719, + "loss": 0.0989, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.450014352798462, + "rewards/margins": 5.047044277191162, + "rewards/rejected": -6.497058868408203, + "step": 1892 + }, + { + "epoch": 0.4, + "learning_rate": 1.2130252100840336e-05, + "logits/chosen": -2.178387403488159, + "logits/rejected": -1.8148460388183594, + "logps/chosen": -261.49285888671875, + "logps/rejected": -250.3060302734375, + "loss": 0.2948, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.348268508911133, + "rewards/margins": 4.149266719818115, + "rewards/rejected": -6.49753475189209, + "step": 1893 + }, + { + "epoch": 0.4, + "learning_rate": 1.2126050420168067e-05, + "logits/chosen": -1.7209651470184326, + "logits/rejected": -1.977144718170166, + "logps/chosen": -221.2631072998047, + "logps/rejected": -295.52752685546875, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.125429153442383, + "rewards/margins": 4.741949081420898, + "rewards/rejected": -7.867377758026123, + "step": 1894 + }, + { + "epoch": 0.4, + "learning_rate": 1.2121848739495798e-05, + "logits/chosen": -2.089787483215332, + "logits/rejected": -1.8356645107269287, + "logps/chosen": -350.14373779296875, + "logps/rejected": -305.8224792480469, + "loss": 0.5869, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5162816047668457, + "rewards/margins": 4.68880558013916, + "rewards/rejected": -6.205087184906006, + "step": 1895 + }, + { + "epoch": 0.4, + "learning_rate": 1.211764705882353e-05, + "logits/chosen": -2.0634453296661377, + "logits/rejected": -1.5936388969421387, + "logps/chosen": -335.98223876953125, + "logps/rejected": -396.586181640625, + "loss": 0.404, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.992807149887085, + "rewards/margins": 3.9616644382476807, + "rewards/rejected": -5.954472064971924, + "step": 1896 + }, + { + "epoch": 0.4, + "learning_rate": 1.211344537815126e-05, + "logits/chosen": -2.292320966720581, + "logits/rejected": -1.9044640064239502, + "logps/chosen": -400.520263671875, + "logps/rejected": -357.2868347167969, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5305293798446655, + "rewards/margins": 5.195306301116943, + "rewards/rejected": -6.72583532333374, + "step": 1897 + }, + { + "epoch": 0.4, + "learning_rate": 1.2109243697478993e-05, + "logits/chosen": -2.240651845932007, + "logits/rejected": -1.7099661827087402, + "logps/chosen": -342.96990966796875, + "logps/rejected": -262.08331298828125, + "loss": 0.8412, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9160306453704834, + "rewards/margins": 3.297761917114258, + "rewards/rejected": -6.21379280090332, + "step": 1898 + }, + { + "epoch": 0.4, + "learning_rate": 1.2105042016806723e-05, + "logits/chosen": -1.9975688457489014, + "logits/rejected": -1.7819318771362305, + "logps/chosen": -241.73753356933594, + "logps/rejected": -228.4881134033203, + "loss": 0.289, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0775880813598633, + "rewards/margins": 3.5786678791046143, + "rewards/rejected": -5.656255722045898, + "step": 1899 + }, + { + "epoch": 0.4, + "learning_rate": 1.2100840336134455e-05, + "logits/chosen": -1.7145962715148926, + "logits/rejected": -1.9114662408828735, + "logps/chosen": -354.4574279785156, + "logps/rejected": -506.02679443359375, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.365055561065674, + "rewards/margins": 5.540973663330078, + "rewards/rejected": -7.906028747558594, + "step": 1900 + }, + { + "epoch": 0.4, + "learning_rate": 1.2096638655462185e-05, + "logits/chosen": -1.8172868490219116, + "logits/rejected": -1.8095623254776, + "logps/chosen": -382.0679931640625, + "logps/rejected": -406.56988525390625, + "loss": 0.1669, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6362740993499756, + "rewards/margins": 4.088894367218018, + "rewards/rejected": -5.725168228149414, + "step": 1901 + }, + { + "epoch": 0.4, + "learning_rate": 1.2092436974789917e-05, + "logits/chosen": -2.0657129287719727, + "logits/rejected": -1.6017169952392578, + "logps/chosen": -367.9393310546875, + "logps/rejected": -360.8873596191406, + "loss": 0.2588, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.119049072265625, + "rewards/margins": 3.978104591369629, + "rewards/rejected": -6.097153663635254, + "step": 1902 + }, + { + "epoch": 0.4, + "learning_rate": 1.2088235294117647e-05, + "logits/chosen": -1.9344240427017212, + "logits/rejected": -1.7728099822998047, + "logps/chosen": -351.71624755859375, + "logps/rejected": -399.22308349609375, + "loss": 0.2671, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.720890522003174, + "rewards/margins": 4.908173561096191, + "rewards/rejected": -7.629063606262207, + "step": 1903 + }, + { + "epoch": 0.4, + "learning_rate": 1.2084033613445379e-05, + "logits/chosen": -2.136500597000122, + "logits/rejected": -1.817636251449585, + "logps/chosen": -378.0855407714844, + "logps/rejected": -467.0699462890625, + "loss": 0.2881, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8037837743759155, + "rewards/margins": 4.506923675537109, + "rewards/rejected": -6.310707092285156, + "step": 1904 + }, + { + "epoch": 0.4, + "learning_rate": 1.207983193277311e-05, + "logits/chosen": -1.924896001815796, + "logits/rejected": -1.7402033805847168, + "logps/chosen": -317.7908020019531, + "logps/rejected": -314.0490417480469, + "loss": 0.393, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.650522232055664, + "rewards/margins": 3.0206260681152344, + "rewards/rejected": -5.671148777008057, + "step": 1905 + }, + { + "epoch": 0.4, + "learning_rate": 1.2075630252100841e-05, + "logits/chosen": -2.1404194831848145, + "logits/rejected": -2.0721638202667236, + "logps/chosen": -376.66436767578125, + "logps/rejected": -394.8505859375, + "loss": 0.156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9328482151031494, + "rewards/margins": 3.471165180206299, + "rewards/rejected": -5.404013633728027, + "step": 1906 + }, + { + "epoch": 0.4, + "learning_rate": 1.2071428571428571e-05, + "logits/chosen": -2.142035961151123, + "logits/rejected": -1.7251577377319336, + "logps/chosen": -376.7251892089844, + "logps/rejected": -284.1921691894531, + "loss": 0.3614, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8773622512817383, + "rewards/margins": 2.6040382385253906, + "rewards/rejected": -5.481400489807129, + "step": 1907 + }, + { + "epoch": 0.4, + "learning_rate": 1.2067226890756303e-05, + "logits/chosen": -2.1734681129455566, + "logits/rejected": -2.0065765380859375, + "logps/chosen": -334.5392150878906, + "logps/rejected": -393.1302185058594, + "loss": 0.1577, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3483502864837646, + "rewards/margins": 6.308602333068848, + "rewards/rejected": -8.656951904296875, + "step": 1908 + }, + { + "epoch": 0.4, + "learning_rate": 1.2063025210084033e-05, + "logits/chosen": -1.9934320449829102, + "logits/rejected": -2.2016761302948, + "logps/chosen": -228.0828399658203, + "logps/rejected": -319.03955078125, + "loss": 0.5653, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9895856380462646, + "rewards/margins": 3.5494492053985596, + "rewards/rejected": -6.539034843444824, + "step": 1909 + }, + { + "epoch": 0.4, + "learning_rate": 1.2058823529411765e-05, + "logits/chosen": -2.2420475482940674, + "logits/rejected": -2.120718002319336, + "logps/chosen": -301.90911865234375, + "logps/rejected": -350.76873779296875, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2650742530822754, + "rewards/margins": 3.5707528591156006, + "rewards/rejected": -5.835826873779297, + "step": 1910 + }, + { + "epoch": 0.4, + "learning_rate": 1.2054621848739496e-05, + "logits/chosen": -1.7912968397140503, + "logits/rejected": -1.8730591535568237, + "logps/chosen": -295.6524658203125, + "logps/rejected": -340.78265380859375, + "loss": 0.1912, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5242538452148438, + "rewards/margins": 4.108933925628662, + "rewards/rejected": -5.633187770843506, + "step": 1911 + }, + { + "epoch": 0.4, + "learning_rate": 1.2050420168067227e-05, + "logits/chosen": -2.1771109104156494, + "logits/rejected": -1.5272819995880127, + "logps/chosen": -390.6642761230469, + "logps/rejected": -357.1390075683594, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7402662038803101, + "rewards/margins": 6.469948768615723, + "rewards/rejected": -7.2102155685424805, + "step": 1912 + }, + { + "epoch": 0.4, + "learning_rate": 1.2046218487394958e-05, + "logits/chosen": -2.007319927215576, + "logits/rejected": -1.7263262271881104, + "logps/chosen": -396.69183349609375, + "logps/rejected": -383.7300109863281, + "loss": 0.2062, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8319088220596313, + "rewards/margins": 3.7907354831695557, + "rewards/rejected": -4.622644424438477, + "step": 1913 + }, + { + "epoch": 0.4, + "learning_rate": 1.204201680672269e-05, + "logits/chosen": -2.216395378112793, + "logits/rejected": -1.9293133020401, + "logps/chosen": -277.431396484375, + "logps/rejected": -282.9858703613281, + "loss": 0.2279, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.076395034790039, + "rewards/margins": 4.433126449584961, + "rewards/rejected": -6.509521484375, + "step": 1914 + }, + { + "epoch": 0.4, + "learning_rate": 1.203781512605042e-05, + "logits/chosen": -1.8018733263015747, + "logits/rejected": -2.0765953063964844, + "logps/chosen": -308.0455017089844, + "logps/rejected": -368.81005859375, + "loss": 0.2452, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.542095899581909, + "rewards/margins": 4.802630424499512, + "rewards/rejected": -7.344725608825684, + "step": 1915 + }, + { + "epoch": 0.4, + "learning_rate": 1.2033613445378152e-05, + "logits/chosen": -1.9174859523773193, + "logits/rejected": -1.7762975692749023, + "logps/chosen": -418.926513671875, + "logps/rejected": -354.2291259765625, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6412107944488525, + "rewards/margins": 5.765614986419678, + "rewards/rejected": -7.406826019287109, + "step": 1916 + }, + { + "epoch": 0.4, + "learning_rate": 1.2029411764705882e-05, + "logits/chosen": -2.0088179111480713, + "logits/rejected": -1.851726770401001, + "logps/chosen": -344.12451171875, + "logps/rejected": -348.35101318359375, + "loss": 0.591, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.588670492172241, + "rewards/margins": 3.583779811859131, + "rewards/rejected": -6.172450542449951, + "step": 1917 + }, + { + "epoch": 0.4, + "learning_rate": 1.2025210084033614e-05, + "logits/chosen": -1.945975661277771, + "logits/rejected": -1.8420360088348389, + "logps/chosen": -228.79737854003906, + "logps/rejected": -242.62191772460938, + "loss": 0.3266, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1105878353118896, + "rewards/margins": 2.999886989593506, + "rewards/rejected": -5.110475063323975, + "step": 1918 + }, + { + "epoch": 0.4, + "learning_rate": 1.2021008403361346e-05, + "logits/chosen": -2.0177059173583984, + "logits/rejected": -1.9783623218536377, + "logps/chosen": -383.1611022949219, + "logps/rejected": -450.2672424316406, + "loss": 0.3609, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5593056678771973, + "rewards/margins": 4.635698318481445, + "rewards/rejected": -6.195004463195801, + "step": 1919 + }, + { + "epoch": 0.4, + "learning_rate": 1.2016806722689076e-05, + "logits/chosen": -2.3523528575897217, + "logits/rejected": -2.425680637359619, + "logps/chosen": -428.9132080078125, + "logps/rejected": -518.2432250976562, + "loss": 0.3788, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.144625186920166, + "rewards/margins": 3.4645891189575195, + "rewards/rejected": -5.609213829040527, + "step": 1920 + }, + { + "epoch": 0.4, + "learning_rate": 1.2012605042016808e-05, + "logits/chosen": -2.0367918014526367, + "logits/rejected": -2.1302733421325684, + "logps/chosen": -296.88134765625, + "logps/rejected": -338.58477783203125, + "loss": 0.5792, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.708970069885254, + "rewards/margins": 3.3739919662475586, + "rewards/rejected": -6.0829620361328125, + "step": 1921 + }, + { + "epoch": 0.4, + "learning_rate": 1.2008403361344538e-05, + "logits/chosen": -2.3181698322296143, + "logits/rejected": -1.9248433113098145, + "logps/chosen": -428.5474548339844, + "logps/rejected": -337.886474609375, + "loss": 0.3008, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7446939945220947, + "rewards/margins": 3.9507875442504883, + "rewards/rejected": -5.695481777191162, + "step": 1922 + }, + { + "epoch": 0.4, + "learning_rate": 1.200420168067227e-05, + "logits/chosen": -2.095181465148926, + "logits/rejected": -1.7831504344940186, + "logps/chosen": -401.6945495605469, + "logps/rejected": -413.27313232421875, + "loss": 0.5541, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7652153968811035, + "rewards/margins": 3.8777570724487305, + "rewards/rejected": -6.642971992492676, + "step": 1923 + }, + { + "epoch": 0.4, + "learning_rate": 1.2e-05, + "logits/chosen": -2.323911190032959, + "logits/rejected": -2.004767417907715, + "logps/chosen": -362.94989013671875, + "logps/rejected": -304.14801025390625, + "loss": 0.3896, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.179198741912842, + "rewards/margins": 2.648852825164795, + "rewards/rejected": -5.828051567077637, + "step": 1924 + }, + { + "epoch": 0.4, + "learning_rate": 1.1995798319327732e-05, + "logits/chosen": -1.9243475198745728, + "logits/rejected": -2.0698821544647217, + "logps/chosen": -276.5430908203125, + "logps/rejected": -395.3537292480469, + "loss": 0.11, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.197965383529663, + "rewards/margins": 5.2543487548828125, + "rewards/rejected": -6.452314376831055, + "step": 1925 + }, + { + "epoch": 0.4, + "learning_rate": 1.1991596638655462e-05, + "logits/chosen": -1.8655226230621338, + "logits/rejected": -1.7618821859359741, + "logps/chosen": -218.9773406982422, + "logps/rejected": -318.39959716796875, + "loss": 0.366, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.919647216796875, + "rewards/margins": 2.703221082687378, + "rewards/rejected": -5.622868061065674, + "step": 1926 + }, + { + "epoch": 0.4, + "learning_rate": 1.1987394957983194e-05, + "logits/chosen": -2.188274621963501, + "logits/rejected": -2.0795950889587402, + "logps/chosen": -339.4609069824219, + "logps/rejected": -515.133544921875, + "loss": 0.5193, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6383622884750366, + "rewards/margins": 4.2760515213012695, + "rewards/rejected": -5.914413928985596, + "step": 1927 + }, + { + "epoch": 0.4, + "learning_rate": 1.1983193277310925e-05, + "logits/chosen": -2.1272454261779785, + "logits/rejected": -1.6379809379577637, + "logps/chosen": -296.60980224609375, + "logps/rejected": -249.27235412597656, + "loss": 0.6232, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6637842655181885, + "rewards/margins": 3.8892006874084473, + "rewards/rejected": -6.552984714508057, + "step": 1928 + }, + { + "epoch": 0.4, + "learning_rate": 1.1978991596638656e-05, + "logits/chosen": -2.289217472076416, + "logits/rejected": -2.0336754322052, + "logps/chosen": -430.8250732421875, + "logps/rejected": -327.38641357421875, + "loss": 0.3064, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0705857276916504, + "rewards/margins": 3.771559476852417, + "rewards/rejected": -4.842144966125488, + "step": 1929 + }, + { + "epoch": 0.4, + "learning_rate": 1.1974789915966387e-05, + "logits/chosen": -2.120927572250366, + "logits/rejected": -1.6716057062149048, + "logps/chosen": -346.63397216796875, + "logps/rejected": -313.1705322265625, + "loss": 0.2674, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.612196922302246, + "rewards/margins": 3.1841797828674316, + "rewards/rejected": -4.7963762283325195, + "step": 1930 + }, + { + "epoch": 0.4, + "learning_rate": 1.1970588235294119e-05, + "logits/chosen": -2.410149574279785, + "logits/rejected": -1.9183396100997925, + "logps/chosen": -411.2257385253906, + "logps/rejected": -329.4946594238281, + "loss": 0.3493, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0278282165527344, + "rewards/margins": 2.2552437782287598, + "rewards/rejected": -3.283071756362915, + "step": 1931 + }, + { + "epoch": 0.4, + "learning_rate": 1.1966386554621849e-05, + "logits/chosen": -1.3189826011657715, + "logits/rejected": -1.9186497926712036, + "logps/chosen": -193.09617614746094, + "logps/rejected": -335.5715637207031, + "loss": 0.372, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0491819381713867, + "rewards/margins": 4.139082908630371, + "rewards/rejected": -6.188264846801758, + "step": 1932 + }, + { + "epoch": 0.4, + "learning_rate": 1.196218487394958e-05, + "logits/chosen": -2.284926414489746, + "logits/rejected": -1.7808797359466553, + "logps/chosen": -335.343017578125, + "logps/rejected": -321.68231201171875, + "loss": 0.2559, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4183425903320312, + "rewards/margins": 4.103165149688721, + "rewards/rejected": -5.52150821685791, + "step": 1933 + }, + { + "epoch": 0.4, + "learning_rate": 1.1957983193277311e-05, + "logits/chosen": -2.2952332496643066, + "logits/rejected": -1.4111660718917847, + "logps/chosen": -424.5174560546875, + "logps/rejected": -392.2184143066406, + "loss": 0.4404, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0654237270355225, + "rewards/margins": 3.078819751739502, + "rewards/rejected": -5.1442437171936035, + "step": 1934 + }, + { + "epoch": 0.4, + "learning_rate": 1.1953781512605043e-05, + "logits/chosen": -2.0491909980773926, + "logits/rejected": -2.1064460277557373, + "logps/chosen": -193.53875732421875, + "logps/rejected": -303.26409912109375, + "loss": 0.1201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.657051682472229, + "rewards/margins": 4.737081527709961, + "rewards/rejected": -6.3941330909729, + "step": 1935 + }, + { + "epoch": 0.41, + "learning_rate": 1.1949579831932773e-05, + "logits/chosen": -2.1119136810302734, + "logits/rejected": -2.1079163551330566, + "logps/chosen": -265.5458068847656, + "logps/rejected": -367.0487060546875, + "loss": 0.3151, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.417665958404541, + "rewards/margins": 3.9206326007843018, + "rewards/rejected": -6.338298320770264, + "step": 1936 + }, + { + "epoch": 0.41, + "learning_rate": 1.1945378151260505e-05, + "logits/chosen": -2.200597047805786, + "logits/rejected": -1.979295015335083, + "logps/chosen": -415.5914306640625, + "logps/rejected": -437.1701354980469, + "loss": 0.2908, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3065394163131714, + "rewards/margins": 3.6270809173583984, + "rewards/rejected": -4.933620452880859, + "step": 1937 + }, + { + "epoch": 0.41, + "learning_rate": 1.1941176470588235e-05, + "logits/chosen": -1.958099126815796, + "logits/rejected": -1.7996703386306763, + "logps/chosen": -329.54888916015625, + "logps/rejected": -380.7408142089844, + "loss": 0.1874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8060333132743835, + "rewards/margins": 2.933206081390381, + "rewards/rejected": -3.739239454269409, + "step": 1938 + }, + { + "epoch": 0.41, + "learning_rate": 1.1936974789915967e-05, + "logits/chosen": -1.9268754720687866, + "logits/rejected": -2.0633370876312256, + "logps/chosen": -254.12820434570312, + "logps/rejected": -280.59991455078125, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9287095069885254, + "rewards/margins": 3.3217313289642334, + "rewards/rejected": -5.25044059753418, + "step": 1939 + }, + { + "epoch": 0.41, + "learning_rate": 1.1932773109243699e-05, + "logits/chosen": -2.182180166244507, + "logits/rejected": -1.7079010009765625, + "logps/chosen": -423.8414306640625, + "logps/rejected": -335.9015197753906, + "loss": 0.4899, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6346639394760132, + "rewards/margins": 4.65989351272583, + "rewards/rejected": -6.294557571411133, + "step": 1940 + }, + { + "epoch": 0.41, + "learning_rate": 1.192857142857143e-05, + "logits/chosen": -2.022777557373047, + "logits/rejected": -2.0109386444091797, + "logps/chosen": -307.6396484375, + "logps/rejected": -387.5233154296875, + "loss": 0.6461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8571763038635254, + "rewards/margins": 3.773679733276367, + "rewards/rejected": -6.630856037139893, + "step": 1941 + }, + { + "epoch": 0.41, + "learning_rate": 1.1924369747899161e-05, + "logits/chosen": -2.147752285003662, + "logits/rejected": -1.888809084892273, + "logps/chosen": -259.9430236816406, + "logps/rejected": -292.3148193359375, + "loss": 0.1, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3286447525024414, + "rewards/margins": 5.61132287979126, + "rewards/rejected": -6.939967632293701, + "step": 1942 + }, + { + "epoch": 0.41, + "learning_rate": 1.1920168067226891e-05, + "logits/chosen": -2.0400469303131104, + "logits/rejected": -1.971203088760376, + "logps/chosen": -339.0711669921875, + "logps/rejected": -396.9685363769531, + "loss": 0.2274, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1302807331085205, + "rewards/margins": 5.530862331390381, + "rewards/rejected": -7.6611433029174805, + "step": 1943 + }, + { + "epoch": 0.41, + "learning_rate": 1.1915966386554623e-05, + "logits/chosen": -2.0932838916778564, + "logits/rejected": -2.1938164234161377, + "logps/chosen": -280.6672058105469, + "logps/rejected": -329.2666931152344, + "loss": 0.2732, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8697800636291504, + "rewards/margins": 3.28078556060791, + "rewards/rejected": -5.150566101074219, + "step": 1944 + }, + { + "epoch": 0.41, + "learning_rate": 1.1911764705882354e-05, + "logits/chosen": -2.123795986175537, + "logits/rejected": -1.7978296279907227, + "logps/chosen": -181.64398193359375, + "logps/rejected": -172.8799285888672, + "loss": 0.2655, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8274486064910889, + "rewards/margins": 3.6413755416870117, + "rewards/rejected": -5.46882438659668, + "step": 1945 + }, + { + "epoch": 0.41, + "learning_rate": 1.1907563025210086e-05, + "logits/chosen": -2.057217597961426, + "logits/rejected": -1.7277305126190186, + "logps/chosen": -345.43341064453125, + "logps/rejected": -342.9769287109375, + "loss": 0.4275, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9175856113433838, + "rewards/margins": 4.27393102645874, + "rewards/rejected": -6.191516876220703, + "step": 1946 + }, + { + "epoch": 0.41, + "learning_rate": 1.1903361344537816e-05, + "logits/chosen": -2.2791125774383545, + "logits/rejected": -2.1852200031280518, + "logps/chosen": -447.7438659667969, + "logps/rejected": -433.920166015625, + "loss": 0.4509, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8635163307189941, + "rewards/margins": 3.2856340408325195, + "rewards/rejected": -5.149150371551514, + "step": 1947 + }, + { + "epoch": 0.41, + "learning_rate": 1.1899159663865548e-05, + "logits/chosen": -2.1836092472076416, + "logits/rejected": -1.7901263236999512, + "logps/chosen": -236.86294555664062, + "logps/rejected": -298.5499267578125, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6819275617599487, + "rewards/margins": 4.898286819458008, + "rewards/rejected": -6.580214500427246, + "step": 1948 + }, + { + "epoch": 0.41, + "learning_rate": 1.1894957983193278e-05, + "logits/chosen": -1.8687278032302856, + "logits/rejected": -1.7982490062713623, + "logps/chosen": -411.6961975097656, + "logps/rejected": -383.32257080078125, + "loss": 0.464, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2086563110351562, + "rewards/margins": 3.933460235595703, + "rewards/rejected": -6.142116069793701, + "step": 1949 + }, + { + "epoch": 0.41, + "learning_rate": 1.189075630252101e-05, + "logits/chosen": -2.2042768001556396, + "logits/rejected": -1.9281728267669678, + "logps/chosen": -375.22100830078125, + "logps/rejected": -343.9200439453125, + "loss": 0.4534, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.310399293899536, + "rewards/margins": 3.9504218101501465, + "rewards/rejected": -6.260821342468262, + "step": 1950 + }, + { + "epoch": 0.41, + "learning_rate": 1.188655462184874e-05, + "logits/chosen": -1.6523381471633911, + "logits/rejected": -1.7892065048217773, + "logps/chosen": -291.7080078125, + "logps/rejected": -394.178466796875, + "loss": 0.3157, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.075544834136963, + "rewards/margins": 3.866364002227783, + "rewards/rejected": -5.941908836364746, + "step": 1951 + }, + { + "epoch": 0.41, + "learning_rate": 1.1882352941176472e-05, + "logits/chosen": -1.688197135925293, + "logits/rejected": -1.9073342084884644, + "logps/chosen": -182.81512451171875, + "logps/rejected": -294.6404113769531, + "loss": 0.1558, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.539276361465454, + "rewards/margins": 4.741386890411377, + "rewards/rejected": -6.280663013458252, + "step": 1952 + }, + { + "epoch": 0.41, + "learning_rate": 1.1878151260504202e-05, + "logits/chosen": -2.3379714488983154, + "logits/rejected": -1.7211012840270996, + "logps/chosen": -554.7611083984375, + "logps/rejected": -393.25482177734375, + "loss": 0.4411, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9788715839385986, + "rewards/margins": 3.947096109390259, + "rewards/rejected": -5.925967693328857, + "step": 1953 + }, + { + "epoch": 0.41, + "learning_rate": 1.1873949579831934e-05, + "logits/chosen": -2.1366307735443115, + "logits/rejected": -2.145113468170166, + "logps/chosen": -354.5496826171875, + "logps/rejected": -384.859619140625, + "loss": 0.2317, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0397887229919434, + "rewards/margins": 4.053105354309082, + "rewards/rejected": -6.092894554138184, + "step": 1954 + }, + { + "epoch": 0.41, + "learning_rate": 1.1869747899159664e-05, + "logits/chosen": -2.2341721057891846, + "logits/rejected": -1.98061203956604, + "logps/chosen": -338.8604736328125, + "logps/rejected": -304.5364990234375, + "loss": 0.4437, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.230681896209717, + "rewards/margins": 3.2862181663513184, + "rewards/rejected": -5.516900539398193, + "step": 1955 + }, + { + "epoch": 0.41, + "learning_rate": 1.1865546218487396e-05, + "logits/chosen": -1.887587308883667, + "logits/rejected": -1.7623660564422607, + "logps/chosen": -306.985595703125, + "logps/rejected": -350.65142822265625, + "loss": 0.4348, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.3750853538513184, + "rewards/margins": 1.4892942905426025, + "rewards/rejected": -4.8643798828125, + "step": 1956 + }, + { + "epoch": 0.41, + "learning_rate": 1.1861344537815126e-05, + "logits/chosen": -2.128178119659424, + "logits/rejected": -1.4239622354507446, + "logps/chosen": -389.30242919921875, + "logps/rejected": -272.2572021484375, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3870751857757568, + "rewards/margins": 4.557292938232422, + "rewards/rejected": -5.9443678855896, + "step": 1957 + }, + { + "epoch": 0.41, + "learning_rate": 1.1857142857142858e-05, + "logits/chosen": -2.092729330062866, + "logits/rejected": -2.0663514137268066, + "logps/chosen": -248.1607666015625, + "logps/rejected": -274.74151611328125, + "loss": 0.4519, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9701026678085327, + "rewards/margins": 2.9491186141967773, + "rewards/rejected": -4.919220924377441, + "step": 1958 + }, + { + "epoch": 0.41, + "learning_rate": 1.1852941176470589e-05, + "logits/chosen": -1.8019726276397705, + "logits/rejected": -1.7665395736694336, + "logps/chosen": -251.14222717285156, + "logps/rejected": -362.126708984375, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9113917350769043, + "rewards/margins": 4.477956771850586, + "rewards/rejected": -6.389348030090332, + "step": 1959 + }, + { + "epoch": 0.41, + "learning_rate": 1.184873949579832e-05, + "logits/chosen": -1.7693349123001099, + "logits/rejected": -1.6798259019851685, + "logps/chosen": -330.01531982421875, + "logps/rejected": -336.0985107421875, + "loss": 0.1821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.097743272781372, + "rewards/margins": 3.837233543395996, + "rewards/rejected": -5.934976577758789, + "step": 1960 + }, + { + "epoch": 0.41, + "learning_rate": 1.184453781512605e-05, + "logits/chosen": -1.6896579265594482, + "logits/rejected": -1.8994159698486328, + "logps/chosen": -373.4425048828125, + "logps/rejected": -480.84454345703125, + "loss": 0.6525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2558233737945557, + "rewards/margins": 2.759458065032959, + "rewards/rejected": -6.0152812004089355, + "step": 1961 + }, + { + "epoch": 0.41, + "learning_rate": 1.1840336134453783e-05, + "logits/chosen": -2.0631000995635986, + "logits/rejected": -1.8703677654266357, + "logps/chosen": -342.1109619140625, + "logps/rejected": -299.40411376953125, + "loss": 0.4489, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.010925531387329, + "rewards/margins": 3.1662964820861816, + "rewards/rejected": -5.177222728729248, + "step": 1962 + }, + { + "epoch": 0.41, + "learning_rate": 1.1836134453781515e-05, + "logits/chosen": -2.199766159057617, + "logits/rejected": -2.1669607162475586, + "logps/chosen": -330.13623046875, + "logps/rejected": -331.3076477050781, + "loss": 0.5619, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.482279062271118, + "rewards/margins": 4.212305068969727, + "rewards/rejected": -6.694583892822266, + "step": 1963 + }, + { + "epoch": 0.41, + "learning_rate": 1.1831932773109245e-05, + "logits/chosen": -2.0139265060424805, + "logits/rejected": -1.939925193786621, + "logps/chosen": -277.1054992675781, + "logps/rejected": -270.07830810546875, + "loss": 0.603, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6427929401397705, + "rewards/margins": 2.453946590423584, + "rewards/rejected": -5.096739768981934, + "step": 1964 + }, + { + "epoch": 0.41, + "learning_rate": 1.1827731092436977e-05, + "logits/chosen": -2.074214458465576, + "logits/rejected": -1.5690877437591553, + "logps/chosen": -382.2853698730469, + "logps/rejected": -397.92059326171875, + "loss": 0.15, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.129748582839966, + "rewards/margins": 4.783637523651123, + "rewards/rejected": -6.91338586807251, + "step": 1965 + }, + { + "epoch": 0.41, + "learning_rate": 1.1823529411764707e-05, + "logits/chosen": -1.486264705657959, + "logits/rejected": -1.6873584985733032, + "logps/chosen": -313.6159362792969, + "logps/rejected": -409.6959228515625, + "loss": 0.2055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4844980239868164, + "rewards/margins": 3.7648491859436035, + "rewards/rejected": -6.249347686767578, + "step": 1966 + }, + { + "epoch": 0.41, + "learning_rate": 1.1819327731092439e-05, + "logits/chosen": -2.1372106075286865, + "logits/rejected": -1.5233962535858154, + "logps/chosen": -303.3284606933594, + "logps/rejected": -292.6336364746094, + "loss": 0.2732, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.993044137954712, + "rewards/margins": 4.036615371704102, + "rewards/rejected": -6.029659748077393, + "step": 1967 + }, + { + "epoch": 0.41, + "learning_rate": 1.1815126050420169e-05, + "logits/chosen": -1.8164833784103394, + "logits/rejected": -1.5731630325317383, + "logps/chosen": -268.54144287109375, + "logps/rejected": -219.06790161132812, + "loss": 0.5545, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.839538812637329, + "rewards/margins": 2.256260633468628, + "rewards/rejected": -5.095799446105957, + "step": 1968 + }, + { + "epoch": 0.41, + "learning_rate": 1.1810924369747901e-05, + "logits/chosen": -1.9257806539535522, + "logits/rejected": -1.5365087985992432, + "logps/chosen": -392.1906433105469, + "logps/rejected": -377.35791015625, + "loss": 0.2562, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.231613874435425, + "rewards/margins": 4.251580238342285, + "rewards/rejected": -6.483194351196289, + "step": 1969 + }, + { + "epoch": 0.41, + "learning_rate": 1.1806722689075631e-05, + "logits/chosen": -2.223924398422241, + "logits/rejected": -2.3198938369750977, + "logps/chosen": -455.7251281738281, + "logps/rejected": -424.00750732421875, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4913262128829956, + "rewards/margins": 2.4450111389160156, + "rewards/rejected": -3.9363372325897217, + "step": 1970 + }, + { + "epoch": 0.41, + "learning_rate": 1.1802521008403363e-05, + "logits/chosen": -2.1450693607330322, + "logits/rejected": -2.015467405319214, + "logps/chosen": -551.3428344726562, + "logps/rejected": -442.72283935546875, + "loss": 0.1146, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1253373622894287, + "rewards/margins": 4.3501081466674805, + "rewards/rejected": -5.475445747375488, + "step": 1971 + }, + { + "epoch": 0.41, + "learning_rate": 1.1798319327731093e-05, + "logits/chosen": -1.9290738105773926, + "logits/rejected": -1.543068289756775, + "logps/chosen": -406.9100036621094, + "logps/rejected": -354.5848083496094, + "loss": 0.3529, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.880568504333496, + "rewards/margins": 3.7765603065490723, + "rewards/rejected": -5.65712833404541, + "step": 1972 + }, + { + "epoch": 0.41, + "learning_rate": 1.1794117647058825e-05, + "logits/chosen": -2.038806676864624, + "logits/rejected": -2.015094757080078, + "logps/chosen": -350.6685485839844, + "logps/rejected": -344.201416015625, + "loss": 0.3596, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4985814094543457, + "rewards/margins": 4.5968017578125, + "rewards/rejected": -7.095383644104004, + "step": 1973 + }, + { + "epoch": 0.41, + "learning_rate": 1.1789915966386555e-05, + "logits/chosen": -1.9172277450561523, + "logits/rejected": -2.393588066101074, + "logps/chosen": -273.54412841796875, + "logps/rejected": -362.3873596191406, + "loss": 0.1462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1986079216003418, + "rewards/margins": 5.007539749145508, + "rewards/rejected": -6.206147193908691, + "step": 1974 + }, + { + "epoch": 0.41, + "learning_rate": 1.1785714285714287e-05, + "logits/chosen": -2.0969595909118652, + "logits/rejected": -1.7913565635681152, + "logps/chosen": -397.072509765625, + "logps/rejected": -355.4930114746094, + "loss": 0.3659, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.811758041381836, + "rewards/margins": 4.794989585876465, + "rewards/rejected": -6.606747627258301, + "step": 1975 + }, + { + "epoch": 0.41, + "learning_rate": 1.1781512605042018e-05, + "logits/chosen": -2.117095947265625, + "logits/rejected": -1.6759452819824219, + "logps/chosen": -429.91754150390625, + "logps/rejected": -372.3650207519531, + "loss": 0.2085, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8025895357131958, + "rewards/margins": 3.76780104637146, + "rewards/rejected": -5.570390701293945, + "step": 1976 + }, + { + "epoch": 0.41, + "learning_rate": 1.177731092436975e-05, + "logits/chosen": -2.346609354019165, + "logits/rejected": -2.261892080307007, + "logps/chosen": -257.985107421875, + "logps/rejected": -361.9310607910156, + "loss": 0.0974, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8649860620498657, + "rewards/margins": 3.897449016571045, + "rewards/rejected": -5.762434959411621, + "step": 1977 + }, + { + "epoch": 0.41, + "learning_rate": 1.177310924369748e-05, + "logits/chosen": -2.1725616455078125, + "logits/rejected": -1.791488766670227, + "logps/chosen": -330.4352111816406, + "logps/rejected": -270.24859619140625, + "loss": 0.2262, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.191359758377075, + "rewards/margins": 3.675600051879883, + "rewards/rejected": -5.866960048675537, + "step": 1978 + }, + { + "epoch": 0.41, + "learning_rate": 1.1768907563025212e-05, + "logits/chosen": -2.2235729694366455, + "logits/rejected": -1.9197649955749512, + "logps/chosen": -341.85516357421875, + "logps/rejected": -293.85821533203125, + "loss": 0.1469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0742182731628418, + "rewards/margins": 3.1900198459625244, + "rewards/rejected": -4.264238357543945, + "step": 1979 + }, + { + "epoch": 0.41, + "learning_rate": 1.1764705882352942e-05, + "logits/chosen": -2.2083446979522705, + "logits/rejected": -2.09649920463562, + "logps/chosen": -358.7520751953125, + "logps/rejected": -356.1342468261719, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.455860137939453, + "rewards/margins": 3.830432176589966, + "rewards/rejected": -6.28629207611084, + "step": 1980 + }, + { + "epoch": 0.41, + "learning_rate": 1.1760504201680674e-05, + "logits/chosen": -2.045534133911133, + "logits/rejected": -2.037423610687256, + "logps/chosen": -334.7827453613281, + "logps/rejected": -393.2997741699219, + "loss": 0.2295, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.02451753616333, + "rewards/margins": 3.3219265937805176, + "rewards/rejected": -5.346444129943848, + "step": 1981 + }, + { + "epoch": 0.41, + "learning_rate": 1.1756302521008404e-05, + "logits/chosen": -1.9969927072525024, + "logits/rejected": -2.108992099761963, + "logps/chosen": -300.56158447265625, + "logps/rejected": -276.80181884765625, + "loss": 0.2839, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.421304225921631, + "rewards/margins": 3.123438596725464, + "rewards/rejected": -5.544742584228516, + "step": 1982 + }, + { + "epoch": 0.41, + "learning_rate": 1.1752100840336136e-05, + "logits/chosen": -2.492192029953003, + "logits/rejected": -1.85874605178833, + "logps/chosen": -424.85296630859375, + "logps/rejected": -295.4246826171875, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.951184630393982, + "rewards/margins": 3.984459400177002, + "rewards/rejected": -5.935644149780273, + "step": 1983 + }, + { + "epoch": 0.42, + "learning_rate": 1.1747899159663866e-05, + "logits/chosen": -2.1601483821868896, + "logits/rejected": -2.0993869304656982, + "logps/chosen": -359.14013671875, + "logps/rejected": -285.28759765625, + "loss": 0.2903, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.616579294204712, + "rewards/margins": 2.754587173461914, + "rewards/rejected": -5.371166229248047, + "step": 1984 + }, + { + "epoch": 0.42, + "learning_rate": 1.1743697478991598e-05, + "logits/chosen": -2.106593370437622, + "logits/rejected": -1.70912504196167, + "logps/chosen": -435.725830078125, + "logps/rejected": -388.1470642089844, + "loss": 0.1443, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3147165775299072, + "rewards/margins": 5.514959812164307, + "rewards/rejected": -6.829676628112793, + "step": 1985 + }, + { + "epoch": 0.42, + "learning_rate": 1.173949579831933e-05, + "logits/chosen": -2.2300922870635986, + "logits/rejected": -1.7195117473602295, + "logps/chosen": -255.38682556152344, + "logps/rejected": -241.39251708984375, + "loss": 0.2197, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2693943977355957, + "rewards/margins": 3.3821494579315186, + "rewards/rejected": -5.651543617248535, + "step": 1986 + }, + { + "epoch": 0.42, + "learning_rate": 1.173529411764706e-05, + "logits/chosen": -2.233675479888916, + "logits/rejected": -1.6387104988098145, + "logps/chosen": -288.41064453125, + "logps/rejected": -294.75640869140625, + "loss": 0.301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.454026699066162, + "rewards/margins": 3.922327995300293, + "rewards/rejected": -6.376354694366455, + "step": 1987 + }, + { + "epoch": 0.42, + "learning_rate": 1.1731092436974792e-05, + "logits/chosen": -1.9402329921722412, + "logits/rejected": -1.9833506345748901, + "logps/chosen": -223.27432250976562, + "logps/rejected": -307.5994873046875, + "loss": 0.3893, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.535097599029541, + "rewards/margins": 4.090473175048828, + "rewards/rejected": -6.625570297241211, + "step": 1988 + }, + { + "epoch": 0.42, + "learning_rate": 1.1726890756302522e-05, + "logits/chosen": -1.8817272186279297, + "logits/rejected": -1.883370041847229, + "logps/chosen": -295.766357421875, + "logps/rejected": -317.646240234375, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3192965984344482, + "rewards/margins": 3.371514320373535, + "rewards/rejected": -5.6908111572265625, + "step": 1989 + }, + { + "epoch": 0.42, + "learning_rate": 1.1722689075630254e-05, + "logits/chosen": -2.5206491947174072, + "logits/rejected": -2.1297101974487305, + "logps/chosen": -335.68402099609375, + "logps/rejected": -323.77972412109375, + "loss": 0.1752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9038801193237305, + "rewards/margins": 3.6221609115600586, + "rewards/rejected": -5.526040554046631, + "step": 1990 + }, + { + "epoch": 0.42, + "learning_rate": 1.1718487394957984e-05, + "logits/chosen": -2.2206225395202637, + "logits/rejected": -1.6062289476394653, + "logps/chosen": -365.5782470703125, + "logps/rejected": -308.41986083984375, + "loss": 0.5274, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1508214473724365, + "rewards/margins": 3.382694959640503, + "rewards/rejected": -5.533515930175781, + "step": 1991 + }, + { + "epoch": 0.42, + "learning_rate": 1.1714285714285716e-05, + "logits/chosen": -1.9700405597686768, + "logits/rejected": -1.9780497550964355, + "logps/chosen": -342.9854736328125, + "logps/rejected": -407.82861328125, + "loss": 0.3352, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7476754188537598, + "rewards/margins": 3.291015625, + "rewards/rejected": -6.038690567016602, + "step": 1992 + }, + { + "epoch": 0.42, + "learning_rate": 1.1710084033613447e-05, + "logits/chosen": -2.1418800354003906, + "logits/rejected": -1.5256165266036987, + "logps/chosen": -338.65557861328125, + "logps/rejected": -274.3718566894531, + "loss": 0.4076, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.764272928237915, + "rewards/margins": 3.890021324157715, + "rewards/rejected": -6.654294967651367, + "step": 1993 + }, + { + "epoch": 0.42, + "learning_rate": 1.1705882352941178e-05, + "logits/chosen": -2.095531463623047, + "logits/rejected": -1.8915833234786987, + "logps/chosen": -250.8558349609375, + "logps/rejected": -297.199462890625, + "loss": 0.2868, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9347187280654907, + "rewards/margins": 3.844944477081299, + "rewards/rejected": -5.7796630859375, + "step": 1994 + }, + { + "epoch": 0.42, + "learning_rate": 1.1701680672268909e-05, + "logits/chosen": -2.112004041671753, + "logits/rejected": -1.9657294750213623, + "logps/chosen": -353.8191223144531, + "logps/rejected": -346.2870788574219, + "loss": 0.4182, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6809327602386475, + "rewards/margins": 2.743990421295166, + "rewards/rejected": -5.424922943115234, + "step": 1995 + }, + { + "epoch": 0.42, + "learning_rate": 1.169747899159664e-05, + "logits/chosen": -2.249419689178467, + "logits/rejected": -2.2966601848602295, + "logps/chosen": -411.1002502441406, + "logps/rejected": -422.5325927734375, + "loss": 0.4391, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4471569061279297, + "rewards/margins": 3.2927207946777344, + "rewards/rejected": -5.739877700805664, + "step": 1996 + }, + { + "epoch": 0.42, + "learning_rate": 1.1693277310924371e-05, + "logits/chosen": -2.2589008808135986, + "logits/rejected": -2.1911535263061523, + "logps/chosen": -347.05303955078125, + "logps/rejected": -376.7115478515625, + "loss": 0.6299, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8977396488189697, + "rewards/margins": 3.968108654022217, + "rewards/rejected": -6.865848541259766, + "step": 1997 + }, + { + "epoch": 0.42, + "learning_rate": 1.1689075630252103e-05, + "logits/chosen": -2.438709259033203, + "logits/rejected": -2.026749849319458, + "logps/chosen": -372.5621337890625, + "logps/rejected": -345.9267578125, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9480586051940918, + "rewards/margins": 3.7759501934051514, + "rewards/rejected": -5.724008560180664, + "step": 1998 + }, + { + "epoch": 0.42, + "learning_rate": 1.1684873949579833e-05, + "logits/chosen": -2.129971504211426, + "logits/rejected": -2.127574920654297, + "logps/chosen": -368.34283447265625, + "logps/rejected": -411.705322265625, + "loss": 0.3494, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4318723678588867, + "rewards/margins": 3.5859169960021973, + "rewards/rejected": -6.017788887023926, + "step": 1999 + }, + { + "epoch": 0.42, + "learning_rate": 1.1680672268907565e-05, + "logits/chosen": -2.024904727935791, + "logits/rejected": -1.8718531131744385, + "logps/chosen": -199.9012451171875, + "logps/rejected": -270.2091979980469, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6000401973724365, + "rewards/margins": 3.5901875495910645, + "rewards/rejected": -6.190227031707764, + "step": 2000 + }, + { + "epoch": 0.42, + "learning_rate": 1.1676470588235295e-05, + "logits/chosen": -2.079197406768799, + "logits/rejected": -1.696530818939209, + "logps/chosen": -283.9945983886719, + "logps/rejected": -273.9361572265625, + "loss": 0.7153, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8693621158599854, + "rewards/margins": 2.799614191055298, + "rewards/rejected": -5.668976783752441, + "step": 2001 + }, + { + "epoch": 0.42, + "learning_rate": 1.1672268907563027e-05, + "logits/chosen": -2.105051040649414, + "logits/rejected": -2.029066562652588, + "logps/chosen": -357.68682861328125, + "logps/rejected": -345.736572265625, + "loss": 0.264, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7269809246063232, + "rewards/margins": 3.248204231262207, + "rewards/rejected": -4.975184440612793, + "step": 2002 + }, + { + "epoch": 0.42, + "learning_rate": 1.1668067226890757e-05, + "logits/chosen": -1.910569429397583, + "logits/rejected": -1.6828595399856567, + "logps/chosen": -331.3682861328125, + "logps/rejected": -296.7809753417969, + "loss": 0.2823, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4297759532928467, + "rewards/margins": 3.8586106300354004, + "rewards/rejected": -6.288386821746826, + "step": 2003 + }, + { + "epoch": 0.42, + "learning_rate": 1.166386554621849e-05, + "logits/chosen": -2.074186325073242, + "logits/rejected": -1.9179534912109375, + "logps/chosen": -240.02157592773438, + "logps/rejected": -334.829833984375, + "loss": 0.159, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.493454933166504, + "rewards/margins": 3.1100752353668213, + "rewards/rejected": -5.603529930114746, + "step": 2004 + }, + { + "epoch": 0.42, + "learning_rate": 1.165966386554622e-05, + "logits/chosen": -2.3214378356933594, + "logits/rejected": -2.194861888885498, + "logps/chosen": -294.208740234375, + "logps/rejected": -304.8033447265625, + "loss": 0.3174, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.840536117553711, + "rewards/margins": 3.5510988235473633, + "rewards/rejected": -5.391634464263916, + "step": 2005 + }, + { + "epoch": 0.42, + "learning_rate": 1.1655462184873951e-05, + "logits/chosen": -2.4050040245056152, + "logits/rejected": -1.8576184511184692, + "logps/chosen": -396.2915344238281, + "logps/rejected": -283.24090576171875, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8042290210723877, + "rewards/margins": 5.0325493812561035, + "rewards/rejected": -6.83677864074707, + "step": 2006 + }, + { + "epoch": 0.42, + "learning_rate": 1.1651260504201683e-05, + "logits/chosen": -1.7313112020492554, + "logits/rejected": -1.535030484199524, + "logps/chosen": -278.0242004394531, + "logps/rejected": -248.71359252929688, + "loss": 0.4917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7757339477539062, + "rewards/margins": 2.6418771743774414, + "rewards/rejected": -5.417611122131348, + "step": 2007 + }, + { + "epoch": 0.42, + "learning_rate": 1.1647058823529413e-05, + "logits/chosen": -1.8838014602661133, + "logits/rejected": -2.0262277126312256, + "logps/chosen": -330.3498229980469, + "logps/rejected": -304.90771484375, + "loss": 0.588, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2751355171203613, + "rewards/margins": 2.4011154174804688, + "rewards/rejected": -4.676250457763672, + "step": 2008 + }, + { + "epoch": 0.42, + "learning_rate": 1.1642857142857145e-05, + "logits/chosen": -2.2164459228515625, + "logits/rejected": -1.8665568828582764, + "logps/chosen": -349.1014404296875, + "logps/rejected": -224.43362426757812, + "loss": 0.1701, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1316754817962646, + "rewards/margins": 4.139419078826904, + "rewards/rejected": -6.271094799041748, + "step": 2009 + }, + { + "epoch": 0.42, + "learning_rate": 1.1638655462184876e-05, + "logits/chosen": -2.43519926071167, + "logits/rejected": -1.8341882228851318, + "logps/chosen": -335.29107666015625, + "logps/rejected": -237.13446044921875, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.127246379852295, + "rewards/margins": 4.704779148101807, + "rewards/rejected": -6.832025527954102, + "step": 2010 + }, + { + "epoch": 0.42, + "learning_rate": 1.1634453781512608e-05, + "logits/chosen": -2.3218064308166504, + "logits/rejected": -2.132877826690674, + "logps/chosen": -412.05755615234375, + "logps/rejected": -369.7252197265625, + "loss": 0.3225, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2949604988098145, + "rewards/margins": 3.5905070304870605, + "rewards/rejected": -5.885467529296875, + "step": 2011 + }, + { + "epoch": 0.42, + "learning_rate": 1.1630252100840338e-05, + "logits/chosen": -2.15934681892395, + "logits/rejected": -2.282438039779663, + "logps/chosen": -388.4600830078125, + "logps/rejected": -455.7470397949219, + "loss": 0.2872, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.475032091140747, + "rewards/margins": 3.115109920501709, + "rewards/rejected": -5.590141773223877, + "step": 2012 + }, + { + "epoch": 0.42, + "learning_rate": 1.162605042016807e-05, + "logits/chosen": -2.0802018642425537, + "logits/rejected": -1.5434036254882812, + "logps/chosen": -299.9101867675781, + "logps/rejected": -278.4464111328125, + "loss": 0.1651, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.55178165435791, + "rewards/margins": 3.4816830158233643, + "rewards/rejected": -6.033464431762695, + "step": 2013 + }, + { + "epoch": 0.42, + "learning_rate": 1.16218487394958e-05, + "logits/chosen": -1.8953118324279785, + "logits/rejected": -2.399675130844116, + "logps/chosen": -242.6870880126953, + "logps/rejected": -345.7003173828125, + "loss": 0.8016, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.0013341903686523, + "rewards/margins": 1.4316017627716064, + "rewards/rejected": -4.43293571472168, + "step": 2014 + }, + { + "epoch": 0.42, + "learning_rate": 1.1617647058823532e-05, + "logits/chosen": -2.285087823867798, + "logits/rejected": -1.8152977228164673, + "logps/chosen": -422.3471984863281, + "logps/rejected": -356.77178955078125, + "loss": 0.3376, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9482060670852661, + "rewards/margins": 3.8550796508789062, + "rewards/rejected": -5.803285598754883, + "step": 2015 + }, + { + "epoch": 0.42, + "learning_rate": 1.1613445378151262e-05, + "logits/chosen": -2.2776033878326416, + "logits/rejected": -2.055415391921997, + "logps/chosen": -353.1641540527344, + "logps/rejected": -294.0531005859375, + "loss": 0.2981, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.145308494567871, + "rewards/margins": 3.6759276390075684, + "rewards/rejected": -5.8212361335754395, + "step": 2016 + }, + { + "epoch": 0.42, + "learning_rate": 1.1609243697478994e-05, + "logits/chosen": -2.163743734359741, + "logits/rejected": -1.9288283586502075, + "logps/chosen": -383.09661865234375, + "logps/rejected": -379.734619140625, + "loss": 0.155, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4958137273788452, + "rewards/margins": 4.644062042236328, + "rewards/rejected": -6.139875411987305, + "step": 2017 + }, + { + "epoch": 0.42, + "learning_rate": 1.1605042016806724e-05, + "logits/chosen": -2.2524187564849854, + "logits/rejected": -1.9637539386749268, + "logps/chosen": -381.77923583984375, + "logps/rejected": -385.3320007324219, + "loss": 0.3817, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3869824409484863, + "rewards/margins": 3.772944927215576, + "rewards/rejected": -5.159926891326904, + "step": 2018 + }, + { + "epoch": 0.42, + "learning_rate": 1.1600840336134456e-05, + "logits/chosen": -2.2648332118988037, + "logits/rejected": -2.156357765197754, + "logps/chosen": -339.6280517578125, + "logps/rejected": -316.3348693847656, + "loss": 0.2546, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4685208797454834, + "rewards/margins": 4.188183307647705, + "rewards/rejected": -6.656703948974609, + "step": 2019 + }, + { + "epoch": 0.42, + "learning_rate": 1.1596638655462186e-05, + "logits/chosen": -2.208322048187256, + "logits/rejected": -1.9311819076538086, + "logps/chosen": -281.691650390625, + "logps/rejected": -298.8689270019531, + "loss": 0.3471, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3511407375335693, + "rewards/margins": 4.380074977874756, + "rewards/rejected": -6.731215476989746, + "step": 2020 + }, + { + "epoch": 0.42, + "learning_rate": 1.1592436974789918e-05, + "logits/chosen": -2.2703824043273926, + "logits/rejected": -1.9259319305419922, + "logps/chosen": -483.55999755859375, + "logps/rejected": -356.62371826171875, + "loss": 0.2327, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9399688243865967, + "rewards/margins": 2.9370155334472656, + "rewards/rejected": -4.876984119415283, + "step": 2021 + }, + { + "epoch": 0.42, + "learning_rate": 1.1588235294117648e-05, + "logits/chosen": -2.048274040222168, + "logits/rejected": -2.2095186710357666, + "logps/chosen": -287.12921142578125, + "logps/rejected": -294.0669860839844, + "loss": 0.5658, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.329941749572754, + "rewards/margins": 2.1715383529663086, + "rewards/rejected": -4.5014801025390625, + "step": 2022 + }, + { + "epoch": 0.42, + "learning_rate": 1.158403361344538e-05, + "logits/chosen": -2.269408702850342, + "logits/rejected": -2.075428009033203, + "logps/chosen": -402.91607666015625, + "logps/rejected": -356.7300720214844, + "loss": 0.2597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4409866333007812, + "rewards/margins": 4.622939586639404, + "rewards/rejected": -6.0639262199401855, + "step": 2023 + }, + { + "epoch": 0.42, + "learning_rate": 1.157983193277311e-05, + "logits/chosen": -2.1971328258514404, + "logits/rejected": -2.0909547805786133, + "logps/chosen": -359.8511962890625, + "logps/rejected": -321.28973388671875, + "loss": 0.573, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.780590534210205, + "rewards/margins": 2.45827054977417, + "rewards/rejected": -6.238861083984375, + "step": 2024 + }, + { + "epoch": 0.42, + "learning_rate": 1.1575630252100842e-05, + "logits/chosen": -1.920001745223999, + "logits/rejected": -2.0093905925750732, + "logps/chosen": -373.7569580078125, + "logps/rejected": -357.22515869140625, + "loss": 0.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3780170679092407, + "rewards/margins": 2.417965888977051, + "rewards/rejected": -3.795982599258423, + "step": 2025 + }, + { + "epoch": 0.42, + "learning_rate": 1.1571428571428573e-05, + "logits/chosen": -2.2176761627197266, + "logits/rejected": -1.9830904006958008, + "logps/chosen": -329.3486328125, + "logps/rejected": -342.1224060058594, + "loss": 0.182, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7171614170074463, + "rewards/margins": 4.0453410148620605, + "rewards/rejected": -6.762502670288086, + "step": 2026 + }, + { + "epoch": 0.42, + "learning_rate": 1.1567226890756305e-05, + "logits/chosen": -2.058716297149658, + "logits/rejected": -1.8591029644012451, + "logps/chosen": -383.6545104980469, + "logps/rejected": -344.69195556640625, + "loss": 0.6941, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4524264335632324, + "rewards/margins": 2.642935276031494, + "rewards/rejected": -5.095361709594727, + "step": 2027 + }, + { + "epoch": 0.42, + "learning_rate": 1.1563025210084035e-05, + "logits/chosen": -2.2357912063598633, + "logits/rejected": -1.981215238571167, + "logps/chosen": -372.9283142089844, + "logps/rejected": -298.0931701660156, + "loss": 0.1775, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.313389539718628, + "rewards/margins": 4.310908317565918, + "rewards/rejected": -5.624298572540283, + "step": 2028 + }, + { + "epoch": 0.42, + "learning_rate": 1.1558823529411765e-05, + "logits/chosen": -1.8320471048355103, + "logits/rejected": -1.7739046812057495, + "logps/chosen": -259.5126953125, + "logps/rejected": -312.6507568359375, + "loss": 0.2925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.192497730255127, + "rewards/margins": 2.910689353942871, + "rewards/rejected": -5.103187084197998, + "step": 2029 + }, + { + "epoch": 0.42, + "learning_rate": 1.1554621848739495e-05, + "logits/chosen": -2.0798726081848145, + "logits/rejected": -2.0783588886260986, + "logps/chosen": -304.9694519042969, + "logps/rejected": -366.6223449707031, + "loss": 0.1726, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.851839303970337, + "rewards/margins": 3.8220221996307373, + "rewards/rejected": -5.673861503601074, + "step": 2030 + }, + { + "epoch": 0.42, + "learning_rate": 1.1550420168067227e-05, + "logits/chosen": -2.346567153930664, + "logits/rejected": -2.2844364643096924, + "logps/chosen": -296.7462158203125, + "logps/rejected": -325.58624267578125, + "loss": 0.3088, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9146637916564941, + "rewards/margins": 3.0064821243286133, + "rewards/rejected": -4.921145915985107, + "step": 2031 + }, + { + "epoch": 0.43, + "learning_rate": 1.1546218487394957e-05, + "logits/chosen": -2.3003039360046387, + "logits/rejected": -1.7692413330078125, + "logps/chosen": -389.7088928222656, + "logps/rejected": -356.91424560546875, + "loss": 0.1603, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9001538753509521, + "rewards/margins": 4.908247470855713, + "rewards/rejected": -6.808401584625244, + "step": 2032 + }, + { + "epoch": 0.43, + "learning_rate": 1.154201680672269e-05, + "logits/chosen": -1.9862852096557617, + "logits/rejected": -2.068901538848877, + "logps/chosen": -370.74969482421875, + "logps/rejected": -373.4486083984375, + "loss": 1.1811, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.051518201828003, + "rewards/margins": 2.2835683822631836, + "rewards/rejected": -5.335086822509766, + "step": 2033 + }, + { + "epoch": 0.43, + "learning_rate": 1.153781512605042e-05, + "logits/chosen": -2.0426383018493652, + "logits/rejected": -1.8522021770477295, + "logps/chosen": -236.17050170898438, + "logps/rejected": -281.18817138671875, + "loss": 0.3454, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7310116291046143, + "rewards/margins": 3.253507375717163, + "rewards/rejected": -5.984519004821777, + "step": 2034 + }, + { + "epoch": 0.43, + "learning_rate": 1.1533613445378151e-05, + "logits/chosen": -2.3632569313049316, + "logits/rejected": -2.0204625129699707, + "logps/chosen": -285.57525634765625, + "logps/rejected": -314.8445129394531, + "loss": 0.2469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7121021747589111, + "rewards/margins": 3.498154401779175, + "rewards/rejected": -5.210256576538086, + "step": 2035 + }, + { + "epoch": 0.43, + "learning_rate": 1.1529411764705882e-05, + "logits/chosen": -2.1731150150299072, + "logits/rejected": -1.7058427333831787, + "logps/chosen": -302.15435791015625, + "logps/rejected": -299.70880126953125, + "loss": 0.3348, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2195417881011963, + "rewards/margins": 3.850309371948242, + "rewards/rejected": -6.069850921630859, + "step": 2036 + }, + { + "epoch": 0.43, + "learning_rate": 1.1525210084033614e-05, + "logits/chosen": -2.0464119911193848, + "logits/rejected": -2.1896140575408936, + "logps/chosen": -249.23846435546875, + "logps/rejected": -341.70574951171875, + "loss": 0.2352, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.360543966293335, + "rewards/margins": 3.9387807846069336, + "rewards/rejected": -6.299324989318848, + "step": 2037 + }, + { + "epoch": 0.43, + "learning_rate": 1.1521008403361344e-05, + "logits/chosen": -2.086638927459717, + "logits/rejected": -1.889849305152893, + "logps/chosen": -251.52032470703125, + "logps/rejected": -281.9322814941406, + "loss": 0.203, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.70516037940979, + "rewards/margins": 4.880710601806641, + "rewards/rejected": -6.585871696472168, + "step": 2038 + }, + { + "epoch": 0.43, + "learning_rate": 1.1516806722689076e-05, + "logits/chosen": -1.9713646173477173, + "logits/rejected": -1.8880610466003418, + "logps/chosen": -320.4168395996094, + "logps/rejected": -385.7452392578125, + "loss": 0.3768, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7439255714416504, + "rewards/margins": 3.454660415649414, + "rewards/rejected": -5.1985859870910645, + "step": 2039 + }, + { + "epoch": 0.43, + "learning_rate": 1.1512605042016806e-05, + "logits/chosen": -2.108473300933838, + "logits/rejected": -1.8333741426467896, + "logps/chosen": -347.6883544921875, + "logps/rejected": -345.1535949707031, + "loss": 0.4493, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.845118522644043, + "rewards/margins": 3.5066580772399902, + "rewards/rejected": -6.351777076721191, + "step": 2040 + }, + { + "epoch": 0.43, + "learning_rate": 1.1508403361344538e-05, + "logits/chosen": -2.1279444694519043, + "logits/rejected": -2.014584541320801, + "logps/chosen": -361.067138671875, + "logps/rejected": -425.7370910644531, + "loss": 0.1364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5322329998016357, + "rewards/margins": 4.137571811676025, + "rewards/rejected": -5.66980504989624, + "step": 2041 + }, + { + "epoch": 0.43, + "learning_rate": 1.150420168067227e-05, + "logits/chosen": -2.379103899002075, + "logits/rejected": -2.2502808570861816, + "logps/chosen": -423.4561767578125, + "logps/rejected": -397.1187744140625, + "loss": 0.3148, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7356139421463013, + "rewards/margins": 3.093418598175049, + "rewards/rejected": -4.8290324211120605, + "step": 2042 + }, + { + "epoch": 0.43, + "learning_rate": 1.15e-05, + "logits/chosen": -2.2598257064819336, + "logits/rejected": -1.6651756763458252, + "logps/chosen": -386.95611572265625, + "logps/rejected": -287.5246887207031, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4715769290924072, + "rewards/margins": 3.0825605392456055, + "rewards/rejected": -5.554137229919434, + "step": 2043 + }, + { + "epoch": 0.43, + "learning_rate": 1.1495798319327732e-05, + "logits/chosen": -2.1495726108551025, + "logits/rejected": -1.9794659614562988, + "logps/chosen": -388.66326904296875, + "logps/rejected": -351.50335693359375, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5405120849609375, + "rewards/margins": 3.1952106952667236, + "rewards/rejected": -5.735722541809082, + "step": 2044 + }, + { + "epoch": 0.43, + "learning_rate": 1.1491596638655462e-05, + "logits/chosen": -2.0491394996643066, + "logits/rejected": -1.9735050201416016, + "logps/chosen": -263.5440673828125, + "logps/rejected": -291.84661865234375, + "loss": 0.3734, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.613436698913574, + "rewards/margins": 2.9057528972625732, + "rewards/rejected": -5.51918888092041, + "step": 2045 + }, + { + "epoch": 0.43, + "learning_rate": 1.1487394957983194e-05, + "logits/chosen": -2.035627841949463, + "logits/rejected": -2.0443267822265625, + "logps/chosen": -329.2646484375, + "logps/rejected": -437.09130859375, + "loss": 0.1607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0102715492248535, + "rewards/margins": 3.5766985416412354, + "rewards/rejected": -5.586970329284668, + "step": 2046 + }, + { + "epoch": 0.43, + "learning_rate": 1.1483193277310924e-05, + "logits/chosen": -1.938293695449829, + "logits/rejected": -2.2565715312957764, + "logps/chosen": -251.23255920410156, + "logps/rejected": -378.9906311035156, + "loss": 0.5835, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1010117530822754, + "rewards/margins": 2.8104801177978516, + "rewards/rejected": -5.911492347717285, + "step": 2047 + }, + { + "epoch": 0.43, + "learning_rate": 1.1478991596638656e-05, + "logits/chosen": -1.8902827501296997, + "logits/rejected": -2.066514730453491, + "logps/chosen": -278.21295166015625, + "logps/rejected": -309.6601867675781, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.574343204498291, + "rewards/margins": 4.671230792999268, + "rewards/rejected": -7.245573997497559, + "step": 2048 + }, + { + "epoch": 0.43, + "learning_rate": 1.1474789915966386e-05, + "logits/chosen": -2.1475770473480225, + "logits/rejected": -1.978366732597351, + "logps/chosen": -324.9825439453125, + "logps/rejected": -328.5615234375, + "loss": 0.5464, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5895750522613525, + "rewards/margins": 2.695735216140747, + "rewards/rejected": -5.2853102684021, + "step": 2049 + }, + { + "epoch": 0.43, + "learning_rate": 1.1470588235294118e-05, + "logits/chosen": -2.171236991882324, + "logits/rejected": -2.039135694503784, + "logps/chosen": -287.06365966796875, + "logps/rejected": -313.1839599609375, + "loss": 0.2787, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.255889415740967, + "rewards/margins": 4.388477325439453, + "rewards/rejected": -6.644366264343262, + "step": 2050 + }, + { + "epoch": 0.43, + "learning_rate": 1.1466386554621849e-05, + "logits/chosen": -2.3846218585968018, + "logits/rejected": -2.0171148777008057, + "logps/chosen": -430.9451904296875, + "logps/rejected": -428.7223815917969, + "loss": 0.4191, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.113434076309204, + "rewards/margins": 2.795745849609375, + "rewards/rejected": -4.9091796875, + "step": 2051 + }, + { + "epoch": 0.43, + "learning_rate": 1.146218487394958e-05, + "logits/chosen": -2.164930820465088, + "logits/rejected": -1.7924004793167114, + "logps/chosen": -345.96697998046875, + "logps/rejected": -243.38168334960938, + "loss": 0.2555, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1616978645324707, + "rewards/margins": 3.265957832336426, + "rewards/rejected": -5.4276556968688965, + "step": 2052 + }, + { + "epoch": 0.43, + "learning_rate": 1.145798319327731e-05, + "logits/chosen": -1.9983105659484863, + "logits/rejected": -2.1830124855041504, + "logps/chosen": -300.07928466796875, + "logps/rejected": -378.98406982421875, + "loss": 0.2549, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.333488941192627, + "rewards/margins": 3.7451555728912354, + "rewards/rejected": -6.078644275665283, + "step": 2053 + }, + { + "epoch": 0.43, + "learning_rate": 1.1453781512605043e-05, + "logits/chosen": -1.919573187828064, + "logits/rejected": -2.1717214584350586, + "logps/chosen": -226.31471252441406, + "logps/rejected": -268.5181884765625, + "loss": 0.6907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.896489381790161, + "rewards/margins": 1.5752509832382202, + "rewards/rejected": -4.47174072265625, + "step": 2054 + }, + { + "epoch": 0.43, + "learning_rate": 1.1449579831932773e-05, + "logits/chosen": -2.1140739917755127, + "logits/rejected": -2.202899932861328, + "logps/chosen": -365.08599853515625, + "logps/rejected": -420.6261291503906, + "loss": 0.3417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3386406898498535, + "rewards/margins": 2.8463761806488037, + "rewards/rejected": -6.185017108917236, + "step": 2055 + }, + { + "epoch": 0.43, + "learning_rate": 1.1445378151260505e-05, + "logits/chosen": -1.8200733661651611, + "logits/rejected": -2.191620111465454, + "logps/chosen": -282.4912109375, + "logps/rejected": -390.8856201171875, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0334620475769043, + "rewards/margins": 3.3059329986572266, + "rewards/rejected": -6.339395046234131, + "step": 2056 + }, + { + "epoch": 0.43, + "learning_rate": 1.1441176470588235e-05, + "logits/chosen": -2.20247483253479, + "logits/rejected": -2.1901540756225586, + "logps/chosen": -292.2968444824219, + "logps/rejected": -309.931884765625, + "loss": 0.2491, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9867068529129028, + "rewards/margins": 3.555753231048584, + "rewards/rejected": -5.542459964752197, + "step": 2057 + }, + { + "epoch": 0.43, + "learning_rate": 1.1436974789915967e-05, + "logits/chosen": -2.0039258003234863, + "logits/rejected": -1.9227045774459839, + "logps/chosen": -250.51931762695312, + "logps/rejected": -339.1800537109375, + "loss": 0.2171, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2139534950256348, + "rewards/margins": 3.0042152404785156, + "rewards/rejected": -6.218169212341309, + "step": 2058 + }, + { + "epoch": 0.43, + "learning_rate": 1.1432773109243697e-05, + "logits/chosen": -1.9491963386535645, + "logits/rejected": -2.0885331630706787, + "logps/chosen": -247.37596130371094, + "logps/rejected": -319.37261962890625, + "loss": 0.404, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2546679973602295, + "rewards/margins": 3.4546799659729004, + "rewards/rejected": -5.709348201751709, + "step": 2059 + }, + { + "epoch": 0.43, + "learning_rate": 1.1428571428571429e-05, + "logits/chosen": -2.337691068649292, + "logits/rejected": -1.9971985816955566, + "logps/chosen": -377.20257568359375, + "logps/rejected": -356.19903564453125, + "loss": 0.2222, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.640131950378418, + "rewards/margins": 3.9922690391540527, + "rewards/rejected": -6.632400989532471, + "step": 2060 + }, + { + "epoch": 0.43, + "learning_rate": 1.142436974789916e-05, + "logits/chosen": -1.8078107833862305, + "logits/rejected": -1.9563109874725342, + "logps/chosen": -343.4014892578125, + "logps/rejected": -369.037109375, + "loss": 0.4549, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.637943983078003, + "rewards/margins": 2.3987245559692383, + "rewards/rejected": -5.03666877746582, + "step": 2061 + }, + { + "epoch": 0.43, + "learning_rate": 1.1420168067226891e-05, + "logits/chosen": -2.0936930179595947, + "logits/rejected": -1.9929474592208862, + "logps/chosen": -422.263671875, + "logps/rejected": -383.9462890625, + "loss": 0.324, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1107659339904785, + "rewards/margins": 3.9146995544433594, + "rewards/rejected": -6.025465488433838, + "step": 2062 + }, + { + "epoch": 0.43, + "learning_rate": 1.1415966386554621e-05, + "logits/chosen": -2.2708230018615723, + "logits/rejected": -1.8042092323303223, + "logps/chosen": -339.3675842285156, + "logps/rejected": -272.2579345703125, + "loss": 0.3843, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3816967010498047, + "rewards/margins": 4.644870281219482, + "rewards/rejected": -7.026566982269287, + "step": 2063 + }, + { + "epoch": 0.43, + "learning_rate": 1.1411764705882353e-05, + "logits/chosen": -2.100377082824707, + "logits/rejected": -1.805079460144043, + "logps/chosen": -436.5498046875, + "logps/rejected": -280.3676452636719, + "loss": 0.233, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8288683891296387, + "rewards/margins": 3.222707986831665, + "rewards/rejected": -5.051576614379883, + "step": 2064 + }, + { + "epoch": 0.43, + "learning_rate": 1.1407563025210085e-05, + "logits/chosen": -2.2598533630371094, + "logits/rejected": -1.8235297203063965, + "logps/chosen": -359.3331298828125, + "logps/rejected": -332.5981750488281, + "loss": 0.1902, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.383780002593994, + "rewards/margins": 4.61159610748291, + "rewards/rejected": -6.9953765869140625, + "step": 2065 + }, + { + "epoch": 0.43, + "learning_rate": 1.1403361344537815e-05, + "logits/chosen": -2.011625051498413, + "logits/rejected": -2.0912132263183594, + "logps/chosen": -266.5169372558594, + "logps/rejected": -274.8509216308594, + "loss": 0.1585, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.510806083679199, + "rewards/margins": 3.6269052028656006, + "rewards/rejected": -6.137711524963379, + "step": 2066 + }, + { + "epoch": 0.43, + "learning_rate": 1.1399159663865547e-05, + "logits/chosen": -2.01057767868042, + "logits/rejected": -1.691219687461853, + "logps/chosen": -380.3301696777344, + "logps/rejected": -364.795166015625, + "loss": 0.0959, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.495618224143982, + "rewards/margins": 4.659554481506348, + "rewards/rejected": -6.155173301696777, + "step": 2067 + }, + { + "epoch": 0.43, + "learning_rate": 1.1394957983193278e-05, + "logits/chosen": -2.1961684226989746, + "logits/rejected": -2.151221513748169, + "logps/chosen": -290.4610290527344, + "logps/rejected": -376.06927490234375, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5207271575927734, + "rewards/margins": 2.2443108558654785, + "rewards/rejected": -5.765038013458252, + "step": 2068 + }, + { + "epoch": 0.43, + "learning_rate": 1.139075630252101e-05, + "logits/chosen": -1.9307758808135986, + "logits/rejected": -1.6932154893875122, + "logps/chosen": -250.8085174560547, + "logps/rejected": -266.97637939453125, + "loss": 0.4998, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.792975902557373, + "rewards/margins": 3.3512892723083496, + "rewards/rejected": -6.144266128540039, + "step": 2069 + }, + { + "epoch": 0.43, + "learning_rate": 1.138655462184874e-05, + "logits/chosen": -2.2497262954711914, + "logits/rejected": -2.2222139835357666, + "logps/chosen": -398.31011962890625, + "logps/rejected": -472.5359191894531, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7597455978393555, + "rewards/margins": 4.6423258781433105, + "rewards/rejected": -7.402071952819824, + "step": 2070 + }, + { + "epoch": 0.43, + "learning_rate": 1.1382352941176472e-05, + "logits/chosen": -1.9763514995574951, + "logits/rejected": -1.9554386138916016, + "logps/chosen": -344.38006591796875, + "logps/rejected": -343.58050537109375, + "loss": 0.2937, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.546339273452759, + "rewards/margins": 3.6334803104400635, + "rewards/rejected": -6.1798200607299805, + "step": 2071 + }, + { + "epoch": 0.43, + "learning_rate": 1.1378151260504202e-05, + "logits/chosen": -2.1349334716796875, + "logits/rejected": -2.1206350326538086, + "logps/chosen": -313.98760986328125, + "logps/rejected": -341.19451904296875, + "loss": 0.37, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2789769172668457, + "rewards/margins": 4.54546594619751, + "rewards/rejected": -6.8244428634643555, + "step": 2072 + }, + { + "epoch": 0.43, + "learning_rate": 1.1373949579831934e-05, + "logits/chosen": -2.3179574012756348, + "logits/rejected": -1.8872590065002441, + "logps/chosen": -340.9142150878906, + "logps/rejected": -356.84735107421875, + "loss": 0.6791, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6295087337493896, + "rewards/margins": 2.722949743270874, + "rewards/rejected": -5.352458953857422, + "step": 2073 + }, + { + "epoch": 0.43, + "learning_rate": 1.1369747899159664e-05, + "logits/chosen": -2.2450451850891113, + "logits/rejected": -1.8917794227600098, + "logps/chosen": -260.1798095703125, + "logps/rejected": -357.92303466796875, + "loss": 0.1589, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1378841400146484, + "rewards/margins": 5.381170272827148, + "rewards/rejected": -7.519054412841797, + "step": 2074 + }, + { + "epoch": 0.43, + "learning_rate": 1.1365546218487396e-05, + "logits/chosen": -2.0596959590911865, + "logits/rejected": -1.9514442682266235, + "logps/chosen": -301.93731689453125, + "logps/rejected": -374.26031494140625, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.668936014175415, + "rewards/margins": 4.4316558837890625, + "rewards/rejected": -7.100592613220215, + "step": 2075 + }, + { + "epoch": 0.43, + "learning_rate": 1.1361344537815126e-05, + "logits/chosen": -1.8287434577941895, + "logits/rejected": -1.4872750043869019, + "logps/chosen": -307.71392822265625, + "logps/rejected": -246.12623596191406, + "loss": 1.5487, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.520806074142456, + "rewards/margins": 1.386871576309204, + "rewards/rejected": -4.90767765045166, + "step": 2076 + }, + { + "epoch": 0.43, + "learning_rate": 1.1357142857142858e-05, + "logits/chosen": -2.097133159637451, + "logits/rejected": -2.046030044555664, + "logps/chosen": -286.571044921875, + "logps/rejected": -301.0132141113281, + "loss": 0.3351, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.337228775024414, + "rewards/margins": 3.833244800567627, + "rewards/rejected": -6.170473575592041, + "step": 2077 + }, + { + "epoch": 0.43, + "learning_rate": 1.1352941176470588e-05, + "logits/chosen": -2.148763656616211, + "logits/rejected": -2.0681514739990234, + "logps/chosen": -237.3824462890625, + "logps/rejected": -325.4210510253906, + "loss": 0.3731, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7078304290771484, + "rewards/margins": 4.451881408691406, + "rewards/rejected": -7.159711837768555, + "step": 2078 + }, + { + "epoch": 0.43, + "learning_rate": 1.134873949579832e-05, + "logits/chosen": -1.891608715057373, + "logits/rejected": -1.807741641998291, + "logps/chosen": -314.13287353515625, + "logps/rejected": -319.89892578125, + "loss": 0.4133, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.19529390335083, + "rewards/margins": 2.7128686904907227, + "rewards/rejected": -5.9081621170043945, + "step": 2079 + }, + { + "epoch": 0.44, + "learning_rate": 1.134453781512605e-05, + "logits/chosen": -2.2419192790985107, + "logits/rejected": -1.522952675819397, + "logps/chosen": -299.9716796875, + "logps/rejected": -298.77581787109375, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1659088134765625, + "rewards/margins": 2.2226810455322266, + "rewards/rejected": -5.388589859008789, + "step": 2080 + }, + { + "epoch": 0.44, + "learning_rate": 1.1340336134453782e-05, + "logits/chosen": -2.11307954788208, + "logits/rejected": -1.7963145971298218, + "logps/chosen": -409.2318420410156, + "logps/rejected": -387.35028076171875, + "loss": 0.7589, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7387428283691406, + "rewards/margins": 2.3906798362731934, + "rewards/rejected": -5.129422187805176, + "step": 2081 + }, + { + "epoch": 0.44, + "learning_rate": 1.1336134453781513e-05, + "logits/chosen": -2.056333065032959, + "logits/rejected": -1.7021454572677612, + "logps/chosen": -266.5394287109375, + "logps/rejected": -356.16705322265625, + "loss": 0.3234, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8668999671936035, + "rewards/margins": 3.9565298557281494, + "rewards/rejected": -6.823429107666016, + "step": 2082 + }, + { + "epoch": 0.44, + "learning_rate": 1.1331932773109244e-05, + "logits/chosen": -2.128643035888672, + "logits/rejected": -2.211228847503662, + "logps/chosen": -265.26055908203125, + "logps/rejected": -286.2271423339844, + "loss": 0.4343, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1598892211914062, + "rewards/margins": 3.7516732215881348, + "rewards/rejected": -6.911562919616699, + "step": 2083 + }, + { + "epoch": 0.44, + "learning_rate": 1.1327731092436975e-05, + "logits/chosen": -1.9741413593292236, + "logits/rejected": -1.753514289855957, + "logps/chosen": -282.8174743652344, + "logps/rejected": -285.58099365234375, + "loss": 0.2032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6921117305755615, + "rewards/margins": 4.4303131103515625, + "rewards/rejected": -6.122424602508545, + "step": 2084 + }, + { + "epoch": 0.44, + "learning_rate": 1.1323529411764707e-05, + "logits/chosen": -1.9598090648651123, + "logits/rejected": -1.7841306924819946, + "logps/chosen": -267.2647399902344, + "logps/rejected": -262.5185852050781, + "loss": 0.2939, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8312489986419678, + "rewards/margins": 3.528136968612671, + "rewards/rejected": -6.359385967254639, + "step": 2085 + }, + { + "epoch": 0.44, + "learning_rate": 1.1319327731092439e-05, + "logits/chosen": -2.2115390300750732, + "logits/rejected": -2.172428846359253, + "logps/chosen": -329.2820129394531, + "logps/rejected": -415.5566711425781, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2381389141082764, + "rewards/margins": 3.159836769104004, + "rewards/rejected": -5.397975921630859, + "step": 2086 + }, + { + "epoch": 0.44, + "learning_rate": 1.1315126050420169e-05, + "logits/chosen": -2.237715244293213, + "logits/rejected": -2.0016987323760986, + "logps/chosen": -263.94195556640625, + "logps/rejected": -311.2554931640625, + "loss": 0.289, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5013856887817383, + "rewards/margins": 5.073391914367676, + "rewards/rejected": -7.574777126312256, + "step": 2087 + }, + { + "epoch": 0.44, + "learning_rate": 1.13109243697479e-05, + "logits/chosen": -2.374457359313965, + "logits/rejected": -1.9431941509246826, + "logps/chosen": -539.1976318359375, + "logps/rejected": -495.11175537109375, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0853111743927002, + "rewards/margins": 5.777039051055908, + "rewards/rejected": -6.862349987030029, + "step": 2088 + }, + { + "epoch": 0.44, + "learning_rate": 1.1306722689075631e-05, + "logits/chosen": -1.7959283590316772, + "logits/rejected": -1.8263540267944336, + "logps/chosen": -394.26483154296875, + "logps/rejected": -279.94378662109375, + "loss": 0.6829, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.368159294128418, + "rewards/margins": 2.0748109817504883, + "rewards/rejected": -5.442970275878906, + "step": 2089 + }, + { + "epoch": 0.44, + "learning_rate": 1.1302521008403363e-05, + "logits/chosen": -2.4007978439331055, + "logits/rejected": -1.8503947257995605, + "logps/chosen": -430.58837890625, + "logps/rejected": -389.11370849609375, + "loss": 0.1725, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.239142417907715, + "rewards/margins": 3.3423779010772705, + "rewards/rejected": -5.581520080566406, + "step": 2090 + }, + { + "epoch": 0.44, + "learning_rate": 1.1298319327731093e-05, + "logits/chosen": -2.0770785808563232, + "logits/rejected": -2.238295555114746, + "logps/chosen": -210.31890869140625, + "logps/rejected": -340.9302673339844, + "loss": 0.4006, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.704918384552002, + "rewards/margins": 3.581852912902832, + "rewards/rejected": -6.286771774291992, + "step": 2091 + }, + { + "epoch": 0.44, + "learning_rate": 1.1294117647058825e-05, + "logits/chosen": -1.7465485334396362, + "logits/rejected": -2.1249797344207764, + "logps/chosen": -197.56185913085938, + "logps/rejected": -380.1848449707031, + "loss": 0.1264, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.786294937133789, + "rewards/margins": 4.378789901733398, + "rewards/rejected": -7.165083885192871, + "step": 2092 + }, + { + "epoch": 0.44, + "learning_rate": 1.1289915966386555e-05, + "logits/chosen": -1.9650239944458008, + "logits/rejected": -1.7887494564056396, + "logps/chosen": -327.19775390625, + "logps/rejected": -273.880615234375, + "loss": 0.6284, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.466172218322754, + "rewards/margins": 2.816950798034668, + "rewards/rejected": -5.283123016357422, + "step": 2093 + }, + { + "epoch": 0.44, + "learning_rate": 1.1285714285714287e-05, + "logits/chosen": -2.035176992416382, + "logits/rejected": -1.6705471277236938, + "logps/chosen": -230.17788696289062, + "logps/rejected": -295.9740295410156, + "loss": 0.1913, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7242836952209473, + "rewards/margins": 3.9833855628967285, + "rewards/rejected": -6.707669734954834, + "step": 2094 + }, + { + "epoch": 0.44, + "learning_rate": 1.1281512605042017e-05, + "logits/chosen": -2.362687349319458, + "logits/rejected": -2.008629322052002, + "logps/chosen": -354.7229309082031, + "logps/rejected": -328.02349853515625, + "loss": 0.6579, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.88460111618042, + "rewards/margins": 3.4469523429870605, + "rewards/rejected": -6.3315534591674805, + "step": 2095 + }, + { + "epoch": 0.44, + "learning_rate": 1.127731092436975e-05, + "logits/chosen": -1.960052251815796, + "logits/rejected": -1.850200891494751, + "logps/chosen": -277.55389404296875, + "logps/rejected": -382.36468505859375, + "loss": 0.1642, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.284581184387207, + "rewards/margins": 3.7295641899108887, + "rewards/rejected": -6.014145374298096, + "step": 2096 + }, + { + "epoch": 0.44, + "learning_rate": 1.127310924369748e-05, + "logits/chosen": -2.088284492492676, + "logits/rejected": -1.6026382446289062, + "logps/chosen": -403.4964904785156, + "logps/rejected": -332.2255859375, + "loss": 0.1538, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.50394868850708, + "rewards/margins": 4.2063798904418945, + "rewards/rejected": -6.710328578948975, + "step": 2097 + }, + { + "epoch": 0.44, + "learning_rate": 1.1268907563025211e-05, + "logits/chosen": -2.0808637142181396, + "logits/rejected": -2.164285182952881, + "logps/chosen": -346.36553955078125, + "logps/rejected": -343.06097412109375, + "loss": 0.1462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0551834106445312, + "rewards/margins": 3.9173378944396973, + "rewards/rejected": -5.97252082824707, + "step": 2098 + }, + { + "epoch": 0.44, + "learning_rate": 1.1264705882352942e-05, + "logits/chosen": -2.0195608139038086, + "logits/rejected": -1.7941190004348755, + "logps/chosen": -264.0512390136719, + "logps/rejected": -266.022216796875, + "loss": 0.3801, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.468585968017578, + "rewards/margins": 3.57623291015625, + "rewards/rejected": -6.044818878173828, + "step": 2099 + }, + { + "epoch": 0.44, + "learning_rate": 1.1260504201680673e-05, + "logits/chosen": -2.32964825630188, + "logits/rejected": -1.603450894355774, + "logps/chosen": -410.2392578125, + "logps/rejected": -377.18988037109375, + "loss": 0.371, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9095793962478638, + "rewards/margins": 4.100839138031006, + "rewards/rejected": -6.010417938232422, + "step": 2100 + }, + { + "epoch": 0.44, + "learning_rate": 1.1256302521008404e-05, + "logits/chosen": -2.290189743041992, + "logits/rejected": -2.090519905090332, + "logps/chosen": -273.4089660644531, + "logps/rejected": -293.29266357421875, + "loss": 0.1521, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7961888313293457, + "rewards/margins": 4.478764057159424, + "rewards/rejected": -6.2749528884887695, + "step": 2101 + }, + { + "epoch": 0.44, + "learning_rate": 1.1252100840336136e-05, + "logits/chosen": -2.324431896209717, + "logits/rejected": -2.2318320274353027, + "logps/chosen": -268.22857666015625, + "logps/rejected": -277.08648681640625, + "loss": 0.2325, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.608604073524475, + "rewards/margins": 3.6065711975097656, + "rewards/rejected": -5.215175151824951, + "step": 2102 + }, + { + "epoch": 0.44, + "learning_rate": 1.1247899159663866e-05, + "logits/chosen": -1.8101016283035278, + "logits/rejected": -1.7072787284851074, + "logps/chosen": -216.84495544433594, + "logps/rejected": -259.4885559082031, + "loss": 0.3058, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8696796894073486, + "rewards/margins": 4.031265735626221, + "rewards/rejected": -5.900945663452148, + "step": 2103 + }, + { + "epoch": 0.44, + "learning_rate": 1.1243697478991598e-05, + "logits/chosen": -1.866039752960205, + "logits/rejected": -2.0375428199768066, + "logps/chosen": -332.8404235839844, + "logps/rejected": -310.45416259765625, + "loss": 0.4658, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1105406284332275, + "rewards/margins": 4.041823387145996, + "rewards/rejected": -6.152364253997803, + "step": 2104 + }, + { + "epoch": 0.44, + "learning_rate": 1.1239495798319328e-05, + "logits/chosen": -2.2641921043395996, + "logits/rejected": -2.3262248039245605, + "logps/chosen": -336.8399353027344, + "logps/rejected": -428.47216796875, + "loss": 0.2122, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3530216217041016, + "rewards/margins": 4.220526695251465, + "rewards/rejected": -6.573548316955566, + "step": 2105 + }, + { + "epoch": 0.44, + "learning_rate": 1.123529411764706e-05, + "logits/chosen": -2.3589253425598145, + "logits/rejected": -1.8773760795593262, + "logps/chosen": -250.96490478515625, + "logps/rejected": -235.406982421875, + "loss": 0.3002, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2207822799682617, + "rewards/margins": 4.55903959274292, + "rewards/rejected": -6.779821872711182, + "step": 2106 + }, + { + "epoch": 0.44, + "learning_rate": 1.123109243697479e-05, + "logits/chosen": -1.6518869400024414, + "logits/rejected": -1.560832142829895, + "logps/chosen": -392.82037353515625, + "logps/rejected": -301.0335388183594, + "loss": 0.1628, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.157667875289917, + "rewards/margins": 4.188111782073975, + "rewards/rejected": -6.3457794189453125, + "step": 2107 + }, + { + "epoch": 0.44, + "learning_rate": 1.1226890756302522e-05, + "logits/chosen": -1.927270770072937, + "logits/rejected": -2.0700478553771973, + "logps/chosen": -218.9384002685547, + "logps/rejected": -491.71588134765625, + "loss": 0.2734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5728435516357422, + "rewards/margins": 4.213078022003174, + "rewards/rejected": -5.785921573638916, + "step": 2108 + }, + { + "epoch": 0.44, + "learning_rate": 1.1222689075630254e-05, + "logits/chosen": -2.2030696868896484, + "logits/rejected": -2.2147703170776367, + "logps/chosen": -336.65985107421875, + "logps/rejected": -443.0171203613281, + "loss": 0.8827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.138679027557373, + "rewards/margins": 1.965569019317627, + "rewards/rejected": -4.104248046875, + "step": 2109 + }, + { + "epoch": 0.44, + "learning_rate": 1.1218487394957984e-05, + "logits/chosen": -2.1268224716186523, + "logits/rejected": -2.117046594619751, + "logps/chosen": -243.7693328857422, + "logps/rejected": -322.23895263671875, + "loss": 0.3461, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.841224193572998, + "rewards/margins": 4.569146156311035, + "rewards/rejected": -6.410369873046875, + "step": 2110 + }, + { + "epoch": 0.44, + "learning_rate": 1.1214285714285716e-05, + "logits/chosen": -1.9013772010803223, + "logits/rejected": -1.8576560020446777, + "logps/chosen": -400.218994140625, + "logps/rejected": -378.9980163574219, + "loss": 0.3474, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2907309532165527, + "rewards/margins": 2.286616802215576, + "rewards/rejected": -4.577347755432129, + "step": 2111 + }, + { + "epoch": 0.44, + "learning_rate": 1.1210084033613446e-05, + "logits/chosen": -2.185497760772705, + "logits/rejected": -1.4457454681396484, + "logps/chosen": -395.5505676269531, + "logps/rejected": -306.52978515625, + "loss": 0.2547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6021113395690918, + "rewards/margins": 3.7774672508239746, + "rewards/rejected": -5.379578590393066, + "step": 2112 + }, + { + "epoch": 0.44, + "learning_rate": 1.1205882352941178e-05, + "logits/chosen": -2.0680925846099854, + "logits/rejected": -2.034123420715332, + "logps/chosen": -317.09588623046875, + "logps/rejected": -293.72576904296875, + "loss": 0.3705, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2642791271209717, + "rewards/margins": 3.343371868133545, + "rewards/rejected": -4.6076507568359375, + "step": 2113 + }, + { + "epoch": 0.44, + "learning_rate": 1.1201680672268908e-05, + "logits/chosen": -2.0487797260284424, + "logits/rejected": -1.839284896850586, + "logps/chosen": -399.13446044921875, + "logps/rejected": -408.6397705078125, + "loss": 0.1207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1105506420135498, + "rewards/margins": 4.683767318725586, + "rewards/rejected": -5.794317245483398, + "step": 2114 + }, + { + "epoch": 0.44, + "learning_rate": 1.119747899159664e-05, + "logits/chosen": -2.248800039291382, + "logits/rejected": -1.820990800857544, + "logps/chosen": -327.5166015625, + "logps/rejected": -275.72821044921875, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0566530227661133, + "rewards/margins": 3.134023666381836, + "rewards/rejected": -5.190676689147949, + "step": 2115 + }, + { + "epoch": 0.44, + "learning_rate": 1.119327731092437e-05, + "logits/chosen": -2.3500242233276367, + "logits/rejected": -2.351048469543457, + "logps/chosen": -288.04998779296875, + "logps/rejected": -322.3257141113281, + "loss": 0.6152, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.28229022026062, + "rewards/margins": 2.140395402908325, + "rewards/rejected": -4.422685623168945, + "step": 2116 + }, + { + "epoch": 0.44, + "learning_rate": 1.1189075630252102e-05, + "logits/chosen": -2.2064483165740967, + "logits/rejected": -2.0624213218688965, + "logps/chosen": -224.48138427734375, + "logps/rejected": -266.416748046875, + "loss": 0.2117, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9427436590194702, + "rewards/margins": 4.758347511291504, + "rewards/rejected": -6.7010908126831055, + "step": 2117 + }, + { + "epoch": 0.44, + "learning_rate": 1.1184873949579833e-05, + "logits/chosen": -1.919853925704956, + "logits/rejected": -2.0803310871124268, + "logps/chosen": -405.0791931152344, + "logps/rejected": -391.9491882324219, + "loss": 0.2733, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6223969459533691, + "rewards/margins": 3.929147720336914, + "rewards/rejected": -5.551544189453125, + "step": 2118 + }, + { + "epoch": 0.44, + "learning_rate": 1.1180672268907565e-05, + "logits/chosen": -2.158562660217285, + "logits/rejected": -2.0884718894958496, + "logps/chosen": -245.0634002685547, + "logps/rejected": -270.0154113769531, + "loss": 0.6614, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7728772163391113, + "rewards/margins": 1.5978729724884033, + "rewards/rejected": -4.3707499504089355, + "step": 2119 + }, + { + "epoch": 0.44, + "learning_rate": 1.1176470588235295e-05, + "logits/chosen": -2.3036646842956543, + "logits/rejected": -1.7940211296081543, + "logps/chosen": -395.62701416015625, + "logps/rejected": -282.34710693359375, + "loss": 0.3757, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5422285795211792, + "rewards/margins": 2.4022321701049805, + "rewards/rejected": -3.944460868835449, + "step": 2120 + }, + { + "epoch": 0.44, + "learning_rate": 1.1172268907563027e-05, + "logits/chosen": -2.3628814220428467, + "logits/rejected": -1.8830676078796387, + "logps/chosen": -296.4643249511719, + "logps/rejected": -305.5382385253906, + "loss": 0.2552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4281736612319946, + "rewards/margins": 3.2428548336029053, + "rewards/rejected": -4.6710286140441895, + "step": 2121 + }, + { + "epoch": 0.44, + "learning_rate": 1.1168067226890757e-05, + "logits/chosen": -2.156634569168091, + "logits/rejected": -2.038304328918457, + "logps/chosen": -284.01422119140625, + "logps/rejected": -334.09405517578125, + "loss": 0.6048, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1160361766815186, + "rewards/margins": 3.0303492546081543, + "rewards/rejected": -5.146385192871094, + "step": 2122 + }, + { + "epoch": 0.44, + "learning_rate": 1.1163865546218489e-05, + "logits/chosen": -2.6387832164764404, + "logits/rejected": -2.0389580726623535, + "logps/chosen": -356.20269775390625, + "logps/rejected": -249.40695190429688, + "loss": 0.5059, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8262940645217896, + "rewards/margins": 2.527844190597534, + "rewards/rejected": -4.354138374328613, + "step": 2123 + }, + { + "epoch": 0.44, + "learning_rate": 1.1159663865546219e-05, + "logits/chosen": -2.007159948348999, + "logits/rejected": -2.019857883453369, + "logps/chosen": -324.50885009765625, + "logps/rejected": -446.41827392578125, + "loss": 0.2346, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0337823629379272, + "rewards/margins": 4.590088367462158, + "rewards/rejected": -5.623870849609375, + "step": 2124 + }, + { + "epoch": 0.44, + "learning_rate": 1.1155462184873951e-05, + "logits/chosen": -2.3467788696289062, + "logits/rejected": -1.712422251701355, + "logps/chosen": -376.94354248046875, + "logps/rejected": -292.44635009765625, + "loss": 0.118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9205886125564575, + "rewards/margins": 5.446073532104492, + "rewards/rejected": -6.366662502288818, + "step": 2125 + }, + { + "epoch": 0.44, + "learning_rate": 1.1151260504201681e-05, + "logits/chosen": -2.0627143383026123, + "logits/rejected": -2.0900228023529053, + "logps/chosen": -253.41395568847656, + "logps/rejected": -315.8117980957031, + "loss": 0.3165, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6969473361968994, + "rewards/margins": 3.3166635036468506, + "rewards/rejected": -5.01361083984375, + "step": 2126 + }, + { + "epoch": 0.44, + "learning_rate": 1.1147058823529413e-05, + "logits/chosen": -2.214221239089966, + "logits/rejected": -2.2162487506866455, + "logps/chosen": -315.79638671875, + "logps/rejected": -335.0633544921875, + "loss": 0.2653, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.503174066543579, + "rewards/margins": 3.8282437324523926, + "rewards/rejected": -5.331417083740234, + "step": 2127 + }, + { + "epoch": 0.45, + "learning_rate": 1.1142857142857143e-05, + "logits/chosen": -2.1454780101776123, + "logits/rejected": -2.429685115814209, + "logps/chosen": -271.5894470214844, + "logps/rejected": -331.1427001953125, + "loss": 0.2079, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8967690467834473, + "rewards/margins": 4.363072395324707, + "rewards/rejected": -6.259840488433838, + "step": 2128 + }, + { + "epoch": 0.45, + "learning_rate": 1.1138655462184875e-05, + "logits/chosen": -2.0567209720611572, + "logits/rejected": -1.8903017044067383, + "logps/chosen": -221.18536376953125, + "logps/rejected": -244.03857421875, + "loss": 0.2921, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0792124271392822, + "rewards/margins": 3.994507312774658, + "rewards/rejected": -6.0737199783325195, + "step": 2129 + }, + { + "epoch": 0.45, + "learning_rate": 1.1134453781512606e-05, + "logits/chosen": -2.1531128883361816, + "logits/rejected": -1.928863286972046, + "logps/chosen": -351.0466613769531, + "logps/rejected": -335.0560607910156, + "loss": 0.3611, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4166717529296875, + "rewards/margins": 2.5792300701141357, + "rewards/rejected": -3.9959020614624023, + "step": 2130 + }, + { + "epoch": 0.45, + "learning_rate": 1.1130252100840337e-05, + "logits/chosen": -2.2557971477508545, + "logits/rejected": -2.076328992843628, + "logps/chosen": -354.7710266113281, + "logps/rejected": -359.93536376953125, + "loss": 0.1661, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.266421318054199, + "rewards/margins": 3.2916550636291504, + "rewards/rejected": -5.55807638168335, + "step": 2131 + }, + { + "epoch": 0.45, + "learning_rate": 1.112605042016807e-05, + "logits/chosen": -2.0067272186279297, + "logits/rejected": -1.946378231048584, + "logps/chosen": -340.98028564453125, + "logps/rejected": -387.01654052734375, + "loss": 0.5698, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6974260807037354, + "rewards/margins": 2.0739426612854004, + "rewards/rejected": -3.771368980407715, + "step": 2132 + }, + { + "epoch": 0.45, + "learning_rate": 1.11218487394958e-05, + "logits/chosen": -2.0942511558532715, + "logits/rejected": -1.7318954467773438, + "logps/chosen": -278.7694396972656, + "logps/rejected": -230.17576599121094, + "loss": 0.2503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.154672384262085, + "rewards/margins": 3.4053964614868164, + "rewards/rejected": -5.5600690841674805, + "step": 2133 + }, + { + "epoch": 0.45, + "learning_rate": 1.1117647058823531e-05, + "logits/chosen": -2.133626937866211, + "logits/rejected": -1.4854366779327393, + "logps/chosen": -320.07464599609375, + "logps/rejected": -260.7422790527344, + "loss": 0.2469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6537978649139404, + "rewards/margins": 2.7697830200195312, + "rewards/rejected": -4.423581123352051, + "step": 2134 + }, + { + "epoch": 0.45, + "learning_rate": 1.1113445378151262e-05, + "logits/chosen": -2.3014333248138428, + "logits/rejected": -1.8766543865203857, + "logps/chosen": -327.1843566894531, + "logps/rejected": -240.4751434326172, + "loss": 0.4028, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9308364391326904, + "rewards/margins": 2.4663476943969727, + "rewards/rejected": -4.397184371948242, + "step": 2135 + }, + { + "epoch": 0.45, + "learning_rate": 1.1109243697478994e-05, + "logits/chosen": -2.3082189559936523, + "logits/rejected": -2.0221683979034424, + "logps/chosen": -433.845703125, + "logps/rejected": -477.98236083984375, + "loss": 0.1283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2177271842956543, + "rewards/margins": 4.435600280761719, + "rewards/rejected": -5.653327465057373, + "step": 2136 + }, + { + "epoch": 0.45, + "learning_rate": 1.1105042016806724e-05, + "logits/chosen": -2.1207849979400635, + "logits/rejected": -2.1162607669830322, + "logps/chosen": -477.9659423828125, + "logps/rejected": -376.99041748046875, + "loss": 0.5491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.758960485458374, + "rewards/margins": 3.101200580596924, + "rewards/rejected": -4.860160827636719, + "step": 2137 + }, + { + "epoch": 0.45, + "learning_rate": 1.1100840336134456e-05, + "logits/chosen": -2.0040762424468994, + "logits/rejected": -1.7547401189804077, + "logps/chosen": -222.41940307617188, + "logps/rejected": -244.71957397460938, + "loss": 0.2128, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7893508672714233, + "rewards/margins": 3.536910057067871, + "rewards/rejected": -5.326261043548584, + "step": 2138 + }, + { + "epoch": 0.45, + "learning_rate": 1.1096638655462186e-05, + "logits/chosen": -2.021898031234741, + "logits/rejected": -2.126206398010254, + "logps/chosen": -281.96881103515625, + "logps/rejected": -422.16717529296875, + "loss": 0.1347, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1842364072799683, + "rewards/margins": 3.9649136066436768, + "rewards/rejected": -5.149150371551514, + "step": 2139 + }, + { + "epoch": 0.45, + "learning_rate": 1.1092436974789918e-05, + "logits/chosen": -1.9356483221054077, + "logits/rejected": -1.9482295513153076, + "logps/chosen": -348.6847229003906, + "logps/rejected": -339.896240234375, + "loss": 0.4361, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6491668224334717, + "rewards/margins": 3.3715784549713135, + "rewards/rejected": -5.020745754241943, + "step": 2140 + }, + { + "epoch": 0.45, + "learning_rate": 1.1088235294117648e-05, + "logits/chosen": -1.9045846462249756, + "logits/rejected": -1.9601390361785889, + "logps/chosen": -254.6430206298828, + "logps/rejected": -256.7317810058594, + "loss": 0.4325, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2346580028533936, + "rewards/margins": 1.3620989322662354, + "rewards/rejected": -3.596756935119629, + "step": 2141 + }, + { + "epoch": 0.45, + "learning_rate": 1.108403361344538e-05, + "logits/chosen": -2.0899438858032227, + "logits/rejected": -1.9953680038452148, + "logps/chosen": -341.23724365234375, + "logps/rejected": -350.5480651855469, + "loss": 0.2085, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4544007778167725, + "rewards/margins": 3.8809330463409424, + "rewards/rejected": -5.335333824157715, + "step": 2142 + }, + { + "epoch": 0.45, + "learning_rate": 1.107983193277311e-05, + "logits/chosen": -1.807093620300293, + "logits/rejected": -2.1782169342041016, + "logps/chosen": -284.1681213378906, + "logps/rejected": -350.2537536621094, + "loss": 0.2749, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8590219020843506, + "rewards/margins": 4.617184162139893, + "rewards/rejected": -6.476205348968506, + "step": 2143 + }, + { + "epoch": 0.45, + "learning_rate": 1.1075630252100842e-05, + "logits/chosen": -1.9736236333847046, + "logits/rejected": -1.7161903381347656, + "logps/chosen": -329.498291015625, + "logps/rejected": -256.70166015625, + "loss": 0.1526, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.075540065765381, + "rewards/margins": 4.444458484649658, + "rewards/rejected": -6.519998550415039, + "step": 2144 + }, + { + "epoch": 0.45, + "learning_rate": 1.1071428571428572e-05, + "logits/chosen": -2.0499110221862793, + "logits/rejected": -1.6978867053985596, + "logps/chosen": -438.6824035644531, + "logps/rejected": -420.48138427734375, + "loss": 0.1331, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.453322410583496, + "rewards/margins": 4.202869415283203, + "rewards/rejected": -5.656192302703857, + "step": 2145 + }, + { + "epoch": 0.45, + "learning_rate": 1.1067226890756304e-05, + "logits/chosen": -2.0777339935302734, + "logits/rejected": -1.7196691036224365, + "logps/chosen": -406.0032958984375, + "logps/rejected": -299.55535888671875, + "loss": 0.4558, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3479602336883545, + "rewards/margins": 2.849907398223877, + "rewards/rejected": -5.197867393493652, + "step": 2146 + }, + { + "epoch": 0.45, + "learning_rate": 1.1063025210084035e-05, + "logits/chosen": -1.9819612503051758, + "logits/rejected": -1.8589093685150146, + "logps/chosen": -380.8150634765625, + "logps/rejected": -316.04998779296875, + "loss": 0.4447, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2182402610778809, + "rewards/margins": 2.380608081817627, + "rewards/rejected": -3.598848342895508, + "step": 2147 + }, + { + "epoch": 0.45, + "learning_rate": 1.1058823529411766e-05, + "logits/chosen": -2.1406993865966797, + "logits/rejected": -1.9022096395492554, + "logps/chosen": -255.685546875, + "logps/rejected": -284.3907775878906, + "loss": 0.2504, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.062098503112793, + "rewards/margins": 3.99426007270813, + "rewards/rejected": -6.056358337402344, + "step": 2148 + }, + { + "epoch": 0.45, + "learning_rate": 1.1054621848739497e-05, + "logits/chosen": -2.0744566917419434, + "logits/rejected": -1.6592695713043213, + "logps/chosen": -414.8008117675781, + "logps/rejected": -374.22576904296875, + "loss": 0.541, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8700759410858154, + "rewards/margins": 2.453132390975952, + "rewards/rejected": -4.323208332061768, + "step": 2149 + }, + { + "epoch": 0.45, + "learning_rate": 1.1050420168067229e-05, + "logits/chosen": -2.2994801998138428, + "logits/rejected": -2.1311941146850586, + "logps/chosen": -373.1037902832031, + "logps/rejected": -370.64605712890625, + "loss": 0.44, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4412970542907715, + "rewards/margins": 2.494722843170166, + "rewards/rejected": -4.936020374298096, + "step": 2150 + }, + { + "epoch": 0.45, + "learning_rate": 1.1046218487394959e-05, + "logits/chosen": -2.1855316162109375, + "logits/rejected": -1.809584617614746, + "logps/chosen": -353.017578125, + "logps/rejected": -376.0660400390625, + "loss": 0.2531, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.504176616668701, + "rewards/margins": 3.7505970001220703, + "rewards/rejected": -6.2547736167907715, + "step": 2151 + }, + { + "epoch": 0.45, + "learning_rate": 1.104201680672269e-05, + "logits/chosen": -2.1064038276672363, + "logits/rejected": -2.291184425354004, + "logps/chosen": -277.2584228515625, + "logps/rejected": -320.9295654296875, + "loss": 0.2134, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.748108983039856, + "rewards/margins": 5.320732593536377, + "rewards/rejected": -7.06884241104126, + "step": 2152 + }, + { + "epoch": 0.45, + "learning_rate": 1.1037815126050423e-05, + "logits/chosen": -1.3586311340332031, + "logits/rejected": -2.143803358078003, + "logps/chosen": -141.74215698242188, + "logps/rejected": -416.4621887207031, + "loss": 0.1852, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5869956016540527, + "rewards/margins": 5.488154411315918, + "rewards/rejected": -8.075149536132812, + "step": 2153 + }, + { + "epoch": 0.45, + "learning_rate": 1.1033613445378153e-05, + "logits/chosen": -2.063504934310913, + "logits/rejected": -2.1399271488189697, + "logps/chosen": -403.47430419921875, + "logps/rejected": -408.6955871582031, + "loss": 0.4129, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.31896710395813, + "rewards/margins": 2.337878704071045, + "rewards/rejected": -5.656845569610596, + "step": 2154 + }, + { + "epoch": 0.45, + "learning_rate": 1.1029411764705885e-05, + "logits/chosen": -2.310912847518921, + "logits/rejected": -1.835289716720581, + "logps/chosen": -273.4323425292969, + "logps/rejected": -296.7527770996094, + "loss": 0.2315, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.002804756164551, + "rewards/margins": 3.6010003089904785, + "rewards/rejected": -5.603805065155029, + "step": 2155 + }, + { + "epoch": 0.45, + "learning_rate": 1.1025210084033615e-05, + "logits/chosen": -1.9482005834579468, + "logits/rejected": -2.0161092281341553, + "logps/chosen": -231.42431640625, + "logps/rejected": -336.76190185546875, + "loss": 0.2457, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.353275775909424, + "rewards/margins": 4.855534553527832, + "rewards/rejected": -7.208811283111572, + "step": 2156 + }, + { + "epoch": 0.45, + "learning_rate": 1.1021008403361347e-05, + "logits/chosen": -2.066718339920044, + "logits/rejected": -2.2583022117614746, + "logps/chosen": -256.9452819824219, + "logps/rejected": -453.9226989746094, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7138071060180664, + "rewards/margins": 3.7090206146240234, + "rewards/rejected": -6.422828197479248, + "step": 2157 + }, + { + "epoch": 0.45, + "learning_rate": 1.1016806722689077e-05, + "logits/chosen": -2.4758973121643066, + "logits/rejected": -1.9720137119293213, + "logps/chosen": -396.96002197265625, + "logps/rejected": -369.3337097167969, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8153385519981384, + "rewards/margins": 5.693082332611084, + "rewards/rejected": -6.508420467376709, + "step": 2158 + }, + { + "epoch": 0.45, + "learning_rate": 1.1012605042016809e-05, + "logits/chosen": -1.9146405458450317, + "logits/rejected": -1.9971437454223633, + "logps/chosen": -275.4366760253906, + "logps/rejected": -303.45977783203125, + "loss": 0.7363, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3198189735412598, + "rewards/margins": 2.8516457080841064, + "rewards/rejected": -5.171464920043945, + "step": 2159 + }, + { + "epoch": 0.45, + "learning_rate": 1.100840336134454e-05, + "logits/chosen": -2.0020523071289062, + "logits/rejected": -2.1226553916931152, + "logps/chosen": -315.7077941894531, + "logps/rejected": -361.6723937988281, + "loss": 0.6878, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.637564182281494, + "rewards/margins": 2.7419075965881348, + "rewards/rejected": -5.379471778869629, + "step": 2160 + }, + { + "epoch": 0.45, + "learning_rate": 1.1004201680672271e-05, + "logits/chosen": -2.408409595489502, + "logits/rejected": -2.198552131652832, + "logps/chosen": -246.95294189453125, + "logps/rejected": -264.199462890625, + "loss": 0.871, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5186996459960938, + "rewards/margins": 1.7888046503067017, + "rewards/rejected": -4.307504177093506, + "step": 2161 + }, + { + "epoch": 0.45, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -1.996659755706787, + "logits/rejected": -1.8724391460418701, + "logps/chosen": -332.5257263183594, + "logps/rejected": -390.8209228515625, + "loss": 0.2259, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4656529426574707, + "rewards/margins": 5.166693687438965, + "rewards/rejected": -7.6323466300964355, + "step": 2162 + }, + { + "epoch": 0.45, + "learning_rate": 1.0995798319327733e-05, + "logits/chosen": -2.014070987701416, + "logits/rejected": -1.9697198867797852, + "logps/chosen": -340.2963562011719, + "logps/rejected": -403.85308837890625, + "loss": 0.3293, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7613236904144287, + "rewards/margins": 4.447287559509277, + "rewards/rejected": -7.208611488342285, + "step": 2163 + }, + { + "epoch": 0.45, + "learning_rate": 1.0991596638655464e-05, + "logits/chosen": -2.1590576171875, + "logits/rejected": -1.6582505702972412, + "logps/chosen": -426.586669921875, + "logps/rejected": -315.9244689941406, + "loss": 0.1607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.891146659851074, + "rewards/margins": 3.6502685546875, + "rewards/rejected": -6.541415214538574, + "step": 2164 + }, + { + "epoch": 0.45, + "learning_rate": 1.0987394957983195e-05, + "logits/chosen": -2.239654541015625, + "logits/rejected": -2.051011800765991, + "logps/chosen": -333.54583740234375, + "logps/rejected": -374.7533264160156, + "loss": 0.4656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3286242485046387, + "rewards/margins": 3.1175670623779297, + "rewards/rejected": -5.446191787719727, + "step": 2165 + }, + { + "epoch": 0.45, + "learning_rate": 1.0983193277310926e-05, + "logits/chosen": -2.227440595626831, + "logits/rejected": -1.6257731914520264, + "logps/chosen": -324.452392578125, + "logps/rejected": -350.2322082519531, + "loss": 0.4629, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6320247650146484, + "rewards/margins": 4.669007301330566, + "rewards/rejected": -7.301032066345215, + "step": 2166 + }, + { + "epoch": 0.45, + "learning_rate": 1.0978991596638658e-05, + "logits/chosen": -1.8249037265777588, + "logits/rejected": -1.996896743774414, + "logps/chosen": -430.4873046875, + "logps/rejected": -423.3940734863281, + "loss": 0.4376, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5360426902770996, + "rewards/margins": 3.874124050140381, + "rewards/rejected": -6.410167694091797, + "step": 2167 + }, + { + "epoch": 0.45, + "learning_rate": 1.0974789915966388e-05, + "logits/chosen": -2.0975301265716553, + "logits/rejected": -2.2377991676330566, + "logps/chosen": -372.0230712890625, + "logps/rejected": -362.9530334472656, + "loss": 0.1716, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1585159301757812, + "rewards/margins": 4.064459800720215, + "rewards/rejected": -6.222976207733154, + "step": 2168 + }, + { + "epoch": 0.45, + "learning_rate": 1.097058823529412e-05, + "logits/chosen": -2.486316680908203, + "logits/rejected": -1.9096156358718872, + "logps/chosen": -457.6482849121094, + "logps/rejected": -450.875, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2479711771011353, + "rewards/margins": 4.646535873413086, + "rewards/rejected": -5.894506454467773, + "step": 2169 + }, + { + "epoch": 0.45, + "learning_rate": 1.096638655462185e-05, + "logits/chosen": -1.9506534337997437, + "logits/rejected": -1.620651125907898, + "logps/chosen": -432.6158752441406, + "logps/rejected": -335.71527099609375, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5584423542022705, + "rewards/margins": 4.7099928855896, + "rewards/rejected": -7.268435478210449, + "step": 2170 + }, + { + "epoch": 0.45, + "learning_rate": 1.0962184873949582e-05, + "logits/chosen": -1.963405728340149, + "logits/rejected": -2.0942749977111816, + "logps/chosen": -272.369140625, + "logps/rejected": -335.7466735839844, + "loss": 0.3217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3475117683410645, + "rewards/margins": 4.016371726989746, + "rewards/rejected": -6.363883972167969, + "step": 2171 + }, + { + "epoch": 0.45, + "learning_rate": 1.0957983193277312e-05, + "logits/chosen": -2.163416862487793, + "logits/rejected": -1.9712603092193604, + "logps/chosen": -314.0606384277344, + "logps/rejected": -329.432861328125, + "loss": 0.2107, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.70294451713562, + "rewards/margins": 4.212249279022217, + "rewards/rejected": -6.915193557739258, + "step": 2172 + }, + { + "epoch": 0.45, + "learning_rate": 1.0953781512605044e-05, + "logits/chosen": -2.2412123680114746, + "logits/rejected": -1.7747416496276855, + "logps/chosen": -468.77557373046875, + "logps/rejected": -384.94287109375, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.805687427520752, + "rewards/margins": 3.7529850006103516, + "rewards/rejected": -5.558672904968262, + "step": 2173 + }, + { + "epoch": 0.45, + "learning_rate": 1.0949579831932774e-05, + "logits/chosen": -2.0782248973846436, + "logits/rejected": -2.045067071914673, + "logps/chosen": -295.7433166503906, + "logps/rejected": -291.02227783203125, + "loss": 0.3914, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.424102783203125, + "rewards/margins": 3.0287539958953857, + "rewards/rejected": -5.452856063842773, + "step": 2174 + }, + { + "epoch": 0.46, + "learning_rate": 1.0945378151260506e-05, + "logits/chosen": -1.7476472854614258, + "logits/rejected": -2.089772939682007, + "logps/chosen": -222.85621643066406, + "logps/rejected": -342.553955078125, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.483558177947998, + "rewards/margins": 4.208912372589111, + "rewards/rejected": -6.692470550537109, + "step": 2175 + }, + { + "epoch": 0.46, + "learning_rate": 1.0941176470588238e-05, + "logits/chosen": -2.0619561672210693, + "logits/rejected": -1.5173184871673584, + "logps/chosen": -403.90960693359375, + "logps/rejected": -370.9187316894531, + "loss": 0.0938, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6350886821746826, + "rewards/margins": 4.738796234130859, + "rewards/rejected": -6.373884677886963, + "step": 2176 + }, + { + "epoch": 0.46, + "learning_rate": 1.0936974789915967e-05, + "logits/chosen": -2.050675868988037, + "logits/rejected": -1.9390783309936523, + "logps/chosen": -376.11114501953125, + "logps/rejected": -429.7190246582031, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5263442993164062, + "rewards/margins": 2.934098482131958, + "rewards/rejected": -5.460442543029785, + "step": 2177 + }, + { + "epoch": 0.46, + "learning_rate": 1.0932773109243697e-05, + "logits/chosen": -1.8516062498092651, + "logits/rejected": -1.8539493083953857, + "logps/chosen": -314.53460693359375, + "logps/rejected": -318.81060791015625, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8330845832824707, + "rewards/margins": 2.078017473220825, + "rewards/rejected": -4.911102294921875, + "step": 2178 + }, + { + "epoch": 0.46, + "learning_rate": 1.0928571428571429e-05, + "logits/chosen": -2.3023195266723633, + "logits/rejected": -2.301882266998291, + "logps/chosen": -412.1047058105469, + "logps/rejected": -418.8988037109375, + "loss": 0.5978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1915111541748047, + "rewards/margins": 4.310257434844971, + "rewards/rejected": -6.501768112182617, + "step": 2179 + }, + { + "epoch": 0.46, + "learning_rate": 1.0924369747899159e-05, + "logits/chosen": -2.0407650470733643, + "logits/rejected": -2.271683931350708, + "logps/chosen": -255.7179718017578, + "logps/rejected": -339.0557556152344, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5781748294830322, + "rewards/margins": 4.977086067199707, + "rewards/rejected": -7.555261135101318, + "step": 2180 + }, + { + "epoch": 0.46, + "learning_rate": 1.0920168067226891e-05, + "logits/chosen": -2.2148516178131104, + "logits/rejected": -2.2547411918640137, + "logps/chosen": -436.52734375, + "logps/rejected": -474.769287109375, + "loss": 0.1716, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4925627708435059, + "rewards/margins": 4.246408939361572, + "rewards/rejected": -5.738971710205078, + "step": 2181 + }, + { + "epoch": 0.46, + "learning_rate": 1.0915966386554621e-05, + "logits/chosen": -2.2408289909362793, + "logits/rejected": -2.023141860961914, + "logps/chosen": -403.99871826171875, + "logps/rejected": -399.5506591796875, + "loss": 0.586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7053794860839844, + "rewards/margins": 2.3182213306427, + "rewards/rejected": -5.023601055145264, + "step": 2182 + }, + { + "epoch": 0.46, + "learning_rate": 1.0911764705882353e-05, + "logits/chosen": -1.6399402618408203, + "logits/rejected": -1.7357593774795532, + "logps/chosen": -507.09234619140625, + "logps/rejected": -425.30145263671875, + "loss": 0.24, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3757715225219727, + "rewards/margins": 3.9001646041870117, + "rewards/rejected": -6.275936126708984, + "step": 2183 + }, + { + "epoch": 0.46, + "learning_rate": 1.0907563025210083e-05, + "logits/chosen": -2.0427703857421875, + "logits/rejected": -2.1670761108398438, + "logps/chosen": -355.68426513671875, + "logps/rejected": -440.0216064453125, + "loss": 0.5933, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7694621086120605, + "rewards/margins": 3.3427374362945557, + "rewards/rejected": -6.112199783325195, + "step": 2184 + }, + { + "epoch": 0.46, + "learning_rate": 1.0903361344537815e-05, + "logits/chosen": -2.351576328277588, + "logits/rejected": -2.1522438526153564, + "logps/chosen": -336.2664794921875, + "logps/rejected": -357.8309631347656, + "loss": 0.2405, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.058230400085449, + "rewards/margins": 3.932281970977783, + "rewards/rejected": -5.990512371063232, + "step": 2185 + }, + { + "epoch": 0.46, + "learning_rate": 1.0899159663865545e-05, + "logits/chosen": -2.090019941329956, + "logits/rejected": -2.0676915645599365, + "logps/chosen": -335.3820495605469, + "logps/rejected": -304.6964416503906, + "loss": 0.6636, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.871441602706909, + "rewards/margins": 2.626312255859375, + "rewards/rejected": -6.497754096984863, + "step": 2186 + }, + { + "epoch": 0.46, + "learning_rate": 1.0894957983193277e-05, + "logits/chosen": -2.189368724822998, + "logits/rejected": -1.9917242527008057, + "logps/chosen": -325.16314697265625, + "logps/rejected": -284.92034912109375, + "loss": 0.2305, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.945838212966919, + "rewards/margins": 5.222498416900635, + "rewards/rejected": -7.168336868286133, + "step": 2187 + }, + { + "epoch": 0.46, + "learning_rate": 1.089075630252101e-05, + "logits/chosen": -2.0255091190338135, + "logits/rejected": -1.7928221225738525, + "logps/chosen": -273.7083435058594, + "logps/rejected": -320.1885070800781, + "loss": 0.2606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7845582962036133, + "rewards/margins": 4.29451322555542, + "rewards/rejected": -7.079071998596191, + "step": 2188 + }, + { + "epoch": 0.46, + "learning_rate": 1.088655462184874e-05, + "logits/chosen": -1.6586649417877197, + "logits/rejected": -1.6128149032592773, + "logps/chosen": -319.1055908203125, + "logps/rejected": -313.76007080078125, + "loss": 0.1692, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4722862243652344, + "rewards/margins": 3.634136199951172, + "rewards/rejected": -6.106421947479248, + "step": 2189 + }, + { + "epoch": 0.46, + "learning_rate": 1.0882352941176471e-05, + "logits/chosen": -2.049567937850952, + "logits/rejected": -1.8410358428955078, + "logps/chosen": -372.403076171875, + "logps/rejected": -391.11279296875, + "loss": 0.2081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9479005336761475, + "rewards/margins": 4.206435680389404, + "rewards/rejected": -7.154335975646973, + "step": 2190 + }, + { + "epoch": 0.46, + "learning_rate": 1.0878151260504202e-05, + "logits/chosen": -2.0285184383392334, + "logits/rejected": -2.119401454925537, + "logps/chosen": -291.40618896484375, + "logps/rejected": -292.7174987792969, + "loss": 0.1861, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1796467304229736, + "rewards/margins": 4.327023029327393, + "rewards/rejected": -6.506669521331787, + "step": 2191 + }, + { + "epoch": 0.46, + "learning_rate": 1.0873949579831933e-05, + "logits/chosen": -2.241891622543335, + "logits/rejected": -2.0366880893707275, + "logps/chosen": -371.3621826171875, + "logps/rejected": -313.64239501953125, + "loss": 0.3594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0455706119537354, + "rewards/margins": 2.867155075073242, + "rewards/rejected": -4.912725448608398, + "step": 2192 + }, + { + "epoch": 0.46, + "learning_rate": 1.0869747899159664e-05, + "logits/chosen": -1.8171122074127197, + "logits/rejected": -1.9362175464630127, + "logps/chosen": -256.2909851074219, + "logps/rejected": -359.748779296875, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8400087356567383, + "rewards/margins": 4.862631797790527, + "rewards/rejected": -7.702641010284424, + "step": 2193 + }, + { + "epoch": 0.46, + "learning_rate": 1.0865546218487396e-05, + "logits/chosen": -2.0894508361816406, + "logits/rejected": -1.9648733139038086, + "logps/chosen": -206.87841796875, + "logps/rejected": -241.1136474609375, + "loss": 0.3472, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9194865226745605, + "rewards/margins": 3.1334617137908936, + "rewards/rejected": -6.052947998046875, + "step": 2194 + }, + { + "epoch": 0.46, + "learning_rate": 1.0861344537815126e-05, + "logits/chosen": -2.4774932861328125, + "logits/rejected": -1.8363265991210938, + "logps/chosen": -311.701171875, + "logps/rejected": -288.99200439453125, + "loss": 0.4517, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.968862771987915, + "rewards/margins": 3.147794008255005, + "rewards/rejected": -5.11665678024292, + "step": 2195 + }, + { + "epoch": 0.46, + "learning_rate": 1.0857142857142858e-05, + "logits/chosen": -2.261884927749634, + "logits/rejected": -2.079307794570923, + "logps/chosen": -354.0386657714844, + "logps/rejected": -398.28839111328125, + "loss": 0.5102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.148468017578125, + "rewards/margins": 3.487614631652832, + "rewards/rejected": -5.636082649230957, + "step": 2196 + }, + { + "epoch": 0.46, + "learning_rate": 1.0852941176470588e-05, + "logits/chosen": -1.9121036529541016, + "logits/rejected": -1.739865779876709, + "logps/chosen": -323.443603515625, + "logps/rejected": -320.7878723144531, + "loss": 0.2256, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.251253366470337, + "rewards/margins": 3.9752492904663086, + "rewards/rejected": -6.226502418518066, + "step": 2197 + }, + { + "epoch": 0.46, + "learning_rate": 1.084873949579832e-05, + "logits/chosen": -2.10477876663208, + "logits/rejected": -1.7474911212921143, + "logps/chosen": -327.603759765625, + "logps/rejected": -298.4808349609375, + "loss": 0.5288, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.091388702392578, + "rewards/margins": 2.6992855072021484, + "rewards/rejected": -5.790674209594727, + "step": 2198 + }, + { + "epoch": 0.46, + "learning_rate": 1.084453781512605e-05, + "logits/chosen": -2.3742029666900635, + "logits/rejected": -2.1379635334014893, + "logps/chosen": -365.9757385253906, + "logps/rejected": -375.118408203125, + "loss": 0.3775, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7214303016662598, + "rewards/margins": 3.7734272480010986, + "rewards/rejected": -6.4948577880859375, + "step": 2199 + }, + { + "epoch": 0.46, + "learning_rate": 1.0840336134453782e-05, + "logits/chosen": -2.286151885986328, + "logits/rejected": -1.999983787536621, + "logps/chosen": -374.40264892578125, + "logps/rejected": -364.20947265625, + "loss": 0.2444, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7902493476867676, + "rewards/margins": 4.513697147369385, + "rewards/rejected": -7.3039469718933105, + "step": 2200 + }, + { + "epoch": 0.46, + "learning_rate": 1.0836134453781512e-05, + "logits/chosen": -2.6414976119995117, + "logits/rejected": -2.2949018478393555, + "logps/chosen": -361.7509765625, + "logps/rejected": -294.181396484375, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.162288188934326, + "rewards/margins": 5.092597007751465, + "rewards/rejected": -7.254885196685791, + "step": 2201 + }, + { + "epoch": 0.46, + "learning_rate": 1.0831932773109244e-05, + "logits/chosen": -2.229208469390869, + "logits/rejected": -2.1986241340637207, + "logps/chosen": -391.3697509765625, + "logps/rejected": -414.3483581542969, + "loss": 0.2296, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.487478733062744, + "rewards/margins": 5.798541069030762, + "rewards/rejected": -8.286020278930664, + "step": 2202 + }, + { + "epoch": 0.46, + "learning_rate": 1.0827731092436974e-05, + "logits/chosen": -1.7265875339508057, + "logits/rejected": -1.5976886749267578, + "logps/chosen": -319.3828125, + "logps/rejected": -346.0447692871094, + "loss": 0.2522, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9269204139709473, + "rewards/margins": 4.979476451873779, + "rewards/rejected": -6.906396865844727, + "step": 2203 + }, + { + "epoch": 0.46, + "learning_rate": 1.0823529411764706e-05, + "logits/chosen": -2.1532180309295654, + "logits/rejected": -1.8384546041488647, + "logps/chosen": -264.1273498535156, + "logps/rejected": -322.74090576171875, + "loss": 0.54, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7662711143493652, + "rewards/margins": 2.821484327316284, + "rewards/rejected": -5.58775520324707, + "step": 2204 + }, + { + "epoch": 0.46, + "learning_rate": 1.0819327731092437e-05, + "logits/chosen": -2.2824459075927734, + "logits/rejected": -1.5332520008087158, + "logps/chosen": -346.97906494140625, + "logps/rejected": -406.9164733886719, + "loss": 0.2554, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8781042098999023, + "rewards/margins": 4.692074298858643, + "rewards/rejected": -7.570178985595703, + "step": 2205 + }, + { + "epoch": 0.46, + "learning_rate": 1.0815126050420168e-05, + "logits/chosen": -2.170711040496826, + "logits/rejected": -1.7903850078582764, + "logps/chosen": -383.2043151855469, + "logps/rejected": -300.8239440917969, + "loss": 0.0836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8433815240859985, + "rewards/margins": 5.589659690856934, + "rewards/rejected": -7.433040618896484, + "step": 2206 + }, + { + "epoch": 0.46, + "learning_rate": 1.0810924369747899e-05, + "logits/chosen": -2.308537721633911, + "logits/rejected": -2.3312244415283203, + "logps/chosen": -395.81304931640625, + "logps/rejected": -406.4383239746094, + "loss": 0.6183, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0024824142456055, + "rewards/margins": 3.527528762817383, + "rewards/rejected": -6.530011177062988, + "step": 2207 + }, + { + "epoch": 0.46, + "learning_rate": 1.080672268907563e-05, + "logits/chosen": -2.2998664379119873, + "logits/rejected": -2.1448166370391846, + "logps/chosen": -339.9493713378906, + "logps/rejected": -372.6708679199219, + "loss": 0.358, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8555407524108887, + "rewards/margins": 4.349848747253418, + "rewards/rejected": -7.205389022827148, + "step": 2208 + }, + { + "epoch": 0.46, + "learning_rate": 1.080252100840336e-05, + "logits/chosen": -1.6480600833892822, + "logits/rejected": -1.9596545696258545, + "logps/chosen": -192.30313110351562, + "logps/rejected": -284.8935852050781, + "loss": 0.209, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6758458614349365, + "rewards/margins": 5.204977512359619, + "rewards/rejected": -7.880823135375977, + "step": 2209 + }, + { + "epoch": 0.46, + "learning_rate": 1.0798319327731093e-05, + "logits/chosen": -2.2920610904693604, + "logits/rejected": -2.1652326583862305, + "logps/chosen": -325.49786376953125, + "logps/rejected": -266.3533630371094, + "loss": 0.7449, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.4675817489624023, + "rewards/margins": 1.6641329526901245, + "rewards/rejected": -5.131714820861816, + "step": 2210 + }, + { + "epoch": 0.46, + "learning_rate": 1.0794117647058825e-05, + "logits/chosen": -1.9324177503585815, + "logits/rejected": -2.0953521728515625, + "logps/chosen": -338.3372497558594, + "logps/rejected": -379.05084228515625, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.318756341934204, + "rewards/margins": 6.595605850219727, + "rewards/rejected": -8.914361953735352, + "step": 2211 + }, + { + "epoch": 0.46, + "learning_rate": 1.0789915966386555e-05, + "logits/chosen": -2.408193826675415, + "logits/rejected": -1.6430341005325317, + "logps/chosen": -445.908447265625, + "logps/rejected": -345.45330810546875, + "loss": 0.1399, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8667926788330078, + "rewards/margins": 4.218425750732422, + "rewards/rejected": -6.0852179527282715, + "step": 2212 + }, + { + "epoch": 0.46, + "learning_rate": 1.0785714285714287e-05, + "logits/chosen": -1.7721405029296875, + "logits/rejected": -1.875141978263855, + "logps/chosen": -325.4342956542969, + "logps/rejected": -332.0131530761719, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4487504959106445, + "rewards/margins": 2.412158966064453, + "rewards/rejected": -4.860909461975098, + "step": 2213 + }, + { + "epoch": 0.46, + "learning_rate": 1.0781512605042017e-05, + "logits/chosen": -2.221919059753418, + "logits/rejected": -2.1236467361450195, + "logps/chosen": -304.822021484375, + "logps/rejected": -395.0116271972656, + "loss": 0.2476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7468748092651367, + "rewards/margins": 4.721800804138184, + "rewards/rejected": -7.4686760902404785, + "step": 2214 + }, + { + "epoch": 0.46, + "learning_rate": 1.0777310924369749e-05, + "logits/chosen": -1.943882942199707, + "logits/rejected": -2.1289587020874023, + "logps/chosen": -234.31790161132812, + "logps/rejected": -312.1425476074219, + "loss": 0.1867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5907859802246094, + "rewards/margins": 5.211655139923096, + "rewards/rejected": -7.802441596984863, + "step": 2215 + }, + { + "epoch": 0.46, + "learning_rate": 1.0773109243697479e-05, + "logits/chosen": -2.232961893081665, + "logits/rejected": -1.516900658607483, + "logps/chosen": -334.7171630859375, + "logps/rejected": -315.154296875, + "loss": 0.1667, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.770059823989868, + "rewards/margins": 5.6397857666015625, + "rewards/rejected": -8.409846305847168, + "step": 2216 + }, + { + "epoch": 0.46, + "learning_rate": 1.0768907563025211e-05, + "logits/chosen": -2.2113990783691406, + "logits/rejected": -2.358501434326172, + "logps/chosen": -217.718017578125, + "logps/rejected": -260.5225830078125, + "loss": 0.2355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.661038875579834, + "rewards/margins": 3.5129575729370117, + "rewards/rejected": -6.1739959716796875, + "step": 2217 + }, + { + "epoch": 0.46, + "learning_rate": 1.0764705882352941e-05, + "logits/chosen": -2.2868616580963135, + "logits/rejected": -1.994567632675171, + "logps/chosen": -285.7127990722656, + "logps/rejected": -286.7519226074219, + "loss": 0.7166, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.269653558731079, + "rewards/margins": 2.8677520751953125, + "rewards/rejected": -5.1374053955078125, + "step": 2218 + }, + { + "epoch": 0.46, + "learning_rate": 1.0760504201680673e-05, + "logits/chosen": -2.0874366760253906, + "logits/rejected": -1.9938488006591797, + "logps/chosen": -299.6470031738281, + "logps/rejected": -344.7825012207031, + "loss": 0.1614, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7262797355651855, + "rewards/margins": 4.680932998657227, + "rewards/rejected": -7.407212257385254, + "step": 2219 + }, + { + "epoch": 0.46, + "learning_rate": 1.0756302521008403e-05, + "logits/chosen": -2.219188690185547, + "logits/rejected": -1.85585618019104, + "logps/chosen": -277.2038269042969, + "logps/rejected": -242.1912841796875, + "loss": 0.3288, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.494776487350464, + "rewards/margins": 3.3394877910614014, + "rewards/rejected": -5.834264278411865, + "step": 2220 + }, + { + "epoch": 0.46, + "learning_rate": 1.0752100840336135e-05, + "logits/chosen": -2.302727460861206, + "logits/rejected": -2.219780921936035, + "logps/chosen": -341.3525695800781, + "logps/rejected": -291.35687255859375, + "loss": 0.2023, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.839379072189331, + "rewards/margins": 4.455135822296143, + "rewards/rejected": -6.2945146560668945, + "step": 2221 + }, + { + "epoch": 0.46, + "learning_rate": 1.0747899159663866e-05, + "logits/chosen": -2.44423246383667, + "logits/rejected": -2.164736032485962, + "logps/chosen": -274.3758239746094, + "logps/rejected": -284.9472351074219, + "loss": 0.185, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9392091035842896, + "rewards/margins": 3.931859016418457, + "rewards/rejected": -5.871068477630615, + "step": 2222 + }, + { + "epoch": 0.47, + "learning_rate": 1.0743697478991597e-05, + "logits/chosen": -1.8586549758911133, + "logits/rejected": -2.2672882080078125, + "logps/chosen": -270.50628662109375, + "logps/rejected": -377.39288330078125, + "loss": 0.182, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9168264865875244, + "rewards/margins": 4.387651443481445, + "rewards/rejected": -7.304478645324707, + "step": 2223 + }, + { + "epoch": 0.47, + "learning_rate": 1.0739495798319328e-05, + "logits/chosen": -2.2375826835632324, + "logits/rejected": -2.4133353233337402, + "logps/chosen": -284.8385925292969, + "logps/rejected": -383.3919982910156, + "loss": 0.1722, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.103823661804199, + "rewards/margins": 5.040059566497803, + "rewards/rejected": -7.143883228302002, + "step": 2224 + }, + { + "epoch": 0.47, + "learning_rate": 1.073529411764706e-05, + "logits/chosen": -2.255084276199341, + "logits/rejected": -1.869938611984253, + "logps/chosen": -390.86376953125, + "logps/rejected": -377.3240051269531, + "loss": 0.702, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9561069011688232, + "rewards/margins": 3.594660997390747, + "rewards/rejected": -6.55076789855957, + "step": 2225 + }, + { + "epoch": 0.47, + "learning_rate": 1.073109243697479e-05, + "logits/chosen": -2.032472610473633, + "logits/rejected": -2.1681413650512695, + "logps/chosen": -328.66876220703125, + "logps/rejected": -387.1632080078125, + "loss": 0.5122, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.000619649887085, + "rewards/margins": 3.8319714069366455, + "rewards/rejected": -5.8325910568237305, + "step": 2226 + }, + { + "epoch": 0.47, + "learning_rate": 1.0726890756302522e-05, + "logits/chosen": -2.299386978149414, + "logits/rejected": -2.2176566123962402, + "logps/chosen": -296.2229919433594, + "logps/rejected": -400.525146484375, + "loss": 0.1183, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6341265439987183, + "rewards/margins": 5.189969539642334, + "rewards/rejected": -6.824095726013184, + "step": 2227 + }, + { + "epoch": 0.47, + "learning_rate": 1.0722689075630252e-05, + "logits/chosen": -2.344362735748291, + "logits/rejected": -2.1972527503967285, + "logps/chosen": -254.92074584960938, + "logps/rejected": -317.0208435058594, + "loss": 0.3232, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.403212547302246, + "rewards/margins": 3.340275287628174, + "rewards/rejected": -5.743488311767578, + "step": 2228 + }, + { + "epoch": 0.47, + "learning_rate": 1.0718487394957984e-05, + "logits/chosen": -2.115281581878662, + "logits/rejected": -2.0778818130493164, + "logps/chosen": -308.91973876953125, + "logps/rejected": -348.39447021484375, + "loss": 0.3515, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.801435947418213, + "rewards/margins": 4.555243968963623, + "rewards/rejected": -7.356679916381836, + "step": 2229 + }, + { + "epoch": 0.47, + "learning_rate": 1.0714285714285714e-05, + "logits/chosen": -1.9922337532043457, + "logits/rejected": -1.5101429224014282, + "logps/chosen": -339.193115234375, + "logps/rejected": -348.9505310058594, + "loss": 0.1001, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8807196617126465, + "rewards/margins": 6.552649974822998, + "rewards/rejected": -8.433370590209961, + "step": 2230 + }, + { + "epoch": 0.47, + "learning_rate": 1.0710084033613446e-05, + "logits/chosen": -2.1856231689453125, + "logits/rejected": -1.764656662940979, + "logps/chosen": -402.35906982421875, + "logps/rejected": -446.05694580078125, + "loss": 0.1637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4766392707824707, + "rewards/margins": 5.052073955535889, + "rewards/rejected": -6.528713226318359, + "step": 2231 + }, + { + "epoch": 0.47, + "learning_rate": 1.0705882352941178e-05, + "logits/chosen": -1.666250228881836, + "logits/rejected": -1.4801517724990845, + "logps/chosen": -264.9478759765625, + "logps/rejected": -324.7733154296875, + "loss": 0.4313, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.266223430633545, + "rewards/margins": 3.1230216026306152, + "rewards/rejected": -5.389245510101318, + "step": 2232 + }, + { + "epoch": 0.47, + "learning_rate": 1.0701680672268908e-05, + "logits/chosen": -2.4650676250457764, + "logits/rejected": -2.1606171131134033, + "logps/chosen": -390.0478210449219, + "logps/rejected": -376.9784240722656, + "loss": 0.4598, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0779905319213867, + "rewards/margins": 3.78542423248291, + "rewards/rejected": -5.863414764404297, + "step": 2233 + }, + { + "epoch": 0.47, + "learning_rate": 1.069747899159664e-05, + "logits/chosen": -2.3174917697906494, + "logits/rejected": -1.8342410326004028, + "logps/chosen": -340.0020446777344, + "logps/rejected": -307.1953125, + "loss": 0.2601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5838121175765991, + "rewards/margins": 3.620408296585083, + "rewards/rejected": -5.204220771789551, + "step": 2234 + }, + { + "epoch": 0.47, + "learning_rate": 1.069327731092437e-05, + "logits/chosen": -2.17441463470459, + "logits/rejected": -1.7299838066101074, + "logps/chosen": -337.74639892578125, + "logps/rejected": -295.0901794433594, + "loss": 0.1725, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.877786636352539, + "rewards/margins": 3.0202202796936035, + "rewards/rejected": -4.898006916046143, + "step": 2235 + }, + { + "epoch": 0.47, + "learning_rate": 1.0689075630252102e-05, + "logits/chosen": -2.270581007003784, + "logits/rejected": -1.8749492168426514, + "logps/chosen": -261.9243469238281, + "logps/rejected": -203.61996459960938, + "loss": 0.3716, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4220809936523438, + "rewards/margins": 2.819855213165283, + "rewards/rejected": -5.241936206817627, + "step": 2236 + }, + { + "epoch": 0.47, + "learning_rate": 1.0684873949579832e-05, + "logits/chosen": -2.2787320613861084, + "logits/rejected": -1.8370237350463867, + "logps/chosen": -217.92852783203125, + "logps/rejected": -242.4041748046875, + "loss": 0.3508, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.757615089416504, + "rewards/margins": 4.874331951141357, + "rewards/rejected": -6.631947040557861, + "step": 2237 + }, + { + "epoch": 0.47, + "learning_rate": 1.0680672268907564e-05, + "logits/chosen": -2.1033692359924316, + "logits/rejected": -2.1304478645324707, + "logps/chosen": -297.3065185546875, + "logps/rejected": -304.0707092285156, + "loss": 0.3296, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0246024131774902, + "rewards/margins": 5.272213459014893, + "rewards/rejected": -7.296815395355225, + "step": 2238 + }, + { + "epoch": 0.47, + "learning_rate": 1.0676470588235295e-05, + "logits/chosen": -2.493102550506592, + "logits/rejected": -1.6055775880813599, + "logps/chosen": -343.25311279296875, + "logps/rejected": -275.83184814453125, + "loss": 0.5858, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.303953170776367, + "rewards/margins": 2.8446543216705322, + "rewards/rejected": -6.148608207702637, + "step": 2239 + }, + { + "epoch": 0.47, + "learning_rate": 1.0672268907563026e-05, + "logits/chosen": -2.3732991218566895, + "logits/rejected": -1.3028373718261719, + "logps/chosen": -336.52301025390625, + "logps/rejected": -308.91583251953125, + "loss": 0.1113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7011866569519043, + "rewards/margins": 3.7374966144561768, + "rewards/rejected": -5.43868350982666, + "step": 2240 + }, + { + "epoch": 0.47, + "learning_rate": 1.0668067226890757e-05, + "logits/chosen": -2.541382312774658, + "logits/rejected": -2.0083346366882324, + "logps/chosen": -408.62957763671875, + "logps/rejected": -319.6160888671875, + "loss": 0.1543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7813868522644043, + "rewards/margins": 4.453334808349609, + "rewards/rejected": -6.234721660614014, + "step": 2241 + }, + { + "epoch": 0.47, + "learning_rate": 1.0663865546218489e-05, + "logits/chosen": -2.182600498199463, + "logits/rejected": -1.753218412399292, + "logps/chosen": -276.21453857421875, + "logps/rejected": -280.6688537597656, + "loss": 0.1858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0654513835906982, + "rewards/margins": 5.518108367919922, + "rewards/rejected": -7.583559989929199, + "step": 2242 + }, + { + "epoch": 0.47, + "learning_rate": 1.0659663865546219e-05, + "logits/chosen": -2.244385242462158, + "logits/rejected": -1.3462674617767334, + "logps/chosen": -372.945068359375, + "logps/rejected": -268.8367004394531, + "loss": 0.3019, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.858535885810852, + "rewards/margins": 4.242053985595703, + "rewards/rejected": -6.100589752197266, + "step": 2243 + }, + { + "epoch": 0.47, + "learning_rate": 1.065546218487395e-05, + "logits/chosen": -1.829626441001892, + "logits/rejected": -1.9983892440795898, + "logps/chosen": -348.000732421875, + "logps/rejected": -434.7879638671875, + "loss": 0.3185, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6664745807647705, + "rewards/margins": 4.671359062194824, + "rewards/rejected": -7.337833881378174, + "step": 2244 + }, + { + "epoch": 0.47, + "learning_rate": 1.0651260504201681e-05, + "logits/chosen": -2.2860045433044434, + "logits/rejected": -2.216571569442749, + "logps/chosen": -295.8656005859375, + "logps/rejected": -349.29376220703125, + "loss": 0.1342, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.580683708190918, + "rewards/margins": 5.6064300537109375, + "rewards/rejected": -7.187114238739014, + "step": 2245 + }, + { + "epoch": 0.47, + "learning_rate": 1.0647058823529413e-05, + "logits/chosen": -1.977545976638794, + "logits/rejected": -1.8383845090866089, + "logps/chosen": -332.2418212890625, + "logps/rejected": -304.9915771484375, + "loss": 0.1582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0358564853668213, + "rewards/margins": 4.698163986206055, + "rewards/rejected": -5.734020709991455, + "step": 2246 + }, + { + "epoch": 0.47, + "learning_rate": 1.0642857142857143e-05, + "logits/chosen": -2.205399990081787, + "logits/rejected": -1.9745917320251465, + "logps/chosen": -348.6900634765625, + "logps/rejected": -434.18096923828125, + "loss": 0.4658, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9008054733276367, + "rewards/margins": 3.648080587387085, + "rewards/rejected": -6.548886299133301, + "step": 2247 + }, + { + "epoch": 0.47, + "learning_rate": 1.0638655462184875e-05, + "logits/chosen": -2.161336660385132, + "logits/rejected": -1.6996766328811646, + "logps/chosen": -307.3624267578125, + "logps/rejected": -200.87753295898438, + "loss": 0.3624, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.090386390686035, + "rewards/margins": 1.5949962139129639, + "rewards/rejected": -4.685382843017578, + "step": 2248 + }, + { + "epoch": 0.47, + "learning_rate": 1.0634453781512605e-05, + "logits/chosen": -2.21938419342041, + "logits/rejected": -2.160126209259033, + "logps/chosen": -458.0859375, + "logps/rejected": -423.2908935546875, + "loss": 0.4247, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2430121898651123, + "rewards/margins": 3.230743885040283, + "rewards/rejected": -5.473756313323975, + "step": 2249 + }, + { + "epoch": 0.47, + "learning_rate": 1.0630252100840337e-05, + "logits/chosen": -2.311984062194824, + "logits/rejected": -2.030043840408325, + "logps/chosen": -437.20074462890625, + "logps/rejected": -395.2602233886719, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.775640845298767, + "rewards/margins": 5.377265453338623, + "rewards/rejected": -7.15290641784668, + "step": 2250 + }, + { + "epoch": 0.47, + "learning_rate": 1.0626050420168067e-05, + "logits/chosen": -2.534961223602295, + "logits/rejected": -2.0440006256103516, + "logps/chosen": -432.7363586425781, + "logps/rejected": -331.427001953125, + "loss": 0.2031, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3942008018493652, + "rewards/margins": 4.759222984313965, + "rewards/rejected": -7.153424263000488, + "step": 2251 + }, + { + "epoch": 0.47, + "learning_rate": 1.06218487394958e-05, + "logits/chosen": -2.186769962310791, + "logits/rejected": -1.577028512954712, + "logps/chosen": -380.36651611328125, + "logps/rejected": -367.24798583984375, + "loss": 0.3819, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.320749282836914, + "rewards/margins": 3.0393753051757812, + "rewards/rejected": -5.360124588012695, + "step": 2252 + }, + { + "epoch": 0.47, + "learning_rate": 1.061764705882353e-05, + "logits/chosen": -2.464982032775879, + "logits/rejected": -2.277682304382324, + "logps/chosen": -355.7694091796875, + "logps/rejected": -355.5269775390625, + "loss": 0.1861, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0717246532440186, + "rewards/margins": 4.6460161209106445, + "rewards/rejected": -6.717741012573242, + "step": 2253 + }, + { + "epoch": 0.47, + "learning_rate": 1.0613445378151261e-05, + "logits/chosen": -2.132046937942505, + "logits/rejected": -2.1645894050598145, + "logps/chosen": -279.29534912109375, + "logps/rejected": -296.971435546875, + "loss": 0.1427, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.611100435256958, + "rewards/margins": 5.796579837799072, + "rewards/rejected": -7.407679557800293, + "step": 2254 + }, + { + "epoch": 0.47, + "learning_rate": 1.0609243697478993e-05, + "logits/chosen": -2.2670211791992188, + "logits/rejected": -1.1424469947814941, + "logps/chosen": -276.5291748046875, + "logps/rejected": -190.7577362060547, + "loss": 0.2272, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.65633487701416, + "rewards/margins": 3.2629189491271973, + "rewards/rejected": -5.919253349304199, + "step": 2255 + }, + { + "epoch": 0.47, + "learning_rate": 1.0605042016806724e-05, + "logits/chosen": -1.8606351613998413, + "logits/rejected": -2.001354455947876, + "logps/chosen": -291.7681884765625, + "logps/rejected": -429.3544616699219, + "loss": 0.0872, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9967080354690552, + "rewards/margins": 6.338170528411865, + "rewards/rejected": -8.334878921508789, + "step": 2256 + }, + { + "epoch": 0.47, + "learning_rate": 1.0600840336134455e-05, + "logits/chosen": -2.2156407833099365, + "logits/rejected": -1.8583240509033203, + "logps/chosen": -324.5140686035156, + "logps/rejected": -345.49224853515625, + "loss": 0.1138, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9272854328155518, + "rewards/margins": 5.892843246459961, + "rewards/rejected": -7.820127964019775, + "step": 2257 + }, + { + "epoch": 0.47, + "learning_rate": 1.0596638655462186e-05, + "logits/chosen": -2.350205898284912, + "logits/rejected": -2.100562572479248, + "logps/chosen": -430.767822265625, + "logps/rejected": -432.44012451171875, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.129133462905884, + "rewards/margins": 3.9867444038391113, + "rewards/rejected": -6.115878105163574, + "step": 2258 + }, + { + "epoch": 0.47, + "learning_rate": 1.0592436974789918e-05, + "logits/chosen": -2.123753070831299, + "logits/rejected": -2.0963852405548096, + "logps/chosen": -235.05751037597656, + "logps/rejected": -232.68536376953125, + "loss": 0.7072, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6822681427001953, + "rewards/margins": 2.7440764904022217, + "rewards/rejected": -5.426344871520996, + "step": 2259 + }, + { + "epoch": 0.47, + "learning_rate": 1.0588235294117648e-05, + "logits/chosen": -2.380059003829956, + "logits/rejected": -1.995006799697876, + "logps/chosen": -442.45294189453125, + "logps/rejected": -366.01123046875, + "loss": 0.3146, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6446397304534912, + "rewards/margins": 4.297112464904785, + "rewards/rejected": -5.9417524337768555, + "step": 2260 + }, + { + "epoch": 0.47, + "learning_rate": 1.058403361344538e-05, + "logits/chosen": -2.071579933166504, + "logits/rejected": -1.9076157808303833, + "logps/chosen": -343.28558349609375, + "logps/rejected": -337.1347961425781, + "loss": 0.4269, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4804975986480713, + "rewards/margins": 4.389636516571045, + "rewards/rejected": -6.870134353637695, + "step": 2261 + }, + { + "epoch": 0.47, + "learning_rate": 1.057983193277311e-05, + "logits/chosen": -2.2891056537628174, + "logits/rejected": -2.11515736579895, + "logps/chosen": -401.09228515625, + "logps/rejected": -378.7855224609375, + "loss": 0.535, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5840132236480713, + "rewards/margins": 3.7881226539611816, + "rewards/rejected": -6.372136116027832, + "step": 2262 + }, + { + "epoch": 0.47, + "learning_rate": 1.0575630252100842e-05, + "logits/chosen": -2.134396553039551, + "logits/rejected": -2.165761709213257, + "logps/chosen": -330.08551025390625, + "logps/rejected": -335.84393310546875, + "loss": 0.5337, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6672980785369873, + "rewards/margins": 2.9391837120056152, + "rewards/rejected": -5.606481552124023, + "step": 2263 + }, + { + "epoch": 0.47, + "learning_rate": 1.0571428571428572e-05, + "logits/chosen": -2.1118838787078857, + "logits/rejected": -1.8191330432891846, + "logps/chosen": -303.418701171875, + "logps/rejected": -301.62945556640625, + "loss": 0.252, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4390218257904053, + "rewards/margins": 3.3422956466674805, + "rewards/rejected": -5.781317234039307, + "step": 2264 + }, + { + "epoch": 0.47, + "learning_rate": 1.0567226890756304e-05, + "logits/chosen": -2.3714094161987305, + "logits/rejected": -2.09450364112854, + "logps/chosen": -351.881591796875, + "logps/rejected": -346.2973937988281, + "loss": 0.3416, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.331536054611206, + "rewards/margins": 3.77156138420105, + "rewards/rejected": -6.103097438812256, + "step": 2265 + }, + { + "epoch": 0.47, + "learning_rate": 1.0563025210084034e-05, + "logits/chosen": -2.073634386062622, + "logits/rejected": -2.2854981422424316, + "logps/chosen": -390.7427978515625, + "logps/rejected": -487.54473876953125, + "loss": 0.4898, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2430243492126465, + "rewards/margins": 1.8762409687042236, + "rewards/rejected": -4.119265556335449, + "step": 2266 + }, + { + "epoch": 0.47, + "learning_rate": 1.0558823529411766e-05, + "logits/chosen": -2.2952065467834473, + "logits/rejected": -2.1446497440338135, + "logps/chosen": -439.07672119140625, + "logps/rejected": -476.9337463378906, + "loss": 0.1455, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1975785493850708, + "rewards/margins": 4.0154242515563965, + "rewards/rejected": -5.213002681732178, + "step": 2267 + }, + { + "epoch": 0.47, + "learning_rate": 1.0554621848739496e-05, + "logits/chosen": -2.3860528469085693, + "logits/rejected": -2.342975616455078, + "logps/chosen": -361.2107238769531, + "logps/rejected": -340.8471374511719, + "loss": 0.2526, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7229912281036377, + "rewards/margins": 4.171393394470215, + "rewards/rejected": -5.894384384155273, + "step": 2268 + }, + { + "epoch": 0.47, + "learning_rate": 1.0550420168067228e-05, + "logits/chosen": -2.1936278343200684, + "logits/rejected": -1.8721768856048584, + "logps/chosen": -404.3787536621094, + "logps/rejected": -323.47528076171875, + "loss": 0.2161, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8592255115509033, + "rewards/margins": 3.1862571239471436, + "rewards/rejected": -5.045482635498047, + "step": 2269 + }, + { + "epoch": 0.47, + "learning_rate": 1.0546218487394959e-05, + "logits/chosen": -1.8568305969238281, + "logits/rejected": -1.7172417640686035, + "logps/chosen": -340.1731872558594, + "logps/rejected": -378.71990966796875, + "loss": 0.5118, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8410212993621826, + "rewards/margins": 4.397149562835693, + "rewards/rejected": -7.238170146942139, + "step": 2270 + }, + { + "epoch": 0.48, + "learning_rate": 1.054201680672269e-05, + "logits/chosen": -2.167987108230591, + "logits/rejected": -2.2638425827026367, + "logps/chosen": -234.21136474609375, + "logps/rejected": -247.3729705810547, + "loss": 0.7625, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.6195812225341797, + "rewards/margins": 2.501633882522583, + "rewards/rejected": -5.121215343475342, + "step": 2271 + }, + { + "epoch": 0.48, + "learning_rate": 1.053781512605042e-05, + "logits/chosen": -2.143850803375244, + "logits/rejected": -2.1145806312561035, + "logps/chosen": -284.2950744628906, + "logps/rejected": -351.94415283203125, + "loss": 0.3488, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3442676067352295, + "rewards/margins": 2.7296066284179688, + "rewards/rejected": -5.073873996734619, + "step": 2272 + }, + { + "epoch": 0.48, + "learning_rate": 1.0533613445378153e-05, + "logits/chosen": -2.2186970710754395, + "logits/rejected": -1.824692964553833, + "logps/chosen": -403.21417236328125, + "logps/rejected": -364.8682861328125, + "loss": 0.17, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1549181938171387, + "rewards/margins": 3.983293294906616, + "rewards/rejected": -6.138211250305176, + "step": 2273 + }, + { + "epoch": 0.48, + "learning_rate": 1.0529411764705883e-05, + "logits/chosen": -2.2263786792755127, + "logits/rejected": -1.9309293031692505, + "logps/chosen": -377.0666198730469, + "logps/rejected": -345.8014221191406, + "loss": 0.2844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4748728275299072, + "rewards/margins": 2.6632609367370605, + "rewards/rejected": -5.138134002685547, + "step": 2274 + }, + { + "epoch": 0.48, + "learning_rate": 1.0525210084033615e-05, + "logits/chosen": -2.2081804275512695, + "logits/rejected": -2.1690778732299805, + "logps/chosen": -405.8522033691406, + "logps/rejected": -438.5234069824219, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0199661254882812, + "rewards/margins": 4.896577835083008, + "rewards/rejected": -6.916543006896973, + "step": 2275 + }, + { + "epoch": 0.48, + "learning_rate": 1.0521008403361345e-05, + "logits/chosen": -2.109779119491577, + "logits/rejected": -1.8916078805923462, + "logps/chosen": -412.5704040527344, + "logps/rejected": -336.9224853515625, + "loss": 0.273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3690977096557617, + "rewards/margins": 3.8368191719055176, + "rewards/rejected": -7.2059173583984375, + "step": 2276 + }, + { + "epoch": 0.48, + "learning_rate": 1.0516806722689077e-05, + "logits/chosen": -2.049389600753784, + "logits/rejected": -2.2946488857269287, + "logps/chosen": -244.0751953125, + "logps/rejected": -334.02740478515625, + "loss": 0.1811, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.221323251724243, + "rewards/margins": 3.872422456741333, + "rewards/rejected": -6.093745231628418, + "step": 2277 + }, + { + "epoch": 0.48, + "learning_rate": 1.0512605042016809e-05, + "logits/chosen": -1.6871998310089111, + "logits/rejected": -2.065037727355957, + "logps/chosen": -261.9024658203125, + "logps/rejected": -354.5792236328125, + "loss": 0.2743, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8833125829696655, + "rewards/margins": 5.362640380859375, + "rewards/rejected": -7.24595308303833, + "step": 2278 + }, + { + "epoch": 0.48, + "learning_rate": 1.0508403361344539e-05, + "logits/chosen": -2.1301095485687256, + "logits/rejected": -1.8175932168960571, + "logps/chosen": -403.0058288574219, + "logps/rejected": -353.320068359375, + "loss": 0.2916, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5765385627746582, + "rewards/margins": 3.7885303497314453, + "rewards/rejected": -5.3650689125061035, + "step": 2279 + }, + { + "epoch": 0.48, + "learning_rate": 1.0504201680672271e-05, + "logits/chosen": -2.170663833618164, + "logits/rejected": -2.0763487815856934, + "logps/chosen": -333.7818603515625, + "logps/rejected": -419.4892883300781, + "loss": 0.5893, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.268490791320801, + "rewards/margins": 3.5740537643432617, + "rewards/rejected": -5.8425445556640625, + "step": 2280 + }, + { + "epoch": 0.48, + "learning_rate": 1.0500000000000001e-05, + "logits/chosen": -2.1273372173309326, + "logits/rejected": -1.657572627067566, + "logps/chosen": -311.7430725097656, + "logps/rejected": -291.7752990722656, + "loss": 0.2873, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.247150421142578, + "rewards/margins": 3.382223606109619, + "rewards/rejected": -5.629374027252197, + "step": 2281 + }, + { + "epoch": 0.48, + "learning_rate": 1.0495798319327733e-05, + "logits/chosen": -2.058392286300659, + "logits/rejected": -1.9028072357177734, + "logps/chosen": -284.7234802246094, + "logps/rejected": -387.623046875, + "loss": 0.1222, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0908210277557373, + "rewards/margins": 4.051909446716309, + "rewards/rejected": -6.142730712890625, + "step": 2282 + }, + { + "epoch": 0.48, + "learning_rate": 1.0491596638655463e-05, + "logits/chosen": -2.1329362392425537, + "logits/rejected": -1.8955204486846924, + "logps/chosen": -351.31396484375, + "logps/rejected": -315.35736083984375, + "loss": 0.3912, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5695321559906006, + "rewards/margins": 3.1083269119262695, + "rewards/rejected": -5.677859306335449, + "step": 2283 + }, + { + "epoch": 0.48, + "learning_rate": 1.0487394957983195e-05, + "logits/chosen": -2.144345998764038, + "logits/rejected": -2.027606964111328, + "logps/chosen": -379.93804931640625, + "logps/rejected": -350.4498291015625, + "loss": 0.4272, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.058117389678955, + "rewards/margins": 2.9958508014678955, + "rewards/rejected": -5.05396842956543, + "step": 2284 + }, + { + "epoch": 0.48, + "learning_rate": 1.0483193277310925e-05, + "logits/chosen": -1.8589396476745605, + "logits/rejected": -2.1651878356933594, + "logps/chosen": -287.7435302734375, + "logps/rejected": -346.69329833984375, + "loss": 0.3065, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6636154651641846, + "rewards/margins": 4.229780197143555, + "rewards/rejected": -6.893395900726318, + "step": 2285 + }, + { + "epoch": 0.48, + "learning_rate": 1.0478991596638657e-05, + "logits/chosen": -2.327234983444214, + "logits/rejected": -1.6147817373275757, + "logps/chosen": -371.5098876953125, + "logps/rejected": -308.7040100097656, + "loss": 0.7319, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9379379749298096, + "rewards/margins": 2.8196470737457275, + "rewards/rejected": -4.757585048675537, + "step": 2286 + }, + { + "epoch": 0.48, + "learning_rate": 1.0474789915966388e-05, + "logits/chosen": -2.1534690856933594, + "logits/rejected": -1.8361022472381592, + "logps/chosen": -385.3453063964844, + "logps/rejected": -327.05804443359375, + "loss": 0.2885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.141204833984375, + "rewards/margins": 2.60811710357666, + "rewards/rejected": -4.749321937561035, + "step": 2287 + }, + { + "epoch": 0.48, + "learning_rate": 1.047058823529412e-05, + "logits/chosen": -2.4462528228759766, + "logits/rejected": -1.8881711959838867, + "logps/chosen": -389.78082275390625, + "logps/rejected": -297.2864074707031, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5844292640686035, + "rewards/margins": 4.194100379943848, + "rewards/rejected": -5.778530120849609, + "step": 2288 + }, + { + "epoch": 0.48, + "learning_rate": 1.046638655462185e-05, + "logits/chosen": -2.297259569168091, + "logits/rejected": -2.227626085281372, + "logps/chosen": -306.6974792480469, + "logps/rejected": -374.4692077636719, + "loss": 0.5069, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2834997177124023, + "rewards/margins": 3.1043922901153564, + "rewards/rejected": -5.38789176940918, + "step": 2289 + }, + { + "epoch": 0.48, + "learning_rate": 1.0462184873949582e-05, + "logits/chosen": -2.3599748611450195, + "logits/rejected": -1.8623325824737549, + "logps/chosen": -448.1184387207031, + "logps/rejected": -416.02880859375, + "loss": 0.1388, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6405494213104248, + "rewards/margins": 3.9979772567749023, + "rewards/rejected": -5.638526916503906, + "step": 2290 + }, + { + "epoch": 0.48, + "learning_rate": 1.0457983193277312e-05, + "logits/chosen": -2.436807632446289, + "logits/rejected": -1.9875867366790771, + "logps/chosen": -398.48077392578125, + "logps/rejected": -370.5784606933594, + "loss": 0.1847, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.903550148010254, + "rewards/margins": 5.235564231872559, + "rewards/rejected": -7.1391143798828125, + "step": 2291 + }, + { + "epoch": 0.48, + "learning_rate": 1.0453781512605044e-05, + "logits/chosen": -2.027940511703491, + "logits/rejected": -2.1264381408691406, + "logps/chosen": -255.39329528808594, + "logps/rejected": -269.36065673828125, + "loss": 0.3458, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9228711128234863, + "rewards/margins": 4.008758068084717, + "rewards/rejected": -5.931629180908203, + "step": 2292 + }, + { + "epoch": 0.48, + "learning_rate": 1.0449579831932774e-05, + "logits/chosen": -2.075376272201538, + "logits/rejected": -1.755548119544983, + "logps/chosen": -257.39459228515625, + "logps/rejected": -324.1755065917969, + "loss": 0.5122, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1123883724212646, + "rewards/margins": 3.1925406455993652, + "rewards/rejected": -5.304928779602051, + "step": 2293 + }, + { + "epoch": 0.48, + "learning_rate": 1.0445378151260506e-05, + "logits/chosen": -2.155927896499634, + "logits/rejected": -1.897313117980957, + "logps/chosen": -472.8931579589844, + "logps/rejected": -356.0142517089844, + "loss": 0.4165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.771646022796631, + "rewards/margins": 3.050260066986084, + "rewards/rejected": -5.821905612945557, + "step": 2294 + }, + { + "epoch": 0.48, + "learning_rate": 1.0441176470588236e-05, + "logits/chosen": -2.031141757965088, + "logits/rejected": -1.7571543455123901, + "logps/chosen": -416.4038391113281, + "logps/rejected": -377.3183288574219, + "loss": 0.6872, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.065910577774048, + "rewards/margins": 2.4119224548339844, + "rewards/rejected": -5.477832794189453, + "step": 2295 + }, + { + "epoch": 0.48, + "learning_rate": 1.0436974789915968e-05, + "logits/chosen": -1.9982634782791138, + "logits/rejected": -2.2248196601867676, + "logps/chosen": -394.5055847167969, + "logps/rejected": -366.26068115234375, + "loss": 0.8573, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1969375610351562, + "rewards/margins": 1.4121136665344238, + "rewards/rejected": -3.609051465988159, + "step": 2296 + }, + { + "epoch": 0.48, + "learning_rate": 1.0432773109243698e-05, + "logits/chosen": -2.087254524230957, + "logits/rejected": -2.105206251144409, + "logps/chosen": -257.530517578125, + "logps/rejected": -392.151123046875, + "loss": 0.5559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3972482681274414, + "rewards/margins": 3.5129787921905518, + "rewards/rejected": -5.910226821899414, + "step": 2297 + }, + { + "epoch": 0.48, + "learning_rate": 1.042857142857143e-05, + "logits/chosen": -1.8594083786010742, + "logits/rejected": -1.8784559965133667, + "logps/chosen": -216.34341430664062, + "logps/rejected": -239.94049072265625, + "loss": 0.4017, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.8850667476654053, + "rewards/margins": 2.38800311088562, + "rewards/rejected": -5.273069858551025, + "step": 2298 + }, + { + "epoch": 0.48, + "learning_rate": 1.0424369747899162e-05, + "logits/chosen": -2.3612325191497803, + "logits/rejected": -1.8503270149230957, + "logps/chosen": -351.280029296875, + "logps/rejected": -323.8396301269531, + "loss": 0.1901, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.224998950958252, + "rewards/margins": 4.165541648864746, + "rewards/rejected": -6.390540599822998, + "step": 2299 + }, + { + "epoch": 0.48, + "learning_rate": 1.0420168067226892e-05, + "logits/chosen": -2.148956060409546, + "logits/rejected": -2.082418918609619, + "logps/chosen": -266.72198486328125, + "logps/rejected": -284.8290100097656, + "loss": 0.6362, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.545670986175537, + "rewards/margins": 2.190056800842285, + "rewards/rejected": -4.735727310180664, + "step": 2300 + }, + { + "epoch": 0.48, + "learning_rate": 1.0415966386554624e-05, + "logits/chosen": -1.9479386806488037, + "logits/rejected": -1.770588755607605, + "logps/chosen": -258.39300537109375, + "logps/rejected": -289.5184326171875, + "loss": 0.3128, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.543332576751709, + "rewards/margins": 4.3417229652404785, + "rewards/rejected": -6.885054588317871, + "step": 2301 + }, + { + "epoch": 0.48, + "learning_rate": 1.0411764705882354e-05, + "logits/chosen": -2.016252279281616, + "logits/rejected": -1.8760995864868164, + "logps/chosen": -285.0805969238281, + "logps/rejected": -336.7353515625, + "loss": 0.622, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7174577713012695, + "rewards/margins": 4.859967231750488, + "rewards/rejected": -7.577424049377441, + "step": 2302 + }, + { + "epoch": 0.48, + "learning_rate": 1.0407563025210086e-05, + "logits/chosen": -2.206784248352051, + "logits/rejected": -2.0356056690216064, + "logps/chosen": -277.21630859375, + "logps/rejected": -303.509521484375, + "loss": 0.2285, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6532747745513916, + "rewards/margins": 4.461917877197266, + "rewards/rejected": -6.115192413330078, + "step": 2303 + }, + { + "epoch": 0.48, + "learning_rate": 1.0403361344537817e-05, + "logits/chosen": -1.9960033893585205, + "logits/rejected": -2.025233745574951, + "logps/chosen": -308.00921630859375, + "logps/rejected": -373.82086181640625, + "loss": 0.2264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.995784044265747, + "rewards/margins": 4.461019992828369, + "rewards/rejected": -6.456804275512695, + "step": 2304 + }, + { + "epoch": 0.48, + "learning_rate": 1.0399159663865548e-05, + "logits/chosen": -2.2696568965911865, + "logits/rejected": -1.901287317276001, + "logps/chosen": -333.8496398925781, + "logps/rejected": -328.36859130859375, + "loss": 0.5239, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.70107102394104, + "rewards/margins": 3.6228084564208984, + "rewards/rejected": -7.323879241943359, + "step": 2305 + }, + { + "epoch": 0.48, + "learning_rate": 1.0394957983193279e-05, + "logits/chosen": -2.204970598220825, + "logits/rejected": -1.9253250360488892, + "logps/chosen": -299.41058349609375, + "logps/rejected": -332.7149963378906, + "loss": 0.2594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5997343063354492, + "rewards/margins": 4.068821907043457, + "rewards/rejected": -5.668556213378906, + "step": 2306 + }, + { + "epoch": 0.48, + "learning_rate": 1.039075630252101e-05, + "logits/chosen": -2.3570010662078857, + "logits/rejected": -2.1713829040527344, + "logps/chosen": -420.0768127441406, + "logps/rejected": -391.6488342285156, + "loss": 0.5355, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.025796890258789, + "rewards/margins": 3.06980037689209, + "rewards/rejected": -5.095596790313721, + "step": 2307 + }, + { + "epoch": 0.48, + "learning_rate": 1.038655462184874e-05, + "logits/chosen": -2.2435479164123535, + "logits/rejected": -1.6122558116912842, + "logps/chosen": -367.30340576171875, + "logps/rejected": -310.99847412109375, + "loss": 0.1822, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.503770351409912, + "rewards/margins": 5.666388511657715, + "rewards/rejected": -8.170159339904785, + "step": 2308 + }, + { + "epoch": 0.48, + "learning_rate": 1.0382352941176473e-05, + "logits/chosen": -2.30468487739563, + "logits/rejected": -1.940298318862915, + "logps/chosen": -279.4072570800781, + "logps/rejected": -334.99066162109375, + "loss": 0.3656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3309428691864014, + "rewards/margins": 2.286120653152466, + "rewards/rejected": -4.617063522338867, + "step": 2309 + }, + { + "epoch": 0.48, + "learning_rate": 1.0378151260504203e-05, + "logits/chosen": -2.3532700538635254, + "logits/rejected": -1.9994531869888306, + "logps/chosen": -397.7611999511719, + "logps/rejected": -353.3001708984375, + "loss": 0.5001, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.361623764038086, + "rewards/margins": 3.895838975906372, + "rewards/rejected": -6.257462978363037, + "step": 2310 + }, + { + "epoch": 0.48, + "learning_rate": 1.0373949579831935e-05, + "logits/chosen": -1.7401371002197266, + "logits/rejected": -1.6821308135986328, + "logps/chosen": -262.25726318359375, + "logps/rejected": -362.7687072753906, + "loss": 0.5544, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5420703887939453, + "rewards/margins": 2.6672356128692627, + "rewards/rejected": -6.209306240081787, + "step": 2311 + }, + { + "epoch": 0.48, + "learning_rate": 1.0369747899159665e-05, + "logits/chosen": -2.186805009841919, + "logits/rejected": -1.8599371910095215, + "logps/chosen": -311.45953369140625, + "logps/rejected": -243.93914794921875, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5152578353881836, + "rewards/margins": 4.713910102844238, + "rewards/rejected": -6.229167938232422, + "step": 2312 + }, + { + "epoch": 0.48, + "learning_rate": 1.0365546218487397e-05, + "logits/chosen": -2.1323211193084717, + "logits/rejected": -1.7854714393615723, + "logps/chosen": -274.37762451171875, + "logps/rejected": -263.7120361328125, + "loss": 0.5824, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.362654209136963, + "rewards/margins": 2.8587679862976074, + "rewards/rejected": -5.22142219543457, + "step": 2313 + }, + { + "epoch": 0.48, + "learning_rate": 1.0361344537815127e-05, + "logits/chosen": -2.2310009002685547, + "logits/rejected": -1.94380521774292, + "logps/chosen": -296.125, + "logps/rejected": -303.37823486328125, + "loss": 0.1717, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1863696575164795, + "rewards/margins": 4.0852813720703125, + "rewards/rejected": -6.271651268005371, + "step": 2314 + }, + { + "epoch": 0.48, + "learning_rate": 1.0357142857142859e-05, + "logits/chosen": -2.3897182941436768, + "logits/rejected": -2.312070369720459, + "logps/chosen": -285.4508056640625, + "logps/rejected": -342.91778564453125, + "loss": 0.4388, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.359724521636963, + "rewards/margins": 2.9359967708587646, + "rewards/rejected": -5.295721530914307, + "step": 2315 + }, + { + "epoch": 0.48, + "learning_rate": 1.035294117647059e-05, + "logits/chosen": -2.118828058242798, + "logits/rejected": -1.9076753854751587, + "logps/chosen": -442.66815185546875, + "logps/rejected": -374.44476318359375, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5280684232711792, + "rewards/margins": 4.460971832275391, + "rewards/rejected": -5.989039897918701, + "step": 2316 + }, + { + "epoch": 0.48, + "learning_rate": 1.0348739495798321e-05, + "logits/chosen": -2.4222981929779053, + "logits/rejected": -1.9140920639038086, + "logps/chosen": -398.29254150390625, + "logps/rejected": -323.85986328125, + "loss": 0.214, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.780364990234375, + "rewards/margins": 4.140762805938721, + "rewards/rejected": -5.921127796173096, + "step": 2317 + }, + { + "epoch": 0.48, + "learning_rate": 1.0344537815126051e-05, + "logits/chosen": -2.24556827545166, + "logits/rejected": -2.1300268173217773, + "logps/chosen": -282.3939208984375, + "logps/rejected": -274.1551513671875, + "loss": 0.7544, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9049735069274902, + "rewards/margins": 1.473496437072754, + "rewards/rejected": -4.378469467163086, + "step": 2318 + }, + { + "epoch": 0.49, + "learning_rate": 1.0340336134453783e-05, + "logits/chosen": -1.9689371585845947, + "logits/rejected": -2.0177853107452393, + "logps/chosen": -363.2492980957031, + "logps/rejected": -509.2076416015625, + "loss": 0.1934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.043694019317627, + "rewards/margins": 5.898947238922119, + "rewards/rejected": -7.942641258239746, + "step": 2319 + }, + { + "epoch": 0.49, + "learning_rate": 1.0336134453781514e-05, + "logits/chosen": -2.285996913909912, + "logits/rejected": -2.084526300430298, + "logps/chosen": -283.24053955078125, + "logps/rejected": -276.7225036621094, + "loss": 0.3824, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2471442222595215, + "rewards/margins": 3.2939157485961914, + "rewards/rejected": -5.541059494018555, + "step": 2320 + }, + { + "epoch": 0.49, + "learning_rate": 1.0331932773109246e-05, + "logits/chosen": -2.0119547843933105, + "logits/rejected": -2.0724170207977295, + "logps/chosen": -323.9012145996094, + "logps/rejected": -377.12176513671875, + "loss": 0.1381, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7072409391403198, + "rewards/margins": 5.127521991729736, + "rewards/rejected": -6.834763050079346, + "step": 2321 + }, + { + "epoch": 0.49, + "learning_rate": 1.0327731092436977e-05, + "logits/chosen": -1.9772799015045166, + "logits/rejected": -2.087378978729248, + "logps/chosen": -306.2851257324219, + "logps/rejected": -336.35595703125, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.317422866821289, + "rewards/margins": 5.268842697143555, + "rewards/rejected": -7.586265563964844, + "step": 2322 + }, + { + "epoch": 0.49, + "learning_rate": 1.0323529411764708e-05, + "logits/chosen": -2.3413643836975098, + "logits/rejected": -2.0181195735931396, + "logps/chosen": -473.9027099609375, + "logps/rejected": -438.0572814941406, + "loss": 0.1525, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5069773197174072, + "rewards/margins": 3.889072895050049, + "rewards/rejected": -6.396050453186035, + "step": 2323 + }, + { + "epoch": 0.49, + "learning_rate": 1.031932773109244e-05, + "logits/chosen": -2.1342389583587646, + "logits/rejected": -1.8100658655166626, + "logps/chosen": -340.0462341308594, + "logps/rejected": -333.6764831542969, + "loss": 0.2793, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2039098739624023, + "rewards/margins": 4.550719738006592, + "rewards/rejected": -6.754630088806152, + "step": 2324 + }, + { + "epoch": 0.49, + "learning_rate": 1.031512605042017e-05, + "logits/chosen": -2.097410202026367, + "logits/rejected": -1.6221643686294556, + "logps/chosen": -324.7098388671875, + "logps/rejected": -355.258056640625, + "loss": 0.1827, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1750595569610596, + "rewards/margins": 3.7759718894958496, + "rewards/rejected": -5.951031684875488, + "step": 2325 + }, + { + "epoch": 0.49, + "learning_rate": 1.0310924369747898e-05, + "logits/chosen": -2.276090145111084, + "logits/rejected": -1.5418732166290283, + "logps/chosen": -333.2148742675781, + "logps/rejected": -309.28741455078125, + "loss": 0.5162, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.39286470413208, + "rewards/margins": 3.407494068145752, + "rewards/rejected": -5.80035924911499, + "step": 2326 + }, + { + "epoch": 0.49, + "learning_rate": 1.030672268907563e-05, + "logits/chosen": -1.9552195072174072, + "logits/rejected": -2.0105459690093994, + "logps/chosen": -247.01927185058594, + "logps/rejected": -273.3602294921875, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.407646417617798, + "rewards/margins": 3.6120429039001465, + "rewards/rejected": -6.019689083099365, + "step": 2327 + }, + { + "epoch": 0.49, + "learning_rate": 1.030252100840336e-05, + "logits/chosen": -2.1548757553100586, + "logits/rejected": -1.6838428974151611, + "logps/chosen": -279.60491943359375, + "logps/rejected": -316.785400390625, + "loss": 0.1947, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.594866991043091, + "rewards/margins": 4.489614009857178, + "rewards/rejected": -7.0844807624816895, + "step": 2328 + }, + { + "epoch": 0.49, + "learning_rate": 1.0298319327731092e-05, + "logits/chosen": -2.328587532043457, + "logits/rejected": -1.908862829208374, + "logps/chosen": -465.23431396484375, + "logps/rejected": -378.6368408203125, + "loss": 0.2821, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7628517150878906, + "rewards/margins": 3.3254811763763428, + "rewards/rejected": -5.0883331298828125, + "step": 2329 + }, + { + "epoch": 0.49, + "learning_rate": 1.0294117647058823e-05, + "logits/chosen": -1.8744580745697021, + "logits/rejected": -1.6908175945281982, + "logps/chosen": -259.7429504394531, + "logps/rejected": -254.2803955078125, + "loss": 0.1529, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.686598777770996, + "rewards/margins": 4.597958087921143, + "rewards/rejected": -6.284556865692139, + "step": 2330 + }, + { + "epoch": 0.49, + "learning_rate": 1.0289915966386555e-05, + "logits/chosen": -2.311937093734741, + "logits/rejected": -1.9439003467559814, + "logps/chosen": -519.1214599609375, + "logps/rejected": -501.08642578125, + "loss": 0.2057, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2204725742340088, + "rewards/margins": 5.467904567718506, + "rewards/rejected": -6.688377380371094, + "step": 2331 + }, + { + "epoch": 0.49, + "learning_rate": 1.0285714285714285e-05, + "logits/chosen": -2.255858898162842, + "logits/rejected": -1.896178960800171, + "logps/chosen": -387.4921569824219, + "logps/rejected": -328.13751220703125, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8484928607940674, + "rewards/margins": 5.108133316040039, + "rewards/rejected": -6.956626892089844, + "step": 2332 + }, + { + "epoch": 0.49, + "learning_rate": 1.0281512605042017e-05, + "logits/chosen": -2.0751709938049316, + "logits/rejected": -2.004098892211914, + "logps/chosen": -294.5953063964844, + "logps/rejected": -336.47259521484375, + "loss": 0.2154, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6762757301330566, + "rewards/margins": 5.432332515716553, + "rewards/rejected": -8.10860824584961, + "step": 2333 + }, + { + "epoch": 0.49, + "learning_rate": 1.0277310924369749e-05, + "logits/chosen": -2.1748366355895996, + "logits/rejected": -2.142876148223877, + "logps/chosen": -260.00775146484375, + "logps/rejected": -341.19873046875, + "loss": 0.1294, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1166110038757324, + "rewards/margins": 3.6506690979003906, + "rewards/rejected": -5.767280101776123, + "step": 2334 + }, + { + "epoch": 0.49, + "learning_rate": 1.0273109243697479e-05, + "logits/chosen": -2.229928731918335, + "logits/rejected": -2.0559520721435547, + "logps/chosen": -413.1923828125, + "logps/rejected": -398.8578186035156, + "loss": 0.0944, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.347983479499817, + "rewards/margins": 3.7781519889831543, + "rewards/rejected": -5.126135349273682, + "step": 2335 + }, + { + "epoch": 0.49, + "learning_rate": 1.026890756302521e-05, + "logits/chosen": -2.4388957023620605, + "logits/rejected": -2.0972840785980225, + "logps/chosen": -280.9982604980469, + "logps/rejected": -329.42138671875, + "loss": 0.1405, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2944209575653076, + "rewards/margins": 4.757044792175293, + "rewards/rejected": -7.05146598815918, + "step": 2336 + }, + { + "epoch": 0.49, + "learning_rate": 1.0264705882352941e-05, + "logits/chosen": -2.203840732574463, + "logits/rejected": -1.7752699851989746, + "logps/chosen": -269.9710998535156, + "logps/rejected": -273.66229248046875, + "loss": 0.212, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0475072860717773, + "rewards/margins": 4.2801594734191895, + "rewards/rejected": -6.327666759490967, + "step": 2337 + }, + { + "epoch": 0.49, + "learning_rate": 1.0260504201680673e-05, + "logits/chosen": -2.216181755065918, + "logits/rejected": -2.0790791511535645, + "logps/chosen": -254.6668701171875, + "logps/rejected": -300.34228515625, + "loss": 0.1406, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.469614028930664, + "rewards/margins": 4.0897650718688965, + "rewards/rejected": -6.5593791007995605, + "step": 2338 + }, + { + "epoch": 0.49, + "learning_rate": 1.0256302521008403e-05, + "logits/chosen": -2.079301595687866, + "logits/rejected": -1.7495542764663696, + "logps/chosen": -327.1336669921875, + "logps/rejected": -342.5215148925781, + "loss": 0.263, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7037224769592285, + "rewards/margins": 4.537293910980225, + "rewards/rejected": -7.241016387939453, + "step": 2339 + }, + { + "epoch": 0.49, + "learning_rate": 1.0252100840336135e-05, + "logits/chosen": -1.8507493734359741, + "logits/rejected": -2.2964813709259033, + "logps/chosen": -217.77003479003906, + "logps/rejected": -343.0801086425781, + "loss": 0.383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.268247604370117, + "rewards/margins": 4.268898010253906, + "rewards/rejected": -7.537145137786865, + "step": 2340 + }, + { + "epoch": 0.49, + "learning_rate": 1.0247899159663865e-05, + "logits/chosen": -2.00114369392395, + "logits/rejected": -2.044537305831909, + "logps/chosen": -411.5748291015625, + "logps/rejected": -463.3763427734375, + "loss": 0.7643, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.153486967086792, + "rewards/margins": 2.3865609169006348, + "rewards/rejected": -5.540047645568848, + "step": 2341 + }, + { + "epoch": 0.49, + "learning_rate": 1.0243697478991597e-05, + "logits/chosen": -2.0533995628356934, + "logits/rejected": -1.9718629121780396, + "logps/chosen": -292.7917785644531, + "logps/rejected": -277.1968994140625, + "loss": 0.1032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5260417461395264, + "rewards/margins": 5.376851558685303, + "rewards/rejected": -7.902894020080566, + "step": 2342 + }, + { + "epoch": 0.49, + "learning_rate": 1.0239495798319327e-05, + "logits/chosen": -2.1429636478424072, + "logits/rejected": -1.9796146154403687, + "logps/chosen": -263.4094543457031, + "logps/rejected": -260.13861083984375, + "loss": 0.2047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.481532096862793, + "rewards/margins": 3.8358161449432373, + "rewards/rejected": -7.317348480224609, + "step": 2343 + }, + { + "epoch": 0.49, + "learning_rate": 1.023529411764706e-05, + "logits/chosen": -2.1428537368774414, + "logits/rejected": -2.1778624057769775, + "logps/chosen": -208.9241943359375, + "logps/rejected": -296.50360107421875, + "loss": 0.213, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.914829730987549, + "rewards/margins": 3.4774906635284424, + "rewards/rejected": -6.392320156097412, + "step": 2344 + }, + { + "epoch": 0.49, + "learning_rate": 1.023109243697479e-05, + "logits/chosen": -2.118650197982788, + "logits/rejected": -1.8937420845031738, + "logps/chosen": -427.9129638671875, + "logps/rejected": -392.86444091796875, + "loss": 0.1552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.171908378601074, + "rewards/margins": 4.405630588531494, + "rewards/rejected": -6.57753849029541, + "step": 2345 + }, + { + "epoch": 0.49, + "learning_rate": 1.0226890756302521e-05, + "logits/chosen": -2.2222931385040283, + "logits/rejected": -2.1161084175109863, + "logps/chosen": -354.2533874511719, + "logps/rejected": -398.53717041015625, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.480360984802246, + "rewards/margins": 4.525232315063477, + "rewards/rejected": -7.005593299865723, + "step": 2346 + }, + { + "epoch": 0.49, + "learning_rate": 1.0222689075630252e-05, + "logits/chosen": -2.2023239135742188, + "logits/rejected": -1.9762842655181885, + "logps/chosen": -365.4632873535156, + "logps/rejected": -401.91949462890625, + "loss": 0.2193, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.385115385055542, + "rewards/margins": 5.419905662536621, + "rewards/rejected": -7.805021286010742, + "step": 2347 + }, + { + "epoch": 0.49, + "learning_rate": 1.0218487394957984e-05, + "logits/chosen": -2.197899580001831, + "logits/rejected": -1.7755745649337769, + "logps/chosen": -318.3700256347656, + "logps/rejected": -301.67254638671875, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0880095958709717, + "rewards/margins": 4.677926540374756, + "rewards/rejected": -6.765935897827148, + "step": 2348 + }, + { + "epoch": 0.49, + "learning_rate": 1.0214285714285714e-05, + "logits/chosen": -2.0928783416748047, + "logits/rejected": -2.080960750579834, + "logps/chosen": -324.05609130859375, + "logps/rejected": -317.9915771484375, + "loss": 0.2068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9530079364776611, + "rewards/margins": 3.729494094848633, + "rewards/rejected": -5.682502269744873, + "step": 2349 + }, + { + "epoch": 0.49, + "learning_rate": 1.0210084033613446e-05, + "logits/chosen": -2.520092487335205, + "logits/rejected": -1.9951176643371582, + "logps/chosen": -390.976806640625, + "logps/rejected": -354.14837646484375, + "loss": 0.2037, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3681273460388184, + "rewards/margins": 4.0330810546875, + "rewards/rejected": -6.401208877563477, + "step": 2350 + }, + { + "epoch": 0.49, + "learning_rate": 1.0205882352941176e-05, + "logits/chosen": -2.048600673675537, + "logits/rejected": -2.1942248344421387, + "logps/chosen": -257.93670654296875, + "logps/rejected": -339.9472351074219, + "loss": 0.1832, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5696909427642822, + "rewards/margins": 4.439303398132324, + "rewards/rejected": -7.008994102478027, + "step": 2351 + }, + { + "epoch": 0.49, + "learning_rate": 1.0201680672268908e-05, + "logits/chosen": -2.0361030101776123, + "logits/rejected": -1.8283741474151611, + "logps/chosen": -344.7386474609375, + "logps/rejected": -320.3345642089844, + "loss": 0.3664, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.239349603652954, + "rewards/margins": 3.6690733432769775, + "rewards/rejected": -6.908422470092773, + "step": 2352 + }, + { + "epoch": 0.49, + "learning_rate": 1.0197478991596638e-05, + "logits/chosen": -2.393639087677002, + "logits/rejected": -2.444275140762329, + "logps/chosen": -348.50897216796875, + "logps/rejected": -350.375732421875, + "loss": 0.4295, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.299044370651245, + "rewards/margins": 4.70559549331665, + "rewards/rejected": -7.004640579223633, + "step": 2353 + }, + { + "epoch": 0.49, + "learning_rate": 1.019327731092437e-05, + "logits/chosen": -2.3311803340911865, + "logits/rejected": -2.0447452068328857, + "logps/chosen": -217.54718017578125, + "logps/rejected": -220.82308959960938, + "loss": 0.3582, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.995664119720459, + "rewards/margins": 2.9903500080108643, + "rewards/rejected": -5.986014366149902, + "step": 2354 + }, + { + "epoch": 0.49, + "learning_rate": 1.01890756302521e-05, + "logits/chosen": -2.2401108741760254, + "logits/rejected": -1.6294881105422974, + "logps/chosen": -312.89501953125, + "logps/rejected": -274.7790222167969, + "loss": 0.3417, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.852980375289917, + "rewards/margins": 4.437831401824951, + "rewards/rejected": -7.2908124923706055, + "step": 2355 + }, + { + "epoch": 0.49, + "learning_rate": 1.0184873949579832e-05, + "logits/chosen": -2.128582239151001, + "logits/rejected": -1.9758517742156982, + "logps/chosen": -340.9329528808594, + "logps/rejected": -414.89337158203125, + "loss": 0.1756, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3178863525390625, + "rewards/margins": 4.384424209594727, + "rewards/rejected": -7.702310562133789, + "step": 2356 + }, + { + "epoch": 0.49, + "learning_rate": 1.0180672268907564e-05, + "logits/chosen": -2.375614643096924, + "logits/rejected": -1.9644136428833008, + "logps/chosen": -381.69610595703125, + "logps/rejected": -390.6536865234375, + "loss": 0.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.174661159515381, + "rewards/margins": 5.049365520477295, + "rewards/rejected": -7.224026679992676, + "step": 2357 + }, + { + "epoch": 0.49, + "learning_rate": 1.0176470588235294e-05, + "logits/chosen": -2.0989181995391846, + "logits/rejected": -2.069923162460327, + "logps/chosen": -377.942138671875, + "logps/rejected": -343.534423828125, + "loss": 0.4286, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2267632484436035, + "rewards/margins": 3.6504411697387695, + "rewards/rejected": -6.877203941345215, + "step": 2358 + }, + { + "epoch": 0.49, + "learning_rate": 1.0172268907563026e-05, + "logits/chosen": -2.230153799057007, + "logits/rejected": -1.7753021717071533, + "logps/chosen": -451.0483093261719, + "logps/rejected": -409.6219482421875, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4202916622161865, + "rewards/margins": 4.739964485168457, + "rewards/rejected": -7.160256385803223, + "step": 2359 + }, + { + "epoch": 0.49, + "learning_rate": 1.0168067226890756e-05, + "logits/chosen": -2.0516109466552734, + "logits/rejected": -2.4862051010131836, + "logps/chosen": -180.43333435058594, + "logps/rejected": -306.3184814453125, + "loss": 0.1284, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2275238037109375, + "rewards/margins": 6.064441680908203, + "rewards/rejected": -9.29196548461914, + "step": 2360 + }, + { + "epoch": 0.49, + "learning_rate": 1.0163865546218488e-05, + "logits/chosen": -2.2121572494506836, + "logits/rejected": -2.1386008262634277, + "logps/chosen": -429.6708984375, + "logps/rejected": -409.91845703125, + "loss": 0.1757, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.672609329223633, + "rewards/margins": 2.929409980773926, + "rewards/rejected": -5.602019309997559, + "step": 2361 + }, + { + "epoch": 0.49, + "learning_rate": 1.0159663865546219e-05, + "logits/chosen": -2.1723341941833496, + "logits/rejected": -1.3779733180999756, + "logps/chosen": -353.0877685546875, + "logps/rejected": -269.4619445800781, + "loss": 0.2653, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0441975593566895, + "rewards/margins": 2.9195477962493896, + "rewards/rejected": -5.9637451171875, + "step": 2362 + }, + { + "epoch": 0.49, + "learning_rate": 1.015546218487395e-05, + "logits/chosen": -1.7376939058303833, + "logits/rejected": -1.6737254858016968, + "logps/chosen": -310.61724853515625, + "logps/rejected": -336.97760009765625, + "loss": 0.5457, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.106523513793945, + "rewards/margins": 3.6526708602905273, + "rewards/rejected": -7.759194850921631, + "step": 2363 + }, + { + "epoch": 0.49, + "learning_rate": 1.015126050420168e-05, + "logits/chosen": -2.2258591651916504, + "logits/rejected": -1.9771018028259277, + "logps/chosen": -266.7381591796875, + "logps/rejected": -343.21588134765625, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.445730209350586, + "rewards/margins": 4.628357887268066, + "rewards/rejected": -8.074089050292969, + "step": 2364 + }, + { + "epoch": 0.49, + "learning_rate": 1.0147058823529413e-05, + "logits/chosen": -1.917832851409912, + "logits/rejected": -1.7805628776550293, + "logps/chosen": -422.8464660644531, + "logps/rejected": -349.28375244140625, + "loss": 0.1705, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3043696880340576, + "rewards/margins": 3.801083564758301, + "rewards/rejected": -6.1054534912109375, + "step": 2365 + }, + { + "epoch": 0.49, + "learning_rate": 1.0142857142857143e-05, + "logits/chosen": -1.8564013242721558, + "logits/rejected": -2.225888729095459, + "logps/chosen": -391.9482421875, + "logps/rejected": -558.7158813476562, + "loss": 0.1541, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.211973190307617, + "rewards/margins": 4.239522933959961, + "rewards/rejected": -6.451496124267578, + "step": 2366 + }, + { + "epoch": 0.5, + "learning_rate": 1.0138655462184875e-05, + "logits/chosen": -1.974426031112671, + "logits/rejected": -2.100161552429199, + "logps/chosen": -262.3293151855469, + "logps/rejected": -331.90679931640625, + "loss": 0.6789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.795732259750366, + "rewards/margins": 2.055051326751709, + "rewards/rejected": -5.850783348083496, + "step": 2367 + }, + { + "epoch": 0.5, + "learning_rate": 1.0134453781512605e-05, + "logits/chosen": -2.2483503818511963, + "logits/rejected": -2.062711000442505, + "logps/chosen": -377.90899658203125, + "logps/rejected": -348.4140625, + "loss": 0.2545, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9506871700286865, + "rewards/margins": 4.971905708312988, + "rewards/rejected": -7.922593116760254, + "step": 2368 + }, + { + "epoch": 0.5, + "learning_rate": 1.0130252100840337e-05, + "logits/chosen": -2.264641284942627, + "logits/rejected": -1.7713286876678467, + "logps/chosen": -318.753662109375, + "logps/rejected": -387.6985168457031, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.798366069793701, + "rewards/margins": 3.808821201324463, + "rewards/rejected": -7.607187271118164, + "step": 2369 + }, + { + "epoch": 0.5, + "learning_rate": 1.0126050420168067e-05, + "logits/chosen": -2.0879833698272705, + "logits/rejected": -2.022408962249756, + "logps/chosen": -465.34576416015625, + "logps/rejected": -435.58941650390625, + "loss": 0.3542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2982735633850098, + "rewards/margins": 4.443699836730957, + "rewards/rejected": -6.74197244644165, + "step": 2370 + }, + { + "epoch": 0.5, + "learning_rate": 1.0121848739495799e-05, + "logits/chosen": -2.0189285278320312, + "logits/rejected": -1.9815878868103027, + "logps/chosen": -272.43585205078125, + "logps/rejected": -390.76904296875, + "loss": 0.7259, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.639523506164551, + "rewards/margins": 3.0126352310180664, + "rewards/rejected": -6.652159214019775, + "step": 2371 + }, + { + "epoch": 0.5, + "learning_rate": 1.011764705882353e-05, + "logits/chosen": -2.331763744354248, + "logits/rejected": -2.1646156311035156, + "logps/chosen": -442.7057800292969, + "logps/rejected": -377.1627502441406, + "loss": 0.1682, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6277990341186523, + "rewards/margins": 4.27970027923584, + "rewards/rejected": -6.907499313354492, + "step": 2372 + }, + { + "epoch": 0.5, + "learning_rate": 1.0113445378151261e-05, + "logits/chosen": -2.412088394165039, + "logits/rejected": -1.8948307037353516, + "logps/chosen": -382.93829345703125, + "logps/rejected": -317.02227783203125, + "loss": 0.3936, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.954725742340088, + "rewards/margins": 2.828511953353882, + "rewards/rejected": -5.783237457275391, + "step": 2373 + }, + { + "epoch": 0.5, + "learning_rate": 1.0109243697478991e-05, + "logits/chosen": -2.207195281982422, + "logits/rejected": -1.9755818843841553, + "logps/chosen": -375.7664794921875, + "logps/rejected": -380.745849609375, + "loss": 0.2122, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8473095893859863, + "rewards/margins": 4.760699272155762, + "rewards/rejected": -7.60800838470459, + "step": 2374 + }, + { + "epoch": 0.5, + "learning_rate": 1.0105042016806723e-05, + "logits/chosen": -2.118767023086548, + "logits/rejected": -1.8216348886489868, + "logps/chosen": -339.97943115234375, + "logps/rejected": -304.74688720703125, + "loss": 0.1393, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7438158988952637, + "rewards/margins": 4.913661479949951, + "rewards/rejected": -7.657476425170898, + "step": 2375 + }, + { + "epoch": 0.5, + "learning_rate": 1.0100840336134453e-05, + "logits/chosen": -2.537324905395508, + "logits/rejected": -2.022660732269287, + "logps/chosen": -383.07586669921875, + "logps/rejected": -332.684814453125, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3256542682647705, + "rewards/margins": 4.027649879455566, + "rewards/rejected": -7.353304386138916, + "step": 2376 + }, + { + "epoch": 0.5, + "learning_rate": 1.0096638655462185e-05, + "logits/chosen": -2.3195064067840576, + "logits/rejected": -2.185446262359619, + "logps/chosen": -424.9019775390625, + "logps/rejected": -502.19598388671875, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.273162841796875, + "rewards/margins": 3.496093511581421, + "rewards/rejected": -6.769256591796875, + "step": 2377 + }, + { + "epoch": 0.5, + "learning_rate": 1.0092436974789917e-05, + "logits/chosen": -2.441375970840454, + "logits/rejected": -1.9123210906982422, + "logps/chosen": -344.749755859375, + "logps/rejected": -268.8642883300781, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3219001293182373, + "rewards/margins": 4.846903324127197, + "rewards/rejected": -7.168804168701172, + "step": 2378 + }, + { + "epoch": 0.5, + "learning_rate": 1.0088235294117648e-05, + "logits/chosen": -1.9964544773101807, + "logits/rejected": -1.843702793121338, + "logps/chosen": -328.4271545410156, + "logps/rejected": -388.16998291015625, + "loss": 0.3413, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2889223098754883, + "rewards/margins": 4.009631156921387, + "rewards/rejected": -7.298553466796875, + "step": 2379 + }, + { + "epoch": 0.5, + "learning_rate": 1.008403361344538e-05, + "logits/chosen": -1.9196803569793701, + "logits/rejected": -1.5918848514556885, + "logps/chosen": -411.455322265625, + "logps/rejected": -426.3360595703125, + "loss": 0.1504, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9419288635253906, + "rewards/margins": 4.285278797149658, + "rewards/rejected": -7.227207660675049, + "step": 2380 + }, + { + "epoch": 0.5, + "learning_rate": 1.007983193277311e-05, + "logits/chosen": -2.0860230922698975, + "logits/rejected": -2.2899765968322754, + "logps/chosen": -255.71780395507812, + "logps/rejected": -313.4264831542969, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.593266248703003, + "rewards/margins": 4.350472450256348, + "rewards/rejected": -6.943737983703613, + "step": 2381 + }, + { + "epoch": 0.5, + "learning_rate": 1.0075630252100842e-05, + "logits/chosen": -2.2322916984558105, + "logits/rejected": -1.991105079650879, + "logps/chosen": -347.2756652832031, + "logps/rejected": -334.885498046875, + "loss": 0.3435, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1120400428771973, + "rewards/margins": 3.6266214847564697, + "rewards/rejected": -6.738661766052246, + "step": 2382 + }, + { + "epoch": 0.5, + "learning_rate": 1.0071428571428572e-05, + "logits/chosen": -2.3625547885894775, + "logits/rejected": -2.2327795028686523, + "logps/chosen": -297.80084228515625, + "logps/rejected": -400.97515869140625, + "loss": 0.3024, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8820149898529053, + "rewards/margins": 4.671796798706055, + "rewards/rejected": -7.553812026977539, + "step": 2383 + }, + { + "epoch": 0.5, + "learning_rate": 1.0067226890756304e-05, + "logits/chosen": -2.1526286602020264, + "logits/rejected": -2.2628276348114014, + "logps/chosen": -390.17755126953125, + "logps/rejected": -359.5980224609375, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8839187622070312, + "rewards/margins": 3.8925013542175293, + "rewards/rejected": -6.776420593261719, + "step": 2384 + }, + { + "epoch": 0.5, + "learning_rate": 1.0063025210084034e-05, + "logits/chosen": -1.9154771566390991, + "logits/rejected": -2.020354747772217, + "logps/chosen": -423.9452209472656, + "logps/rejected": -391.2864074707031, + "loss": 0.2739, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.87423038482666, + "rewards/margins": 3.822540760040283, + "rewards/rejected": -6.696771144866943, + "step": 2385 + }, + { + "epoch": 0.5, + "learning_rate": 1.0058823529411766e-05, + "logits/chosen": -2.248162269592285, + "logits/rejected": -2.29250431060791, + "logps/chosen": -445.37939453125, + "logps/rejected": -330.8786315917969, + "loss": 0.3405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6439990997314453, + "rewards/margins": 2.292757272720337, + "rewards/rejected": -4.936756134033203, + "step": 2386 + }, + { + "epoch": 0.5, + "learning_rate": 1.0054621848739496e-05, + "logits/chosen": -2.0872318744659424, + "logits/rejected": -1.9483592510223389, + "logps/chosen": -286.3751525878906, + "logps/rejected": -371.61944580078125, + "loss": 0.5192, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1115846633911133, + "rewards/margins": 2.079066276550293, + "rewards/rejected": -5.190650939941406, + "step": 2387 + }, + { + "epoch": 0.5, + "learning_rate": 1.0050420168067228e-05, + "logits/chosen": -2.1926615238189697, + "logits/rejected": -1.941779613494873, + "logps/chosen": -453.847900390625, + "logps/rejected": -412.57366943359375, + "loss": 0.1812, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1723692417144775, + "rewards/margins": 5.949557304382324, + "rewards/rejected": -8.121927261352539, + "step": 2388 + }, + { + "epoch": 0.5, + "learning_rate": 1.0046218487394958e-05, + "logits/chosen": -2.1235480308532715, + "logits/rejected": -2.3440585136413574, + "logps/chosen": -293.1241455078125, + "logps/rejected": -330.4990234375, + "loss": 0.5469, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.06968355178833, + "rewards/margins": 3.635479211807251, + "rewards/rejected": -6.705162048339844, + "step": 2389 + }, + { + "epoch": 0.5, + "learning_rate": 1.004201680672269e-05, + "logits/chosen": -2.2626047134399414, + "logits/rejected": -2.227721691131592, + "logps/chosen": -328.4931335449219, + "logps/rejected": -292.5007629394531, + "loss": 0.3164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4760754108428955, + "rewards/margins": 4.064693450927734, + "rewards/rejected": -6.540769100189209, + "step": 2390 + }, + { + "epoch": 0.5, + "learning_rate": 1.003781512605042e-05, + "logits/chosen": -2.139137029647827, + "logits/rejected": -1.7435845136642456, + "logps/chosen": -321.35418701171875, + "logps/rejected": -360.59027099609375, + "loss": 0.4074, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.110934257507324, + "rewards/margins": 3.720695972442627, + "rewards/rejected": -6.831630229949951, + "step": 2391 + }, + { + "epoch": 0.5, + "learning_rate": 1.0033613445378152e-05, + "logits/chosen": -2.0980777740478516, + "logits/rejected": -1.829169750213623, + "logps/chosen": -381.323486328125, + "logps/rejected": -390.6148681640625, + "loss": 0.3501, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.991558790206909, + "rewards/margins": 4.191066265106201, + "rewards/rejected": -7.182624816894531, + "step": 2392 + }, + { + "epoch": 0.5, + "learning_rate": 1.0029411764705882e-05, + "logits/chosen": -2.022017002105713, + "logits/rejected": -2.1261677742004395, + "logps/chosen": -255.502685546875, + "logps/rejected": -324.99169921875, + "loss": 0.4771, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4784703254699707, + "rewards/margins": 4.7589263916015625, + "rewards/rejected": -8.237397193908691, + "step": 2393 + }, + { + "epoch": 0.5, + "learning_rate": 1.0025210084033614e-05, + "logits/chosen": -2.0819475650787354, + "logits/rejected": -2.2279903888702393, + "logps/chosen": -365.35235595703125, + "logps/rejected": -371.3768615722656, + "loss": 0.4438, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9094648361206055, + "rewards/margins": 4.8693647384643555, + "rewards/rejected": -7.778829574584961, + "step": 2394 + }, + { + "epoch": 0.5, + "learning_rate": 1.0021008403361345e-05, + "logits/chosen": -1.7972275018692017, + "logits/rejected": -1.6436253786087036, + "logps/chosen": -291.5543212890625, + "logps/rejected": -303.94696044921875, + "loss": 0.5153, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.981525421142578, + "rewards/margins": 3.397333860397339, + "rewards/rejected": -6.378859519958496, + "step": 2395 + }, + { + "epoch": 0.5, + "learning_rate": 1.0016806722689077e-05, + "logits/chosen": -1.8907406330108643, + "logits/rejected": -2.097783088684082, + "logps/chosen": -311.48687744140625, + "logps/rejected": -380.88336181640625, + "loss": 0.516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.7880444526672363, + "rewards/margins": 2.915473699569702, + "rewards/rejected": -5.703518390655518, + "step": 2396 + }, + { + "epoch": 0.5, + "learning_rate": 1.0012605042016807e-05, + "logits/chosen": -2.2271320819854736, + "logits/rejected": -1.7776563167572021, + "logps/chosen": -403.1595153808594, + "logps/rejected": -340.1800842285156, + "loss": 0.143, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.421895742416382, + "rewards/margins": 4.355343818664551, + "rewards/rejected": -6.777239799499512, + "step": 2397 + }, + { + "epoch": 0.5, + "learning_rate": 1.0008403361344539e-05, + "logits/chosen": -2.1426753997802734, + "logits/rejected": -2.140594005584717, + "logps/chosen": -326.5360412597656, + "logps/rejected": -398.43597412109375, + "loss": 0.2813, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.651247024536133, + "rewards/margins": 4.19482421875, + "rewards/rejected": -6.846071720123291, + "step": 2398 + }, + { + "epoch": 0.5, + "learning_rate": 1.0004201680672269e-05, + "logits/chosen": -2.3763651847839355, + "logits/rejected": -2.3359007835388184, + "logps/chosen": -330.80255126953125, + "logps/rejected": -355.82623291015625, + "loss": 0.1755, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.269780158996582, + "rewards/margins": 5.421090126037598, + "rewards/rejected": -7.69087028503418, + "step": 2399 + }, + { + "epoch": 0.5, + "learning_rate": 1e-05, + "logits/chosen": -2.2974443435668945, + "logits/rejected": -2.065579414367676, + "logps/chosen": -308.79461669921875, + "logps/rejected": -320.31781005859375, + "loss": 0.3388, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2970945835113525, + "rewards/margins": 4.837871074676514, + "rewards/rejected": -7.134965896606445, + "step": 2400 + }, + { + "epoch": 0.5, + "learning_rate": 9.995798319327733e-06, + "logits/chosen": -1.9178187847137451, + "logits/rejected": -2.0462570190429688, + "logps/chosen": -279.6192626953125, + "logps/rejected": -320.6001281738281, + "loss": 0.4874, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.510626792907715, + "rewards/margins": 4.299708366394043, + "rewards/rejected": -6.8103346824646, + "step": 2401 + }, + { + "epoch": 0.5, + "learning_rate": 9.991596638655463e-06, + "logits/chosen": -2.2906880378723145, + "logits/rejected": -2.1577656269073486, + "logps/chosen": -263.5411376953125, + "logps/rejected": -277.16473388671875, + "loss": 0.3356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.759248971939087, + "rewards/margins": 2.738969564437866, + "rewards/rejected": -5.498218536376953, + "step": 2402 + }, + { + "epoch": 0.5, + "learning_rate": 9.987394957983195e-06, + "logits/chosen": -1.7807637453079224, + "logits/rejected": -1.669394612312317, + "logps/chosen": -473.79345703125, + "logps/rejected": -479.2176208496094, + "loss": 0.3746, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7221126556396484, + "rewards/margins": 3.74810791015625, + "rewards/rejected": -6.470220565795898, + "step": 2403 + }, + { + "epoch": 0.5, + "learning_rate": 9.983193277310925e-06, + "logits/chosen": -2.2767786979675293, + "logits/rejected": -1.8739500045776367, + "logps/chosen": -317.3830871582031, + "logps/rejected": -330.66082763671875, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.860841989517212, + "rewards/margins": 3.7483394145965576, + "rewards/rejected": -6.6091814041137695, + "step": 2404 + }, + { + "epoch": 0.5, + "learning_rate": 9.978991596638657e-06, + "logits/chosen": -2.418147087097168, + "logits/rejected": -1.9186406135559082, + "logps/chosen": -273.658447265625, + "logps/rejected": -317.94830322265625, + "loss": 0.3404, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.732921600341797, + "rewards/margins": 4.473215579986572, + "rewards/rejected": -8.206136703491211, + "step": 2405 + }, + { + "epoch": 0.5, + "learning_rate": 9.974789915966387e-06, + "logits/chosen": -2.348971366882324, + "logits/rejected": -2.004465103149414, + "logps/chosen": -362.5961608886719, + "logps/rejected": -457.1365051269531, + "loss": 0.3792, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5905508995056152, + "rewards/margins": 3.9804835319519043, + "rewards/rejected": -6.5710344314575195, + "step": 2406 + }, + { + "epoch": 0.5, + "learning_rate": 9.970588235294119e-06, + "logits/chosen": -2.070108413696289, + "logits/rejected": -1.8491915464401245, + "logps/chosen": -252.86036682128906, + "logps/rejected": -273.4906005859375, + "loss": 0.3498, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.429840564727783, + "rewards/margins": 3.3223671913146973, + "rewards/rejected": -5.752207279205322, + "step": 2407 + }, + { + "epoch": 0.5, + "learning_rate": 9.96638655462185e-06, + "logits/chosen": -2.3902580738067627, + "logits/rejected": -2.2644615173339844, + "logps/chosen": -339.4729309082031, + "logps/rejected": -313.0079040527344, + "loss": 0.2017, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4026505947113037, + "rewards/margins": 3.845741033554077, + "rewards/rejected": -6.248391151428223, + "step": 2408 + }, + { + "epoch": 0.5, + "learning_rate": 9.962184873949581e-06, + "logits/chosen": -2.1466519832611084, + "logits/rejected": -2.0018019676208496, + "logps/chosen": -330.3734130859375, + "logps/rejected": -274.92901611328125, + "loss": 0.2013, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0059640407562256, + "rewards/margins": 2.92734694480896, + "rewards/rejected": -4.933310508728027, + "step": 2409 + }, + { + "epoch": 0.5, + "learning_rate": 9.957983193277312e-06, + "logits/chosen": -1.5954811573028564, + "logits/rejected": -1.7179388999938965, + "logps/chosen": -322.6182861328125, + "logps/rejected": -384.58233642578125, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.76421856880188, + "rewards/margins": 4.421438217163086, + "rewards/rejected": -7.185657024383545, + "step": 2410 + }, + { + "epoch": 0.5, + "learning_rate": 9.953781512605043e-06, + "logits/chosen": -1.8747562170028687, + "logits/rejected": -1.8973026275634766, + "logps/chosen": -237.95907592773438, + "logps/rejected": -290.9625549316406, + "loss": 0.6665, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.492603302001953, + "rewards/margins": 1.7184562683105469, + "rewards/rejected": -5.211059093475342, + "step": 2411 + }, + { + "epoch": 0.5, + "learning_rate": 9.949579831932774e-06, + "logits/chosen": -2.222580909729004, + "logits/rejected": -1.9016811847686768, + "logps/chosen": -459.78436279296875, + "logps/rejected": -373.4814758300781, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.050839900970459, + "rewards/margins": 6.049892425537109, + "rewards/rejected": -8.100732803344727, + "step": 2412 + }, + { + "epoch": 0.5, + "learning_rate": 9.945378151260506e-06, + "logits/chosen": -2.470627784729004, + "logits/rejected": -2.0995800495147705, + "logps/chosen": -447.10845947265625, + "logps/rejected": -454.21258544921875, + "loss": 0.2402, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3339078426361084, + "rewards/margins": 4.221175193786621, + "rewards/rejected": -6.555083274841309, + "step": 2413 + }, + { + "epoch": 0.51, + "learning_rate": 9.941176470588236e-06, + "logits/chosen": -2.3919808864593506, + "logits/rejected": -2.0468430519104004, + "logps/chosen": -371.371826171875, + "logps/rejected": -364.661376953125, + "loss": 0.313, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9488651752471924, + "rewards/margins": 2.873868227005005, + "rewards/rejected": -4.822733402252197, + "step": 2414 + }, + { + "epoch": 0.51, + "learning_rate": 9.936974789915968e-06, + "logits/chosen": -2.2457568645477295, + "logits/rejected": -2.1089138984680176, + "logps/chosen": -354.2779235839844, + "logps/rejected": -371.3050537109375, + "loss": 0.5444, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6733100414276123, + "rewards/margins": 3.0175583362579346, + "rewards/rejected": -5.690868377685547, + "step": 2415 + }, + { + "epoch": 0.51, + "learning_rate": 9.932773109243698e-06, + "logits/chosen": -2.256075382232666, + "logits/rejected": -2.0453333854675293, + "logps/chosen": -458.46246337890625, + "logps/rejected": -434.7677307128906, + "loss": 0.1436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0981826782226562, + "rewards/margins": 4.070631504058838, + "rewards/rejected": -6.168813705444336, + "step": 2416 + }, + { + "epoch": 0.51, + "learning_rate": 9.92857142857143e-06, + "logits/chosen": -1.832474946975708, + "logits/rejected": -1.751023292541504, + "logps/chosen": -319.69476318359375, + "logps/rejected": -306.7323913574219, + "loss": 0.2366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0703225135803223, + "rewards/margins": 4.127175807952881, + "rewards/rejected": -6.197497844696045, + "step": 2417 + }, + { + "epoch": 0.51, + "learning_rate": 9.92436974789916e-06, + "logits/chosen": -2.2268459796905518, + "logits/rejected": -1.9217164516448975, + "logps/chosen": -467.34344482421875, + "logps/rejected": -364.5041809082031, + "loss": 0.296, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.536264657974243, + "rewards/margins": 4.76158332824707, + "rewards/rejected": -7.297848224639893, + "step": 2418 + }, + { + "epoch": 0.51, + "learning_rate": 9.920168067226892e-06, + "logits/chosen": -2.0816447734832764, + "logits/rejected": -2.1499948501586914, + "logps/chosen": -249.6778564453125, + "logps/rejected": -323.81109619140625, + "loss": 0.5068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7292394638061523, + "rewards/margins": 3.4589426517486572, + "rewards/rejected": -7.1881818771362305, + "step": 2419 + }, + { + "epoch": 0.51, + "learning_rate": 9.915966386554622e-06, + "logits/chosen": -2.0341103076934814, + "logits/rejected": -2.121586799621582, + "logps/chosen": -341.7618408203125, + "logps/rejected": -384.7890625, + "loss": 0.1978, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6588406562805176, + "rewards/margins": 4.138150691986084, + "rewards/rejected": -6.796991348266602, + "step": 2420 + }, + { + "epoch": 0.51, + "learning_rate": 9.911764705882354e-06, + "logits/chosen": -2.099214792251587, + "logits/rejected": -1.7684375047683716, + "logps/chosen": -242.392822265625, + "logps/rejected": -238.7559051513672, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2388548851013184, + "rewards/margins": 2.9281370639801025, + "rewards/rejected": -6.1669921875, + "step": 2421 + }, + { + "epoch": 0.51, + "learning_rate": 9.907563025210084e-06, + "logits/chosen": -2.173182964324951, + "logits/rejected": -1.9689698219299316, + "logps/chosen": -383.5847473144531, + "logps/rejected": -328.8962707519531, + "loss": 0.4517, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.834446907043457, + "rewards/margins": 3.36008358001709, + "rewards/rejected": -6.194530487060547, + "step": 2422 + }, + { + "epoch": 0.51, + "learning_rate": 9.903361344537816e-06, + "logits/chosen": -2.096619129180908, + "logits/rejected": -2.059051036834717, + "logps/chosen": -283.3281555175781, + "logps/rejected": -340.39385986328125, + "loss": 0.3868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.169192314147949, + "rewards/margins": 3.4856536388397217, + "rewards/rejected": -6.654845714569092, + "step": 2423 + }, + { + "epoch": 0.51, + "learning_rate": 9.899159663865548e-06, + "logits/chosen": -2.155031681060791, + "logits/rejected": -1.4631612300872803, + "logps/chosen": -359.2854309082031, + "logps/rejected": -303.6133728027344, + "loss": 0.5978, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.564146041870117, + "rewards/margins": 1.9146242141723633, + "rewards/rejected": -5.4787702560424805, + "step": 2424 + }, + { + "epoch": 0.51, + "learning_rate": 9.894957983193278e-06, + "logits/chosen": -2.281752824783325, + "logits/rejected": -2.141045093536377, + "logps/chosen": -218.8125762939453, + "logps/rejected": -289.11956787109375, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1416196823120117, + "rewards/margins": 4.813127517700195, + "rewards/rejected": -7.954747200012207, + "step": 2425 + }, + { + "epoch": 0.51, + "learning_rate": 9.89075630252101e-06, + "logits/chosen": -1.8661407232284546, + "logits/rejected": -1.8895190954208374, + "logps/chosen": -270.0294494628906, + "logps/rejected": -388.3438720703125, + "loss": 0.4309, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1605801582336426, + "rewards/margins": 5.889432907104492, + "rewards/rejected": -8.050013542175293, + "step": 2426 + }, + { + "epoch": 0.51, + "learning_rate": 9.88655462184874e-06, + "logits/chosen": -2.2457845211029053, + "logits/rejected": -1.9798784255981445, + "logps/chosen": -413.43975830078125, + "logps/rejected": -605.51611328125, + "loss": 0.101, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1516494750976562, + "rewards/margins": 4.583868026733398, + "rewards/rejected": -6.735517501831055, + "step": 2427 + }, + { + "epoch": 0.51, + "learning_rate": 9.882352941176472e-06, + "logits/chosen": -2.276337146759033, + "logits/rejected": -2.0764236450195312, + "logps/chosen": -362.8184814453125, + "logps/rejected": -392.36920166015625, + "loss": 0.483, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7045369148254395, + "rewards/margins": 2.9930124282836914, + "rewards/rejected": -5.697549819946289, + "step": 2428 + }, + { + "epoch": 0.51, + "learning_rate": 9.878151260504203e-06, + "logits/chosen": -2.273655414581299, + "logits/rejected": -2.3905067443847656, + "logps/chosen": -328.6600646972656, + "logps/rejected": -447.10546875, + "loss": 0.3746, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7527495622634888, + "rewards/margins": 3.2043819427490234, + "rewards/rejected": -4.957131385803223, + "step": 2429 + }, + { + "epoch": 0.51, + "learning_rate": 9.873949579831935e-06, + "logits/chosen": -2.25905442237854, + "logits/rejected": -1.72170889377594, + "logps/chosen": -392.6197509765625, + "logps/rejected": -355.248291015625, + "loss": 0.2448, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.880793571472168, + "rewards/margins": 4.852039337158203, + "rewards/rejected": -7.732832908630371, + "step": 2430 + }, + { + "epoch": 0.51, + "learning_rate": 9.869747899159665e-06, + "logits/chosen": -2.402531623840332, + "logits/rejected": -2.236377716064453, + "logps/chosen": -266.3785400390625, + "logps/rejected": -251.65174865722656, + "loss": 0.1707, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1692585945129395, + "rewards/margins": 3.8515825271606445, + "rewards/rejected": -7.020841598510742, + "step": 2431 + }, + { + "epoch": 0.51, + "learning_rate": 9.865546218487397e-06, + "logits/chosen": -2.138195753097534, + "logits/rejected": -1.9108774662017822, + "logps/chosen": -423.58221435546875, + "logps/rejected": -396.9621276855469, + "loss": 0.352, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.975484848022461, + "rewards/margins": 4.312573432922363, + "rewards/rejected": -7.288058280944824, + "step": 2432 + }, + { + "epoch": 0.51, + "learning_rate": 9.861344537815127e-06, + "logits/chosen": -2.290330648422241, + "logits/rejected": -2.1706066131591797, + "logps/chosen": -307.01837158203125, + "logps/rejected": -342.9692687988281, + "loss": 0.5941, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3528783321380615, + "rewards/margins": 2.6740097999572754, + "rewards/rejected": -6.026887893676758, + "step": 2433 + }, + { + "epoch": 0.51, + "learning_rate": 9.857142857142859e-06, + "logits/chosen": -2.1459391117095947, + "logits/rejected": -1.9871735572814941, + "logps/chosen": -356.656982421875, + "logps/rejected": -378.06744384765625, + "loss": 0.2507, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0185086727142334, + "rewards/margins": 3.6278114318847656, + "rewards/rejected": -5.646320343017578, + "step": 2434 + }, + { + "epoch": 0.51, + "learning_rate": 9.852941176470589e-06, + "logits/chosen": -2.2394466400146484, + "logits/rejected": -2.0912163257598877, + "logps/chosen": -408.5823059082031, + "logps/rejected": -428.4944763183594, + "loss": 0.3981, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.22701096534729, + "rewards/margins": 2.8428955078125, + "rewards/rejected": -6.069906234741211, + "step": 2435 + }, + { + "epoch": 0.51, + "learning_rate": 9.848739495798321e-06, + "logits/chosen": -2.0506677627563477, + "logits/rejected": -2.067673683166504, + "logps/chosen": -326.36077880859375, + "logps/rejected": -365.61651611328125, + "loss": 0.7727, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.611337184906006, + "rewards/margins": 3.0270886421203613, + "rewards/rejected": -6.638426303863525, + "step": 2436 + }, + { + "epoch": 0.51, + "learning_rate": 9.844537815126051e-06, + "logits/chosen": -1.8925117254257202, + "logits/rejected": -2.0888848304748535, + "logps/chosen": -271.73089599609375, + "logps/rejected": -324.39678955078125, + "loss": 0.3358, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.850221633911133, + "rewards/margins": 4.2946953773498535, + "rewards/rejected": -7.1449174880981445, + "step": 2437 + }, + { + "epoch": 0.51, + "learning_rate": 9.840336134453781e-06, + "logits/chosen": -1.892627239227295, + "logits/rejected": -1.7988214492797852, + "logps/chosen": -171.96270751953125, + "logps/rejected": -243.96498107910156, + "loss": 0.3838, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9922971725463867, + "rewards/margins": 3.47005033493042, + "rewards/rejected": -6.462347507476807, + "step": 2438 + }, + { + "epoch": 0.51, + "learning_rate": 9.836134453781513e-06, + "logits/chosen": -2.303834915161133, + "logits/rejected": -1.8925604820251465, + "logps/chosen": -377.1575012207031, + "logps/rejected": -363.5316162109375, + "loss": 0.8032, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1350483894348145, + "rewards/margins": 2.6547577381134033, + "rewards/rejected": -5.789806365966797, + "step": 2439 + }, + { + "epoch": 0.51, + "learning_rate": 9.831932773109244e-06, + "logits/chosen": -1.9305191040039062, + "logits/rejected": -1.6555087566375732, + "logps/chosen": -329.98370361328125, + "logps/rejected": -361.0148620605469, + "loss": 0.6618, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3163788318634033, + "rewards/margins": 4.039077281951904, + "rewards/rejected": -6.355456352233887, + "step": 2440 + }, + { + "epoch": 0.51, + "learning_rate": 9.827731092436975e-06, + "logits/chosen": -2.544266700744629, + "logits/rejected": -2.2024903297424316, + "logps/chosen": -339.9169616699219, + "logps/rejected": -298.70294189453125, + "loss": 0.4582, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0411252975463867, + "rewards/margins": 2.788961887359619, + "rewards/rejected": -5.830087184906006, + "step": 2441 + }, + { + "epoch": 0.51, + "learning_rate": 9.823529411764706e-06, + "logits/chosen": -2.1996612548828125, + "logits/rejected": -1.7889305353164673, + "logps/chosen": -346.45660400390625, + "logps/rejected": -335.2572021484375, + "loss": 0.143, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0869550704956055, + "rewards/margins": 6.196144104003906, + "rewards/rejected": -8.283100128173828, + "step": 2442 + }, + { + "epoch": 0.51, + "learning_rate": 9.819327731092438e-06, + "logits/chosen": -2.2793169021606445, + "logits/rejected": -2.0673391819000244, + "logps/chosen": -366.3721008300781, + "logps/rejected": -389.57647705078125, + "loss": 0.3627, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7804150581359863, + "rewards/margins": 4.010008811950684, + "rewards/rejected": -6.790424346923828, + "step": 2443 + }, + { + "epoch": 0.51, + "learning_rate": 9.815126050420168e-06, + "logits/chosen": -2.2194337844848633, + "logits/rejected": -1.9418132305145264, + "logps/chosen": -289.8243408203125, + "logps/rejected": -254.2372283935547, + "loss": 0.5406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4012985229492188, + "rewards/margins": 2.4547157287597656, + "rewards/rejected": -5.856014251708984, + "step": 2444 + }, + { + "epoch": 0.51, + "learning_rate": 9.8109243697479e-06, + "logits/chosen": -2.2629475593566895, + "logits/rejected": -1.9769794940948486, + "logps/chosen": -294.1553039550781, + "logps/rejected": -311.6893615722656, + "loss": 0.2439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.181826114654541, + "rewards/margins": 3.803929328918457, + "rewards/rejected": -5.985755920410156, + "step": 2445 + }, + { + "epoch": 0.51, + "learning_rate": 9.80672268907563e-06, + "logits/chosen": -2.369563102722168, + "logits/rejected": -2.0142669677734375, + "logps/chosen": -324.5804443359375, + "logps/rejected": -452.1680908203125, + "loss": 0.268, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2360963821411133, + "rewards/margins": 3.114483118057251, + "rewards/rejected": -6.350579738616943, + "step": 2446 + }, + { + "epoch": 0.51, + "learning_rate": 9.802521008403362e-06, + "logits/chosen": -1.959319829940796, + "logits/rejected": -2.3647871017456055, + "logps/chosen": -275.794677734375, + "logps/rejected": -394.02490234375, + "loss": 0.4332, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.740354061126709, + "rewards/margins": 3.1907262802124023, + "rewards/rejected": -5.9310808181762695, + "step": 2447 + }, + { + "epoch": 0.51, + "learning_rate": 9.798319327731092e-06, + "logits/chosen": -2.1668601036071777, + "logits/rejected": -2.0067825317382812, + "logps/chosen": -332.92303466796875, + "logps/rejected": -349.88104248046875, + "loss": 0.2853, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.146078109741211, + "rewards/margins": 4.368680000305176, + "rewards/rejected": -6.514758110046387, + "step": 2448 + }, + { + "epoch": 0.51, + "learning_rate": 9.794117647058824e-06, + "logits/chosen": -2.097109079360962, + "logits/rejected": -1.8445606231689453, + "logps/chosen": -287.12579345703125, + "logps/rejected": -224.05560302734375, + "loss": 0.3181, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3764634132385254, + "rewards/margins": 3.5817036628723145, + "rewards/rejected": -6.958167552947998, + "step": 2449 + }, + { + "epoch": 0.51, + "learning_rate": 9.789915966386554e-06, + "logits/chosen": -1.8995410203933716, + "logits/rejected": -1.6100553274154663, + "logps/chosen": -328.62054443359375, + "logps/rejected": -350.05615234375, + "loss": 0.3648, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5372843742370605, + "rewards/margins": 2.7280852794647217, + "rewards/rejected": -5.265369415283203, + "step": 2450 + }, + { + "epoch": 0.51, + "learning_rate": 9.785714285714286e-06, + "logits/chosen": -1.7688593864440918, + "logits/rejected": -1.6556869745254517, + "logps/chosen": -329.97637939453125, + "logps/rejected": -363.11309814453125, + "loss": 0.4768, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1256723403930664, + "rewards/margins": 2.586132764816284, + "rewards/rejected": -5.711804389953613, + "step": 2451 + }, + { + "epoch": 0.51, + "learning_rate": 9.781512605042018e-06, + "logits/chosen": -2.0984439849853516, + "logits/rejected": -2.225480079650879, + "logps/chosen": -303.2366943359375, + "logps/rejected": -337.7622985839844, + "loss": 0.3028, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2887487411499023, + "rewards/margins": 3.055553913116455, + "rewards/rejected": -6.344302654266357, + "step": 2452 + }, + { + "epoch": 0.51, + "learning_rate": 9.777310924369748e-06, + "logits/chosen": -2.042961597442627, + "logits/rejected": -2.081300735473633, + "logps/chosen": -350.177001953125, + "logps/rejected": -369.05224609375, + "loss": 0.221, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5011491775512695, + "rewards/margins": 3.2595925331115723, + "rewards/rejected": -5.760741233825684, + "step": 2453 + }, + { + "epoch": 0.51, + "learning_rate": 9.77310924369748e-06, + "logits/chosen": -2.1722545623779297, + "logits/rejected": -1.8652329444885254, + "logps/chosen": -246.10723876953125, + "logps/rejected": -290.38055419921875, + "loss": 0.2364, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.429633617401123, + "rewards/margins": 4.2516632080078125, + "rewards/rejected": -6.6812968254089355, + "step": 2454 + }, + { + "epoch": 0.51, + "learning_rate": 9.76890756302521e-06, + "logits/chosen": -2.1108481884002686, + "logits/rejected": -1.9407432079315186, + "logps/chosen": -301.5434265136719, + "logps/rejected": -341.26556396484375, + "loss": 0.2757, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6759743690490723, + "rewards/margins": 3.18408465385437, + "rewards/rejected": -5.860058784484863, + "step": 2455 + }, + { + "epoch": 0.51, + "learning_rate": 9.764705882352942e-06, + "logits/chosen": -1.7556816339492798, + "logits/rejected": -1.8322665691375732, + "logps/chosen": -225.53726196289062, + "logps/rejected": -284.1981201171875, + "loss": 0.3382, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.129570245742798, + "rewards/margins": 3.1095287799835205, + "rewards/rejected": -6.239099502563477, + "step": 2456 + }, + { + "epoch": 0.51, + "learning_rate": 9.760504201680673e-06, + "logits/chosen": -2.4053800106048584, + "logits/rejected": -1.751323938369751, + "logps/chosen": -290.16680908203125, + "logps/rejected": -318.2878112792969, + "loss": 0.1977, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7551960945129395, + "rewards/margins": 3.770480155944824, + "rewards/rejected": -6.525676250457764, + "step": 2457 + }, + { + "epoch": 0.51, + "learning_rate": 9.756302521008404e-06, + "logits/chosen": -2.2911484241485596, + "logits/rejected": -2.147085666656494, + "logps/chosen": -267.47674560546875, + "logps/rejected": -262.7044982910156, + "loss": 0.2244, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8555421829223633, + "rewards/margins": 3.6998133659362793, + "rewards/rejected": -5.555356025695801, + "step": 2458 + }, + { + "epoch": 0.51, + "learning_rate": 9.752100840336135e-06, + "logits/chosen": -1.7450745105743408, + "logits/rejected": -1.8867847919464111, + "logps/chosen": -265.0449523925781, + "logps/rejected": -268.73773193359375, + "loss": 0.2069, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2022788524627686, + "rewards/margins": 3.4072115421295166, + "rewards/rejected": -6.609490394592285, + "step": 2459 + }, + { + "epoch": 0.51, + "learning_rate": 9.747899159663867e-06, + "logits/chosen": -2.0996358394622803, + "logits/rejected": -2.2437381744384766, + "logps/chosen": -238.03005981445312, + "logps/rejected": -359.89202880859375, + "loss": 0.1595, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8500618934631348, + "rewards/margins": 3.956296443939209, + "rewards/rejected": -5.806358337402344, + "step": 2460 + }, + { + "epoch": 0.51, + "learning_rate": 9.743697478991597e-06, + "logits/chosen": -2.2528650760650635, + "logits/rejected": -2.020031452178955, + "logps/chosen": -397.2727966308594, + "logps/rejected": -303.8723449707031, + "loss": 0.2884, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.974136233329773, + "rewards/margins": 3.6831774711608887, + "rewards/rejected": -5.657313823699951, + "step": 2461 + }, + { + "epoch": 0.52, + "learning_rate": 9.739495798319329e-06, + "logits/chosen": -2.1542177200317383, + "logits/rejected": -2.0130393505096436, + "logps/chosen": -416.8458251953125, + "logps/rejected": -399.8071594238281, + "loss": 0.2241, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.472114086151123, + "rewards/margins": 3.7513413429260254, + "rewards/rejected": -6.223455905914307, + "step": 2462 + }, + { + "epoch": 0.52, + "learning_rate": 9.735294117647059e-06, + "logits/chosen": -2.1159396171569824, + "logits/rejected": -2.096940279006958, + "logps/chosen": -392.9292907714844, + "logps/rejected": -406.60382080078125, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2731056213378906, + "rewards/margins": 5.1493120193481445, + "rewards/rejected": -7.422417640686035, + "step": 2463 + }, + { + "epoch": 0.52, + "learning_rate": 9.731092436974791e-06, + "logits/chosen": -2.410989284515381, + "logits/rejected": -2.2994556427001953, + "logps/chosen": -355.16552734375, + "logps/rejected": -338.94647216796875, + "loss": 0.3553, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0237691402435303, + "rewards/margins": 3.965956211090088, + "rewards/rejected": -6.989725112915039, + "step": 2464 + }, + { + "epoch": 0.52, + "learning_rate": 9.726890756302521e-06, + "logits/chosen": -2.0583953857421875, + "logits/rejected": -1.7986162900924683, + "logps/chosen": -287.96136474609375, + "logps/rejected": -308.815185546875, + "loss": 0.1745, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.40604829788208, + "rewards/margins": 4.625705718994141, + "rewards/rejected": -7.031754016876221, + "step": 2465 + }, + { + "epoch": 0.52, + "learning_rate": 9.722689075630253e-06, + "logits/chosen": -2.1780307292938232, + "logits/rejected": -2.0415542125701904, + "logps/chosen": -256.6803894042969, + "logps/rejected": -253.82778930664062, + "loss": 0.3768, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1402909755706787, + "rewards/margins": 3.306736946105957, + "rewards/rejected": -6.447027206420898, + "step": 2466 + }, + { + "epoch": 0.52, + "learning_rate": 9.718487394957983e-06, + "logits/chosen": -2.262104034423828, + "logits/rejected": -2.003505229949951, + "logps/chosen": -424.62738037109375, + "logps/rejected": -354.9641418457031, + "loss": 0.8189, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7076375484466553, + "rewards/margins": 1.9948680400848389, + "rewards/rejected": -5.702506065368652, + "step": 2467 + }, + { + "epoch": 0.52, + "learning_rate": 9.714285714285715e-06, + "logits/chosen": -2.070359468460083, + "logits/rejected": -2.31072735786438, + "logps/chosen": -263.1658935546875, + "logps/rejected": -297.6817321777344, + "loss": 0.251, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.321218729019165, + "rewards/margins": 5.258805751800537, + "rewards/rejected": -7.580024719238281, + "step": 2468 + }, + { + "epoch": 0.52, + "learning_rate": 9.710084033613445e-06, + "logits/chosen": -2.224356174468994, + "logits/rejected": -1.8325464725494385, + "logps/chosen": -322.0057678222656, + "logps/rejected": -346.1938171386719, + "loss": 0.161, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.505855083465576, + "rewards/margins": 4.101947784423828, + "rewards/rejected": -6.6078033447265625, + "step": 2469 + }, + { + "epoch": 0.52, + "learning_rate": 9.705882352941177e-06, + "logits/chosen": -2.0635294914245605, + "logits/rejected": -2.2067527770996094, + "logps/chosen": -216.92633056640625, + "logps/rejected": -264.6098937988281, + "loss": 0.2893, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2246415615081787, + "rewards/margins": 3.6159889698028564, + "rewards/rejected": -5.840631008148193, + "step": 2470 + }, + { + "epoch": 0.52, + "learning_rate": 9.701680672268908e-06, + "logits/chosen": -2.1970443725585938, + "logits/rejected": -1.5998048782348633, + "logps/chosen": -349.5067138671875, + "logps/rejected": -348.56768798828125, + "loss": 0.2665, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.670179843902588, + "rewards/margins": 3.6221532821655273, + "rewards/rejected": -6.292333126068115, + "step": 2471 + }, + { + "epoch": 0.52, + "learning_rate": 9.69747899159664e-06, + "logits/chosen": -2.314962863922119, + "logits/rejected": -1.5359362363815308, + "logps/chosen": -375.8669128417969, + "logps/rejected": -272.38360595703125, + "loss": 0.2938, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.750781774520874, + "rewards/margins": 3.455026626586914, + "rewards/rejected": -6.205808639526367, + "step": 2472 + }, + { + "epoch": 0.52, + "learning_rate": 9.693277310924371e-06, + "logits/chosen": -1.9978368282318115, + "logits/rejected": -1.9219955205917358, + "logps/chosen": -291.6861877441406, + "logps/rejected": -295.47235107421875, + "loss": 0.5612, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6381125450134277, + "rewards/margins": 2.6202173233032227, + "rewards/rejected": -5.25832986831665, + "step": 2473 + }, + { + "epoch": 0.52, + "learning_rate": 9.689075630252102e-06, + "logits/chosen": -1.9236409664154053, + "logits/rejected": -1.6505722999572754, + "logps/chosen": -319.92327880859375, + "logps/rejected": -256.6158142089844, + "loss": 0.3467, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.785343885421753, + "rewards/margins": 3.375783920288086, + "rewards/rejected": -6.161128044128418, + "step": 2474 + }, + { + "epoch": 0.52, + "learning_rate": 9.684873949579834e-06, + "logits/chosen": -2.1630935668945312, + "logits/rejected": -1.744858980178833, + "logps/chosen": -374.8528747558594, + "logps/rejected": -353.58740234375, + "loss": 0.6678, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1705503463745117, + "rewards/margins": 3.0533413887023926, + "rewards/rejected": -6.223891258239746, + "step": 2475 + }, + { + "epoch": 0.52, + "learning_rate": 9.680672268907564e-06, + "logits/chosen": -1.9848287105560303, + "logits/rejected": -1.430161476135254, + "logps/chosen": -381.407958984375, + "logps/rejected": -290.5611267089844, + "loss": 0.1705, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1242594718933105, + "rewards/margins": 3.6427717208862305, + "rewards/rejected": -6.767031669616699, + "step": 2476 + }, + { + "epoch": 0.52, + "learning_rate": 9.676470588235296e-06, + "logits/chosen": -2.0961620807647705, + "logits/rejected": -1.940819501876831, + "logps/chosen": -348.21270751953125, + "logps/rejected": -337.52410888671875, + "loss": 0.3089, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.33778715133667, + "rewards/margins": 4.036999225616455, + "rewards/rejected": -6.374786376953125, + "step": 2477 + }, + { + "epoch": 0.52, + "learning_rate": 9.672268907563026e-06, + "logits/chosen": -2.1644392013549805, + "logits/rejected": -1.6128807067871094, + "logps/chosen": -461.83074951171875, + "logps/rejected": -383.1968994140625, + "loss": 0.245, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3617360591888428, + "rewards/margins": 2.6947131156921387, + "rewards/rejected": -5.056448936462402, + "step": 2478 + }, + { + "epoch": 0.52, + "learning_rate": 9.668067226890758e-06, + "logits/chosen": -1.943839430809021, + "logits/rejected": -1.903981328010559, + "logps/chosen": -378.90447998046875, + "logps/rejected": -447.855224609375, + "loss": 0.259, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8508312702178955, + "rewards/margins": 3.9421370029449463, + "rewards/rejected": -6.792968273162842, + "step": 2479 + }, + { + "epoch": 0.52, + "learning_rate": 9.663865546218488e-06, + "logits/chosen": -2.2937865257263184, + "logits/rejected": -2.1340508460998535, + "logps/chosen": -401.56903076171875, + "logps/rejected": -331.7358093261719, + "loss": 0.1174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3423702716827393, + "rewards/margins": 4.279393196105957, + "rewards/rejected": -6.621763229370117, + "step": 2480 + }, + { + "epoch": 0.52, + "learning_rate": 9.65966386554622e-06, + "logits/chosen": -2.031053066253662, + "logits/rejected": -1.5821595191955566, + "logps/chosen": -347.1388854980469, + "logps/rejected": -314.3232421875, + "loss": 0.1265, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.601536750793457, + "rewards/margins": 4.141270637512207, + "rewards/rejected": -6.742807388305664, + "step": 2481 + }, + { + "epoch": 0.52, + "learning_rate": 9.65546218487395e-06, + "logits/chosen": -2.2985172271728516, + "logits/rejected": -2.019052028656006, + "logps/chosen": -431.980712890625, + "logps/rejected": -340.51171875, + "loss": 0.1518, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3010196685791016, + "rewards/margins": 3.675325870513916, + "rewards/rejected": -5.976345539093018, + "step": 2482 + }, + { + "epoch": 0.52, + "learning_rate": 9.651260504201682e-06, + "logits/chosen": -1.8462460041046143, + "logits/rejected": -2.1538021564483643, + "logps/chosen": -192.8182830810547, + "logps/rejected": -307.1131591796875, + "loss": 0.2123, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.861116886138916, + "rewards/margins": 3.683250904083252, + "rewards/rejected": -6.544367790222168, + "step": 2483 + }, + { + "epoch": 0.52, + "learning_rate": 9.647058823529412e-06, + "logits/chosen": -2.170926570892334, + "logits/rejected": -2.0191023349761963, + "logps/chosen": -426.21856689453125, + "logps/rejected": -409.97418212890625, + "loss": 0.6966, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1404061317443848, + "rewards/margins": 2.4268946647644043, + "rewards/rejected": -5.567300796508789, + "step": 2484 + }, + { + "epoch": 0.52, + "learning_rate": 9.642857142857144e-06, + "logits/chosen": -2.3636703491210938, + "logits/rejected": -2.213817834854126, + "logps/chosen": -505.3054504394531, + "logps/rejected": -388.56536865234375, + "loss": 0.2744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8712806701660156, + "rewards/margins": 3.5582900047302246, + "rewards/rejected": -5.429571151733398, + "step": 2485 + }, + { + "epoch": 0.52, + "learning_rate": 9.638655462184874e-06, + "logits/chosen": -2.345541000366211, + "logits/rejected": -2.1253554821014404, + "logps/chosen": -460.6683044433594, + "logps/rejected": -458.9995422363281, + "loss": 0.5745, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.807734251022339, + "rewards/margins": 1.8625736236572266, + "rewards/rejected": -5.670307636260986, + "step": 2486 + }, + { + "epoch": 0.52, + "learning_rate": 9.634453781512606e-06, + "logits/chosen": -2.132876396179199, + "logits/rejected": -2.0411922931671143, + "logps/chosen": -315.79840087890625, + "logps/rejected": -297.36163330078125, + "loss": 0.1376, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6191868782043457, + "rewards/margins": 3.9339449405670166, + "rewards/rejected": -5.553132057189941, + "step": 2487 + }, + { + "epoch": 0.52, + "learning_rate": 9.630252100840337e-06, + "logits/chosen": -2.327385187149048, + "logits/rejected": -1.8433102369308472, + "logps/chosen": -357.0150146484375, + "logps/rejected": -343.45794677734375, + "loss": 0.2, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4687323570251465, + "rewards/margins": 3.935394287109375, + "rewards/rejected": -6.40412712097168, + "step": 2488 + }, + { + "epoch": 0.52, + "learning_rate": 9.626050420168068e-06, + "logits/chosen": -2.3436050415039062, + "logits/rejected": -1.8886823654174805, + "logps/chosen": -376.2164306640625, + "logps/rejected": -400.1048583984375, + "loss": 0.1636, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5092034339904785, + "rewards/margins": 4.286349296569824, + "rewards/rejected": -6.795552730560303, + "step": 2489 + }, + { + "epoch": 0.52, + "learning_rate": 9.621848739495799e-06, + "logits/chosen": -2.188086748123169, + "logits/rejected": -1.9260979890823364, + "logps/chosen": -425.3425598144531, + "logps/rejected": -377.23675537109375, + "loss": 0.4469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.410393476486206, + "rewards/margins": 3.28798508644104, + "rewards/rejected": -5.698378562927246, + "step": 2490 + }, + { + "epoch": 0.52, + "learning_rate": 9.61764705882353e-06, + "logits/chosen": -2.1028683185577393, + "logits/rejected": -1.5959678888320923, + "logps/chosen": -350.1116027832031, + "logps/rejected": -383.0577392578125, + "loss": 0.2312, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.581228733062744, + "rewards/margins": 4.88942813873291, + "rewards/rejected": -7.470656871795654, + "step": 2491 + }, + { + "epoch": 0.52, + "learning_rate": 9.61344537815126e-06, + "logits/chosen": -2.0289759635925293, + "logits/rejected": -2.2293808460235596, + "logps/chosen": -261.5367736816406, + "logps/rejected": -360.9933776855469, + "loss": 0.1301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9632079601287842, + "rewards/margins": 5.21551513671875, + "rewards/rejected": -7.178723335266113, + "step": 2492 + }, + { + "epoch": 0.52, + "learning_rate": 9.609243697478993e-06, + "logits/chosen": -2.3193390369415283, + "logits/rejected": -2.1731221675872803, + "logps/chosen": -246.1613311767578, + "logps/rejected": -234.50965881347656, + "loss": 0.4198, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6315531730651855, + "rewards/margins": 3.3104610443115234, + "rewards/rejected": -5.942013740539551, + "step": 2493 + }, + { + "epoch": 0.52, + "learning_rate": 9.605042016806723e-06, + "logits/chosen": -1.9105511903762817, + "logits/rejected": -2.056098461151123, + "logps/chosen": -377.2398986816406, + "logps/rejected": -371.1231384277344, + "loss": 0.4123, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.841477394104004, + "rewards/margins": 2.861736297607422, + "rewards/rejected": -4.703213691711426, + "step": 2494 + }, + { + "epoch": 0.52, + "learning_rate": 9.600840336134455e-06, + "logits/chosen": -2.2975151538848877, + "logits/rejected": -2.175659418106079, + "logps/chosen": -319.239990234375, + "logps/rejected": -306.5977783203125, + "loss": 0.4597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4825851917266846, + "rewards/margins": 2.254281997680664, + "rewards/rejected": -4.7368669509887695, + "step": 2495 + }, + { + "epoch": 0.52, + "learning_rate": 9.596638655462187e-06, + "logits/chosen": -2.369351387023926, + "logits/rejected": -2.026219606399536, + "logps/chosen": -431.4033508300781, + "logps/rejected": -322.7351379394531, + "loss": 0.1122, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7852988243103027, + "rewards/margins": 4.616962432861328, + "rewards/rejected": -6.402260780334473, + "step": 2496 + }, + { + "epoch": 0.52, + "learning_rate": 9.592436974789917e-06, + "logits/chosen": -2.0997745990753174, + "logits/rejected": -1.7317702770233154, + "logps/chosen": -433.49139404296875, + "logps/rejected": -342.0841064453125, + "loss": 0.2304, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3035593032836914, + "rewards/margins": 3.2437310218811035, + "rewards/rejected": -5.547290325164795, + "step": 2497 + }, + { + "epoch": 0.52, + "learning_rate": 9.588235294117649e-06, + "logits/chosen": -2.0126049518585205, + "logits/rejected": -2.014080047607422, + "logps/chosen": -277.796142578125, + "logps/rejected": -271.4325256347656, + "loss": 0.4414, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6962857246398926, + "rewards/margins": 3.405818462371826, + "rewards/rejected": -6.102104187011719, + "step": 2498 + }, + { + "epoch": 0.52, + "learning_rate": 9.584033613445379e-06, + "logits/chosen": -2.1265807151794434, + "logits/rejected": -2.022434711456299, + "logps/chosen": -316.7300109863281, + "logps/rejected": -390.12530517578125, + "loss": 0.1293, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.51875638961792, + "rewards/margins": 5.055368900299072, + "rewards/rejected": -7.574125289916992, + "step": 2499 + }, + { + "epoch": 0.52, + "learning_rate": 9.579831932773111e-06, + "logits/chosen": -2.3787965774536133, + "logits/rejected": -1.7455025911331177, + "logps/chosen": -407.04150390625, + "logps/rejected": -344.09625244140625, + "loss": 0.3443, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2838099002838135, + "rewards/margins": 3.2583203315734863, + "rewards/rejected": -5.542130470275879, + "step": 2500 + }, + { + "epoch": 0.52, + "learning_rate": 9.575630252100841e-06, + "logits/chosen": -2.4119229316711426, + "logits/rejected": -2.168670177459717, + "logps/chosen": -293.9447937011719, + "logps/rejected": -282.108642578125, + "loss": 0.1043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0820729732513428, + "rewards/margins": 4.6340250968933105, + "rewards/rejected": -6.716097831726074, + "step": 2501 + }, + { + "epoch": 0.52, + "learning_rate": 9.571428571428573e-06, + "logits/chosen": -2.0610527992248535, + "logits/rejected": -2.028764486312866, + "logps/chosen": -392.11962890625, + "logps/rejected": -350.20660400390625, + "loss": 0.4263, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5539908409118652, + "rewards/margins": 2.3492860794067383, + "rewards/rejected": -4.9032769203186035, + "step": 2502 + }, + { + "epoch": 0.52, + "learning_rate": 9.567226890756303e-06, + "logits/chosen": -2.3153438568115234, + "logits/rejected": -2.0979604721069336, + "logps/chosen": -268.85833740234375, + "logps/rejected": -279.8310852050781, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7873706817626953, + "rewards/margins": 5.002776622772217, + "rewards/rejected": -7.79014778137207, + "step": 2503 + }, + { + "epoch": 0.52, + "learning_rate": 9.563025210084035e-06, + "logits/chosen": -1.9433951377868652, + "logits/rejected": -1.827419400215149, + "logps/chosen": -290.7541198730469, + "logps/rejected": -373.6928405761719, + "loss": 0.406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4459328651428223, + "rewards/margins": 4.008621692657471, + "rewards/rejected": -7.454555034637451, + "step": 2504 + }, + { + "epoch": 0.52, + "learning_rate": 9.558823529411766e-06, + "logits/chosen": -2.309584140777588, + "logits/rejected": -1.8415096998214722, + "logps/chosen": -335.1734924316406, + "logps/rejected": -389.43701171875, + "loss": 0.3345, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6883304119110107, + "rewards/margins": 3.4640326499938965, + "rewards/rejected": -6.152363300323486, + "step": 2505 + }, + { + "epoch": 0.52, + "learning_rate": 9.554621848739497e-06, + "logits/chosen": -2.209505558013916, + "logits/rejected": -1.7459771633148193, + "logps/chosen": -386.4450988769531, + "logps/rejected": -244.0200958251953, + "loss": 0.6366, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.361931085586548, + "rewards/margins": 2.834162712097168, + "rewards/rejected": -5.196094036102295, + "step": 2506 + }, + { + "epoch": 0.52, + "learning_rate": 9.550420168067228e-06, + "logits/chosen": -2.232476234436035, + "logits/rejected": -2.1178627014160156, + "logps/chosen": -414.0107421875, + "logps/rejected": -382.9124755859375, + "loss": 0.2299, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2530810832977295, + "rewards/margins": 4.126534461975098, + "rewards/rejected": -6.379615783691406, + "step": 2507 + }, + { + "epoch": 0.52, + "learning_rate": 9.54621848739496e-06, + "logits/chosen": -2.2799813747406006, + "logits/rejected": -1.9662835597991943, + "logps/chosen": -266.8333435058594, + "logps/rejected": -234.3785400390625, + "loss": 0.2949, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3078579902648926, + "rewards/margins": 3.4136760234832764, + "rewards/rejected": -5.721534252166748, + "step": 2508 + }, + { + "epoch": 0.52, + "learning_rate": 9.54201680672269e-06, + "logits/chosen": -2.110401153564453, + "logits/rejected": -1.8340482711791992, + "logps/chosen": -352.2585754394531, + "logps/rejected": -404.18145751953125, + "loss": 0.2744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9760286808013916, + "rewards/margins": 3.059877634048462, + "rewards/rejected": -5.0359063148498535, + "step": 2509 + }, + { + "epoch": 0.53, + "learning_rate": 9.537815126050422e-06, + "logits/chosen": -2.256673574447632, + "logits/rejected": -1.7906372547149658, + "logps/chosen": -450.4818420410156, + "logps/rejected": -350.98626708984375, + "loss": 0.4995, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6848156452178955, + "rewards/margins": 3.273085594177246, + "rewards/rejected": -5.957901954650879, + "step": 2510 + }, + { + "epoch": 0.53, + "learning_rate": 9.533613445378152e-06, + "logits/chosen": -2.168178081512451, + "logits/rejected": -1.7355785369873047, + "logps/chosen": -388.55828857421875, + "logps/rejected": -332.3291320800781, + "loss": 0.0916, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4317495822906494, + "rewards/margins": 4.800617218017578, + "rewards/rejected": -6.232367038726807, + "step": 2511 + }, + { + "epoch": 0.53, + "learning_rate": 9.529411764705882e-06, + "logits/chosen": -2.5559027194976807, + "logits/rejected": -2.1007933616638184, + "logps/chosen": -310.33428955078125, + "logps/rejected": -284.7672424316406, + "loss": 0.3639, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.927351236343384, + "rewards/margins": 2.579256296157837, + "rewards/rejected": -5.506607532501221, + "step": 2512 + }, + { + "epoch": 0.53, + "learning_rate": 9.525210084033614e-06, + "logits/chosen": -2.0332939624786377, + "logits/rejected": -1.690711259841919, + "logps/chosen": -327.4412536621094, + "logps/rejected": -327.41357421875, + "loss": 0.3181, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3663837909698486, + "rewards/margins": 3.2649383544921875, + "rewards/rejected": -6.631321907043457, + "step": 2513 + }, + { + "epoch": 0.53, + "learning_rate": 9.521008403361344e-06, + "logits/chosen": -1.9128518104553223, + "logits/rejected": -2.0001280307769775, + "logps/chosen": -430.49176025390625, + "logps/rejected": -585.4301147460938, + "loss": 0.1853, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.566788673400879, + "rewards/margins": 3.837230682373047, + "rewards/rejected": -6.404019355773926, + "step": 2514 + }, + { + "epoch": 0.53, + "learning_rate": 9.516806722689076e-06, + "logits/chosen": -2.2985007762908936, + "logits/rejected": -2.203998565673828, + "logps/chosen": -226.50936889648438, + "logps/rejected": -280.779296875, + "loss": 0.3349, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4485068321228027, + "rewards/margins": 2.9634742736816406, + "rewards/rejected": -6.411981582641602, + "step": 2515 + }, + { + "epoch": 0.53, + "learning_rate": 9.512605042016806e-06, + "logits/chosen": -1.8921406269073486, + "logits/rejected": -2.1687142848968506, + "logps/chosen": -306.92303466796875, + "logps/rejected": -409.34918212890625, + "loss": 0.2676, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7105960845947266, + "rewards/margins": 4.616537094116211, + "rewards/rejected": -7.327133655548096, + "step": 2516 + }, + { + "epoch": 0.53, + "learning_rate": 9.508403361344538e-06, + "logits/chosen": -2.0976529121398926, + "logits/rejected": -1.7766838073730469, + "logps/chosen": -280.36126708984375, + "logps/rejected": -257.9566345214844, + "loss": 0.1702, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2920262813568115, + "rewards/margins": 3.9331674575805664, + "rewards/rejected": -6.225193977355957, + "step": 2517 + }, + { + "epoch": 0.53, + "learning_rate": 9.504201680672269e-06, + "logits/chosen": -2.11857008934021, + "logits/rejected": -1.8373630046844482, + "logps/chosen": -387.91180419921875, + "logps/rejected": -446.870849609375, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7399072647094727, + "rewards/margins": 5.450682163238525, + "rewards/rejected": -7.190589904785156, + "step": 2518 + }, + { + "epoch": 0.53, + "learning_rate": 9.5e-06, + "logits/chosen": -1.8230106830596924, + "logits/rejected": -1.8141568899154663, + "logps/chosen": -303.5601806640625, + "logps/rejected": -322.4443359375, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.457347869873047, + "rewards/margins": 4.088535785675049, + "rewards/rejected": -6.545883655548096, + "step": 2519 + }, + { + "epoch": 0.53, + "learning_rate": 9.49579831932773e-06, + "logits/chosen": -2.058150291442871, + "logits/rejected": -2.3269007205963135, + "logps/chosen": -196.10536193847656, + "logps/rejected": -246.83120727539062, + "loss": 0.4088, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1146841049194336, + "rewards/margins": 3.557880401611328, + "rewards/rejected": -6.672564506530762, + "step": 2520 + }, + { + "epoch": 0.53, + "learning_rate": 9.491596638655463e-06, + "logits/chosen": -2.2051827907562256, + "logits/rejected": -1.6986697912216187, + "logps/chosen": -355.34686279296875, + "logps/rejected": -291.80450439453125, + "loss": 0.108, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5224666595458984, + "rewards/margins": 5.002082347869873, + "rewards/rejected": -7.5245490074157715, + "step": 2521 + }, + { + "epoch": 0.53, + "learning_rate": 9.487394957983193e-06, + "logits/chosen": -2.323888063430786, + "logits/rejected": -2.148203134536743, + "logps/chosen": -284.5354919433594, + "logps/rejected": -283.0790710449219, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6046156883239746, + "rewards/margins": 3.734713077545166, + "rewards/rejected": -6.339328765869141, + "step": 2522 + }, + { + "epoch": 0.53, + "learning_rate": 9.483193277310925e-06, + "logits/chosen": -2.112504720687866, + "logits/rejected": -1.9059298038482666, + "logps/chosen": -387.5921630859375, + "logps/rejected": -358.6956787109375, + "loss": 0.227, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.621278762817383, + "rewards/margins": 4.938642501831055, + "rewards/rejected": -7.5599212646484375, + "step": 2523 + }, + { + "epoch": 0.53, + "learning_rate": 9.478991596638657e-06, + "logits/chosen": -2.171133518218994, + "logits/rejected": -1.8824927806854248, + "logps/chosen": -400.78460693359375, + "logps/rejected": -373.2786865234375, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8059349060058594, + "rewards/margins": 4.005174160003662, + "rewards/rejected": -6.81110954284668, + "step": 2524 + }, + { + "epoch": 0.53, + "learning_rate": 9.474789915966387e-06, + "logits/chosen": -2.111607074737549, + "logits/rejected": -1.998233437538147, + "logps/chosen": -319.780029296875, + "logps/rejected": -297.7987976074219, + "loss": 0.4915, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.224074363708496, + "rewards/margins": 3.958573818206787, + "rewards/rejected": -7.182647705078125, + "step": 2525 + }, + { + "epoch": 0.53, + "learning_rate": 9.470588235294119e-06, + "logits/chosen": -2.301401376724243, + "logits/rejected": -2.1560215950012207, + "logps/chosen": -325.9587707519531, + "logps/rejected": -343.59716796875, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.68658447265625, + "rewards/margins": 5.390464782714844, + "rewards/rejected": -8.077049255371094, + "step": 2526 + }, + { + "epoch": 0.53, + "learning_rate": 9.466386554621849e-06, + "logits/chosen": -2.146578311920166, + "logits/rejected": -1.9742950201034546, + "logps/chosen": -465.35198974609375, + "logps/rejected": -409.40875244140625, + "loss": 0.232, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1047110557556152, + "rewards/margins": 3.148139476776123, + "rewards/rejected": -6.252850532531738, + "step": 2527 + }, + { + "epoch": 0.53, + "learning_rate": 9.462184873949581e-06, + "logits/chosen": -2.2264466285705566, + "logits/rejected": -1.9232317209243774, + "logps/chosen": -436.72650146484375, + "logps/rejected": -376.5250244140625, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.417790651321411, + "rewards/margins": 3.1051182746887207, + "rewards/rejected": -6.522909164428711, + "step": 2528 + }, + { + "epoch": 0.53, + "learning_rate": 9.457983193277311e-06, + "logits/chosen": -2.1570754051208496, + "logits/rejected": -1.8366494178771973, + "logps/chosen": -342.090576171875, + "logps/rejected": -323.77850341796875, + "loss": 0.4182, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4511475563049316, + "rewards/margins": 3.0063416957855225, + "rewards/rejected": -6.457489013671875, + "step": 2529 + }, + { + "epoch": 0.53, + "learning_rate": 9.453781512605043e-06, + "logits/chosen": -2.2689995765686035, + "logits/rejected": -2.2033443450927734, + "logps/chosen": -342.376953125, + "logps/rejected": -363.0454406738281, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.188932418823242, + "rewards/margins": 4.162549018859863, + "rewards/rejected": -7.3514814376831055, + "step": 2530 + }, + { + "epoch": 0.53, + "learning_rate": 9.449579831932773e-06, + "logits/chosen": -2.2346906661987305, + "logits/rejected": -2.1344492435455322, + "logps/chosen": -438.34271240234375, + "logps/rejected": -363.677734375, + "loss": 0.1747, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8356618881225586, + "rewards/margins": 3.488903522491455, + "rewards/rejected": -6.324565410614014, + "step": 2531 + }, + { + "epoch": 0.53, + "learning_rate": 9.445378151260505e-06, + "logits/chosen": -2.234792470932007, + "logits/rejected": -2.0919010639190674, + "logps/chosen": -281.65472412109375, + "logps/rejected": -278.4136657714844, + "loss": 0.17, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.791999340057373, + "rewards/margins": 4.423157691955566, + "rewards/rejected": -7.2151570320129395, + "step": 2532 + }, + { + "epoch": 0.53, + "learning_rate": 9.441176470588235e-06, + "logits/chosen": -2.1052494049072266, + "logits/rejected": -2.055840492248535, + "logps/chosen": -285.5604248046875, + "logps/rejected": -355.980224609375, + "loss": 0.1288, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.395705461502075, + "rewards/margins": 4.4319915771484375, + "rewards/rejected": -7.827697277069092, + "step": 2533 + }, + { + "epoch": 0.53, + "learning_rate": 9.436974789915967e-06, + "logits/chosen": -2.3428354263305664, + "logits/rejected": -2.0979807376861572, + "logps/chosen": -394.2915344238281, + "logps/rejected": -431.5267028808594, + "loss": 0.4827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2202205657958984, + "rewards/margins": 4.644374370574951, + "rewards/rejected": -7.864595413208008, + "step": 2534 + }, + { + "epoch": 0.53, + "learning_rate": 9.432773109243698e-06, + "logits/chosen": -2.5848116874694824, + "logits/rejected": -2.0564680099487305, + "logps/chosen": -375.3398742675781, + "logps/rejected": -338.7669677734375, + "loss": 0.337, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3333778381347656, + "rewards/margins": 3.6800408363342285, + "rewards/rejected": -6.013419151306152, + "step": 2535 + }, + { + "epoch": 0.53, + "learning_rate": 9.42857142857143e-06, + "logits/chosen": -2.174773931503296, + "logits/rejected": -1.9295463562011719, + "logps/chosen": -380.8409729003906, + "logps/rejected": -344.0479736328125, + "loss": 0.0953, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2969679832458496, + "rewards/margins": 4.161410331726074, + "rewards/rejected": -6.458378314971924, + "step": 2536 + }, + { + "epoch": 0.53, + "learning_rate": 9.42436974789916e-06, + "logits/chosen": -2.273197889328003, + "logits/rejected": -2.0794014930725098, + "logps/chosen": -382.24139404296875, + "logps/rejected": -367.83453369140625, + "loss": 0.1858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1326944828033447, + "rewards/margins": 3.704744338989258, + "rewards/rejected": -6.837438583374023, + "step": 2537 + }, + { + "epoch": 0.53, + "learning_rate": 9.420168067226892e-06, + "logits/chosen": -2.1383862495422363, + "logits/rejected": -1.583884835243225, + "logps/chosen": -423.390380859375, + "logps/rejected": -338.5177917480469, + "loss": 0.647, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6184678077697754, + "rewards/margins": 3.1192736625671387, + "rewards/rejected": -6.737740993499756, + "step": 2538 + }, + { + "epoch": 0.53, + "learning_rate": 9.415966386554622e-06, + "logits/chosen": -2.1102585792541504, + "logits/rejected": -2.008113384246826, + "logps/chosen": -236.2758331298828, + "logps/rejected": -438.6126708984375, + "loss": 0.1283, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6426756381988525, + "rewards/margins": 4.893115043640137, + "rewards/rejected": -8.53579044342041, + "step": 2539 + }, + { + "epoch": 0.53, + "learning_rate": 9.411764705882354e-06, + "logits/chosen": -1.9974117279052734, + "logits/rejected": -1.9260965585708618, + "logps/chosen": -332.7476501464844, + "logps/rejected": -316.2168273925781, + "loss": 0.2161, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.078606605529785, + "rewards/margins": 3.4742813110351562, + "rewards/rejected": -6.552887916564941, + "step": 2540 + }, + { + "epoch": 0.53, + "learning_rate": 9.407563025210084e-06, + "logits/chosen": -2.1985411643981934, + "logits/rejected": -2.1689884662628174, + "logps/chosen": -473.10382080078125, + "logps/rejected": -372.552001953125, + "loss": 0.1044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.685713768005371, + "rewards/margins": 5.397963047027588, + "rewards/rejected": -8.083677291870117, + "step": 2541 + }, + { + "epoch": 0.53, + "learning_rate": 9.403361344537816e-06, + "logits/chosen": -2.0937719345092773, + "logits/rejected": -2.144117593765259, + "logps/chosen": -257.310546875, + "logps/rejected": -354.2978210449219, + "loss": 0.1695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1185336112976074, + "rewards/margins": 4.972678184509277, + "rewards/rejected": -8.091211318969727, + "step": 2542 + }, + { + "epoch": 0.53, + "learning_rate": 9.399159663865546e-06, + "logits/chosen": -1.9987530708312988, + "logits/rejected": -2.2044856548309326, + "logps/chosen": -243.2696533203125, + "logps/rejected": -358.2451171875, + "loss": 0.594, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.61677885055542, + "rewards/margins": 3.6245479583740234, + "rewards/rejected": -7.241326332092285, + "step": 2543 + }, + { + "epoch": 0.53, + "learning_rate": 9.394957983193278e-06, + "logits/chosen": -2.6127560138702393, + "logits/rejected": -2.3447189331054688, + "logps/chosen": -400.9631652832031, + "logps/rejected": -371.0506896972656, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8211510181427002, + "rewards/margins": 4.983794689178467, + "rewards/rejected": -6.804945945739746, + "step": 2544 + }, + { + "epoch": 0.53, + "learning_rate": 9.390756302521008e-06, + "logits/chosen": -2.37842059135437, + "logits/rejected": -2.18770432472229, + "logps/chosen": -374.48046875, + "logps/rejected": -347.82623291015625, + "loss": 0.2057, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1693285703659058, + "rewards/margins": 5.296117782592773, + "rewards/rejected": -6.465446472167969, + "step": 2545 + }, + { + "epoch": 0.53, + "learning_rate": 9.38655462184874e-06, + "logits/chosen": -2.119645833969116, + "logits/rejected": -2.2463600635528564, + "logps/chosen": -312.02764892578125, + "logps/rejected": -404.00872802734375, + "loss": 0.1784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5017592906951904, + "rewards/margins": 5.538944244384766, + "rewards/rejected": -8.040704727172852, + "step": 2546 + }, + { + "epoch": 0.53, + "learning_rate": 9.382352941176472e-06, + "logits/chosen": -2.4114084243774414, + "logits/rejected": -1.9574097394943237, + "logps/chosen": -474.704345703125, + "logps/rejected": -385.8339538574219, + "loss": 0.2403, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.181457996368408, + "rewards/margins": 4.81538200378418, + "rewards/rejected": -6.996840476989746, + "step": 2547 + }, + { + "epoch": 0.53, + "learning_rate": 9.378151260504202e-06, + "logits/chosen": -2.1346471309661865, + "logits/rejected": -1.9008607864379883, + "logps/chosen": -258.4367370605469, + "logps/rejected": -241.75396728515625, + "loss": 0.6987, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.449197769165039, + "rewards/margins": 2.8112971782684326, + "rewards/rejected": -6.260494709014893, + "step": 2548 + }, + { + "epoch": 0.53, + "learning_rate": 9.373949579831934e-06, + "logits/chosen": -2.092656373977661, + "logits/rejected": -1.788853406906128, + "logps/chosen": -433.49664306640625, + "logps/rejected": -292.1119384765625, + "loss": 0.2048, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.957458257675171, + "rewards/margins": 4.3302531242370605, + "rewards/rejected": -6.287711143493652, + "step": 2549 + }, + { + "epoch": 0.53, + "learning_rate": 9.369747899159664e-06, + "logits/chosen": -2.3631653785705566, + "logits/rejected": -1.8547563552856445, + "logps/chosen": -397.82257080078125, + "logps/rejected": -366.6249694824219, + "loss": 0.2385, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6198830604553223, + "rewards/margins": 3.241133689880371, + "rewards/rejected": -6.861016273498535, + "step": 2550 + }, + { + "epoch": 0.53, + "learning_rate": 9.365546218487396e-06, + "logits/chosen": -2.1993772983551025, + "logits/rejected": -1.9715027809143066, + "logps/chosen": -357.59527587890625, + "logps/rejected": -387.57830810546875, + "loss": 0.7124, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9526937007904053, + "rewards/margins": 3.0046770572662354, + "rewards/rejected": -5.957370281219482, + "step": 2551 + }, + { + "epoch": 0.53, + "learning_rate": 9.361344537815127e-06, + "logits/chosen": -2.4792256355285645, + "logits/rejected": -2.3154115676879883, + "logps/chosen": -255.52005004882812, + "logps/rejected": -288.7085266113281, + "loss": 0.4241, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.933387279510498, + "rewards/margins": 3.574796438217163, + "rewards/rejected": -6.508183479309082, + "step": 2552 + }, + { + "epoch": 0.53, + "learning_rate": 9.357142857142859e-06, + "logits/chosen": -2.463165760040283, + "logits/rejected": -1.7777312994003296, + "logps/chosen": -375.28375244140625, + "logps/rejected": -367.20050048828125, + "loss": 0.2777, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9922327995300293, + "rewards/margins": 4.594165802001953, + "rewards/rejected": -7.586398124694824, + "step": 2553 + }, + { + "epoch": 0.53, + "learning_rate": 9.352941176470589e-06, + "logits/chosen": -2.365229368209839, + "logits/rejected": -2.02795147895813, + "logps/chosen": -575.577880859375, + "logps/rejected": -464.572998046875, + "loss": 0.3341, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.211085796356201, + "rewards/margins": 3.429306745529175, + "rewards/rejected": -5.640392780303955, + "step": 2554 + }, + { + "epoch": 0.53, + "learning_rate": 9.34873949579832e-06, + "logits/chosen": -2.081172466278076, + "logits/rejected": -1.6347401142120361, + "logps/chosen": -390.55810546875, + "logps/rejected": -368.6292724609375, + "loss": 0.1993, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.218534231185913, + "rewards/margins": 4.430109024047852, + "rewards/rejected": -7.6486430168151855, + "step": 2555 + }, + { + "epoch": 0.53, + "learning_rate": 9.344537815126051e-06, + "logits/chosen": -2.2170302867889404, + "logits/rejected": -1.7161248922348022, + "logps/chosen": -303.3514404296875, + "logps/rejected": -304.1554260253906, + "loss": 0.3066, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3877222537994385, + "rewards/margins": 4.722976207733154, + "rewards/rejected": -7.1106977462768555, + "step": 2556 + }, + { + "epoch": 0.53, + "learning_rate": 9.340336134453783e-06, + "logits/chosen": -2.178788661956787, + "logits/rejected": -1.8989672660827637, + "logps/chosen": -298.6821594238281, + "logps/rejected": -332.08087158203125, + "loss": 0.3726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8281090259552, + "rewards/margins": 3.697291374206543, + "rewards/rejected": -7.525400161743164, + "step": 2557 + }, + { + "epoch": 0.54, + "learning_rate": 9.336134453781513e-06, + "logits/chosen": -2.3828001022338867, + "logits/rejected": -1.632203459739685, + "logps/chosen": -344.9515686035156, + "logps/rejected": -344.1308898925781, + "loss": 0.387, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9686893224716187, + "rewards/margins": 4.584531784057617, + "rewards/rejected": -6.553220748901367, + "step": 2558 + }, + { + "epoch": 0.54, + "learning_rate": 9.331932773109245e-06, + "logits/chosen": -2.446682929992676, + "logits/rejected": -2.491478443145752, + "logps/chosen": -300.871826171875, + "logps/rejected": -419.3265380859375, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8470587730407715, + "rewards/margins": 5.517639636993408, + "rewards/rejected": -8.36469841003418, + "step": 2559 + }, + { + "epoch": 0.54, + "learning_rate": 9.327731092436975e-06, + "logits/chosen": -2.0064337253570557, + "logits/rejected": -2.1416516304016113, + "logps/chosen": -247.630615234375, + "logps/rejected": -385.41314697265625, + "loss": 0.2106, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.85239839553833, + "rewards/margins": 4.437196731567383, + "rewards/rejected": -7.289595603942871, + "step": 2560 + }, + { + "epoch": 0.54, + "learning_rate": 9.323529411764707e-06, + "logits/chosen": -2.3393592834472656, + "logits/rejected": -1.9963867664337158, + "logps/chosen": -242.6816864013672, + "logps/rejected": -270.532470703125, + "loss": 0.1097, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4430789947509766, + "rewards/margins": 4.7362236976623535, + "rewards/rejected": -7.17930269241333, + "step": 2561 + }, + { + "epoch": 0.54, + "learning_rate": 9.319327731092437e-06, + "logits/chosen": -2.2107598781585693, + "logits/rejected": -1.7756901979446411, + "logps/chosen": -347.0983581542969, + "logps/rejected": -295.2074890136719, + "loss": 0.1596, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4553799629211426, + "rewards/margins": 4.009431838989258, + "rewards/rejected": -6.464812278747559, + "step": 2562 + }, + { + "epoch": 0.54, + "learning_rate": 9.31512605042017e-06, + "logits/chosen": -2.297487735748291, + "logits/rejected": -1.8098678588867188, + "logps/chosen": -341.30523681640625, + "logps/rejected": -402.459716796875, + "loss": 0.3214, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6589231491088867, + "rewards/margins": 5.263858795166016, + "rewards/rejected": -7.922781944274902, + "step": 2563 + }, + { + "epoch": 0.54, + "learning_rate": 9.3109243697479e-06, + "logits/chosen": -2.148975372314453, + "logits/rejected": -2.149716854095459, + "logps/chosen": -339.5742492675781, + "logps/rejected": -288.9598693847656, + "loss": 0.3535, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.34553861618042, + "rewards/margins": 4.230324745178223, + "rewards/rejected": -6.575863838195801, + "step": 2564 + }, + { + "epoch": 0.54, + "learning_rate": 9.306722689075631e-06, + "logits/chosen": -2.296535015106201, + "logits/rejected": -1.8774789571762085, + "logps/chosen": -302.58282470703125, + "logps/rejected": -257.0469055175781, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.46828556060791, + "rewards/margins": 3.013158082962036, + "rewards/rejected": -5.481443405151367, + "step": 2565 + }, + { + "epoch": 0.54, + "learning_rate": 9.302521008403362e-06, + "logits/chosen": -2.0437076091766357, + "logits/rejected": -1.8062479496002197, + "logps/chosen": -354.3843994140625, + "logps/rejected": -416.3578186035156, + "loss": 0.3051, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.923539876937866, + "rewards/margins": 5.1094160079956055, + "rewards/rejected": -8.032955169677734, + "step": 2566 + }, + { + "epoch": 0.54, + "learning_rate": 9.298319327731094e-06, + "logits/chosen": -2.1177310943603516, + "logits/rejected": -1.8696045875549316, + "logps/chosen": -283.68768310546875, + "logps/rejected": -254.80938720703125, + "loss": 0.1382, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4072346687316895, + "rewards/margins": 4.758561611175537, + "rewards/rejected": -7.165797233581543, + "step": 2567 + }, + { + "epoch": 0.54, + "learning_rate": 9.294117647058824e-06, + "logits/chosen": -2.1093902587890625, + "logits/rejected": -1.8260833024978638, + "logps/chosen": -341.4385681152344, + "logps/rejected": -343.77423095703125, + "loss": 0.6989, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6635022163391113, + "rewards/margins": 2.3317060470581055, + "rewards/rejected": -5.995208263397217, + "step": 2568 + }, + { + "epoch": 0.54, + "learning_rate": 9.289915966386556e-06, + "logits/chosen": -2.2569215297698975, + "logits/rejected": -2.148149013519287, + "logps/chosen": -300.59686279296875, + "logps/rejected": -294.5800476074219, + "loss": 0.2444, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.311414957046509, + "rewards/margins": 3.8160738945007324, + "rewards/rejected": -7.12748908996582, + "step": 2569 + }, + { + "epoch": 0.54, + "learning_rate": 9.285714285714288e-06, + "logits/chosen": -2.2656607627868652, + "logits/rejected": -1.7776451110839844, + "logps/chosen": -371.0494384765625, + "logps/rejected": -386.62384033203125, + "loss": 0.119, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2209994792938232, + "rewards/margins": 5.91341495513916, + "rewards/rejected": -8.134414672851562, + "step": 2570 + }, + { + "epoch": 0.54, + "learning_rate": 9.281512605042018e-06, + "logits/chosen": -2.3822526931762695, + "logits/rejected": -2.024885654449463, + "logps/chosen": -341.25274658203125, + "logps/rejected": -340.01080322265625, + "loss": 0.2822, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7695624828338623, + "rewards/margins": 4.532607078552246, + "rewards/rejected": -7.302169322967529, + "step": 2571 + }, + { + "epoch": 0.54, + "learning_rate": 9.27731092436975e-06, + "logits/chosen": -2.1967086791992188, + "logits/rejected": -1.936452865600586, + "logps/chosen": -220.71705627441406, + "logps/rejected": -289.6693115234375, + "loss": 0.2589, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8824918270111084, + "rewards/margins": 4.057369232177734, + "rewards/rejected": -6.939861297607422, + "step": 2572 + }, + { + "epoch": 0.54, + "learning_rate": 9.27310924369748e-06, + "logits/chosen": -2.1816163063049316, + "logits/rejected": -2.0887491703033447, + "logps/chosen": -378.16412353515625, + "logps/rejected": -364.8260498046875, + "loss": 0.4068, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.208500385284424, + "rewards/margins": 3.5802958011627197, + "rewards/rejected": -5.7887959480285645, + "step": 2573 + }, + { + "epoch": 0.54, + "learning_rate": 9.268907563025212e-06, + "logits/chosen": -2.514922618865967, + "logits/rejected": -2.0112464427948, + "logps/chosen": -447.1953125, + "logps/rejected": -442.54864501953125, + "loss": 0.1463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.279115676879883, + "rewards/margins": 4.6169867515563965, + "rewards/rejected": -6.8961029052734375, + "step": 2574 + }, + { + "epoch": 0.54, + "learning_rate": 9.264705882352942e-06, + "logits/chosen": -2.353874683380127, + "logits/rejected": -2.1691641807556152, + "logps/chosen": -294.71722412109375, + "logps/rejected": -317.8603515625, + "loss": 0.1574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7350270748138428, + "rewards/margins": 3.9950876235961914, + "rewards/rejected": -6.730114459991455, + "step": 2575 + }, + { + "epoch": 0.54, + "learning_rate": 9.260504201680674e-06, + "logits/chosen": -2.1018621921539307, + "logits/rejected": -1.7307868003845215, + "logps/chosen": -295.99041748046875, + "logps/rejected": -355.86468505859375, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4273812770843506, + "rewards/margins": 4.7862548828125, + "rewards/rejected": -7.21363639831543, + "step": 2576 + }, + { + "epoch": 0.54, + "learning_rate": 9.256302521008404e-06, + "logits/chosen": -1.9776158332824707, + "logits/rejected": -2.1583383083343506, + "logps/chosen": -424.2509765625, + "logps/rejected": -420.47650146484375, + "loss": 0.1513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9167046546936035, + "rewards/margins": 5.897480010986328, + "rewards/rejected": -8.81418514251709, + "step": 2577 + }, + { + "epoch": 0.54, + "learning_rate": 9.252100840336136e-06, + "logits/chosen": -2.0331006050109863, + "logits/rejected": -1.9048519134521484, + "logps/chosen": -388.6837158203125, + "logps/rejected": -312.158203125, + "loss": 0.5198, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9923412799835205, + "rewards/margins": 2.4050796031951904, + "rewards/rejected": -5.397420883178711, + "step": 2578 + }, + { + "epoch": 0.54, + "learning_rate": 9.247899159663866e-06, + "logits/chosen": -2.337507724761963, + "logits/rejected": -2.2936506271362305, + "logps/chosen": -328.03741455078125, + "logps/rejected": -372.49237060546875, + "loss": 0.1646, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.964887022972107, + "rewards/margins": 5.622317314147949, + "rewards/rejected": -7.5872039794921875, + "step": 2579 + }, + { + "epoch": 0.54, + "learning_rate": 9.243697478991598e-06, + "logits/chosen": -1.9623291492462158, + "logits/rejected": -2.17836332321167, + "logps/chosen": -340.5430908203125, + "logps/rejected": -392.6394958496094, + "loss": 0.1688, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9706039428710938, + "rewards/margins": 3.4422855377197266, + "rewards/rejected": -6.412889003753662, + "step": 2580 + }, + { + "epoch": 0.54, + "learning_rate": 9.239495798319328e-06, + "logits/chosen": -2.2881360054016113, + "logits/rejected": -1.9698160886764526, + "logps/chosen": -564.8394165039062, + "logps/rejected": -408.98504638671875, + "loss": 0.4924, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5167877674102783, + "rewards/margins": 3.31068754196167, + "rewards/rejected": -5.827474594116211, + "step": 2581 + }, + { + "epoch": 0.54, + "learning_rate": 9.23529411764706e-06, + "logits/chosen": -2.0503244400024414, + "logits/rejected": -2.069507122039795, + "logps/chosen": -292.56622314453125, + "logps/rejected": -419.79974365234375, + "loss": 0.2276, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.765777826309204, + "rewards/margins": 4.641334533691406, + "rewards/rejected": -7.4071125984191895, + "step": 2582 + }, + { + "epoch": 0.54, + "learning_rate": 9.23109243697479e-06, + "logits/chosen": -2.0481579303741455, + "logits/rejected": -2.0596923828125, + "logps/chosen": -262.9170837402344, + "logps/rejected": -304.87054443359375, + "loss": 0.2464, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7521820068359375, + "rewards/margins": 3.893430709838867, + "rewards/rejected": -6.645613193511963, + "step": 2583 + }, + { + "epoch": 0.54, + "learning_rate": 9.226890756302523e-06, + "logits/chosen": -2.185044288635254, + "logits/rejected": -1.5094817876815796, + "logps/chosen": -346.2208557128906, + "logps/rejected": -355.1290283203125, + "loss": 0.1542, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6816561222076416, + "rewards/margins": 4.00274658203125, + "rewards/rejected": -6.6844024658203125, + "step": 2584 + }, + { + "epoch": 0.54, + "learning_rate": 9.222689075630253e-06, + "logits/chosen": -2.153420925140381, + "logits/rejected": -1.7344470024108887, + "logps/chosen": -374.46807861328125, + "logps/rejected": -261.6391296386719, + "loss": 0.5289, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0606772899627686, + "rewards/margins": 2.821770429611206, + "rewards/rejected": -4.882447719573975, + "step": 2585 + }, + { + "epoch": 0.54, + "learning_rate": 9.218487394957983e-06, + "logits/chosen": -2.0616300106048584, + "logits/rejected": -1.7246735095977783, + "logps/chosen": -324.0895080566406, + "logps/rejected": -334.64532470703125, + "loss": 0.143, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.841618537902832, + "rewards/margins": 5.997417449951172, + "rewards/rejected": -8.839035987854004, + "step": 2586 + }, + { + "epoch": 0.54, + "learning_rate": 9.214285714285715e-06, + "logits/chosen": -2.087186813354492, + "logits/rejected": -1.5766535997390747, + "logps/chosen": -353.9708557128906, + "logps/rejected": -271.87548828125, + "loss": 0.4163, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5630550384521484, + "rewards/margins": 4.894275188446045, + "rewards/rejected": -7.457330226898193, + "step": 2587 + }, + { + "epoch": 0.54, + "learning_rate": 9.210084033613445e-06, + "logits/chosen": -2.0295844078063965, + "logits/rejected": -2.0846285820007324, + "logps/chosen": -248.7470245361328, + "logps/rejected": -300.9535217285156, + "loss": 0.2358, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9439549446105957, + "rewards/margins": 4.458446502685547, + "rewards/rejected": -7.402401924133301, + "step": 2588 + }, + { + "epoch": 0.54, + "learning_rate": 9.205882352941177e-06, + "logits/chosen": -2.112820625305176, + "logits/rejected": -2.02443790435791, + "logps/chosen": -345.8756408691406, + "logps/rejected": -354.57257080078125, + "loss": 0.1852, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.617520809173584, + "rewards/margins": 4.981188774108887, + "rewards/rejected": -7.598709583282471, + "step": 2589 + }, + { + "epoch": 0.54, + "learning_rate": 9.201680672268907e-06, + "logits/chosen": -2.3481132984161377, + "logits/rejected": -2.384918451309204, + "logps/chosen": -258.334228515625, + "logps/rejected": -276.2754211425781, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7667877674102783, + "rewards/margins": 6.250648498535156, + "rewards/rejected": -8.017436027526855, + "step": 2590 + }, + { + "epoch": 0.54, + "learning_rate": 9.19747899159664e-06, + "logits/chosen": -2.3376073837280273, + "logits/rejected": -1.988825798034668, + "logps/chosen": -393.5895080566406, + "logps/rejected": -325.53167724609375, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9253807067871094, + "rewards/margins": 5.674491882324219, + "rewards/rejected": -8.599872589111328, + "step": 2591 + }, + { + "epoch": 0.54, + "learning_rate": 9.19327731092437e-06, + "logits/chosen": -2.304267406463623, + "logits/rejected": -1.6741276979446411, + "logps/chosen": -367.6330871582031, + "logps/rejected": -323.0014953613281, + "loss": 0.2516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7452031373977661, + "rewards/margins": 5.045428276062012, + "rewards/rejected": -6.790631294250488, + "step": 2592 + }, + { + "epoch": 0.54, + "learning_rate": 9.189075630252101e-06, + "logits/chosen": -2.062570571899414, + "logits/rejected": -2.148332118988037, + "logps/chosen": -272.76373291015625, + "logps/rejected": -308.37774658203125, + "loss": 0.3286, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.939138889312744, + "rewards/margins": 3.520430088043213, + "rewards/rejected": -6.459568977355957, + "step": 2593 + }, + { + "epoch": 0.54, + "learning_rate": 9.184873949579832e-06, + "logits/chosen": -2.1087734699249268, + "logits/rejected": -2.1768598556518555, + "logps/chosen": -341.25469970703125, + "logps/rejected": -380.8143310546875, + "loss": 0.4837, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4032437801361084, + "rewards/margins": 3.6222915649414062, + "rewards/rejected": -6.025535583496094, + "step": 2594 + }, + { + "epoch": 0.54, + "learning_rate": 9.180672268907563e-06, + "logits/chosen": -1.9333209991455078, + "logits/rejected": -2.0346693992614746, + "logps/chosen": -347.0496826171875, + "logps/rejected": -307.85882568359375, + "loss": 0.2877, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5659470558166504, + "rewards/margins": 3.8817567825317383, + "rewards/rejected": -6.447703838348389, + "step": 2595 + }, + { + "epoch": 0.54, + "learning_rate": 9.176470588235294e-06, + "logits/chosen": -2.4154295921325684, + "logits/rejected": -1.8625118732452393, + "logps/chosen": -350.6398620605469, + "logps/rejected": -308.16326904296875, + "loss": 0.1595, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.232438087463379, + "rewards/margins": 4.106030464172363, + "rewards/rejected": -6.338468551635742, + "step": 2596 + }, + { + "epoch": 0.54, + "learning_rate": 9.172268907563026e-06, + "logits/chosen": -2.2323338985443115, + "logits/rejected": -1.9212353229522705, + "logps/chosen": -318.3753662109375, + "logps/rejected": -365.13555908203125, + "loss": 0.2857, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0739917755126953, + "rewards/margins": 2.8681516647338867, + "rewards/rejected": -5.942143440246582, + "step": 2597 + }, + { + "epoch": 0.54, + "learning_rate": 9.168067226890757e-06, + "logits/chosen": -1.9061769247055054, + "logits/rejected": -1.8680685758590698, + "logps/chosen": -361.2035827636719, + "logps/rejected": -349.549072265625, + "loss": 0.1701, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1295711994171143, + "rewards/margins": 4.801074981689453, + "rewards/rejected": -6.930645942687988, + "step": 2598 + }, + { + "epoch": 0.54, + "learning_rate": 9.163865546218488e-06, + "logits/chosen": -2.3487040996551514, + "logits/rejected": -1.9796628952026367, + "logps/chosen": -289.9422912597656, + "logps/rejected": -346.2483825683594, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.595804214477539, + "rewards/margins": 2.2968692779541016, + "rewards/rejected": -5.892673492431641, + "step": 2599 + }, + { + "epoch": 0.54, + "learning_rate": 9.15966386554622e-06, + "logits/chosen": -2.0097591876983643, + "logits/rejected": -1.485403299331665, + "logps/chosen": -274.05865478515625, + "logps/rejected": -259.80865478515625, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5306198596954346, + "rewards/margins": 4.995075225830078, + "rewards/rejected": -8.525693893432617, + "step": 2600 + }, + { + "epoch": 0.54, + "learning_rate": 9.15546218487395e-06, + "logits/chosen": -2.0895721912384033, + "logits/rejected": -2.200493335723877, + "logps/chosen": -331.234375, + "logps/rejected": -339.5184326171875, + "loss": 0.1062, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4182701110839844, + "rewards/margins": 3.144052028656006, + "rewards/rejected": -5.562321662902832, + "step": 2601 + }, + { + "epoch": 0.54, + "learning_rate": 9.151260504201682e-06, + "logits/chosen": -1.5985027551651, + "logits/rejected": -1.690771222114563, + "logps/chosen": -299.86248779296875, + "logps/rejected": -331.09771728515625, + "loss": 0.2417, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1488723754882812, + "rewards/margins": 5.041675090789795, + "rewards/rejected": -8.190546989440918, + "step": 2602 + }, + { + "epoch": 0.54, + "learning_rate": 9.147058823529412e-06, + "logits/chosen": -2.359854221343994, + "logits/rejected": -2.2595980167388916, + "logps/chosen": -401.08905029296875, + "logps/rejected": -358.1273193359375, + "loss": 0.3294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7206554412841797, + "rewards/margins": 3.2542667388916016, + "rewards/rejected": -5.974922180175781, + "step": 2603 + }, + { + "epoch": 0.54, + "learning_rate": 9.142857142857144e-06, + "logits/chosen": -2.379088878631592, + "logits/rejected": -1.968593955039978, + "logps/chosen": -386.78558349609375, + "logps/rejected": -320.34033203125, + "loss": 0.0766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5262081623077393, + "rewards/margins": 6.539731979370117, + "rewards/rejected": -9.065940856933594, + "step": 2604 + }, + { + "epoch": 0.54, + "learning_rate": 9.138655462184874e-06, + "logits/chosen": -1.9846725463867188, + "logits/rejected": -2.355806350708008, + "logps/chosen": -353.7377014160156, + "logps/rejected": -346.3930969238281, + "loss": 0.2074, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.883202314376831, + "rewards/margins": 5.493682384490967, + "rewards/rejected": -7.376884460449219, + "step": 2605 + }, + { + "epoch": 0.55, + "learning_rate": 9.134453781512606e-06, + "logits/chosen": -2.172497510910034, + "logits/rejected": -2.4107048511505127, + "logps/chosen": -319.21929931640625, + "logps/rejected": -395.21673583984375, + "loss": 0.4582, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.57279109954834, + "rewards/margins": 3.936209201812744, + "rewards/rejected": -6.509000778198242, + "step": 2606 + }, + { + "epoch": 0.55, + "learning_rate": 9.130252100840336e-06, + "logits/chosen": -2.122621536254883, + "logits/rejected": -2.1791207790374756, + "logps/chosen": -293.9839782714844, + "logps/rejected": -413.198974609375, + "loss": 0.1166, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3903279304504395, + "rewards/margins": 5.709153652191162, + "rewards/rejected": -8.099481582641602, + "step": 2607 + }, + { + "epoch": 0.55, + "learning_rate": 9.126050420168068e-06, + "logits/chosen": -2.069037437438965, + "logits/rejected": -2.413374423980713, + "logps/chosen": -286.9517822265625, + "logps/rejected": -393.9789123535156, + "loss": 0.1748, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3851985931396484, + "rewards/margins": 4.1733717918396, + "rewards/rejected": -6.558570384979248, + "step": 2608 + }, + { + "epoch": 0.55, + "learning_rate": 9.121848739495798e-06, + "logits/chosen": -2.1006903648376465, + "logits/rejected": -1.9250049591064453, + "logps/chosen": -314.93084716796875, + "logps/rejected": -347.17388916015625, + "loss": 0.0984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.137950897216797, + "rewards/margins": 5.0205488204956055, + "rewards/rejected": -7.158499717712402, + "step": 2609 + }, + { + "epoch": 0.55, + "learning_rate": 9.11764705882353e-06, + "logits/chosen": -1.9699406623840332, + "logits/rejected": -1.966078758239746, + "logps/chosen": -252.5470428466797, + "logps/rejected": -331.76568603515625, + "loss": 0.25, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.402479648590088, + "rewards/margins": 3.463578224182129, + "rewards/rejected": -6.866057395935059, + "step": 2610 + }, + { + "epoch": 0.55, + "learning_rate": 9.11344537815126e-06, + "logits/chosen": -2.2181761264801025, + "logits/rejected": -1.8519394397735596, + "logps/chosen": -367.24224853515625, + "logps/rejected": -299.8636779785156, + "loss": 0.2962, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5875656604766846, + "rewards/margins": 4.67170524597168, + "rewards/rejected": -7.259271621704102, + "step": 2611 + }, + { + "epoch": 0.55, + "learning_rate": 9.109243697478992e-06, + "logits/chosen": -2.304582118988037, + "logits/rejected": -1.9881919622421265, + "logps/chosen": -297.2028503417969, + "logps/rejected": -350.5053405761719, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0449090003967285, + "rewards/margins": 3.859830856323242, + "rewards/rejected": -6.904739856719971, + "step": 2612 + }, + { + "epoch": 0.55, + "learning_rate": 9.105042016806723e-06, + "logits/chosen": -2.447077989578247, + "logits/rejected": -1.5826685428619385, + "logps/chosen": -364.481689453125, + "logps/rejected": -272.84552001953125, + "loss": 0.2932, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.888453722000122, + "rewards/margins": 4.477534294128418, + "rewards/rejected": -7.365988254547119, + "step": 2613 + }, + { + "epoch": 0.55, + "learning_rate": 9.100840336134455e-06, + "logits/chosen": -2.1794469356536865, + "logits/rejected": -1.8452938795089722, + "logps/chosen": -383.5345153808594, + "logps/rejected": -301.6110534667969, + "loss": 0.2406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8869540691375732, + "rewards/margins": 3.0920464992523193, + "rewards/rejected": -5.979000568389893, + "step": 2614 + }, + { + "epoch": 0.55, + "learning_rate": 9.096638655462185e-06, + "logits/chosen": -2.2501351833343506, + "logits/rejected": -1.8008877038955688, + "logps/chosen": -286.72064208984375, + "logps/rejected": -423.5406799316406, + "loss": 0.2278, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5307388305664062, + "rewards/margins": 4.129040241241455, + "rewards/rejected": -6.659779071807861, + "step": 2615 + }, + { + "epoch": 0.55, + "learning_rate": 9.092436974789917e-06, + "logits/chosen": -2.246356725692749, + "logits/rejected": -2.2131903171539307, + "logps/chosen": -428.2558898925781, + "logps/rejected": -399.03460693359375, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9555671215057373, + "rewards/margins": 6.324339866638184, + "rewards/rejected": -8.2799072265625, + "step": 2616 + }, + { + "epoch": 0.55, + "learning_rate": 9.088235294117647e-06, + "logits/chosen": -2.4375953674316406, + "logits/rejected": -2.1075685024261475, + "logps/chosen": -414.30987548828125, + "logps/rejected": -336.76202392578125, + "loss": 0.4661, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8523030281066895, + "rewards/margins": 3.1354942321777344, + "rewards/rejected": -5.987796783447266, + "step": 2617 + }, + { + "epoch": 0.55, + "learning_rate": 9.084033613445379e-06, + "logits/chosen": -2.1061532497406006, + "logits/rejected": -1.478698492050171, + "logps/chosen": -422.8421630859375, + "logps/rejected": -333.0334167480469, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.028029680252075, + "rewards/margins": 6.226371765136719, + "rewards/rejected": -9.254401206970215, + "step": 2618 + }, + { + "epoch": 0.55, + "learning_rate": 9.07983193277311e-06, + "logits/chosen": -2.2607040405273438, + "logits/rejected": -2.214386463165283, + "logps/chosen": -234.00039672851562, + "logps/rejected": -282.59423828125, + "loss": 0.1547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2320210933685303, + "rewards/margins": 4.761127948760986, + "rewards/rejected": -7.993149280548096, + "step": 2619 + }, + { + "epoch": 0.55, + "learning_rate": 9.075630252100841e-06, + "logits/chosen": -2.273449420928955, + "logits/rejected": -2.0686428546905518, + "logps/chosen": -211.99765014648438, + "logps/rejected": -293.23638916015625, + "loss": 0.49, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5950934886932373, + "rewards/margins": 4.220656871795654, + "rewards/rejected": -7.8157501220703125, + "step": 2620 + }, + { + "epoch": 0.55, + "learning_rate": 9.071428571428573e-06, + "logits/chosen": -2.3413877487182617, + "logits/rejected": -1.8345166444778442, + "logps/chosen": -382.54913330078125, + "logps/rejected": -316.766845703125, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.405459403991699, + "rewards/margins": 4.8582048416137695, + "rewards/rejected": -7.263664245605469, + "step": 2621 + }, + { + "epoch": 0.55, + "learning_rate": 9.067226890756303e-06, + "logits/chosen": -2.36049747467041, + "logits/rejected": -1.904428482055664, + "logps/chosen": -378.9596862792969, + "logps/rejected": -385.3060302734375, + "loss": 0.2854, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.327695846557617, + "rewards/margins": 3.909191608428955, + "rewards/rejected": -7.2368879318237305, + "step": 2622 + }, + { + "epoch": 0.55, + "learning_rate": 9.063025210084035e-06, + "logits/chosen": -2.44170880317688, + "logits/rejected": -1.9767940044403076, + "logps/chosen": -362.39764404296875, + "logps/rejected": -421.70013427734375, + "loss": 0.2259, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2055587768554688, + "rewards/margins": 5.011026859283447, + "rewards/rejected": -8.216585159301758, + "step": 2623 + }, + { + "epoch": 0.55, + "learning_rate": 9.058823529411765e-06, + "logits/chosen": -2.313199281692505, + "logits/rejected": -1.9724633693695068, + "logps/chosen": -334.0782470703125, + "logps/rejected": -316.8968200683594, + "loss": 0.1953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.973860502243042, + "rewards/margins": 4.250874042510986, + "rewards/rejected": -7.224734306335449, + "step": 2624 + }, + { + "epoch": 0.55, + "learning_rate": 9.054621848739497e-06, + "logits/chosen": -2.252244472503662, + "logits/rejected": -2.0120794773101807, + "logps/chosen": -352.8179016113281, + "logps/rejected": -302.3409118652344, + "loss": 0.2013, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.972867250442505, + "rewards/margins": 4.9938530921936035, + "rewards/rejected": -7.966720104217529, + "step": 2625 + }, + { + "epoch": 0.55, + "learning_rate": 9.050420168067227e-06, + "logits/chosen": -1.92987060546875, + "logits/rejected": -2.245865821838379, + "logps/chosen": -223.7242889404297, + "logps/rejected": -353.7231140136719, + "loss": 0.236, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3670947551727295, + "rewards/margins": 5.663341999053955, + "rewards/rejected": -9.030437469482422, + "step": 2626 + }, + { + "epoch": 0.55, + "learning_rate": 9.04621848739496e-06, + "logits/chosen": -2.368917942047119, + "logits/rejected": -2.198566436767578, + "logps/chosen": -390.9451904296875, + "logps/rejected": -369.2320556640625, + "loss": 0.3857, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.053164482116699, + "rewards/margins": 4.887670516967773, + "rewards/rejected": -7.940835475921631, + "step": 2627 + }, + { + "epoch": 0.55, + "learning_rate": 9.04201680672269e-06, + "logits/chosen": -2.2183849811553955, + "logits/rejected": -1.978151559829712, + "logps/chosen": -481.5416259765625, + "logps/rejected": -407.5372009277344, + "loss": 0.3501, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5582516193389893, + "rewards/margins": 5.30425500869751, + "rewards/rejected": -7.86250638961792, + "step": 2628 + }, + { + "epoch": 0.55, + "learning_rate": 9.037815126050421e-06, + "logits/chosen": -2.2114574909210205, + "logits/rejected": -2.5505189895629883, + "logps/chosen": -360.9266357421875, + "logps/rejected": -461.0148010253906, + "loss": 0.3772, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.879077911376953, + "rewards/margins": 5.504572868347168, + "rewards/rejected": -9.383650779724121, + "step": 2629 + }, + { + "epoch": 0.55, + "learning_rate": 9.033613445378152e-06, + "logits/chosen": -2.4712681770324707, + "logits/rejected": -2.01299786567688, + "logps/chosen": -306.111083984375, + "logps/rejected": -305.47943115234375, + "loss": 0.1083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.175967216491699, + "rewards/margins": 4.66514253616333, + "rewards/rejected": -8.841110229492188, + "step": 2630 + }, + { + "epoch": 0.55, + "learning_rate": 9.029411764705884e-06, + "logits/chosen": -2.3037240505218506, + "logits/rejected": -2.084071636199951, + "logps/chosen": -417.254638671875, + "logps/rejected": -482.8475341796875, + "loss": 0.2566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2271175384521484, + "rewards/margins": 5.514753818511963, + "rewards/rejected": -8.741870880126953, + "step": 2631 + }, + { + "epoch": 0.55, + "learning_rate": 9.025210084033614e-06, + "logits/chosen": -2.3310635089874268, + "logits/rejected": -2.063809394836426, + "logps/chosen": -335.72723388671875, + "logps/rejected": -323.3340148925781, + "loss": 0.1859, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1652774810791016, + "rewards/margins": 4.732568264007568, + "rewards/rejected": -7.897845268249512, + "step": 2632 + }, + { + "epoch": 0.55, + "learning_rate": 9.021008403361346e-06, + "logits/chosen": -1.6444947719573975, + "logits/rejected": -1.939252495765686, + "logps/chosen": -251.89300537109375, + "logps/rejected": -350.66424560546875, + "loss": 0.333, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7179863452911377, + "rewards/margins": 5.00375509262085, + "rewards/rejected": -7.721741676330566, + "step": 2633 + }, + { + "epoch": 0.55, + "learning_rate": 9.016806722689076e-06, + "logits/chosen": -2.365762710571289, + "logits/rejected": -1.971664547920227, + "logps/chosen": -399.0832824707031, + "logps/rejected": -360.23370361328125, + "loss": 0.2766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8267745971679688, + "rewards/margins": 3.66393780708313, + "rewards/rejected": -6.4907121658325195, + "step": 2634 + }, + { + "epoch": 0.55, + "learning_rate": 9.012605042016808e-06, + "logits/chosen": -2.3139126300811768, + "logits/rejected": -1.9102966785430908, + "logps/chosen": -369.58184814453125, + "logps/rejected": -499.3059387207031, + "loss": 0.2065, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1525888442993164, + "rewards/margins": 4.079642295837402, + "rewards/rejected": -7.232231140136719, + "step": 2635 + }, + { + "epoch": 0.55, + "learning_rate": 9.008403361344538e-06, + "logits/chosen": -2.1470606327056885, + "logits/rejected": -2.103541374206543, + "logps/chosen": -521.7222900390625, + "logps/rejected": -414.69244384765625, + "loss": 0.0871, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5568864345550537, + "rewards/margins": 5.589039325714111, + "rewards/rejected": -9.145925521850586, + "step": 2636 + }, + { + "epoch": 0.55, + "learning_rate": 9.00420168067227e-06, + "logits/chosen": -2.1644845008850098, + "logits/rejected": -1.948533535003662, + "logps/chosen": -335.3471984863281, + "logps/rejected": -331.1089172363281, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3042542934417725, + "rewards/margins": 3.4772984981536865, + "rewards/rejected": -5.781552314758301, + "step": 2637 + }, + { + "epoch": 0.55, + "learning_rate": 9e-06, + "logits/chosen": -2.318042278289795, + "logits/rejected": -1.7043120861053467, + "logps/chosen": -385.737060546875, + "logps/rejected": -284.3018798828125, + "loss": 0.6149, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.235351324081421, + "rewards/margins": 4.077353477478027, + "rewards/rejected": -6.312704563140869, + "step": 2638 + }, + { + "epoch": 0.55, + "learning_rate": 8.995798319327732e-06, + "logits/chosen": -2.337407350540161, + "logits/rejected": -1.94884192943573, + "logps/chosen": -359.7461853027344, + "logps/rejected": -378.401611328125, + "loss": 0.5126, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.537168502807617, + "rewards/margins": 4.906581878662109, + "rewards/rejected": -8.443750381469727, + "step": 2639 + }, + { + "epoch": 0.55, + "learning_rate": 8.991596638655462e-06, + "logits/chosen": -2.1896655559539795, + "logits/rejected": -2.149688720703125, + "logps/chosen": -335.66717529296875, + "logps/rejected": -364.55059814453125, + "loss": 0.1801, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3991432189941406, + "rewards/margins": 5.550799369812012, + "rewards/rejected": -7.949942111968994, + "step": 2640 + }, + { + "epoch": 0.55, + "learning_rate": 8.987394957983194e-06, + "logits/chosen": -1.7884109020233154, + "logits/rejected": -1.832045316696167, + "logps/chosen": -300.0168151855469, + "logps/rejected": -399.049560546875, + "loss": 0.0973, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3079686164855957, + "rewards/margins": 5.28182315826416, + "rewards/rejected": -7.589791297912598, + "step": 2641 + }, + { + "epoch": 0.55, + "learning_rate": 8.983193277310926e-06, + "logits/chosen": -2.1261706352233887, + "logits/rejected": -1.8228116035461426, + "logps/chosen": -335.3315124511719, + "logps/rejected": -375.4422912597656, + "loss": 0.2771, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.366669178009033, + "rewards/margins": 4.015559673309326, + "rewards/rejected": -7.382228851318359, + "step": 2642 + }, + { + "epoch": 0.55, + "learning_rate": 8.978991596638656e-06, + "logits/chosen": -2.3579349517822266, + "logits/rejected": -2.2383649349212646, + "logps/chosen": -353.3088684082031, + "logps/rejected": -275.9657897949219, + "loss": 0.6033, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.352352142333984, + "rewards/margins": 3.948674440383911, + "rewards/rejected": -8.301026344299316, + "step": 2643 + }, + { + "epoch": 0.55, + "learning_rate": 8.974789915966388e-06, + "logits/chosen": -1.7488627433776855, + "logits/rejected": -2.033600091934204, + "logps/chosen": -304.53564453125, + "logps/rejected": -471.5335998535156, + "loss": 0.1309, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4654555320739746, + "rewards/margins": 5.2220869064331055, + "rewards/rejected": -8.687541961669922, + "step": 2644 + }, + { + "epoch": 0.55, + "learning_rate": 8.970588235294119e-06, + "logits/chosen": -1.986555576324463, + "logits/rejected": -1.9408445358276367, + "logps/chosen": -231.56939697265625, + "logps/rejected": -304.6173095703125, + "loss": 0.2685, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.281092643737793, + "rewards/margins": 4.993471622467041, + "rewards/rejected": -8.274564743041992, + "step": 2645 + }, + { + "epoch": 0.55, + "learning_rate": 8.96638655462185e-06, + "logits/chosen": -2.3097894191741943, + "logits/rejected": -1.9431726932525635, + "logps/chosen": -408.22967529296875, + "logps/rejected": -348.1778564453125, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.806692600250244, + "rewards/margins": 4.749147891998291, + "rewards/rejected": -7.555840015411377, + "step": 2646 + }, + { + "epoch": 0.55, + "learning_rate": 8.96218487394958e-06, + "logits/chosen": -2.1173343658447266, + "logits/rejected": -2.224689483642578, + "logps/chosen": -310.6734619140625, + "logps/rejected": -407.75958251953125, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.568974494934082, + "rewards/margins": 7.258604049682617, + "rewards/rejected": -10.827579498291016, + "step": 2647 + }, + { + "epoch": 0.55, + "learning_rate": 8.957983193277313e-06, + "logits/chosen": -2.2200005054473877, + "logits/rejected": -2.1109769344329834, + "logps/chosen": -321.57135009765625, + "logps/rejected": -402.0304260253906, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.22861385345459, + "rewards/margins": 6.363674640655518, + "rewards/rejected": -9.592288970947266, + "step": 2648 + }, + { + "epoch": 0.55, + "learning_rate": 8.953781512605043e-06, + "logits/chosen": -2.362168788909912, + "logits/rejected": -1.8759689331054688, + "logps/chosen": -409.53448486328125, + "logps/rejected": -445.63525390625, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8342442512512207, + "rewards/margins": 4.363339424133301, + "rewards/rejected": -8.197583198547363, + "step": 2649 + }, + { + "epoch": 0.55, + "learning_rate": 8.949579831932775e-06, + "logits/chosen": -1.9577956199645996, + "logits/rejected": -2.1056935787200928, + "logps/chosen": -158.4167938232422, + "logps/rejected": -298.6907958984375, + "loss": 0.29, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2304322719573975, + "rewards/margins": 5.726587295532227, + "rewards/rejected": -8.957019805908203, + "step": 2650 + }, + { + "epoch": 0.55, + "learning_rate": 8.945378151260505e-06, + "logits/chosen": -2.257197141647339, + "logits/rejected": -1.9540551900863647, + "logps/chosen": -348.2672424316406, + "logps/rejected": -284.5853271484375, + "loss": 0.5147, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.159351825714111, + "rewards/margins": 2.5745716094970703, + "rewards/rejected": -6.733923435211182, + "step": 2651 + }, + { + "epoch": 0.55, + "learning_rate": 8.941176470588237e-06, + "logits/chosen": -2.260843276977539, + "logits/rejected": -1.7365663051605225, + "logps/chosen": -298.7296447753906, + "logps/rejected": -313.97467041015625, + "loss": 0.1762, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4675726890563965, + "rewards/margins": 6.697518825531006, + "rewards/rejected": -10.165090560913086, + "step": 2652 + }, + { + "epoch": 0.56, + "learning_rate": 8.936974789915967e-06, + "logits/chosen": -2.260566473007202, + "logits/rejected": -1.9898874759674072, + "logps/chosen": -311.6046142578125, + "logps/rejected": -348.43505859375, + "loss": 0.3002, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.771434783935547, + "rewards/margins": 5.387360572814941, + "rewards/rejected": -9.158795356750488, + "step": 2653 + }, + { + "epoch": 0.56, + "learning_rate": 8.932773109243699e-06, + "logits/chosen": -2.0622031688690186, + "logits/rejected": -2.251230239868164, + "logps/chosen": -517.6419677734375, + "logps/rejected": -503.8721923828125, + "loss": 0.2465, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4424264430999756, + "rewards/margins": 4.305473804473877, + "rewards/rejected": -6.747900485992432, + "step": 2654 + }, + { + "epoch": 0.56, + "learning_rate": 8.92857142857143e-06, + "logits/chosen": -2.1060009002685547, + "logits/rejected": -1.7831798791885376, + "logps/chosen": -273.451904296875, + "logps/rejected": -281.1597900390625, + "loss": 0.1156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.651923894882202, + "rewards/margins": 3.244041919708252, + "rewards/rejected": -5.895966053009033, + "step": 2655 + }, + { + "epoch": 0.56, + "learning_rate": 8.924369747899161e-06, + "logits/chosen": -1.8955503702163696, + "logits/rejected": -1.9887290000915527, + "logps/chosen": -321.331298828125, + "logps/rejected": -392.4033508300781, + "loss": 0.1404, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3340537548065186, + "rewards/margins": 3.8122332096099854, + "rewards/rejected": -6.146286964416504, + "step": 2656 + }, + { + "epoch": 0.56, + "learning_rate": 8.920168067226891e-06, + "logits/chosen": -2.2199277877807617, + "logits/rejected": -2.080019474029541, + "logps/chosen": -297.56146240234375, + "logps/rejected": -305.3303527832031, + "loss": 0.5039, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6488399505615234, + "rewards/margins": 4.500877380371094, + "rewards/rejected": -8.149718284606934, + "step": 2657 + }, + { + "epoch": 0.56, + "learning_rate": 8.915966386554623e-06, + "logits/chosen": -2.250577211380005, + "logits/rejected": -1.9426429271697998, + "logps/chosen": -370.91778564453125, + "logps/rejected": -377.5274658203125, + "loss": 0.1163, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5085253715515137, + "rewards/margins": 5.800785064697266, + "rewards/rejected": -9.309310913085938, + "step": 2658 + }, + { + "epoch": 0.56, + "learning_rate": 8.911764705882354e-06, + "logits/chosen": -2.1390366554260254, + "logits/rejected": -2.1604878902435303, + "logps/chosen": -326.6769714355469, + "logps/rejected": -413.05755615234375, + "loss": 0.3347, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2052788734436035, + "rewards/margins": 4.893599510192871, + "rewards/rejected": -8.098877906799316, + "step": 2659 + }, + { + "epoch": 0.56, + "learning_rate": 8.907563025210085e-06, + "logits/chosen": -2.420534610748291, + "logits/rejected": -2.1544504165649414, + "logps/chosen": -322.2715759277344, + "logps/rejected": -289.072021484375, + "loss": 0.4826, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.048967123031616, + "rewards/margins": 4.112607002258301, + "rewards/rejected": -6.161574363708496, + "step": 2660 + }, + { + "epoch": 0.56, + "learning_rate": 8.903361344537816e-06, + "logits/chosen": -1.980233907699585, + "logits/rejected": -2.0479214191436768, + "logps/chosen": -385.18798828125, + "logps/rejected": -383.0858154296875, + "loss": 0.6965, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.68750262260437, + "rewards/margins": 1.5842576026916504, + "rewards/rejected": -5.271759986877441, + "step": 2661 + }, + { + "epoch": 0.56, + "learning_rate": 8.899159663865546e-06, + "logits/chosen": -2.0453672409057617, + "logits/rejected": -1.9170441627502441, + "logps/chosen": -273.72271728515625, + "logps/rejected": -364.24658203125, + "loss": 0.1084, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7582130432128906, + "rewards/margins": 5.073298454284668, + "rewards/rejected": -8.831510543823242, + "step": 2662 + }, + { + "epoch": 0.56, + "learning_rate": 8.894957983193278e-06, + "logits/chosen": -2.3412487506866455, + "logits/rejected": -1.7636182308197021, + "logps/chosen": -358.69549560546875, + "logps/rejected": -302.4886169433594, + "loss": 0.6744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.442138195037842, + "rewards/margins": 2.9562692642211914, + "rewards/rejected": -6.398407936096191, + "step": 2663 + }, + { + "epoch": 0.56, + "learning_rate": 8.890756302521008e-06, + "logits/chosen": -1.9801467657089233, + "logits/rejected": -2.060098171234131, + "logps/chosen": -329.5774230957031, + "logps/rejected": -370.871826171875, + "loss": 0.6701, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5938949584960938, + "rewards/margins": 3.338050365447998, + "rewards/rejected": -6.93194580078125, + "step": 2664 + }, + { + "epoch": 0.56, + "learning_rate": 8.88655462184874e-06, + "logits/chosen": -2.0661187171936035, + "logits/rejected": -2.1971991062164307, + "logps/chosen": -285.5542297363281, + "logps/rejected": -372.9203186035156, + "loss": 0.2297, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.010709762573242, + "rewards/margins": 4.790306568145752, + "rewards/rejected": -7.801015853881836, + "step": 2665 + }, + { + "epoch": 0.56, + "learning_rate": 8.88235294117647e-06, + "logits/chosen": -2.4135351181030273, + "logits/rejected": -2.27453351020813, + "logps/chosen": -408.83770751953125, + "logps/rejected": -433.5548095703125, + "loss": 0.1567, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2829365730285645, + "rewards/margins": 4.203428268432617, + "rewards/rejected": -6.486364841461182, + "step": 2666 + }, + { + "epoch": 0.56, + "learning_rate": 8.878151260504202e-06, + "logits/chosen": -2.2961039543151855, + "logits/rejected": -2.1111371517181396, + "logps/chosen": -308.9424133300781, + "logps/rejected": -275.480712890625, + "loss": 0.0982, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5258851051330566, + "rewards/margins": 4.584489822387695, + "rewards/rejected": -7.11037540435791, + "step": 2667 + }, + { + "epoch": 0.56, + "learning_rate": 8.873949579831932e-06, + "logits/chosen": -2.15207576751709, + "logits/rejected": -2.0082287788391113, + "logps/chosen": -256.6492004394531, + "logps/rejected": -236.60647583007812, + "loss": 0.2286, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5860178470611572, + "rewards/margins": 4.407907009124756, + "rewards/rejected": -6.993925094604492, + "step": 2668 + }, + { + "epoch": 0.56, + "learning_rate": 8.869747899159664e-06, + "logits/chosen": -1.8155827522277832, + "logits/rejected": -2.276298999786377, + "logps/chosen": -274.17694091796875, + "logps/rejected": -461.59674072265625, + "loss": 0.3218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1462178230285645, + "rewards/margins": 6.446110248565674, + "rewards/rejected": -9.592327117919922, + "step": 2669 + }, + { + "epoch": 0.56, + "learning_rate": 8.865546218487396e-06, + "logits/chosen": -2.102386474609375, + "logits/rejected": -1.9784517288208008, + "logps/chosen": -319.59912109375, + "logps/rejected": -332.0497741699219, + "loss": 0.2989, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7539267539978027, + "rewards/margins": 4.721604347229004, + "rewards/rejected": -8.475530624389648, + "step": 2670 + }, + { + "epoch": 0.56, + "learning_rate": 8.861344537815126e-06, + "logits/chosen": -2.0956966876983643, + "logits/rejected": -2.079134464263916, + "logps/chosen": -254.76153564453125, + "logps/rejected": -328.09149169921875, + "loss": 0.179, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.662783622741699, + "rewards/margins": 3.861410617828369, + "rewards/rejected": -6.524194240570068, + "step": 2671 + }, + { + "epoch": 0.56, + "learning_rate": 8.857142857142858e-06, + "logits/chosen": -2.310729503631592, + "logits/rejected": -1.8544069528579712, + "logps/chosen": -323.09881591796875, + "logps/rejected": -307.9286804199219, + "loss": 0.1296, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8603925704956055, + "rewards/margins": 5.2406206130981445, + "rewards/rejected": -8.10101318359375, + "step": 2672 + }, + { + "epoch": 0.56, + "learning_rate": 8.852941176470588e-06, + "logits/chosen": -2.1763594150543213, + "logits/rejected": -1.8199400901794434, + "logps/chosen": -289.87786865234375, + "logps/rejected": -282.49969482421875, + "loss": 0.344, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.309572219848633, + "rewards/margins": 2.929065465927124, + "rewards/rejected": -6.238637447357178, + "step": 2673 + }, + { + "epoch": 0.56, + "learning_rate": 8.84873949579832e-06, + "logits/chosen": -1.9148685932159424, + "logits/rejected": -1.9912877082824707, + "logps/chosen": -233.59605407714844, + "logps/rejected": -317.5933532714844, + "loss": 0.1539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.150411605834961, + "rewards/margins": 4.769400596618652, + "rewards/rejected": -7.919811725616455, + "step": 2674 + }, + { + "epoch": 0.56, + "learning_rate": 8.84453781512605e-06, + "logits/chosen": -1.9021685123443604, + "logits/rejected": -1.7856111526489258, + "logps/chosen": -354.42327880859375, + "logps/rejected": -425.9520263671875, + "loss": 0.3194, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.467655658721924, + "rewards/margins": 5.39851188659668, + "rewards/rejected": -8.866167068481445, + "step": 2675 + }, + { + "epoch": 0.56, + "learning_rate": 8.840336134453783e-06, + "logits/chosen": -2.054248809814453, + "logits/rejected": -2.304570436477661, + "logps/chosen": -395.2113342285156, + "logps/rejected": -464.2568054199219, + "loss": 0.3341, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.316387176513672, + "rewards/margins": 3.7610421180725098, + "rewards/rejected": -7.07742977142334, + "step": 2676 + }, + { + "epoch": 0.56, + "learning_rate": 8.836134453781513e-06, + "logits/chosen": -2.1528868675231934, + "logits/rejected": -1.893542766571045, + "logps/chosen": -310.95538330078125, + "logps/rejected": -369.63934326171875, + "loss": 0.4258, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.431771755218506, + "rewards/margins": 5.848884582519531, + "rewards/rejected": -9.280654907226562, + "step": 2677 + }, + { + "epoch": 0.56, + "learning_rate": 8.831932773109245e-06, + "logits/chosen": -2.4424333572387695, + "logits/rejected": -2.3958985805511475, + "logps/chosen": -207.1778564453125, + "logps/rejected": -278.8995361328125, + "loss": 0.147, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0163660049438477, + "rewards/margins": 6.170021057128906, + "rewards/rejected": -9.18638801574707, + "step": 2678 + }, + { + "epoch": 0.56, + "learning_rate": 8.827731092436975e-06, + "logits/chosen": -2.1252641677856445, + "logits/rejected": -2.199077844619751, + "logps/chosen": -450.2701721191406, + "logps/rejected": -519.8621826171875, + "loss": 0.3131, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8092446327209473, + "rewards/margins": 5.226586818695068, + "rewards/rejected": -8.035831451416016, + "step": 2679 + }, + { + "epoch": 0.56, + "learning_rate": 8.823529411764707e-06, + "logits/chosen": -2.3932228088378906, + "logits/rejected": -2.2165069580078125, + "logps/chosen": -275.21795654296875, + "logps/rejected": -358.21356201171875, + "loss": 0.1564, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8486104011535645, + "rewards/margins": 6.026585578918457, + "rewards/rejected": -8.87519645690918, + "step": 2680 + }, + { + "epoch": 0.56, + "learning_rate": 8.819327731092437e-06, + "logits/chosen": -2.3068652153015137, + "logits/rejected": -1.6249210834503174, + "logps/chosen": -505.64593505859375, + "logps/rejected": -427.8704833984375, + "loss": 0.2642, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2640533447265625, + "rewards/margins": 6.481842994689941, + "rewards/rejected": -9.745896339416504, + "step": 2681 + }, + { + "epoch": 0.56, + "learning_rate": 8.815126050420169e-06, + "logits/chosen": -2.0739762783050537, + "logits/rejected": -1.9446161985397339, + "logps/chosen": -355.6173095703125, + "logps/rejected": -353.77044677734375, + "loss": 0.5856, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.1183013916015625, + "rewards/margins": 2.5441982746124268, + "rewards/rejected": -7.662499904632568, + "step": 2682 + }, + { + "epoch": 0.56, + "learning_rate": 8.8109243697479e-06, + "logits/chosen": -2.202012300491333, + "logits/rejected": -2.1079869270324707, + "logps/chosen": -231.66783142089844, + "logps/rejected": -316.7498779296875, + "loss": 0.1875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.097196340560913, + "rewards/margins": 4.253522872924805, + "rewards/rejected": -7.350719451904297, + "step": 2683 + }, + { + "epoch": 0.56, + "learning_rate": 8.806722689075631e-06, + "logits/chosen": -1.9491394758224487, + "logits/rejected": -1.900090217590332, + "logps/chosen": -284.14215087890625, + "logps/rejected": -302.74249267578125, + "loss": 0.2771, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.058411121368408, + "rewards/margins": 3.4801127910614014, + "rewards/rejected": -7.5385236740112305, + "step": 2684 + }, + { + "epoch": 0.56, + "learning_rate": 8.802521008403361e-06, + "logits/chosen": -2.123595952987671, + "logits/rejected": -1.8167352676391602, + "logps/chosen": -213.9258575439453, + "logps/rejected": -290.43975830078125, + "loss": 0.1643, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.259025812149048, + "rewards/margins": 5.2606940269470215, + "rewards/rejected": -8.519720077514648, + "step": 2685 + }, + { + "epoch": 0.56, + "learning_rate": 8.798319327731093e-06, + "logits/chosen": -2.1901299953460693, + "logits/rejected": -2.0241165161132812, + "logps/chosen": -453.7032775878906, + "logps/rejected": -394.6129150390625, + "loss": 0.2249, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.29465389251709, + "rewards/margins": 4.586889266967773, + "rewards/rejected": -7.8815436363220215, + "step": 2686 + }, + { + "epoch": 0.56, + "learning_rate": 8.794117647058823e-06, + "logits/chosen": -2.125352144241333, + "logits/rejected": -1.8213437795639038, + "logps/chosen": -277.3586730957031, + "logps/rejected": -249.25454711914062, + "loss": 0.4034, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.321885585784912, + "rewards/margins": 2.524087429046631, + "rewards/rejected": -6.845973014831543, + "step": 2687 + }, + { + "epoch": 0.56, + "learning_rate": 8.789915966386555e-06, + "logits/chosen": -2.2461905479431152, + "logits/rejected": -1.791140079498291, + "logps/chosen": -343.4769287109375, + "logps/rejected": -279.8666076660156, + "loss": 0.1656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.513667583465576, + "rewards/margins": 4.670999050140381, + "rewards/rejected": -7.184666633605957, + "step": 2688 + }, + { + "epoch": 0.56, + "learning_rate": 8.785714285714286e-06, + "logits/chosen": -2.114018440246582, + "logits/rejected": -2.2642788887023926, + "logps/chosen": -352.65643310546875, + "logps/rejected": -360.87921142578125, + "loss": 0.2898, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.881795883178711, + "rewards/margins": 4.008090019226074, + "rewards/rejected": -7.889885902404785, + "step": 2689 + }, + { + "epoch": 0.56, + "learning_rate": 8.781512605042017e-06, + "logits/chosen": -2.1060616970062256, + "logits/rejected": -2.007534980773926, + "logps/chosen": -327.9744567871094, + "logps/rejected": -353.4940185546875, + "loss": 0.5127, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.328692436218262, + "rewards/margins": 4.129589557647705, + "rewards/rejected": -8.458281517028809, + "step": 2690 + }, + { + "epoch": 0.56, + "learning_rate": 8.777310924369748e-06, + "logits/chosen": -2.3290610313415527, + "logits/rejected": -1.9435094594955444, + "logps/chosen": -471.1000671386719, + "logps/rejected": -440.1175231933594, + "loss": 0.5653, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4207262992858887, + "rewards/margins": 3.469817638397217, + "rewards/rejected": -6.8905439376831055, + "step": 2691 + }, + { + "epoch": 0.56, + "learning_rate": 8.77310924369748e-06, + "logits/chosen": -2.036985397338867, + "logits/rejected": -1.8986417055130005, + "logps/chosen": -339.95147705078125, + "logps/rejected": -376.7081604003906, + "loss": 0.4592, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4301857948303223, + "rewards/margins": 3.176327705383301, + "rewards/rejected": -5.606513500213623, + "step": 2692 + }, + { + "epoch": 0.56, + "learning_rate": 8.768907563025212e-06, + "logits/chosen": -2.476844072341919, + "logits/rejected": -2.2569897174835205, + "logps/chosen": -510.7056884765625, + "logps/rejected": -439.92913818359375, + "loss": 0.3004, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7965469360351562, + "rewards/margins": 3.9088680744171143, + "rewards/rejected": -6.705414772033691, + "step": 2693 + }, + { + "epoch": 0.56, + "learning_rate": 8.764705882352942e-06, + "logits/chosen": -2.5093770027160645, + "logits/rejected": -2.2896218299865723, + "logps/chosen": -411.2294616699219, + "logps/rejected": -480.22589111328125, + "loss": 0.131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8385963439941406, + "rewards/margins": 5.305680751800537, + "rewards/rejected": -7.144277572631836, + "step": 2694 + }, + { + "epoch": 0.56, + "learning_rate": 8.760504201680674e-06, + "logits/chosen": -2.0847718715667725, + "logits/rejected": -2.381751298904419, + "logps/chosen": -343.35723876953125, + "logps/rejected": -401.82916259765625, + "loss": 0.339, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9296274185180664, + "rewards/margins": 3.793790340423584, + "rewards/rejected": -7.723417282104492, + "step": 2695 + }, + { + "epoch": 0.56, + "learning_rate": 8.756302521008404e-06, + "logits/chosen": -2.2546093463897705, + "logits/rejected": -2.187814474105835, + "logps/chosen": -249.77774047851562, + "logps/rejected": -447.87176513671875, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3721938133239746, + "rewards/margins": 6.034940719604492, + "rewards/rejected": -9.407135009765625, + "step": 2696 + }, + { + "epoch": 0.56, + "learning_rate": 8.752100840336136e-06, + "logits/chosen": -2.1390864849090576, + "logits/rejected": -1.9791996479034424, + "logps/chosen": -319.9798278808594, + "logps/rejected": -274.62310791015625, + "loss": 0.1254, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.61348032951355, + "rewards/margins": 4.879994869232178, + "rewards/rejected": -8.493474960327148, + "step": 2697 + }, + { + "epoch": 0.56, + "learning_rate": 8.747899159663866e-06, + "logits/chosen": -2.5001142024993896, + "logits/rejected": -2.1573007106781006, + "logps/chosen": -332.5780029296875, + "logps/rejected": -344.8141784667969, + "loss": 0.3649, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9403786659240723, + "rewards/margins": 3.254493236541748, + "rewards/rejected": -5.19487190246582, + "step": 2698 + }, + { + "epoch": 0.56, + "learning_rate": 8.743697478991598e-06, + "logits/chosen": -2.1604132652282715, + "logits/rejected": -2.089139461517334, + "logps/chosen": -366.3182067871094, + "logps/rejected": -417.32781982421875, + "loss": 0.1699, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9634768962860107, + "rewards/margins": 4.155878067016602, + "rewards/rejected": -7.119354724884033, + "step": 2699 + }, + { + "epoch": 0.56, + "learning_rate": 8.739495798319328e-06, + "logits/chosen": -2.213563919067383, + "logits/rejected": -1.9549405574798584, + "logps/chosen": -258.0982666015625, + "logps/rejected": -328.23876953125, + "loss": 0.2053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3350718021392822, + "rewards/margins": 4.298139572143555, + "rewards/rejected": -6.633211135864258, + "step": 2700 + }, + { + "epoch": 0.57, + "learning_rate": 8.73529411764706e-06, + "logits/chosen": -2.1656322479248047, + "logits/rejected": -1.6679805517196655, + "logps/chosen": -399.82763671875, + "logps/rejected": -326.840576171875, + "loss": 0.1248, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.259933948516846, + "rewards/margins": 4.265561580657959, + "rewards/rejected": -8.525495529174805, + "step": 2701 + }, + { + "epoch": 0.57, + "learning_rate": 8.73109243697479e-06, + "logits/chosen": -2.6672751903533936, + "logits/rejected": -2.022127866744995, + "logps/chosen": -510.03265380859375, + "logps/rejected": -349.44293212890625, + "loss": 0.3671, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8153295516967773, + "rewards/margins": 5.115483283996582, + "rewards/rejected": -7.930813312530518, + "step": 2702 + }, + { + "epoch": 0.57, + "learning_rate": 8.726890756302522e-06, + "logits/chosen": -1.8718903064727783, + "logits/rejected": -2.108630657196045, + "logps/chosen": -231.6609344482422, + "logps/rejected": -288.3909606933594, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.45017409324646, + "rewards/margins": 5.107176780700684, + "rewards/rejected": -8.557351112365723, + "step": 2703 + }, + { + "epoch": 0.57, + "learning_rate": 8.722689075630252e-06, + "logits/chosen": -2.074998617172241, + "logits/rejected": -1.9197508096694946, + "logps/chosen": -310.8554992675781, + "logps/rejected": -363.63629150390625, + "loss": 0.7156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.458477973937988, + "rewards/margins": 3.201475143432617, + "rewards/rejected": -7.6599531173706055, + "step": 2704 + }, + { + "epoch": 0.57, + "learning_rate": 8.718487394957984e-06, + "logits/chosen": -2.3683180809020996, + "logits/rejected": -1.646662712097168, + "logps/chosen": -263.2167053222656, + "logps/rejected": -266.9175720214844, + "loss": 0.2083, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8014109134674072, + "rewards/margins": 3.9933462142944336, + "rewards/rejected": -7.794756889343262, + "step": 2705 + }, + { + "epoch": 0.57, + "learning_rate": 8.714285714285715e-06, + "logits/chosen": -2.214357852935791, + "logits/rejected": -2.115215539932251, + "logps/chosen": -327.6528015136719, + "logps/rejected": -311.1213684082031, + "loss": 0.2508, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3208508491516113, + "rewards/margins": 4.3119659423828125, + "rewards/rejected": -6.632816314697266, + "step": 2706 + }, + { + "epoch": 0.57, + "learning_rate": 8.710084033613447e-06, + "logits/chosen": -2.1791768074035645, + "logits/rejected": -2.12751841545105, + "logps/chosen": -301.55316162109375, + "logps/rejected": -236.4023895263672, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9142112731933594, + "rewards/margins": 4.173439979553223, + "rewards/rejected": -7.08765172958374, + "step": 2707 + }, + { + "epoch": 0.57, + "learning_rate": 8.705882352941177e-06, + "logits/chosen": -2.231198310852051, + "logits/rejected": -1.9991087913513184, + "logps/chosen": -508.19085693359375, + "logps/rejected": -461.7760009765625, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078423500061035, + "rewards/margins": 6.187633991241455, + "rewards/rejected": -8.266057014465332, + "step": 2708 + }, + { + "epoch": 0.57, + "learning_rate": 8.701680672268909e-06, + "logits/chosen": -2.195098400115967, + "logits/rejected": -2.091689348220825, + "logps/chosen": -366.6156921386719, + "logps/rejected": -364.11907958984375, + "loss": 0.2665, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5850656032562256, + "rewards/margins": 3.8540866374969482, + "rewards/rejected": -7.439152717590332, + "step": 2709 + }, + { + "epoch": 0.57, + "learning_rate": 8.697478991596639e-06, + "logits/chosen": -2.0250988006591797, + "logits/rejected": -1.936099648475647, + "logps/chosen": -395.562744140625, + "logps/rejected": -501.33209228515625, + "loss": 0.1058, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6543171405792236, + "rewards/margins": 6.141094207763672, + "rewards/rejected": -8.795411109924316, + "step": 2710 + }, + { + "epoch": 0.57, + "learning_rate": 8.69327731092437e-06, + "logits/chosen": -2.071742534637451, + "logits/rejected": -1.9866383075714111, + "logps/chosen": -284.0357360839844, + "logps/rejected": -364.4813232421875, + "loss": 0.1637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.164823532104492, + "rewards/margins": 5.1248273849487305, + "rewards/rejected": -9.289650917053223, + "step": 2711 + }, + { + "epoch": 0.57, + "learning_rate": 8.689075630252101e-06, + "logits/chosen": -2.241704225540161, + "logits/rejected": -1.9620360136032104, + "logps/chosen": -336.57037353515625, + "logps/rejected": -279.80908203125, + "loss": 0.2048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.747136116027832, + "rewards/margins": 4.1562395095825195, + "rewards/rejected": -6.903375625610352, + "step": 2712 + }, + { + "epoch": 0.57, + "learning_rate": 8.684873949579833e-06, + "logits/chosen": -2.1555747985839844, + "logits/rejected": -1.872838020324707, + "logps/chosen": -371.10052490234375, + "logps/rejected": -383.5567932128906, + "loss": 0.2636, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.825657367706299, + "rewards/margins": 4.172976493835449, + "rewards/rejected": -6.998633861541748, + "step": 2713 + }, + { + "epoch": 0.57, + "learning_rate": 8.680672268907563e-06, + "logits/chosen": -2.2234668731689453, + "logits/rejected": -2.1341452598571777, + "logps/chosen": -491.4859619140625, + "logps/rejected": -395.1780700683594, + "loss": 0.2124, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.460757255554199, + "rewards/margins": 5.390682697296143, + "rewards/rejected": -7.851439952850342, + "step": 2714 + }, + { + "epoch": 0.57, + "learning_rate": 8.676470588235295e-06, + "logits/chosen": -1.9256107807159424, + "logits/rejected": -2.047905683517456, + "logps/chosen": -236.48114013671875, + "logps/rejected": -267.57220458984375, + "loss": 0.3138, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.3201799392700195, + "rewards/margins": 3.5454423427581787, + "rewards/rejected": -6.865622520446777, + "step": 2715 + }, + { + "epoch": 0.57, + "learning_rate": 8.672268907563027e-06, + "logits/chosen": -2.3635599613189697, + "logits/rejected": -2.1418421268463135, + "logps/chosen": -384.3231201171875, + "logps/rejected": -424.45611572265625, + "loss": 0.339, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.120706081390381, + "rewards/margins": 4.210129261016846, + "rewards/rejected": -7.330835342407227, + "step": 2716 + }, + { + "epoch": 0.57, + "learning_rate": 8.668067226890757e-06, + "logits/chosen": -2.3286256790161133, + "logits/rejected": -2.213660478591919, + "logps/chosen": -289.77691650390625, + "logps/rejected": -288.75628662109375, + "loss": 0.4781, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.372096061706543, + "rewards/margins": 2.8783774375915527, + "rewards/rejected": -7.250473499298096, + "step": 2717 + }, + { + "epoch": 0.57, + "learning_rate": 8.663865546218489e-06, + "logits/chosen": -2.212244749069214, + "logits/rejected": -2.0899930000305176, + "logps/chosen": -295.803955078125, + "logps/rejected": -289.9457702636719, + "loss": 0.7864, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9074177742004395, + "rewards/margins": 2.805502414703369, + "rewards/rejected": -6.712920665740967, + "step": 2718 + }, + { + "epoch": 0.57, + "learning_rate": 8.65966386554622e-06, + "logits/chosen": -1.9873700141906738, + "logits/rejected": -1.6415438652038574, + "logps/chosen": -233.9061737060547, + "logps/rejected": -268.8019714355469, + "loss": 0.2341, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.150973081588745, + "rewards/margins": 4.107327461242676, + "rewards/rejected": -7.25830078125, + "step": 2719 + }, + { + "epoch": 0.57, + "learning_rate": 8.655462184873951e-06, + "logits/chosen": -1.7120524644851685, + "logits/rejected": -2.188736915588379, + "logps/chosen": -310.07208251953125, + "logps/rejected": -449.23699951171875, + "loss": 0.9419, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.6013145446777344, + "rewards/margins": 5.047693252563477, + "rewards/rejected": -8.649007797241211, + "step": 2720 + }, + { + "epoch": 0.57, + "learning_rate": 8.651260504201681e-06, + "logits/chosen": -2.1267619132995605, + "logits/rejected": -1.9256224632263184, + "logps/chosen": -321.3891296386719, + "logps/rejected": -300.41473388671875, + "loss": 0.2765, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4399349689483643, + "rewards/margins": 4.2923173904418945, + "rewards/rejected": -7.73225212097168, + "step": 2721 + }, + { + "epoch": 0.57, + "learning_rate": 8.647058823529413e-06, + "logits/chosen": -2.4357833862304688, + "logits/rejected": -2.2987172603607178, + "logps/chosen": -339.218017578125, + "logps/rejected": -404.8612060546875, + "loss": 0.1113, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.725738048553467, + "rewards/margins": 7.057034015655518, + "rewards/rejected": -10.7827730178833, + "step": 2722 + }, + { + "epoch": 0.57, + "learning_rate": 8.642857142857144e-06, + "logits/chosen": -1.9419734477996826, + "logits/rejected": -1.9946268796920776, + "logps/chosen": -342.5497741699219, + "logps/rejected": -334.2364501953125, + "loss": 0.339, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.621908187866211, + "rewards/margins": 3.022494316101074, + "rewards/rejected": -7.644402503967285, + "step": 2723 + }, + { + "epoch": 0.57, + "learning_rate": 8.638655462184876e-06, + "logits/chosen": -2.0321571826934814, + "logits/rejected": -1.9260424375534058, + "logps/chosen": -256.74462890625, + "logps/rejected": -317.1990051269531, + "loss": 0.4921, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3484292030334473, + "rewards/margins": 4.982367515563965, + "rewards/rejected": -8.33079719543457, + "step": 2724 + }, + { + "epoch": 0.57, + "learning_rate": 8.634453781512606e-06, + "logits/chosen": -2.0247995853424072, + "logits/rejected": -2.14101505279541, + "logps/chosen": -271.5347900390625, + "logps/rejected": -359.88275146484375, + "loss": 0.4362, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.442434787750244, + "rewards/margins": 3.3044795989990234, + "rewards/rejected": -7.746914386749268, + "step": 2725 + }, + { + "epoch": 0.57, + "learning_rate": 8.630252100840338e-06, + "logits/chosen": -1.8769214153289795, + "logits/rejected": -1.9412651062011719, + "logps/chosen": -312.9871520996094, + "logps/rejected": -355.88446044921875, + "loss": 0.7236, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.403740644454956, + "rewards/margins": 3.570530891418457, + "rewards/rejected": -6.974271774291992, + "step": 2726 + }, + { + "epoch": 0.57, + "learning_rate": 8.626050420168068e-06, + "logits/chosen": -2.022946357727051, + "logits/rejected": -1.8456951379776, + "logps/chosen": -437.06903076171875, + "logps/rejected": -440.7738037109375, + "loss": 0.5504, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5313549041748047, + "rewards/margins": 3.5492324829101562, + "rewards/rejected": -7.080587863922119, + "step": 2727 + }, + { + "epoch": 0.57, + "learning_rate": 8.6218487394958e-06, + "logits/chosen": -2.2058606147766113, + "logits/rejected": -1.8517531156539917, + "logps/chosen": -394.8498840332031, + "logps/rejected": -420.2686767578125, + "loss": 0.5733, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.546185255050659, + "rewards/margins": 4.552314758300781, + "rewards/rejected": -8.09850025177002, + "step": 2728 + }, + { + "epoch": 0.57, + "learning_rate": 8.61764705882353e-06, + "logits/chosen": -2.1698315143585205, + "logits/rejected": -2.3035449981689453, + "logps/chosen": -243.48663330078125, + "logps/rejected": -301.6219482421875, + "loss": 0.1097, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.19541597366333, + "rewards/margins": 4.900232315063477, + "rewards/rejected": -9.095647811889648, + "step": 2729 + }, + { + "epoch": 0.57, + "learning_rate": 8.613445378151262e-06, + "logits/chosen": -2.1246962547302246, + "logits/rejected": -1.770324468612671, + "logps/chosen": -389.90789794921875, + "logps/rejected": -335.2018127441406, + "loss": 0.3047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2084178924560547, + "rewards/margins": 4.77372932434082, + "rewards/rejected": -7.982147216796875, + "step": 2730 + }, + { + "epoch": 0.57, + "learning_rate": 8.609243697478992e-06, + "logits/chosen": -2.10770845413208, + "logits/rejected": -2.1845364570617676, + "logps/chosen": -279.52001953125, + "logps/rejected": -329.3961486816406, + "loss": 0.375, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1655659675598145, + "rewards/margins": 2.0775585174560547, + "rewards/rejected": -5.243124008178711, + "step": 2731 + }, + { + "epoch": 0.57, + "learning_rate": 8.605042016806724e-06, + "logits/chosen": -2.3080530166625977, + "logits/rejected": -1.9859060049057007, + "logps/chosen": -358.9238586425781, + "logps/rejected": -353.3697204589844, + "loss": 0.4408, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.16860294342041, + "rewards/margins": 5.312753677368164, + "rewards/rejected": -8.481356620788574, + "step": 2732 + }, + { + "epoch": 0.57, + "learning_rate": 8.600840336134454e-06, + "logits/chosen": -2.1886751651763916, + "logits/rejected": -2.3670477867126465, + "logps/chosen": -291.016357421875, + "logps/rejected": -432.73663330078125, + "loss": 0.2036, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.796029567718506, + "rewards/margins": 4.382058620452881, + "rewards/rejected": -8.178088188171387, + "step": 2733 + }, + { + "epoch": 0.57, + "learning_rate": 8.596638655462186e-06, + "logits/chosen": -1.9103906154632568, + "logits/rejected": -2.021730661392212, + "logps/chosen": -295.30938720703125, + "logps/rejected": -338.095703125, + "loss": 0.4215, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5600361824035645, + "rewards/margins": 3.3560800552368164, + "rewards/rejected": -6.916116237640381, + "step": 2734 + }, + { + "epoch": 0.57, + "learning_rate": 8.592436974789916e-06, + "logits/chosen": -1.9595222473144531, + "logits/rejected": -1.8773701190948486, + "logps/chosen": -354.2728271484375, + "logps/rejected": -421.49627685546875, + "loss": 0.9587, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4703714847564697, + "rewards/margins": 3.893065929412842, + "rewards/rejected": -7.363436698913574, + "step": 2735 + }, + { + "epoch": 0.57, + "learning_rate": 8.588235294117647e-06, + "logits/chosen": -1.9225609302520752, + "logits/rejected": -1.8159414529800415, + "logps/chosen": -255.03848266601562, + "logps/rejected": -283.30792236328125, + "loss": 0.3654, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1242165565490723, + "rewards/margins": 3.513643264770508, + "rewards/rejected": -6.63785982131958, + "step": 2736 + }, + { + "epoch": 0.57, + "learning_rate": 8.584033613445379e-06, + "logits/chosen": -2.1396279335021973, + "logits/rejected": -1.8366109132766724, + "logps/chosen": -353.5950622558594, + "logps/rejected": -317.05364990234375, + "loss": 0.3519, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.370506763458252, + "rewards/margins": 2.6922452449798584, + "rewards/rejected": -5.062751770019531, + "step": 2737 + }, + { + "epoch": 0.57, + "learning_rate": 8.579831932773109e-06, + "logits/chosen": -2.2322375774383545, + "logits/rejected": -2.0567448139190674, + "logps/chosen": -382.1094970703125, + "logps/rejected": -453.89105224609375, + "loss": 0.2687, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6861536502838135, + "rewards/margins": 4.810391902923584, + "rewards/rejected": -7.496545314788818, + "step": 2738 + }, + { + "epoch": 0.57, + "learning_rate": 8.57563025210084e-06, + "logits/chosen": -1.9626046419143677, + "logits/rejected": -1.8405790328979492, + "logps/chosen": -257.1316833496094, + "logps/rejected": -334.7016296386719, + "loss": 0.2199, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.153595447540283, + "rewards/margins": 5.155060768127441, + "rewards/rejected": -8.308655738830566, + "step": 2739 + }, + { + "epoch": 0.57, + "learning_rate": 8.571428571428571e-06, + "logits/chosen": -2.47385835647583, + "logits/rejected": -2.13322377204895, + "logps/chosen": -443.5631103515625, + "logps/rejected": -359.7532653808594, + "loss": 0.1944, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4087367057800293, + "rewards/margins": 4.697868347167969, + "rewards/rejected": -7.106605529785156, + "step": 2740 + }, + { + "epoch": 0.57, + "learning_rate": 8.567226890756303e-06, + "logits/chosen": -1.9346528053283691, + "logits/rejected": -1.9264346361160278, + "logps/chosen": -241.3891143798828, + "logps/rejected": -295.87591552734375, + "loss": 0.2242, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3510055541992188, + "rewards/margins": 4.411670684814453, + "rewards/rejected": -6.762675762176514, + "step": 2741 + }, + { + "epoch": 0.57, + "learning_rate": 8.563025210084033e-06, + "logits/chosen": -2.4686498641967773, + "logits/rejected": -1.513934850692749, + "logps/chosen": -389.2949523925781, + "logps/rejected": -313.2209777832031, + "loss": 0.1133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3460185527801514, + "rewards/margins": 6.069815635681152, + "rewards/rejected": -8.415834426879883, + "step": 2742 + }, + { + "epoch": 0.57, + "learning_rate": 8.558823529411765e-06, + "logits/chosen": -1.976921558380127, + "logits/rejected": -2.0476112365722656, + "logps/chosen": -295.4115295410156, + "logps/rejected": -344.71636962890625, + "loss": 0.1671, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9183807373046875, + "rewards/margins": 4.755885601043701, + "rewards/rejected": -7.6742658615112305, + "step": 2743 + }, + { + "epoch": 0.57, + "learning_rate": 8.554621848739497e-06, + "logits/chosen": -2.357006072998047, + "logits/rejected": -2.1295089721679688, + "logps/chosen": -336.525634765625, + "logps/rejected": -323.97760009765625, + "loss": 0.2236, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4714977741241455, + "rewards/margins": 2.4537720680236816, + "rewards/rejected": -5.925270080566406, + "step": 2744 + }, + { + "epoch": 0.57, + "learning_rate": 8.550420168067227e-06, + "logits/chosen": -2.479200839996338, + "logits/rejected": -2.314918279647827, + "logps/chosen": -357.25653076171875, + "logps/rejected": -326.67742919921875, + "loss": 0.2253, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.705740451812744, + "rewards/margins": 3.5112805366516113, + "rewards/rejected": -6.2170209884643555, + "step": 2745 + }, + { + "epoch": 0.57, + "learning_rate": 8.546218487394959e-06, + "logits/chosen": -2.3939208984375, + "logits/rejected": -1.787131905555725, + "logps/chosen": -472.98162841796875, + "logps/rejected": -378.8699645996094, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.249552011489868, + "rewards/margins": 4.788482666015625, + "rewards/rejected": -8.038034439086914, + "step": 2746 + }, + { + "epoch": 0.57, + "learning_rate": 8.54201680672269e-06, + "logits/chosen": -2.2116236686706543, + "logits/rejected": -1.9275844097137451, + "logps/chosen": -394.430908203125, + "logps/rejected": -310.01678466796875, + "loss": 0.3442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.295100212097168, + "rewards/margins": 4.601902008056641, + "rewards/rejected": -7.897002696990967, + "step": 2747 + }, + { + "epoch": 0.57, + "learning_rate": 8.537815126050421e-06, + "logits/chosen": -2.2263567447662354, + "logits/rejected": -1.7479074001312256, + "logps/chosen": -272.418701171875, + "logps/rejected": -248.99655151367188, + "loss": 0.2152, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.277068614959717, + "rewards/margins": 3.5347352027893066, + "rewards/rejected": -6.811803817749023, + "step": 2748 + }, + { + "epoch": 0.58, + "learning_rate": 8.533613445378151e-06, + "logits/chosen": -2.09126615524292, + "logits/rejected": -1.7565053701400757, + "logps/chosen": -376.72015380859375, + "logps/rejected": -318.0309143066406, + "loss": 0.6299, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.174788475036621, + "rewards/margins": 3.727644920349121, + "rewards/rejected": -7.9024338722229, + "step": 2749 + }, + { + "epoch": 0.58, + "learning_rate": 8.529411764705883e-06, + "logits/chosen": -1.9707107543945312, + "logits/rejected": -2.1534948348999023, + "logps/chosen": -389.84368896484375, + "logps/rejected": -344.63653564453125, + "loss": 0.249, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.834862470626831, + "rewards/margins": 3.604830741882324, + "rewards/rejected": -6.439692974090576, + "step": 2750 + }, + { + "epoch": 0.58, + "learning_rate": 8.525210084033614e-06, + "logits/chosen": -1.9963254928588867, + "logits/rejected": -1.783385992050171, + "logps/chosen": -272.95709228515625, + "logps/rejected": -267.8773193359375, + "loss": 0.2217, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2382700443267822, + "rewards/margins": 3.3290963172912598, + "rewards/rejected": -6.567366600036621, + "step": 2751 + }, + { + "epoch": 0.58, + "learning_rate": 8.521008403361345e-06, + "logits/chosen": -1.9712328910827637, + "logits/rejected": -2.0587267875671387, + "logps/chosen": -331.9048767089844, + "logps/rejected": -351.9537658691406, + "loss": 0.481, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.469090938568115, + "rewards/margins": 3.4372684955596924, + "rewards/rejected": -7.9063591957092285, + "step": 2752 + }, + { + "epoch": 0.58, + "learning_rate": 8.516806722689076e-06, + "logits/chosen": -1.8107143640518188, + "logits/rejected": -2.1633617877960205, + "logps/chosen": -339.5399475097656, + "logps/rejected": -420.3126220703125, + "loss": 0.3497, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.945976734161377, + "rewards/margins": 3.6860883235931396, + "rewards/rejected": -6.6320648193359375, + "step": 2753 + }, + { + "epoch": 0.58, + "learning_rate": 8.512605042016808e-06, + "logits/chosen": -2.3672072887420654, + "logits/rejected": -2.229079246520996, + "logps/chosen": -367.57208251953125, + "logps/rejected": -360.6598815917969, + "loss": 0.4667, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.000546932220459, + "rewards/margins": 4.037282943725586, + "rewards/rejected": -7.037830352783203, + "step": 2754 + }, + { + "epoch": 0.58, + "learning_rate": 8.508403361344538e-06, + "logits/chosen": -2.0224218368530273, + "logits/rejected": -2.164910316467285, + "logps/chosen": -363.3284606933594, + "logps/rejected": -406.784912109375, + "loss": 0.4544, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.017745018005371, + "rewards/margins": 3.6307172775268555, + "rewards/rejected": -7.648462295532227, + "step": 2755 + }, + { + "epoch": 0.58, + "learning_rate": 8.50420168067227e-06, + "logits/chosen": -2.3570475578308105, + "logits/rejected": -2.0096800327301025, + "logps/chosen": -294.29901123046875, + "logps/rejected": -287.4659118652344, + "loss": 0.2954, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8642473220825195, + "rewards/margins": 4.2596588134765625, + "rewards/rejected": -8.123906135559082, + "step": 2756 + }, + { + "epoch": 0.58, + "learning_rate": 8.5e-06, + "logits/chosen": -2.246244430541992, + "logits/rejected": -1.7577433586120605, + "logps/chosen": -354.1215515136719, + "logps/rejected": -286.30718994140625, + "loss": 0.4151, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.069730281829834, + "rewards/margins": 2.696025848388672, + "rewards/rejected": -6.765755653381348, + "step": 2757 + }, + { + "epoch": 0.58, + "learning_rate": 8.495798319327732e-06, + "logits/chosen": -2.0918283462524414, + "logits/rejected": -1.9773999452590942, + "logps/chosen": -325.2257080078125, + "logps/rejected": -681.0278930664062, + "loss": 0.1745, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.468827486038208, + "rewards/margins": 5.041810035705566, + "rewards/rejected": -7.510637283325195, + "step": 2758 + }, + { + "epoch": 0.58, + "learning_rate": 8.491596638655462e-06, + "logits/chosen": -2.1199092864990234, + "logits/rejected": -2.0753183364868164, + "logps/chosen": -317.63995361328125, + "logps/rejected": -368.48016357421875, + "loss": 0.7816, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.605165719985962, + "rewards/margins": 3.129185438156128, + "rewards/rejected": -6.73435115814209, + "step": 2759 + }, + { + "epoch": 0.58, + "learning_rate": 8.487394957983194e-06, + "logits/chosen": -2.4660263061523438, + "logits/rejected": -2.0746824741363525, + "logps/chosen": -500.5414123535156, + "logps/rejected": -409.3662109375, + "loss": 0.3254, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.141390323638916, + "rewards/margins": 3.743461847305298, + "rewards/rejected": -6.884852409362793, + "step": 2760 + }, + { + "epoch": 0.58, + "learning_rate": 8.483193277310924e-06, + "logits/chosen": -2.048506736755371, + "logits/rejected": -2.1086485385894775, + "logps/chosen": -244.1903076171875, + "logps/rejected": -255.09521484375, + "loss": 0.2566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9720146656036377, + "rewards/margins": 3.8307254314422607, + "rewards/rejected": -6.80273962020874, + "step": 2761 + }, + { + "epoch": 0.58, + "learning_rate": 8.478991596638656e-06, + "logits/chosen": -1.9258618354797363, + "logits/rejected": -1.8111095428466797, + "logps/chosen": -289.5670166015625, + "logps/rejected": -267.3599853515625, + "loss": 0.2838, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1344733238220215, + "rewards/margins": 3.522833824157715, + "rewards/rejected": -6.6573076248168945, + "step": 2762 + }, + { + "epoch": 0.58, + "learning_rate": 8.474789915966386e-06, + "logits/chosen": -1.9609168767929077, + "logits/rejected": -1.9910008907318115, + "logps/chosen": -412.28765869140625, + "logps/rejected": -428.9263610839844, + "loss": 0.1521, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8766653537750244, + "rewards/margins": 5.185330390930176, + "rewards/rejected": -8.061995506286621, + "step": 2763 + }, + { + "epoch": 0.58, + "learning_rate": 8.470588235294118e-06, + "logits/chosen": -2.2419185638427734, + "logits/rejected": -2.4218924045562744, + "logps/chosen": -278.5075988769531, + "logps/rejected": -370.38385009765625, + "loss": 0.4257, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6476404666900635, + "rewards/margins": 3.3917269706726074, + "rewards/rejected": -7.03936767578125, + "step": 2764 + }, + { + "epoch": 0.58, + "learning_rate": 8.46638655462185e-06, + "logits/chosen": -2.202385425567627, + "logits/rejected": -1.91288161277771, + "logps/chosen": -350.34429931640625, + "logps/rejected": -325.995849609375, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.996974468231201, + "rewards/margins": 3.909031867980957, + "rewards/rejected": -6.906006336212158, + "step": 2765 + }, + { + "epoch": 0.58, + "learning_rate": 8.46218487394958e-06, + "logits/chosen": -1.9805129766464233, + "logits/rejected": -1.8966350555419922, + "logps/chosen": -328.09906005859375, + "logps/rejected": -473.0970153808594, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1002235412597656, + "rewards/margins": 2.876615524291992, + "rewards/rejected": -5.976839065551758, + "step": 2766 + }, + { + "epoch": 0.58, + "learning_rate": 8.457983193277312e-06, + "logits/chosen": -2.1060500144958496, + "logits/rejected": -1.880358338356018, + "logps/chosen": -238.4542236328125, + "logps/rejected": -262.7034912109375, + "loss": 0.3534, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.021040439605713, + "rewards/margins": 2.067019462585449, + "rewards/rejected": -5.088059902191162, + "step": 2767 + }, + { + "epoch": 0.58, + "learning_rate": 8.453781512605043e-06, + "logits/chosen": -2.1656126976013184, + "logits/rejected": -1.620773434638977, + "logps/chosen": -450.619873046875, + "logps/rejected": -300.18896484375, + "loss": 0.1291, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9355082511901855, + "rewards/margins": 4.130982875823975, + "rewards/rejected": -7.06649112701416, + "step": 2768 + }, + { + "epoch": 0.58, + "learning_rate": 8.449579831932774e-06, + "logits/chosen": -1.9953278303146362, + "logits/rejected": -1.7384717464447021, + "logps/chosen": -340.85662841796875, + "logps/rejected": -329.06097412109375, + "loss": 0.3799, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7691805362701416, + "rewards/margins": 3.792186737060547, + "rewards/rejected": -6.561367034912109, + "step": 2769 + }, + { + "epoch": 0.58, + "learning_rate": 8.445378151260505e-06, + "logits/chosen": -1.913532018661499, + "logits/rejected": -2.2707715034484863, + "logps/chosen": -455.59881591796875, + "logps/rejected": -611.9102783203125, + "loss": 0.6188, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.793298721313477, + "rewards/margins": 3.1364054679870605, + "rewards/rejected": -7.929703712463379, + "step": 2770 + }, + { + "epoch": 0.58, + "learning_rate": 8.441176470588237e-06, + "logits/chosen": -2.1847782135009766, + "logits/rejected": -2.245988130569458, + "logps/chosen": -286.3011779785156, + "logps/rejected": -327.3034362792969, + "loss": 0.2174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.532351493835449, + "rewards/margins": 4.090540885925293, + "rewards/rejected": -7.622891902923584, + "step": 2771 + }, + { + "epoch": 0.58, + "learning_rate": 8.436974789915967e-06, + "logits/chosen": -2.3693184852600098, + "logits/rejected": -1.832909345626831, + "logps/chosen": -307.59466552734375, + "logps/rejected": -270.8913269042969, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4480719566345215, + "rewards/margins": 2.7676219940185547, + "rewards/rejected": -6.215693473815918, + "step": 2772 + }, + { + "epoch": 0.58, + "learning_rate": 8.432773109243699e-06, + "logits/chosen": -2.3170132637023926, + "logits/rejected": -2.1645610332489014, + "logps/chosen": -313.81915283203125, + "logps/rejected": -330.9603271484375, + "loss": 0.173, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2745020389556885, + "rewards/margins": 4.4955902099609375, + "rewards/rejected": -7.770092010498047, + "step": 2773 + }, + { + "epoch": 0.58, + "learning_rate": 8.428571428571429e-06, + "logits/chosen": -1.7949676513671875, + "logits/rejected": -1.7654428482055664, + "logps/chosen": -313.2247314453125, + "logps/rejected": -284.2032165527344, + "loss": 0.2479, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2744646072387695, + "rewards/margins": 3.678178310394287, + "rewards/rejected": -6.952643394470215, + "step": 2774 + }, + { + "epoch": 0.58, + "learning_rate": 8.424369747899161e-06, + "logits/chosen": -2.18754243850708, + "logits/rejected": -2.008413791656494, + "logps/chosen": -362.43621826171875, + "logps/rejected": -312.5030517578125, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.667646646499634, + "rewards/margins": 2.5805516242980957, + "rewards/rejected": -6.248198509216309, + "step": 2775 + }, + { + "epoch": 0.58, + "learning_rate": 8.420168067226891e-06, + "logits/chosen": -1.9346718788146973, + "logits/rejected": -1.8294777870178223, + "logps/chosen": -355.0703125, + "logps/rejected": -350.0106506347656, + "loss": 0.4653, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.0981059074401855, + "rewards/margins": 2.373842239379883, + "rewards/rejected": -6.471948146820068, + "step": 2776 + }, + { + "epoch": 0.58, + "learning_rate": 8.415966386554623e-06, + "logits/chosen": -2.3585987091064453, + "logits/rejected": -1.9688363075256348, + "logps/chosen": -418.8023681640625, + "logps/rejected": -376.44287109375, + "loss": 0.6314, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7327287197113037, + "rewards/margins": 4.166337966918945, + "rewards/rejected": -6.899066925048828, + "step": 2777 + }, + { + "epoch": 0.58, + "learning_rate": 8.411764705882353e-06, + "logits/chosen": -2.085252523422241, + "logits/rejected": -1.9837133884429932, + "logps/chosen": -307.84088134765625, + "logps/rejected": -310.72540283203125, + "loss": 0.0782, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6739141941070557, + "rewards/margins": 4.90252161026001, + "rewards/rejected": -7.5764360427856445, + "step": 2778 + }, + { + "epoch": 0.58, + "learning_rate": 8.407563025210085e-06, + "logits/chosen": -2.0152535438537598, + "logits/rejected": -2.121246576309204, + "logps/chosen": -305.7744445800781, + "logps/rejected": -425.6885986328125, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6054189205169678, + "rewards/margins": 4.896279811859131, + "rewards/rejected": -7.501698970794678, + "step": 2779 + }, + { + "epoch": 0.58, + "learning_rate": 8.403361344537815e-06, + "logits/chosen": -2.1084814071655273, + "logits/rejected": -1.8833231925964355, + "logps/chosen": -373.2188720703125, + "logps/rejected": -368.16717529296875, + "loss": 0.4532, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.181241989135742, + "rewards/margins": 3.458427906036377, + "rewards/rejected": -7.639669895172119, + "step": 2780 + }, + { + "epoch": 0.58, + "learning_rate": 8.399159663865547e-06, + "logits/chosen": -2.082890272140503, + "logits/rejected": -1.941098690032959, + "logps/chosen": -294.1342468261719, + "logps/rejected": -416.9605407714844, + "loss": 0.2295, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9211888313293457, + "rewards/margins": 3.5427868366241455, + "rewards/rejected": -7.46397590637207, + "step": 2781 + }, + { + "epoch": 0.58, + "learning_rate": 8.394957983193277e-06, + "logits/chosen": -2.0943055152893066, + "logits/rejected": -1.982130527496338, + "logps/chosen": -296.4791564941406, + "logps/rejected": -322.4605712890625, + "loss": 0.5699, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8016984462738037, + "rewards/margins": 2.4455103874206543, + "rewards/rejected": -6.247209072113037, + "step": 2782 + }, + { + "epoch": 0.58, + "learning_rate": 8.39075630252101e-06, + "logits/chosen": -2.210841417312622, + "logits/rejected": -1.247216820716858, + "logps/chosen": -362.2924499511719, + "logps/rejected": -358.727783203125, + "loss": 0.1903, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.836737632751465, + "rewards/margins": 5.2190260887146, + "rewards/rejected": -9.055764198303223, + "step": 2783 + }, + { + "epoch": 0.58, + "learning_rate": 8.38655462184874e-06, + "logits/chosen": -2.2934329509735107, + "logits/rejected": -1.888594150543213, + "logps/chosen": -351.5435791015625, + "logps/rejected": -293.0517883300781, + "loss": 0.621, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.551410436630249, + "rewards/margins": 2.662245273590088, + "rewards/rejected": -6.213655471801758, + "step": 2784 + }, + { + "epoch": 0.58, + "learning_rate": 8.382352941176472e-06, + "logits/chosen": -2.1756348609924316, + "logits/rejected": -2.043821334838867, + "logps/chosen": -360.9361572265625, + "logps/rejected": -348.6968688964844, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6509859561920166, + "rewards/margins": 2.452268123626709, + "rewards/rejected": -6.103254318237305, + "step": 2785 + }, + { + "epoch": 0.58, + "learning_rate": 8.378151260504202e-06, + "logits/chosen": -1.7733654975891113, + "logits/rejected": -2.1611368656158447, + "logps/chosen": -322.57891845703125, + "logps/rejected": -406.50506591796875, + "loss": 0.3779, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5862035751342773, + "rewards/margins": 3.7182374000549316, + "rewards/rejected": -7.304441452026367, + "step": 2786 + }, + { + "epoch": 0.58, + "learning_rate": 8.373949579831934e-06, + "logits/chosen": -1.7838860750198364, + "logits/rejected": -1.7842795848846436, + "logps/chosen": -322.0689697265625, + "logps/rejected": -374.1538391113281, + "loss": 0.1184, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7560086250305176, + "rewards/margins": 4.20697021484375, + "rewards/rejected": -6.962978839874268, + "step": 2787 + }, + { + "epoch": 0.58, + "learning_rate": 8.369747899159666e-06, + "logits/chosen": -1.9061880111694336, + "logits/rejected": -1.9838675260543823, + "logps/chosen": -279.1058349609375, + "logps/rejected": -332.3639221191406, + "loss": 0.4248, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7480697631835938, + "rewards/margins": 2.9088857173919678, + "rewards/rejected": -6.656955242156982, + "step": 2788 + }, + { + "epoch": 0.58, + "learning_rate": 8.365546218487396e-06, + "logits/chosen": -1.9424360990524292, + "logits/rejected": -2.121950626373291, + "logps/chosen": -441.90869140625, + "logps/rejected": -400.1189270019531, + "loss": 0.1411, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.360245704650879, + "rewards/margins": 4.127459526062012, + "rewards/rejected": -6.487705707550049, + "step": 2789 + }, + { + "epoch": 0.58, + "learning_rate": 8.361344537815128e-06, + "logits/chosen": -2.2728476524353027, + "logits/rejected": -1.8819782733917236, + "logps/chosen": -302.7781677246094, + "logps/rejected": -253.2996063232422, + "loss": 0.146, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.595346689224243, + "rewards/margins": 4.623626232147217, + "rewards/rejected": -7.218973159790039, + "step": 2790 + }, + { + "epoch": 0.58, + "learning_rate": 8.357142857142858e-06, + "logits/chosen": -2.2668657302856445, + "logits/rejected": -1.9130792617797852, + "logps/chosen": -461.9660949707031, + "logps/rejected": -356.5831604003906, + "loss": 0.1988, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.409268617630005, + "rewards/margins": 3.6587791442871094, + "rewards/rejected": -6.068047523498535, + "step": 2791 + }, + { + "epoch": 0.58, + "learning_rate": 8.35294117647059e-06, + "logits/chosen": -2.04266619682312, + "logits/rejected": -1.8873331546783447, + "logps/chosen": -328.31317138671875, + "logps/rejected": -318.4830017089844, + "loss": 0.6483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6163482666015625, + "rewards/margins": 3.330782890319824, + "rewards/rejected": -6.94713020324707, + "step": 2792 + }, + { + "epoch": 0.58, + "learning_rate": 8.34873949579832e-06, + "logits/chosen": -2.2646713256835938, + "logits/rejected": -1.931402564048767, + "logps/chosen": -283.7231140136719, + "logps/rejected": -335.3396911621094, + "loss": 0.3156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.831061601638794, + "rewards/margins": 3.654247283935547, + "rewards/rejected": -7.485308647155762, + "step": 2793 + }, + { + "epoch": 0.58, + "learning_rate": 8.344537815126052e-06, + "logits/chosen": -2.2812507152557373, + "logits/rejected": -2.142279863357544, + "logps/chosen": -301.21826171875, + "logps/rejected": -331.44244384765625, + "loss": 0.4129, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.496366024017334, + "rewards/margins": 3.1071934700012207, + "rewards/rejected": -6.603559494018555, + "step": 2794 + }, + { + "epoch": 0.58, + "learning_rate": 8.340336134453782e-06, + "logits/chosen": -2.236924648284912, + "logits/rejected": -1.9672925472259521, + "logps/chosen": -393.2303466796875, + "logps/rejected": -373.8767395019531, + "loss": 0.3419, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8130574226379395, + "rewards/margins": 4.237860679626465, + "rewards/rejected": -7.0509185791015625, + "step": 2795 + }, + { + "epoch": 0.58, + "learning_rate": 8.336134453781514e-06, + "logits/chosen": -2.3719825744628906, + "logits/rejected": -2.116333246231079, + "logps/chosen": -364.54010009765625, + "logps/rejected": -399.49456787109375, + "loss": 0.6433, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.476327896118164, + "rewards/margins": 2.731562852859497, + "rewards/rejected": -7.207890510559082, + "step": 2796 + }, + { + "epoch": 0.59, + "learning_rate": 8.331932773109244e-06, + "logits/chosen": -2.025804042816162, + "logits/rejected": -2.1762540340423584, + "logps/chosen": -201.6788330078125, + "logps/rejected": -288.7137451171875, + "loss": 0.2266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5049009323120117, + "rewards/margins": 3.50118088722229, + "rewards/rejected": -7.0060811042785645, + "step": 2797 + }, + { + "epoch": 0.59, + "learning_rate": 8.327731092436976e-06, + "logits/chosen": -2.3107378482818604, + "logits/rejected": -1.8063175678253174, + "logps/chosen": -309.6197814941406, + "logps/rejected": -310.44390869140625, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.897059917449951, + "rewards/margins": 5.263006210327148, + "rewards/rejected": -8.160066604614258, + "step": 2798 + }, + { + "epoch": 0.59, + "learning_rate": 8.323529411764707e-06, + "logits/chosen": -2.088073492050171, + "logits/rejected": -1.984992265701294, + "logps/chosen": -298.46868896484375, + "logps/rejected": -299.87664794921875, + "loss": 0.2191, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6454598903656006, + "rewards/margins": 3.8441081047058105, + "rewards/rejected": -7.48956823348999, + "step": 2799 + }, + { + "epoch": 0.59, + "learning_rate": 8.319327731092438e-06, + "logits/chosen": -1.8999435901641846, + "logits/rejected": -1.975258708000183, + "logps/chosen": -233.81503295898438, + "logps/rejected": -267.8039245605469, + "loss": 0.423, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6107382774353027, + "rewards/margins": 2.935202121734619, + "rewards/rejected": -6.545940399169922, + "step": 2800 + }, + { + "epoch": 0.59, + "learning_rate": 8.315126050420169e-06, + "logits/chosen": -2.0748801231384277, + "logits/rejected": -1.9033212661743164, + "logps/chosen": -376.46484375, + "logps/rejected": -376.0827941894531, + "loss": 0.0856, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.930233955383301, + "rewards/margins": 5.441582202911377, + "rewards/rejected": -8.37181568145752, + "step": 2801 + }, + { + "epoch": 0.59, + "learning_rate": 8.3109243697479e-06, + "logits/chosen": -2.065798759460449, + "logits/rejected": -2.2086679935455322, + "logps/chosen": -277.2217712402344, + "logps/rejected": -395.1743469238281, + "loss": 0.4226, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7898495197296143, + "rewards/margins": 2.7232718467712402, + "rewards/rejected": -5.513121128082275, + "step": 2802 + }, + { + "epoch": 0.59, + "learning_rate": 8.30672268907563e-06, + "logits/chosen": -2.1602094173431396, + "logits/rejected": -1.749508023262024, + "logps/chosen": -346.4497375488281, + "logps/rejected": -289.76220703125, + "loss": 0.2109, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.653831720352173, + "rewards/margins": 4.648664474487305, + "rewards/rejected": -7.302495956420898, + "step": 2803 + }, + { + "epoch": 0.59, + "learning_rate": 8.302521008403363e-06, + "logits/chosen": -2.107135534286499, + "logits/rejected": -1.8716074228286743, + "logps/chosen": -365.9205322265625, + "logps/rejected": -354.135498046875, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.993846893310547, + "rewards/margins": 4.114795207977295, + "rewards/rejected": -7.108642578125, + "step": 2804 + }, + { + "epoch": 0.59, + "learning_rate": 8.298319327731093e-06, + "logits/chosen": -2.174389362335205, + "logits/rejected": -1.9666194915771484, + "logps/chosen": -326.303466796875, + "logps/rejected": -291.7364807128906, + "loss": 0.2614, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.902897596359253, + "rewards/margins": 4.314565658569336, + "rewards/rejected": -8.217463493347168, + "step": 2805 + }, + { + "epoch": 0.59, + "learning_rate": 8.294117647058825e-06, + "logits/chosen": -2.284073829650879, + "logits/rejected": -2.0911784172058105, + "logps/chosen": -450.76458740234375, + "logps/rejected": -396.35565185546875, + "loss": 0.3316, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.68430495262146, + "rewards/margins": 3.2674429416656494, + "rewards/rejected": -6.951747894287109, + "step": 2806 + }, + { + "epoch": 0.59, + "learning_rate": 8.289915966386555e-06, + "logits/chosen": -2.363044500350952, + "logits/rejected": -1.9413715600967407, + "logps/chosen": -490.34033203125, + "logps/rejected": -455.1481628417969, + "loss": 0.1363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6697349548339844, + "rewards/margins": 5.12384557723999, + "rewards/rejected": -7.793581008911133, + "step": 2807 + }, + { + "epoch": 0.59, + "learning_rate": 8.285714285714287e-06, + "logits/chosen": -2.07851505279541, + "logits/rejected": -1.738268256187439, + "logps/chosen": -317.51806640625, + "logps/rejected": -324.86822509765625, + "loss": 0.297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6573843955993652, + "rewards/margins": 3.86836576461792, + "rewards/rejected": -7.525750160217285, + "step": 2808 + }, + { + "epoch": 0.59, + "learning_rate": 8.281512605042017e-06, + "logits/chosen": -1.879288911819458, + "logits/rejected": -1.9739577770233154, + "logps/chosen": -393.1387023925781, + "logps/rejected": -398.0206604003906, + "loss": 0.209, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2824854850769043, + "rewards/margins": 2.966374397277832, + "rewards/rejected": -6.248859405517578, + "step": 2809 + }, + { + "epoch": 0.59, + "learning_rate": 8.277310924369747e-06, + "logits/chosen": -1.7616772651672363, + "logits/rejected": -2.0976154804229736, + "logps/chosen": -215.555908203125, + "logps/rejected": -275.7451477050781, + "loss": 0.176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.925398349761963, + "rewards/margins": 5.10221529006958, + "rewards/rejected": -9.027612686157227, + "step": 2810 + }, + { + "epoch": 0.59, + "learning_rate": 8.27310924369748e-06, + "logits/chosen": -2.186342716217041, + "logits/rejected": -1.6078993082046509, + "logps/chosen": -374.6521301269531, + "logps/rejected": -321.64398193359375, + "loss": 0.2071, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4683961868286133, + "rewards/margins": 3.606710195541382, + "rewards/rejected": -7.075106620788574, + "step": 2811 + }, + { + "epoch": 0.59, + "learning_rate": 8.26890756302521e-06, + "logits/chosen": -2.2170135974884033, + "logits/rejected": -2.0392019748687744, + "logps/chosen": -412.68218994140625, + "logps/rejected": -439.02056884765625, + "loss": 0.3074, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8754336833953857, + "rewards/margins": 4.200932025909424, + "rewards/rejected": -7.0763654708862305, + "step": 2812 + }, + { + "epoch": 0.59, + "learning_rate": 8.264705882352941e-06, + "logits/chosen": -1.7978732585906982, + "logits/rejected": -2.045762538909912, + "logps/chosen": -255.79513549804688, + "logps/rejected": -351.8050231933594, + "loss": 0.4799, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6061131954193115, + "rewards/margins": 4.833450794219971, + "rewards/rejected": -8.439563751220703, + "step": 2813 + }, + { + "epoch": 0.59, + "learning_rate": 8.260504201680672e-06, + "logits/chosen": -2.3886878490448, + "logits/rejected": -1.910915732383728, + "logps/chosen": -370.0023193359375, + "logps/rejected": -311.5486145019531, + "loss": 0.3669, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9370834827423096, + "rewards/margins": 3.734679698944092, + "rewards/rejected": -6.6717634201049805, + "step": 2814 + }, + { + "epoch": 0.59, + "learning_rate": 8.256302521008404e-06, + "logits/chosen": -2.293048143386841, + "logits/rejected": -1.751259446144104, + "logps/chosen": -380.3153076171875, + "logps/rejected": -323.15106201171875, + "loss": 0.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8120436668395996, + "rewards/margins": 3.2259268760681152, + "rewards/rejected": -6.037970542907715, + "step": 2815 + }, + { + "epoch": 0.59, + "learning_rate": 8.252100840336136e-06, + "logits/chosen": -2.1402781009674072, + "logits/rejected": -2.1231849193573, + "logps/chosen": -219.87908935546875, + "logps/rejected": -263.8876647949219, + "loss": 0.2605, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0515546798706055, + "rewards/margins": 3.040764093399048, + "rewards/rejected": -6.092319011688232, + "step": 2816 + }, + { + "epoch": 0.59, + "learning_rate": 8.247899159663866e-06, + "logits/chosen": -2.347778797149658, + "logits/rejected": -2.0777573585510254, + "logps/chosen": -337.0054931640625, + "logps/rejected": -352.59539794921875, + "loss": 0.1095, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0044875144958496, + "rewards/margins": 5.388765335083008, + "rewards/rejected": -8.393253326416016, + "step": 2817 + }, + { + "epoch": 0.59, + "learning_rate": 8.243697478991598e-06, + "logits/chosen": -2.123249053955078, + "logits/rejected": -1.9542696475982666, + "logps/chosen": -357.5397644042969, + "logps/rejected": -544.1553344726562, + "loss": 1.2184, + "rewards/accuracies": 0.4375, + "rewards/chosen": -3.882512092590332, + "rewards/margins": 0.9422541856765747, + "rewards/rejected": -4.824766635894775, + "step": 2818 + }, + { + "epoch": 0.59, + "learning_rate": 8.239495798319328e-06, + "logits/chosen": -1.9939534664154053, + "logits/rejected": -2.119971752166748, + "logps/chosen": -284.2933654785156, + "logps/rejected": -383.06951904296875, + "loss": 0.6059, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.894986152648926, + "rewards/margins": 3.1070070266723633, + "rewards/rejected": -7.001993179321289, + "step": 2819 + }, + { + "epoch": 0.59, + "learning_rate": 8.23529411764706e-06, + "logits/chosen": -2.3356690406799316, + "logits/rejected": -2.06516695022583, + "logps/chosen": -335.189697265625, + "logps/rejected": -333.42047119140625, + "loss": 0.2821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6549527645111084, + "rewards/margins": 2.70896053314209, + "rewards/rejected": -6.363913536071777, + "step": 2820 + }, + { + "epoch": 0.59, + "learning_rate": 8.23109243697479e-06, + "logits/chosen": -2.038480758666992, + "logits/rejected": -1.8009753227233887, + "logps/chosen": -331.6080322265625, + "logps/rejected": -333.0943298339844, + "loss": 0.0706, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7101237773895264, + "rewards/margins": 5.231769561767578, + "rewards/rejected": -7.941892623901367, + "step": 2821 + }, + { + "epoch": 0.59, + "learning_rate": 8.226890756302522e-06, + "logits/chosen": -2.0843725204467773, + "logits/rejected": -1.7788591384887695, + "logps/chosen": -337.2210693359375, + "logps/rejected": -372.18585205078125, + "loss": 0.337, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.717348098754883, + "rewards/margins": 3.6554651260375977, + "rewards/rejected": -7.372813701629639, + "step": 2822 + }, + { + "epoch": 0.59, + "learning_rate": 8.222689075630252e-06, + "logits/chosen": -1.8485604524612427, + "logits/rejected": -2.3213250637054443, + "logps/chosen": -224.17706298828125, + "logps/rejected": -452.4322204589844, + "loss": 0.2053, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2848198413848877, + "rewards/margins": 5.009344100952148, + "rewards/rejected": -8.294164657592773, + "step": 2823 + }, + { + "epoch": 0.59, + "learning_rate": 8.218487394957984e-06, + "logits/chosen": -2.095917224884033, + "logits/rejected": -1.7116236686706543, + "logps/chosen": -355.4200744628906, + "logps/rejected": -320.6032409667969, + "loss": 0.2091, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.125521421432495, + "rewards/margins": 4.299933433532715, + "rewards/rejected": -6.425454139709473, + "step": 2824 + }, + { + "epoch": 0.59, + "learning_rate": 8.214285714285714e-06, + "logits/chosen": -2.322499990463257, + "logits/rejected": -2.0416998863220215, + "logps/chosen": -421.985595703125, + "logps/rejected": -385.819580078125, + "loss": 0.2117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3511154651641846, + "rewards/margins": 4.265036582946777, + "rewards/rejected": -7.616151809692383, + "step": 2825 + }, + { + "epoch": 0.59, + "learning_rate": 8.210084033613446e-06, + "logits/chosen": -2.109950065612793, + "logits/rejected": -1.7112679481506348, + "logps/chosen": -342.0282287597656, + "logps/rejected": -386.3279113769531, + "loss": 0.5215, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.720414161682129, + "rewards/margins": 2.8072731494903564, + "rewards/rejected": -7.527687072753906, + "step": 2826 + }, + { + "epoch": 0.59, + "learning_rate": 8.205882352941176e-06, + "logits/chosen": -2.043193817138672, + "logits/rejected": -1.8053600788116455, + "logps/chosen": -340.2882995605469, + "logps/rejected": -283.55450439453125, + "loss": 0.4156, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3082804679870605, + "rewards/margins": 4.165438175201416, + "rewards/rejected": -8.473718643188477, + "step": 2827 + }, + { + "epoch": 0.59, + "learning_rate": 8.201680672268908e-06, + "logits/chosen": -2.2017405033111572, + "logits/rejected": -1.9761722087860107, + "logps/chosen": -320.93365478515625, + "logps/rejected": -292.62908935546875, + "loss": 0.3305, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.529244899749756, + "rewards/margins": 4.6572675704956055, + "rewards/rejected": -8.18651294708252, + "step": 2828 + }, + { + "epoch": 0.59, + "learning_rate": 8.197478991596639e-06, + "logits/chosen": -1.9891941547393799, + "logits/rejected": -2.301286220550537, + "logps/chosen": -287.46368408203125, + "logps/rejected": -434.3602294921875, + "loss": 0.2346, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.409396171569824, + "rewards/margins": 6.277019500732422, + "rewards/rejected": -9.686415672302246, + "step": 2829 + }, + { + "epoch": 0.59, + "learning_rate": 8.19327731092437e-06, + "logits/chosen": -1.7858808040618896, + "logits/rejected": -1.7814605236053467, + "logps/chosen": -296.35528564453125, + "logps/rejected": -334.67218017578125, + "loss": 0.2394, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1191277503967285, + "rewards/margins": 4.2347588539123535, + "rewards/rejected": -7.353886604309082, + "step": 2830 + }, + { + "epoch": 0.59, + "learning_rate": 8.1890756302521e-06, + "logits/chosen": -2.403583526611328, + "logits/rejected": -2.051482915878296, + "logps/chosen": -344.85809326171875, + "logps/rejected": -307.8115234375, + "loss": 0.2398, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3161840438842773, + "rewards/margins": 3.2084336280822754, + "rewards/rejected": -6.524618148803711, + "step": 2831 + }, + { + "epoch": 0.59, + "learning_rate": 8.184873949579833e-06, + "logits/chosen": -2.1043310165405273, + "logits/rejected": -2.0739669799804688, + "logps/chosen": -309.0305480957031, + "logps/rejected": -405.19696044921875, + "loss": 0.167, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.588003158569336, + "rewards/margins": 4.840431213378906, + "rewards/rejected": -8.428434371948242, + "step": 2832 + }, + { + "epoch": 0.59, + "learning_rate": 8.180672268907563e-06, + "logits/chosen": -2.2396187782287598, + "logits/rejected": -2.1531238555908203, + "logps/chosen": -277.57025146484375, + "logps/rejected": -323.8175048828125, + "loss": 0.2929, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.723935127258301, + "rewards/margins": 2.4785430431365967, + "rewards/rejected": -6.202478408813477, + "step": 2833 + }, + { + "epoch": 0.59, + "learning_rate": 8.176470588235295e-06, + "logits/chosen": -2.009207248687744, + "logits/rejected": -2.1428966522216797, + "logps/chosen": -272.13909912109375, + "logps/rejected": -355.3739013671875, + "loss": 0.1675, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.269925594329834, + "rewards/margins": 4.545327186584473, + "rewards/rejected": -8.815254211425781, + "step": 2834 + }, + { + "epoch": 0.59, + "learning_rate": 8.172268907563025e-06, + "logits/chosen": -2.255329132080078, + "logits/rejected": -2.042604923248291, + "logps/chosen": -457.68280029296875, + "logps/rejected": -343.29901123046875, + "loss": 0.3561, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5013070106506348, + "rewards/margins": 3.7814860343933105, + "rewards/rejected": -7.2827935218811035, + "step": 2835 + }, + { + "epoch": 0.59, + "learning_rate": 8.168067226890757e-06, + "logits/chosen": -1.8427557945251465, + "logits/rejected": -2.123448610305786, + "logps/chosen": -319.8478088378906, + "logps/rejected": -425.02716064453125, + "loss": 0.2068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.94172739982605, + "rewards/margins": 6.207667827606201, + "rewards/rejected": -9.149395942687988, + "step": 2836 + }, + { + "epoch": 0.59, + "learning_rate": 8.163865546218487e-06, + "logits/chosen": -2.0624234676361084, + "logits/rejected": -2.131868600845337, + "logps/chosen": -328.3984069824219, + "logps/rejected": -394.0900573730469, + "loss": 0.504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.672734498977661, + "rewards/margins": 2.772357225418091, + "rewards/rejected": -6.44509220123291, + "step": 2837 + }, + { + "epoch": 0.59, + "learning_rate": 8.159663865546219e-06, + "logits/chosen": -2.002519369125366, + "logits/rejected": -2.3567581176757812, + "logps/chosen": -263.5201416015625, + "logps/rejected": -352.77752685546875, + "loss": 0.5185, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.1634368896484375, + "rewards/margins": 3.137320041656494, + "rewards/rejected": -7.300756454467773, + "step": 2838 + }, + { + "epoch": 0.59, + "learning_rate": 8.155462184873951e-06, + "logits/chosen": -2.0792806148529053, + "logits/rejected": -2.0239672660827637, + "logps/chosen": -348.33917236328125, + "logps/rejected": -391.73919677734375, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.483395576477051, + "rewards/margins": 4.1585822105407715, + "rewards/rejected": -8.64197826385498, + "step": 2839 + }, + { + "epoch": 0.59, + "learning_rate": 8.151260504201681e-06, + "logits/chosen": -2.3560848236083984, + "logits/rejected": -1.748203992843628, + "logps/chosen": -428.44305419921875, + "logps/rejected": -355.63946533203125, + "loss": 0.8137, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6552109718322754, + "rewards/margins": 3.1481997966766357, + "rewards/rejected": -6.80341100692749, + "step": 2840 + }, + { + "epoch": 0.59, + "learning_rate": 8.147058823529413e-06, + "logits/chosen": -2.2991445064544678, + "logits/rejected": -1.9024101495742798, + "logps/chosen": -349.4647216796875, + "logps/rejected": -412.1666564941406, + "loss": 0.1101, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6878700256347656, + "rewards/margins": 6.302396774291992, + "rewards/rejected": -8.990266799926758, + "step": 2841 + }, + { + "epoch": 0.59, + "learning_rate": 8.142857142857143e-06, + "logits/chosen": -2.1942005157470703, + "logits/rejected": -1.7929238080978394, + "logps/chosen": -416.61358642578125, + "logps/rejected": -389.9256896972656, + "loss": 0.2395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.389235496520996, + "rewards/margins": 5.791066646575928, + "rewards/rejected": -10.180302619934082, + "step": 2842 + }, + { + "epoch": 0.59, + "learning_rate": 8.138655462184875e-06, + "logits/chosen": -2.36810302734375, + "logits/rejected": -1.8364918231964111, + "logps/chosen": -401.8931884765625, + "logps/rejected": -354.80712890625, + "loss": 0.3152, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.2738542556762695, + "rewards/margins": 3.8468637466430664, + "rewards/rejected": -8.120718002319336, + "step": 2843 + }, + { + "epoch": 0.59, + "learning_rate": 8.134453781512605e-06, + "logits/chosen": -2.1607508659362793, + "logits/rejected": -1.9271211624145508, + "logps/chosen": -321.31622314453125, + "logps/rejected": -308.65155029296875, + "loss": 0.4544, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.333409309387207, + "rewards/margins": 3.616272449493408, + "rewards/rejected": -6.949681758880615, + "step": 2844 + }, + { + "epoch": 0.6, + "learning_rate": 8.130252100840337e-06, + "logits/chosen": -2.1910910606384277, + "logits/rejected": -2.0567409992218018, + "logps/chosen": -388.2835693359375, + "logps/rejected": -397.63775634765625, + "loss": 0.4485, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.4273123741149902, + "rewards/margins": 4.08199405670166, + "rewards/rejected": -7.50930643081665, + "step": 2845 + }, + { + "epoch": 0.6, + "learning_rate": 8.126050420168068e-06, + "logits/chosen": -1.9302513599395752, + "logits/rejected": -2.0104446411132812, + "logps/chosen": -408.349609375, + "logps/rejected": -366.58447265625, + "loss": 0.1361, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.827929973602295, + "rewards/margins": 4.033010482788086, + "rewards/rejected": -6.860940933227539, + "step": 2846 + }, + { + "epoch": 0.6, + "learning_rate": 8.1218487394958e-06, + "logits/chosen": -2.2164080142974854, + "logits/rejected": -1.3722795248031616, + "logps/chosen": -313.64495849609375, + "logps/rejected": -263.4909973144531, + "loss": 0.4127, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.483514308929443, + "rewards/margins": 4.103476524353027, + "rewards/rejected": -8.586990356445312, + "step": 2847 + }, + { + "epoch": 0.6, + "learning_rate": 8.11764705882353e-06, + "logits/chosen": -2.17451548576355, + "logits/rejected": -2.138374090194702, + "logps/chosen": -292.60052490234375, + "logps/rejected": -374.3209533691406, + "loss": 0.21, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0047714710235596, + "rewards/margins": 5.142330169677734, + "rewards/rejected": -8.147100448608398, + "step": 2848 + }, + { + "epoch": 0.6, + "learning_rate": 8.113445378151262e-06, + "logits/chosen": -2.0436458587646484, + "logits/rejected": -1.9778953790664673, + "logps/chosen": -301.82952880859375, + "logps/rejected": -397.66802978515625, + "loss": 0.0724, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.147308349609375, + "rewards/margins": 5.640937328338623, + "rewards/rejected": -8.78824520111084, + "step": 2849 + }, + { + "epoch": 0.6, + "learning_rate": 8.109243697478992e-06, + "logits/chosen": -2.169551134109497, + "logits/rejected": -1.8833082914352417, + "logps/chosen": -252.54771423339844, + "logps/rejected": -292.64251708984375, + "loss": 0.5495, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.90415620803833, + "rewards/margins": 2.9267563819885254, + "rewards/rejected": -6.8309125900268555, + "step": 2850 + }, + { + "epoch": 0.6, + "learning_rate": 8.105042016806724e-06, + "logits/chosen": -2.1114001274108887, + "logits/rejected": -1.8983662128448486, + "logps/chosen": -345.64544677734375, + "logps/rejected": -396.6995544433594, + "loss": 0.2116, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0202040672302246, + "rewards/margins": 4.6315717697143555, + "rewards/rejected": -7.651776313781738, + "step": 2851 + }, + { + "epoch": 0.6, + "learning_rate": 8.100840336134454e-06, + "logits/chosen": -2.4691951274871826, + "logits/rejected": -2.1260340213775635, + "logps/chosen": -461.36627197265625, + "logps/rejected": -373.4756164550781, + "loss": 0.512, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6879405975341797, + "rewards/margins": 3.469712972640991, + "rewards/rejected": -6.157653331756592, + "step": 2852 + }, + { + "epoch": 0.6, + "learning_rate": 8.096638655462186e-06, + "logits/chosen": -2.1128549575805664, + "logits/rejected": -1.9955192804336548, + "logps/chosen": -289.69439697265625, + "logps/rejected": -373.2474670410156, + "loss": 0.6819, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7963194847106934, + "rewards/margins": 3.129281997680664, + "rewards/rejected": -6.925601482391357, + "step": 2853 + }, + { + "epoch": 0.6, + "learning_rate": 8.092436974789916e-06, + "logits/chosen": -1.9095611572265625, + "logits/rejected": -1.647695779800415, + "logps/chosen": -350.608642578125, + "logps/rejected": -318.40875244140625, + "loss": 0.1933, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8464856147766113, + "rewards/margins": 3.5230777263641357, + "rewards/rejected": -6.369563579559326, + "step": 2854 + }, + { + "epoch": 0.6, + "learning_rate": 8.088235294117648e-06, + "logits/chosen": -1.9426568746566772, + "logits/rejected": -1.8198564052581787, + "logps/chosen": -313.92413330078125, + "logps/rejected": -332.2087097167969, + "loss": 0.1479, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.27368688583374, + "rewards/margins": 4.817465782165527, + "rewards/rejected": -9.09115219116211, + "step": 2855 + }, + { + "epoch": 0.6, + "learning_rate": 8.084033613445378e-06, + "logits/chosen": -2.2103123664855957, + "logits/rejected": -2.2698264122009277, + "logps/chosen": -235.57872009277344, + "logps/rejected": -256.6566162109375, + "loss": 0.2125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.961265802383423, + "rewards/margins": 4.453272819519043, + "rewards/rejected": -7.414538860321045, + "step": 2856 + }, + { + "epoch": 0.6, + "learning_rate": 8.07983193277311e-06, + "logits/chosen": -2.2029149532318115, + "logits/rejected": -1.9300525188446045, + "logps/chosen": -364.18182373046875, + "logps/rejected": -378.244873046875, + "loss": 0.3083, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.002892971038818, + "rewards/margins": 4.774847507476807, + "rewards/rejected": -8.777740478515625, + "step": 2857 + }, + { + "epoch": 0.6, + "learning_rate": 8.07563025210084e-06, + "logits/chosen": -2.1541645526885986, + "logits/rejected": -2.269373893737793, + "logps/chosen": -371.02301025390625, + "logps/rejected": -420.470458984375, + "loss": 0.4484, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.891386032104492, + "rewards/margins": 2.578678607940674, + "rewards/rejected": -6.470065116882324, + "step": 2858 + }, + { + "epoch": 0.6, + "learning_rate": 8.071428571428572e-06, + "logits/chosen": -2.264519214630127, + "logits/rejected": -2.0965452194213867, + "logps/chosen": -304.95111083984375, + "logps/rejected": -332.13262939453125, + "loss": 0.1327, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.532072067260742, + "rewards/margins": 4.769775390625, + "rewards/rejected": -7.301847457885742, + "step": 2859 + }, + { + "epoch": 0.6, + "learning_rate": 8.067226890756303e-06, + "logits/chosen": -2.408257484436035, + "logits/rejected": -2.1373329162597656, + "logps/chosen": -444.46441650390625, + "logps/rejected": -405.84423828125, + "loss": 0.2064, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1656885147094727, + "rewards/margins": 4.396812438964844, + "rewards/rejected": -7.562500953674316, + "step": 2860 + }, + { + "epoch": 0.6, + "learning_rate": 8.063025210084034e-06, + "logits/chosen": -2.187272071838379, + "logits/rejected": -2.0420517921447754, + "logps/chosen": -386.6808166503906, + "logps/rejected": -349.65887451171875, + "loss": 0.2506, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7949306964874268, + "rewards/margins": 5.878454208374023, + "rewards/rejected": -8.673385620117188, + "step": 2861 + }, + { + "epoch": 0.6, + "learning_rate": 8.058823529411766e-06, + "logits/chosen": -2.3455960750579834, + "logits/rejected": -1.427271842956543, + "logps/chosen": -345.93450927734375, + "logps/rejected": -242.88400268554688, + "loss": 0.4899, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.987334728240967, + "rewards/margins": 3.650768280029297, + "rewards/rejected": -6.6381025314331055, + "step": 2862 + }, + { + "epoch": 0.6, + "learning_rate": 8.054621848739497e-06, + "logits/chosen": -2.0288524627685547, + "logits/rejected": -1.914783000946045, + "logps/chosen": -314.9799499511719, + "logps/rejected": -385.12982177734375, + "loss": 0.2354, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.895512342453003, + "rewards/margins": 4.287503719329834, + "rewards/rejected": -8.183015823364258, + "step": 2863 + }, + { + "epoch": 0.6, + "learning_rate": 8.050420168067229e-06, + "logits/chosen": -2.2847816944122314, + "logits/rejected": -2.0739388465881348, + "logps/chosen": -270.5831298828125, + "logps/rejected": -328.23907470703125, + "loss": 0.2799, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.534893751144409, + "rewards/margins": 5.640796661376953, + "rewards/rejected": -9.175689697265625, + "step": 2864 + }, + { + "epoch": 0.6, + "learning_rate": 8.046218487394959e-06, + "logits/chosen": -1.9350030422210693, + "logits/rejected": -1.6919705867767334, + "logps/chosen": -382.2035827636719, + "logps/rejected": -390.9158935546875, + "loss": 0.3389, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.023379802703857, + "rewards/margins": 4.018260478973389, + "rewards/rejected": -8.041640281677246, + "step": 2865 + }, + { + "epoch": 0.6, + "learning_rate": 8.04201680672269e-06, + "logits/chosen": -2.129018783569336, + "logits/rejected": -2.176754951477051, + "logps/chosen": -396.83563232421875, + "logps/rejected": -400.2115478515625, + "loss": 0.2302, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7818636894226074, + "rewards/margins": 3.6553544998168945, + "rewards/rejected": -6.437218189239502, + "step": 2866 + }, + { + "epoch": 0.6, + "learning_rate": 8.037815126050421e-06, + "logits/chosen": -2.068474769592285, + "logits/rejected": -1.9121875762939453, + "logps/chosen": -424.01397705078125, + "logps/rejected": -347.99053955078125, + "loss": 0.4682, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.936293840408325, + "rewards/margins": 2.923370122909546, + "rewards/rejected": -6.859663963317871, + "step": 2867 + }, + { + "epoch": 0.6, + "learning_rate": 8.033613445378153e-06, + "logits/chosen": -2.3719639778137207, + "logits/rejected": -1.8226454257965088, + "logps/chosen": -439.0179138183594, + "logps/rejected": -372.7168273925781, + "loss": 0.3606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1236109733581543, + "rewards/margins": 4.458876132965088, + "rewards/rejected": -7.582486629486084, + "step": 2868 + } + ], + "logging_steps": 1, + "max_steps": 4780, + "num_train_epochs": 1, + "save_steps": 239, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}