diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9685 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 5804, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00034458993797381116, + "grad_norm": 1.5268679857254028, + "learning_rate": 8.605851979345955e-11, + "logits/chosen": -3.024087429046631, + "logits/rejected": -2.988196611404419, + "logps/chosen": -47.308799743652344, + "logps/rejected": -44.131954193115234, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0034458993797381117, + "grad_norm": 1.6066739559173584, + "learning_rate": 8.605851979345954e-10, + "logits/chosen": -3.0891270637512207, + "logits/rejected": -3.071465015411377, + "logps/chosen": -51.676212310791016, + "logps/rejected": -51.65631103515625, + "loss": 0.6931, + "rewards/accuracies": 0.4409722089767456, + "rewards/chosen": 1.6248530300799757e-05, + "rewards/margins": 8.904636342776939e-05, + "rewards/rejected": -7.279782585101202e-05, + "step": 10 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 1.4660340547561646, + "learning_rate": 1.7211703958691908e-09, + "logits/chosen": -3.098437786102295, + "logits/rejected": -3.0707993507385254, + "logps/chosen": -56.029632568359375, + "logps/rejected": -54.557579040527344, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 4.5064934965921566e-05, + "rewards/margins": 8.188869287550915e-06, + "rewards/rejected": 3.687605931190774e-05, + "step": 20 + }, + { + "epoch": 0.010337698139214336, + "grad_norm": 1.8557682037353516, + "learning_rate": 2.5817555938037863e-09, + "logits/chosen": -3.127624988555908, + "logits/rejected": -3.104484796524048, + "logps/chosen": -55.351043701171875, + "logps/rejected": -52.83721923828125, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 4.745673322759103e-06, + "rewards/margins": -3.1675353966420516e-05, + "rewards/rejected": 3.6421035474631935e-05, + "step": 30 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 1.5628290176391602, + "learning_rate": 3.4423407917383816e-09, + "logits/chosen": -3.1053009033203125, + "logits/rejected": -3.078575611114502, + "logps/chosen": -56.41291427612305, + "logps/rejected": -53.65864181518555, + "loss": 0.6931, + "rewards/accuracies": 0.546875, + "rewards/chosen": 0.00019920275371987373, + "rewards/margins": 0.00018488746718503535, + "rewards/rejected": 1.4315284715848975e-05, + "step": 40 + }, + { + "epoch": 0.01722949689869056, + "grad_norm": 1.7454065084457397, + "learning_rate": 4.302925989672977e-09, + "logits/chosen": -3.0816972255706787, + "logits/rejected": -3.0432658195495605, + "logps/chosen": -54.70732879638672, + "logps/rejected": -51.237525939941406, + "loss": 0.6931, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 6.539397872984409e-06, + "rewards/margins": 2.3190634237835184e-05, + "rewards/rejected": -1.6651232726871967e-05, + "step": 50 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 1.8053064346313477, + "learning_rate": 5.163511187607573e-09, + "logits/chosen": -3.093721866607666, + "logits/rejected": -3.074064254760742, + "logps/chosen": -54.40106964111328, + "logps/rejected": -53.95695877075195, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00021476794790942222, + "rewards/margins": -0.00016086632967926562, + "rewards/rejected": -5.390158548834734e-05, + "step": 60 + }, + { + "epoch": 0.024121295658166782, + "grad_norm": 1.7542425394058228, + "learning_rate": 6.024096385542168e-09, + "logits/chosen": -3.111924648284912, + "logits/rejected": -3.09572172164917, + "logps/chosen": -54.690452575683594, + "logps/rejected": -53.768272399902344, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -6.438524906116072e-06, + "rewards/margins": 8.780825010035187e-05, + "rewards/rejected": -9.424677409697324e-05, + "step": 70 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 1.5708752870559692, + "learning_rate": 6.884681583476763e-09, + "logits/chosen": -3.0582454204559326, + "logits/rejected": -3.0389389991760254, + "logps/chosen": -53.05244064331055, + "logps/rejected": -53.5573844909668, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 4.5851076720282435e-05, + "rewards/margins": 9.506057540420443e-05, + "rewards/rejected": -4.92095023219008e-05, + "step": 80 + }, + { + "epoch": 0.031013094417643005, + "grad_norm": 1.7699593305587769, + "learning_rate": 7.745266781411359e-09, + "logits/chosen": -3.0694618225097656, + "logits/rejected": -3.050314426422119, + "logps/chosen": -56.33974075317383, + "logps/rejected": -52.55852127075195, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 5.394589970819652e-05, + "rewards/margins": 0.0001279563148273155, + "rewards/rejected": -7.40104223950766e-05, + "step": 90 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 1.757240891456604, + "learning_rate": 8.605851979345954e-09, + "logits/chosen": -3.0916152000427246, + "logits/rejected": -3.0621144771575928, + "logps/chosen": -56.1510124206543, + "logps/rejected": -53.62493896484375, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 2.1132344045327045e-05, + "rewards/margins": -2.9667915441677906e-05, + "rewards/rejected": 5.0800241297110915e-05, + "step": 100 + }, + { + "epoch": 0.03445899379738112, + "eval_logits/chosen": -3.1631598472595215, + "eval_logits/rejected": -3.1574864387512207, + "eval_logps/chosen": -58.709110260009766, + "eval_logps/rejected": -63.17157745361328, + "eval_loss": 0.6931766271591187, + "eval_rewards/accuracies": 0.4804832637310028, + "eval_rewards/chosen": 2.7851780032506213e-05, + "eval_rewards/margins": -5.7606255722930655e-05, + "eval_rewards/rejected": 8.545803575543687e-05, + "eval_runtime": 384.4303, + "eval_samples_per_second": 11.196, + "eval_steps_per_second": 1.399, + "step": 100 + }, + { + "epoch": 0.03790489317711923, + "grad_norm": 1.6156939268112183, + "learning_rate": 9.46643717728055e-09, + "logits/chosen": -3.032536745071411, + "logits/rejected": -3.0125365257263184, + "logps/chosen": -52.728843688964844, + "logps/rejected": -54.300689697265625, + "loss": 0.6931, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 2.9080998501740396e-05, + "rewards/margins": 2.5842193736025365e-06, + "rewards/rejected": 2.6496752980165184e-05, + "step": 110 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 1.5372369289398193, + "learning_rate": 1.0327022375215145e-08, + "logits/chosen": -3.0377848148345947, + "logits/rejected": -3.007380962371826, + "logps/chosen": -52.42706298828125, + "logps/rejected": -51.04817581176758, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 4.8243564378935844e-05, + "rewards/margins": 3.2927131542237476e-05, + "rewards/rejected": 1.5316421922761947e-05, + "step": 120 + }, + { + "epoch": 0.044796691936595454, + "grad_norm": 1.7136356830596924, + "learning_rate": 1.1187607573149742e-08, + "logits/chosen": -3.1061851978302, + "logits/rejected": -3.089826822280884, + "logps/chosen": -53.56927490234375, + "logps/rejected": -53.8600959777832, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 5.350602441467345e-05, + "rewards/margins": 8.011364843696356e-05, + "rewards/rejected": -2.6607634936226532e-05, + "step": 130 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 1.8630205392837524, + "learning_rate": 1.2048192771084337e-08, + "logits/chosen": -3.0788228511810303, + "logits/rejected": -3.0526034832000732, + "logps/chosen": -55.3593864440918, + "logps/rejected": -53.862342834472656, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 6.969690730329603e-05, + "rewards/margins": 9.481948654865846e-05, + "rewards/rejected": -2.5122590159298852e-05, + "step": 140 + }, + { + "epoch": 0.051688490696071676, + "grad_norm": 1.626455545425415, + "learning_rate": 1.2908777969018933e-08, + "logits/chosen": -3.025050640106201, + "logits/rejected": -3.0135657787323, + "logps/chosen": -54.11144256591797, + "logps/rejected": -54.102447509765625, + "loss": 0.6932, + "rewards/accuracies": 0.46562498807907104, + "rewards/chosen": -2.0424617105163634e-05, + "rewards/margins": -0.0001254867820534855, + "rewards/rejected": 0.00010506215039640665, + "step": 150 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 1.6855826377868652, + "learning_rate": 1.3769363166953526e-08, + "logits/chosen": -3.044309616088867, + "logits/rejected": -3.02839732170105, + "logps/chosen": -54.01361083984375, + "logps/rejected": -51.27421188354492, + "loss": 0.6931, + "rewards/accuracies": 0.515625, + "rewards/chosen": 8.679249958731816e-07, + "rewards/margins": 6.541889160871506e-05, + "rewards/rejected": -6.455096445279196e-05, + "step": 160 + }, + { + "epoch": 0.0585802894555479, + "grad_norm": 1.666648507118225, + "learning_rate": 1.4629948364888123e-08, + "logits/chosen": -3.044278621673584, + "logits/rejected": -3.0216360092163086, + "logps/chosen": -53.77402877807617, + "logps/rejected": -52.072715759277344, + "loss": 0.6931, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": 5.8081197494175285e-05, + "rewards/margins": 5.6167882576119155e-05, + "rewards/rejected": 1.913309006340569e-06, + "step": 170 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 1.6648340225219727, + "learning_rate": 1.5490533562822718e-08, + "logits/chosen": -3.053169012069702, + "logits/rejected": -3.0219600200653076, + "logps/chosen": -55.412353515625, + "logps/rejected": -52.04193115234375, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00013867543020751327, + "rewards/margins": -7.246668974403292e-05, + "rewards/rejected": -6.620875501539558e-05, + "step": 180 + }, + { + "epoch": 0.06547208821502412, + "grad_norm": 1.527038812637329, + "learning_rate": 1.6351118760757314e-08, + "logits/chosen": -3.153803586959839, + "logits/rejected": -3.126765489578247, + "logps/chosen": -52.947410583496094, + "logps/rejected": -51.77288055419922, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.9722734325332567e-05, + "rewards/margins": 4.115318733965978e-05, + "rewards/rejected": -7.087593985488638e-05, + "step": 190 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 1.733069896697998, + "learning_rate": 1.7211703958691908e-08, + "logits/chosen": -3.0855889320373535, + "logits/rejected": -3.064948081970215, + "logps/chosen": -54.34423828125, + "logps/rejected": -53.98308181762695, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 8.129735942929983e-05, + "rewards/margins": 3.768491296796128e-05, + "rewards/rejected": 4.361245737527497e-05, + "step": 200 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -3.163177728652954, + "eval_logits/rejected": -3.1575300693511963, + "eval_logps/chosen": -58.711936950683594, + "eval_logps/rejected": -63.17683410644531, + "eval_loss": 0.6931644678115845, + "eval_rewards/accuracies": 0.4862918257713318, + "eval_rewards/chosen": -4.3645013647619635e-07, + "eval_rewards/margins": -3.331492916913703e-05, + "eval_rewards/rejected": 3.287848085165024e-05, + "eval_runtime": 384.7152, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 200 + }, + { + "epoch": 0.07236388697450034, + "grad_norm": 1.7262409925460815, + "learning_rate": 1.8072289156626504e-08, + "logits/chosen": -3.0673792362213135, + "logits/rejected": -3.0613677501678467, + "logps/chosen": -52.46643829345703, + "logps/rejected": -54.71671676635742, + "loss": 0.6932, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.00010696313984226435, + "rewards/margins": -0.00014883882249705493, + "rewards/rejected": 4.187567901681177e-05, + "step": 210 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 1.730053186416626, + "learning_rate": 1.89328743545611e-08, + "logits/chosen": -3.110975742340088, + "logits/rejected": -3.086411237716675, + "logps/chosen": -53.7027702331543, + "logps/rejected": -53.711639404296875, + "loss": 0.6931, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -4.8385940317530185e-05, + "rewards/margins": 2.1820562324137427e-05, + "rewards/rejected": -7.020650082267821e-05, + "step": 220 + }, + { + "epoch": 0.07925568573397657, + "grad_norm": 1.6545181274414062, + "learning_rate": 1.9793459552495694e-08, + "logits/chosen": -3.0402209758758545, + "logits/rejected": -3.014207363128662, + "logps/chosen": -56.14927291870117, + "logps/rejected": -53.79438400268555, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.179082664748421e-05, + "rewards/margins": -1.239746325154556e-05, + "rewards/rejected": 6.066344440114335e-07, + "step": 230 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 1.8088655471801758, + "learning_rate": 2.065404475043029e-08, + "logits/chosen": -3.0466361045837402, + "logits/rejected": -3.0282115936279297, + "logps/chosen": -53.76200485229492, + "logps/rejected": -55.18682861328125, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -9.619267075322568e-05, + "rewards/margins": -2.319897794222925e-05, + "rewards/rejected": -7.2993672802113e-05, + "step": 240 + }, + { + "epoch": 0.08614748449345279, + "grad_norm": 1.5655380487442017, + "learning_rate": 2.1514629948364887e-08, + "logits/chosen": -2.986396551132202, + "logits/rejected": -2.9472384452819824, + "logps/chosen": -57.795745849609375, + "logps/rejected": -51.48908615112305, + "loss": 0.6931, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.0001305347977904603, + "rewards/margins": 0.00012307525321375579, + "rewards/rejected": -0.0002536100219003856, + "step": 250 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 1.5828660726547241, + "learning_rate": 2.2375215146299484e-08, + "logits/chosen": -3.03403902053833, + "logits/rejected": -3.010531187057495, + "logps/chosen": -57.34452438354492, + "logps/rejected": -51.8224983215332, + "loss": 0.6931, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -3.587424725992605e-05, + "rewards/margins": 6.571992707904428e-05, + "rewards/rejected": -0.00010159417433897033, + "step": 260 + }, + { + "epoch": 0.09303928325292901, + "grad_norm": 1.5742228031158447, + "learning_rate": 2.3235800344234077e-08, + "logits/chosen": -3.047518491744995, + "logits/rejected": -3.018275499343872, + "logps/chosen": -54.280174255371094, + "logps/rejected": -52.072166442871094, + "loss": 0.6931, + "rewards/accuracies": 0.515625, + "rewards/chosen": -4.243188232067041e-05, + "rewards/margins": 0.00016101889195851982, + "rewards/rejected": -0.00020345079246908426, + "step": 270 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 1.7620676755905151, + "learning_rate": 2.4096385542168673e-08, + "logits/chosen": -3.0869364738464355, + "logits/rejected": -3.073918581008911, + "logps/chosen": -52.85234451293945, + "logps/rejected": -53.48882293701172, + "loss": 0.6931, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.00011127178004244342, + "rewards/margins": 0.00011203387839486822, + "rewards/rejected": -0.0002233056875411421, + "step": 280 + }, + { + "epoch": 0.09993108201240523, + "grad_norm": 1.469874620437622, + "learning_rate": 2.495697074010327e-08, + "logits/chosen": -3.0485854148864746, + "logits/rejected": -3.041419267654419, + "logps/chosen": -51.24589920043945, + "logps/rejected": -53.544586181640625, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.00010780607408378273, + "rewards/margins": 1.8475009710527956e-05, + "rewards/rejected": -0.00012628106924239546, + "step": 290 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 1.7838884592056274, + "learning_rate": 2.5817555938037866e-08, + "logits/chosen": -3.0377209186553955, + "logits/rejected": -3.0146260261535645, + "logps/chosen": -54.3782844543457, + "logps/rejected": -55.74077224731445, + "loss": 0.6931, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": 3.057599315070547e-05, + "rewards/margins": 0.00016555582988075912, + "rewards/rejected": -0.00013497984036803246, + "step": 300 + }, + { + "epoch": 0.10337698139214335, + "eval_logits/chosen": -3.1631579399108887, + "eval_logits/rejected": -3.157500743865967, + "eval_logps/chosen": -58.70083236694336, + "eval_logps/rejected": -63.1627197265625, + "eval_loss": 0.6931794285774231, + "eval_rewards/accuracies": 0.4756040871143341, + "eval_rewards/chosen": 0.00011061962868552655, + "eval_rewards/margins": -6.344748544506729e-05, + "eval_rewards/rejected": 0.00017406711413059384, + "eval_runtime": 384.668, + "eval_samples_per_second": 11.189, + "eval_steps_per_second": 1.399, + "step": 300 + }, + { + "epoch": 0.10682288077188146, + "grad_norm": 1.6819980144500732, + "learning_rate": 2.667814113597246e-08, + "logits/chosen": -3.069870710372925, + "logits/rejected": -3.0552239418029785, + "logps/chosen": -53.801971435546875, + "logps/rejected": -53.27043914794922, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00010162549733649939, + "rewards/margins": -3.66856183973141e-05, + "rewards/rejected": -6.49398862151429e-05, + "step": 310 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 1.8077443838119507, + "learning_rate": 2.7538726333907053e-08, + "logits/chosen": -3.116912841796875, + "logits/rejected": -3.097919225692749, + "logps/chosen": -53.39704513549805, + "logps/rejected": -52.60417556762695, + "loss": 0.6931, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -8.59591382322833e-05, + "rewards/margins": 5.404234252637252e-05, + "rewards/rejected": -0.00014000148803461343, + "step": 320 + }, + { + "epoch": 0.11371467953135768, + "grad_norm": 1.6062852144241333, + "learning_rate": 2.8399311531841653e-08, + "logits/chosen": -3.0585761070251465, + "logits/rejected": -3.056246042251587, + "logps/chosen": -53.057029724121094, + "logps/rejected": -53.562095642089844, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -5.562259320868179e-05, + "rewards/margins": 0.00014037203800398856, + "rewards/rejected": -0.0001959946530405432, + "step": 330 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 1.6745755672454834, + "learning_rate": 2.9259896729776246e-08, + "logits/chosen": -3.0025081634521484, + "logits/rejected": -2.9875540733337402, + "logps/chosen": -53.48546600341797, + "logps/rejected": -54.20134735107422, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00016055100422818214, + "rewards/margins": -7.695326348766685e-05, + "rewards/rejected": -8.359774074051529e-05, + "step": 340 + }, + { + "epoch": 0.1206064782908339, + "grad_norm": 1.6029772758483887, + "learning_rate": 3.012048192771084e-08, + "logits/chosen": -3.1061463356018066, + "logits/rejected": -3.0769925117492676, + "logps/chosen": -57.422828674316406, + "logps/rejected": -51.77228927612305, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00017459427181165665, + "rewards/margins": -6.433665112126619e-05, + "rewards/rejected": -0.00011025762796634808, + "step": 350 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 1.6956379413604736, + "learning_rate": 3.0981067125645436e-08, + "logits/chosen": -3.044156551361084, + "logits/rejected": -3.027600049972534, + "logps/chosen": -53.99070358276367, + "logps/rejected": -54.564674377441406, + "loss": 0.6931, + "rewards/accuracies": 0.546875, + "rewards/chosen": -7.801742322044447e-05, + "rewards/margins": 9.001044963952154e-05, + "rewards/rejected": -0.00016802789468783885, + "step": 360 + }, + { + "epoch": 0.12749827705031014, + "grad_norm": 1.6766176223754883, + "learning_rate": 3.184165232358003e-08, + "logits/chosen": -3.0830063819885254, + "logits/rejected": -3.0575692653656006, + "logps/chosen": -55.62574005126953, + "logps/rejected": -53.142921447753906, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1775503480748739e-05, + "rewards/margins": 0.00031634545302949846, + "rewards/rejected": -0.0003281210083514452, + "step": 370 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 1.7956558465957642, + "learning_rate": 3.270223752151463e-08, + "logits/chosen": -3.122870922088623, + "logits/rejected": -3.089454412460327, + "logps/chosen": -55.22282791137695, + "logps/rejected": -51.79097366333008, + "loss": 0.693, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": 1.655972118896898e-05, + "rewards/margins": 0.0003105142677668482, + "rewards/rejected": -0.00029395456658676267, + "step": 380 + }, + { + "epoch": 0.13439007580978635, + "grad_norm": 1.6882342100143433, + "learning_rate": 3.356282271944922e-08, + "logits/chosen": -3.0961623191833496, + "logits/rejected": -3.068988084793091, + "logps/chosen": -53.05096435546875, + "logps/rejected": -51.719261169433594, + "loss": 0.6931, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.00021331440075300634, + "rewards/margins": 8.027394505916163e-05, + "rewards/rejected": -0.0002935883530881256, + "step": 390 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 1.7367823123931885, + "learning_rate": 3.4423407917383815e-08, + "logits/chosen": -3.04445743560791, + "logits/rejected": -3.0147831439971924, + "logps/chosen": -54.4653205871582, + "logps/rejected": -54.02545166015625, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00013222855341155082, + "rewards/margins": 0.00030294861062429845, + "rewards/rejected": -0.00043517714948393404, + "step": 400 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -3.1628847122192383, + "eval_logits/rejected": -3.1572272777557373, + "eval_logps/chosen": -58.69401550292969, + "eval_logps/rejected": -63.163734436035156, + "eval_loss": 0.6931403875350952, + "eval_rewards/accuracies": 0.5006970167160034, + "eval_rewards/chosen": 0.00017883002874441445, + "eval_rewards/margins": 1.4981026652094442e-05, + "eval_rewards/rejected": 0.00016384897753596306, + "eval_runtime": 384.3938, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 400 + }, + { + "epoch": 0.14128187456926258, + "grad_norm": 1.5201988220214844, + "learning_rate": 3.5283993115318415e-08, + "logits/chosen": -3.0868654251098633, + "logits/rejected": -3.059985637664795, + "logps/chosen": -54.123985290527344, + "logps/rejected": -53.197288513183594, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00031447256333194673, + "rewards/margins": 0.00020825877436436713, + "rewards/rejected": -0.000522731221280992, + "step": 410 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 1.7084550857543945, + "learning_rate": 3.614457831325301e-08, + "logits/chosen": -3.091403007507324, + "logits/rejected": -3.070308208465576, + "logps/chosen": -54.363189697265625, + "logps/rejected": -51.84135055541992, + "loss": 0.6929, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.0002584571484476328, + "rewards/margins": 0.0005626108613796532, + "rewards/rejected": -0.0008210679516196251, + "step": 420 + }, + { + "epoch": 0.1481736733287388, + "grad_norm": 1.7150694131851196, + "learning_rate": 3.70051635111876e-08, + "logits/chosen": -3.031503200531006, + "logits/rejected": -3.0161654949188232, + "logps/chosen": -51.86822509765625, + "logps/rejected": -53.66876220703125, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0002805610129144043, + "rewards/margins": 0.00019659681129269302, + "rewards/rejected": -0.00047715791151858866, + "step": 430 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 1.5232292413711548, + "learning_rate": 3.78657487091222e-08, + "logits/chosen": -3.075045108795166, + "logits/rejected": -3.061749219894409, + "logps/chosen": -51.54164505004883, + "logps/rejected": -52.585777282714844, + "loss": 0.6929, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.00025903843925334513, + "rewards/margins": 0.00041876602335833013, + "rewards/rejected": -0.0006778044044040143, + "step": 440 + }, + { + "epoch": 0.15506547208821503, + "grad_norm": 1.5611236095428467, + "learning_rate": 3.8726333907056795e-08, + "logits/chosen": -3.070613145828247, + "logits/rejected": -3.0467374324798584, + "logps/chosen": -56.33530807495117, + "logps/rejected": -53.32719802856445, + "loss": 0.693, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.0002340213832212612, + "rewards/margins": 0.0003091735125053674, + "rewards/rejected": -0.0005431949393823743, + "step": 450 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 1.6240078210830688, + "learning_rate": 3.958691910499139e-08, + "logits/chosen": -3.0725510120391846, + "logits/rejected": -3.0467612743377686, + "logps/chosen": -52.70389938354492, + "logps/rejected": -50.810211181640625, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0002929639595095068, + "rewards/margins": 0.00025620352244004607, + "rewards/rejected": -0.0005491675110533834, + "step": 460 + }, + { + "epoch": 0.16195727084769124, + "grad_norm": 1.5414916276931763, + "learning_rate": 4.044750430292599e-08, + "logits/chosen": -3.1445844173431396, + "logits/rejected": -3.1178226470947266, + "logps/chosen": -56.37430953979492, + "logps/rejected": -54.76704788208008, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00029713130788877606, + "rewards/margins": 0.0004932652227580547, + "rewards/rejected": -0.0007903965306468308, + "step": 470 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 1.7192236185073853, + "learning_rate": 4.130808950086058e-08, + "logits/chosen": -2.9115607738494873, + "logits/rejected": -2.9044690132141113, + "logps/chosen": -53.07261276245117, + "logps/rejected": -55.70893478393555, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0006627115653827786, + "rewards/margins": 0.00027101318119093776, + "rewards/rejected": -0.0009337246301583946, + "step": 480 + }, + { + "epoch": 0.16884906960716747, + "grad_norm": 1.8793227672576904, + "learning_rate": 4.216867469879518e-08, + "logits/chosen": -3.1179113388061523, + "logits/rejected": -3.0901989936828613, + "logps/chosen": -58.39832305908203, + "logps/rejected": -53.71763229370117, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0004218885151203722, + "rewards/margins": 0.000330808776197955, + "rewards/rejected": -0.000752697407733649, + "step": 490 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 1.5864213705062866, + "learning_rate": 4.3029259896729774e-08, + "logits/chosen": -3.0117759704589844, + "logits/rejected": -2.9860451221466064, + "logps/chosen": -55.77704620361328, + "logps/rejected": -52.258628845214844, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0005830780719406903, + "rewards/margins": 0.00019635938224382699, + "rewards/rejected": -0.0007794374832883477, + "step": 500 + }, + { + "epoch": 0.17229496898690558, + "eval_logits/chosen": -3.16249942779541, + "eval_logits/rejected": -3.1568548679351807, + "eval_logps/chosen": -58.682533264160156, + "eval_logps/rejected": -63.158958435058594, + "eval_loss": 0.6931070685386658, + "eval_rewards/accuracies": 0.494191437959671, + "eval_rewards/chosen": 0.0002936015371233225, + "eval_rewards/margins": 8.19716660771519e-05, + "eval_rewards/rejected": 0.0002116298710461706, + "eval_runtime": 384.608, + "eval_samples_per_second": 11.191, + "eval_steps_per_second": 1.399, + "step": 500 + }, + { + "epoch": 0.17574086836664368, + "grad_norm": 1.7202733755111694, + "learning_rate": 4.388984509466437e-08, + "logits/chosen": -3.015113353729248, + "logits/rejected": -3.002572536468506, + "logps/chosen": -55.490325927734375, + "logps/rejected": -56.18473434448242, + "loss": 0.6928, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.0006010312354192138, + "rewards/margins": 0.0006601332570426166, + "rewards/rejected": -0.0012611645506694913, + "step": 510 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 1.5302149057388306, + "learning_rate": 4.475043029259897e-08, + "logits/chosen": -3.113670825958252, + "logits/rejected": -3.0949208736419678, + "logps/chosen": -53.60992431640625, + "logps/rejected": -53.83058547973633, + "loss": 0.6929, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.0005134848761372268, + "rewards/margins": 0.0005421391688287258, + "rewards/rejected": -0.0010556241031736135, + "step": 520 + }, + { + "epoch": 0.18263266712611992, + "grad_norm": 1.6602168083190918, + "learning_rate": 4.561101549053356e-08, + "logits/chosen": -2.997028112411499, + "logits/rejected": -2.9696555137634277, + "logps/chosen": -56.8779296875, + "logps/rejected": -52.88819122314453, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0005577536066994071, + "rewards/margins": 0.0005765163223259151, + "rewards/rejected": -0.001134270103648305, + "step": 530 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 1.5818272829055786, + "learning_rate": 4.6471600688468154e-08, + "logits/chosen": -3.134260892868042, + "logits/rejected": -3.105567455291748, + "logps/chosen": -55.92725372314453, + "logps/rejected": -52.12944412231445, + "loss": 0.6928, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.0006873260135762393, + "rewards/margins": 0.0007128252182155848, + "rewards/rejected": -0.0014001511735841632, + "step": 540 + }, + { + "epoch": 0.18952446588559613, + "grad_norm": 1.5802710056304932, + "learning_rate": 4.7332185886402753e-08, + "logits/chosen": -3.032979965209961, + "logits/rejected": -3.027398109436035, + "logps/chosen": -51.641571044921875, + "logps/rejected": -53.729759216308594, + "loss": 0.693, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.0007201815606094897, + "rewards/margins": 0.0003699318622238934, + "rewards/rejected": -0.001090113422833383, + "step": 550 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 1.6604018211364746, + "learning_rate": 4.8192771084337347e-08, + "logits/chosen": -3.0785210132598877, + "logits/rejected": -3.0737929344177246, + "logps/chosen": -54.71602249145508, + "logps/rejected": -55.244483947753906, + "loss": 0.6928, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0008259072201326489, + "rewards/margins": 0.0006385392043739557, + "rewards/rejected": -0.0014644463080912828, + "step": 560 + }, + { + "epoch": 0.19641626464507236, + "grad_norm": 1.7432787418365479, + "learning_rate": 4.905335628227194e-08, + "logits/chosen": -3.1137382984161377, + "logits/rejected": -3.084575891494751, + "logps/chosen": -54.5408821105957, + "logps/rejected": -53.760047912597656, + "loss": 0.6927, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.0008401373634114861, + "rewards/margins": 0.0008745190571062267, + "rewards/rejected": -0.0017146564787253737, + "step": 570 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 1.535184383392334, + "learning_rate": 4.991394148020654e-08, + "logits/chosen": -3.0551018714904785, + "logits/rejected": -3.041159152984619, + "logps/chosen": -53.745262145996094, + "logps/rejected": -54.66472625732422, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0005425583804026246, + "rewards/margins": 0.0009367944439873099, + "rewards/rejected": -0.0014793528243899345, + "step": 580 + }, + { + "epoch": 0.2033080634045486, + "grad_norm": 1.7447534799575806, + "learning_rate": 4.9999633685875244e-08, + "logits/chosen": -2.9614148139953613, + "logits/rejected": -2.9393579959869385, + "logps/chosen": -52.52238845825195, + "logps/rejected": -53.02446365356445, + "loss": 0.6926, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.0008184127509593964, + "rewards/margins": 0.0010490169515833259, + "rewards/rejected": -0.0018674297025427222, + "step": 590 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 1.4818027019500732, + "learning_rate": 4.9998367428608654e-08, + "logits/chosen": -3.0609617233276367, + "logits/rejected": -3.0357613563537598, + "logps/chosen": -56.0200309753418, + "logps/rejected": -50.999908447265625, + "loss": 0.6928, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0008805854013189673, + "rewards/margins": 0.0007603298290632665, + "rewards/rejected": -0.0016409152885898948, + "step": 600 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -3.1612586975097656, + "eval_logits/rejected": -3.155609369277954, + "eval_logps/chosen": -58.64760208129883, + "eval_logps/rejected": -63.13199234008789, + "eval_loss": 0.693067729473114, + "eval_rewards/accuracies": 0.5023234486579895, + "eval_rewards/chosen": 0.0006429245695471764, + "eval_rewards/margins": 0.00016164187400136143, + "eval_rewards/rejected": 0.0004812826809938997, + "eval_runtime": 384.5126, + "eval_samples_per_second": 11.193, + "eval_steps_per_second": 1.399, + "step": 600 + }, + { + "epoch": 0.2101998621640248, + "grad_norm": 1.5632553100585938, + "learning_rate": 4.999619675160485e-08, + "logits/chosen": -3.081904649734497, + "logits/rejected": -3.0513620376586914, + "logps/chosen": -53.667579650878906, + "logps/rejected": -52.85166549682617, + "loss": 0.6926, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.000827993091661483, + "rewards/margins": 0.00119282235391438, + "rewards/rejected": -0.002020815387368202, + "step": 610 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 1.629331111907959, + "learning_rate": 4.999312173339707e-08, + "logits/chosen": -3.0883796215057373, + "logits/rejected": -3.0587565898895264, + "logps/chosen": -54.1837272644043, + "logps/rejected": -52.726890563964844, + "loss": 0.6926, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.0008975326200015843, + "rewards/margins": 0.0011468585580587387, + "rewards/rejected": -0.0020443913526833057, + "step": 620 + }, + { + "epoch": 0.21709166092350105, + "grad_norm": 1.5568369626998901, + "learning_rate": 4.998914248523688e-08, + "logits/chosen": -3.063368558883667, + "logits/rejected": -3.0294599533081055, + "logps/chosen": -53.4654655456543, + "logps/rejected": -50.96184539794922, + "loss": 0.6926, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0012578107416629791, + "rewards/margins": 0.0010019788751378655, + "rewards/rejected": -0.002259789500385523, + "step": 630 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 1.6668413877487183, + "learning_rate": 4.998425915109009e-08, + "logits/chosen": -3.084184169769287, + "logits/rejected": -3.084564685821533, + "logps/chosen": -51.63975143432617, + "logps/rejected": -57.51726531982422, + "loss": 0.6927, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.0011606714688241482, + "rewards/margins": 0.0009209074196405709, + "rewards/rejected": -0.002081578830257058, + "step": 640 + }, + { + "epoch": 0.22398345968297725, + "grad_norm": 1.5113067626953125, + "learning_rate": 4.9978471907631604e-08, + "logits/chosen": -3.0601143836975098, + "logits/rejected": -3.0378782749176025, + "logps/chosen": -52.5632438659668, + "logps/rejected": -52.316322326660156, + "loss": 0.6927, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0015532078687101603, + "rewards/margins": 0.0008993824012577534, + "rewards/rejected": -0.00245259003713727, + "step": 650 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 1.772717833518982, + "learning_rate": 4.9971780964238976e-08, + "logits/chosen": -3.086937427520752, + "logits/rejected": -3.0564024448394775, + "logps/chosen": -54.497314453125, + "logps/rejected": -50.39165496826172, + "loss": 0.6922, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.0011715441942214966, + "rewards/margins": 0.00196392135694623, + "rewards/rejected": -0.0031354655511677265, + "step": 660 + }, + { + "epoch": 0.2308752584424535, + "grad_norm": 1.6012787818908691, + "learning_rate": 4.996418656298486e-08, + "logits/chosen": -3.0736892223358154, + "logits/rejected": -3.0475335121154785, + "logps/chosen": -55.251243591308594, + "logps/rejected": -51.75853729248047, + "loss": 0.6922, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.0007274464005604386, + "rewards/margins": 0.0018245524261146784, + "rewards/rejected": -0.002551998710259795, + "step": 670 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 1.6301146745681763, + "learning_rate": 4.995568897862825e-08, + "logits/chosen": -3.0388023853302, + "logits/rejected": -3.020338773727417, + "logps/chosen": -54.80192184448242, + "logps/rejected": -54.8926887512207, + "loss": 0.6926, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.001500314916484058, + "rewards/margins": 0.0011322436621412635, + "rewards/rejected": -0.002632558811455965, + "step": 680 + }, + { + "epoch": 0.2377670572019297, + "grad_norm": 1.5935677289962769, + "learning_rate": 4.994628851860456e-08, + "logits/chosen": -3.0772476196289062, + "logits/rejected": -3.058328866958618, + "logps/chosen": -53.629234313964844, + "logps/rejected": -52.86656951904297, + "loss": 0.6922, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.001254276023246348, + "rewards/margins": 0.0018880158895626664, + "rewards/rejected": -0.003142292145639658, + "step": 690 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 1.5870788097381592, + "learning_rate": 4.993598552301446e-08, + "logits/chosen": -3.0830237865448, + "logits/rejected": -3.056472063064575, + "logps/chosen": -56.46472930908203, + "logps/rejected": -53.47216033935547, + "loss": 0.692, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.0011341057252138853, + "rewards/margins": 0.0022793817333877087, + "rewards/rejected": -0.0034134879242628813, + "step": 700 + }, + { + "epoch": 0.2412129565816678, + "eval_logits/chosen": -3.1599061489105225, + "eval_logits/rejected": -3.1542632579803467, + "eval_logps/chosen": -58.6091423034668, + "eval_logps/rejected": -63.11530685424805, + "eval_loss": 0.6929602026939392, + "eval_rewards/accuracies": 0.5413568615913391, + "eval_rewards/chosen": 0.0010274943197146058, + "eval_rewards/margins": 0.00037930175312794745, + "eval_rewards/rejected": 0.0006481926538981497, + "eval_runtime": 384.5168, + "eval_samples_per_second": 11.193, + "eval_steps_per_second": 1.399, + "step": 700 + }, + { + "epoch": 0.24465885596140594, + "grad_norm": 1.4943526983261108, + "learning_rate": 4.9924780364611574e-08, + "logits/chosen": -3.0165517330169678, + "logits/rejected": -3.013462781906128, + "logps/chosen": -52.749977111816406, + "logps/rejected": -54.24238204956055, + "loss": 0.6926, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0018606961239129305, + "rewards/margins": 0.0011449294397607446, + "rewards/rejected": -0.0030056254472583532, + "step": 710 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 1.5908006429672241, + "learning_rate": 4.9912673448789055e-08, + "logits/chosen": -3.0585460662841797, + "logits/rejected": -3.0370562076568604, + "logps/chosen": -52.21095657348633, + "logps/rejected": -52.79679489135742, + "loss": 0.6922, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.0018738650251179934, + "rewards/margins": 0.001822577090933919, + "rewards/rejected": -0.003696442348882556, + "step": 720 + }, + { + "epoch": 0.25155065472088217, + "grad_norm": 1.6703251600265503, + "learning_rate": 4.989966521356484e-08, + "logits/chosen": -3.0375118255615234, + "logits/rejected": -3.00249981880188, + "logps/chosen": -53.77020263671875, + "logps/rejected": -51.606651306152344, + "loss": 0.692, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0018283795798197389, + "rewards/margins": 0.0023119805846363306, + "rewards/rejected": -0.004140359815210104, + "step": 730 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 1.5016419887542725, + "learning_rate": 4.9885756129565855e-08, + "logits/chosen": -3.187788963317871, + "logits/rejected": -3.1516430377960205, + "logps/chosen": -54.3302116394043, + "logps/rejected": -53.6005859375, + "loss": 0.692, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.0016302086878567934, + "rewards/margins": 0.0022917932365089655, + "rewards/rejected": -0.003922001924365759, + "step": 740 + }, + { + "epoch": 0.2584424534803584, + "grad_norm": 1.652618169784546, + "learning_rate": 4.9870946700010963e-08, + "logits/chosen": -3.0573348999023438, + "logits/rejected": -3.043586254119873, + "logps/chosen": -53.8933219909668, + "logps/rejected": -53.5998649597168, + "loss": 0.692, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.002071264898404479, + "rewards/margins": 0.002245786366984248, + "rewards/rejected": -0.0043170517310500145, + "step": 750 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 1.5802987813949585, + "learning_rate": 4.985523746069277e-08, + "logits/chosen": -3.0178141593933105, + "logits/rejected": -2.99599289894104, + "logps/chosen": -55.6571159362793, + "logps/rejected": -52.568031311035156, + "loss": 0.6923, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.002288275398313999, + "rewards/margins": 0.001769710099324584, + "rewards/rejected": -0.004057985730469227, + "step": 760 + }, + { + "epoch": 0.2653342522398346, + "grad_norm": 1.6327557563781738, + "learning_rate": 4.9838628979958226e-08, + "logits/chosen": -3.025636911392212, + "logits/rejected": -3.001251697540283, + "logps/chosen": -54.04003143310547, + "logps/rejected": -51.364707946777344, + "loss": 0.6919, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.002429876709356904, + "rewards/margins": 0.0024292119778692722, + "rewards/rejected": -0.00485908892005682, + "step": 770 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 1.6433818340301514, + "learning_rate": 4.982112185868809e-08, + "logits/chosen": -3.016303300857544, + "logits/rejected": -2.9975686073303223, + "logps/chosen": -52.39423751831055, + "logps/rejected": -51.11736297607422, + "loss": 0.692, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.002856833627447486, + "rewards/margins": 0.002312553348019719, + "rewards/rejected": -0.005169386975467205, + "step": 780 + }, + { + "epoch": 0.2722260509993108, + "grad_norm": 1.7514872550964355, + "learning_rate": 4.980271673027517e-08, + "logits/chosen": -3.042994737625122, + "logits/rejected": -3.038914442062378, + "logps/chosen": -52.71642303466797, + "logps/rejected": -55.53397750854492, + "loss": 0.6923, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0030075139366090298, + "rewards/margins": 0.0017282769549638033, + "rewards/rejected": -0.004735790658742189, + "step": 790 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 1.5455923080444336, + "learning_rate": 4.9783414260601395e-08, + "logits/chosen": -3.052412509918213, + "logits/rejected": -3.0258965492248535, + "logps/chosen": -53.40227127075195, + "logps/rejected": -53.219749450683594, + "loss": 0.6923, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.0024742651730775833, + "rewards/margins": 0.00176246277987957, + "rewards/rejected": -0.004236727952957153, + "step": 800 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -3.158536672592163, + "eval_logits/rejected": -3.1529228687286377, + "eval_logps/chosen": -58.58611297607422, + "eval_logps/rejected": -63.12191390991211, + "eval_loss": 0.6928143501281738, + "eval_rewards/accuracies": 0.5587825179100037, + "eval_rewards/chosen": 0.0012578194728121161, + "eval_rewards/margins": 0.000675736868288368, + "eval_rewards/rejected": 0.0005820823716931045, + "eval_runtime": 384.4564, + "eval_samples_per_second": 11.195, + "eval_steps_per_second": 1.399, + "step": 800 + }, + { + "epoch": 0.27911784975878706, + "grad_norm": 1.6539554595947266, + "learning_rate": 4.976321514801376e-08, + "logits/chosen": -3.0484116077423096, + "logits/rejected": -3.0248422622680664, + "logps/chosen": -53.9824333190918, + "logps/rejected": -56.396575927734375, + "loss": 0.6922, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.0031535557936877012, + "rewards/margins": 0.001993196550756693, + "rewards/rejected": -0.0051467521116137505, + "step": 810 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 1.732709288597107, + "learning_rate": 4.974212012329902e-08, + "logits/chosen": -3.1013197898864746, + "logits/rejected": -3.0694527626037598, + "logps/chosen": -56.43525314331055, + "logps/rejected": -52.12022018432617, + "loss": 0.6913, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0020319526083767414, + "rewards/margins": 0.003823560895398259, + "rewards/rejected": -0.005855513270944357, + "step": 820 + }, + { + "epoch": 0.28600964851826327, + "grad_norm": 1.6947790384292603, + "learning_rate": 4.97201299496573e-08, + "logits/chosen": -3.1101901531219482, + "logits/rejected": -3.0727791786193848, + "logps/chosen": -56.2694206237793, + "logps/rejected": -52.88818359375, + "loss": 0.6913, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.0016542660305276513, + "rewards/margins": 0.0037212413735687733, + "rewards/rejected": -0.005375507287681103, + "step": 830 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 1.650404453277588, + "learning_rate": 4.969724542267442e-08, + "logits/chosen": -3.0947813987731934, + "logits/rejected": -3.0706396102905273, + "logps/chosen": -55.293067932128906, + "logps/rejected": -55.382408142089844, + "loss": 0.6914, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.002315860940143466, + "rewards/margins": 0.003439708147197962, + "rewards/rejected": -0.005755568854510784, + "step": 840 + }, + { + "epoch": 0.2929014472777395, + "grad_norm": 1.7584972381591797, + "learning_rate": 4.967346737029316e-08, + "logits/chosen": -3.0069823265075684, + "logits/rejected": -3.001641273498535, + "logps/chosen": -52.9250602722168, + "logps/rejected": -54.31984329223633, + "loss": 0.6924, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.003708853619173169, + "rewards/margins": 0.001558844349347055, + "rewards/rejected": -0.005267697852104902, + "step": 850 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 1.7942557334899902, + "learning_rate": 4.964879665278331e-08, + "logits/chosen": -3.0983834266662598, + "logits/rejected": -3.0633091926574707, + "logps/chosen": -57.68050003051758, + "logps/rejected": -53.3126106262207, + "loss": 0.6918, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.00262661837041378, + "rewards/margins": 0.0028027433436363935, + "rewards/rejected": -0.005429361946880817, + "step": 860 + }, + { + "epoch": 0.2997932460372157, + "grad_norm": 1.6487774848937988, + "learning_rate": 4.9623234162710505e-08, + "logits/chosen": -3.072366952896118, + "logits/rejected": -3.0587127208709717, + "logps/chosen": -53.41727828979492, + "logps/rejected": -53.83111572265625, + "loss": 0.6924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.003994829952716827, + "rewards/margins": 0.001481076586060226, + "rewards/rejected": -0.005475906189531088, + "step": 870 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 1.6196177005767822, + "learning_rate": 4.959678082490396e-08, + "logits/chosen": -3.0850799083709717, + "logits/rejected": -3.0610485076904297, + "logps/chosen": -55.864410400390625, + "logps/rejected": -55.08472442626953, + "loss": 0.6915, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0025982023216784, + "rewards/margins": 0.0033730953000485897, + "rewards/rejected": -0.00597129762172699, + "step": 880 + }, + { + "epoch": 0.30668504479669195, + "grad_norm": 1.7501091957092285, + "learning_rate": 4.9569437596423006e-08, + "logits/chosen": -3.0634050369262695, + "logits/rejected": -3.0458738803863525, + "logps/chosen": -55.1644401550293, + "logps/rejected": -54.405372619628906, + "loss": 0.6917, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.004143073223531246, + "rewards/margins": 0.002887632232159376, + "rewards/rejected": -0.007030704524368048, + "step": 890 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 1.6706045866012573, + "learning_rate": 4.954120546652246e-08, + "logits/chosen": -3.1494853496551514, + "logits/rejected": -3.1262238025665283, + "logps/chosen": -52.21024703979492, + "logps/rejected": -52.95808792114258, + "loss": 0.6912, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.0031147089321166277, + "rewards/margins": 0.0038592598866671324, + "rewards/rejected": -0.006973968353122473, + "step": 900 + }, + { + "epoch": 0.31013094417643006, + "eval_logits/chosen": -3.1557774543762207, + "eval_logits/rejected": -3.1501119136810303, + "eval_logps/chosen": -58.54638671875, + "eval_logps/rejected": -63.11029815673828, + "eval_loss": 0.6926776766777039, + "eval_rewards/accuracies": 0.5659851431846619, + "eval_rewards/chosen": 0.0016550758155062795, + "eval_rewards/margins": 0.0009568364475853741, + "eval_rewards/rejected": 0.0006982393097132444, + "eval_runtime": 384.6781, + "eval_samples_per_second": 11.189, + "eval_steps_per_second": 1.399, + "step": 900 + }, + { + "epoch": 0.31357684355616816, + "grad_norm": 1.5960333347320557, + "learning_rate": 4.9512085456616845e-08, + "logits/chosen": -3.119786024093628, + "logits/rejected": -3.0845987796783447, + "logps/chosen": -56.18413162231445, + "logps/rejected": -52.79380416870117, + "loss": 0.6916, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.0029451637528836727, + "rewards/margins": 0.003129506018012762, + "rewards/rejected": -0.006074669770896435, + "step": 910 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 1.6668741703033447, + "learning_rate": 4.948207862024345e-08, + "logits/chosen": -3.104740619659424, + "logits/rejected": -3.0957703590393066, + "logps/chosen": -55.60581588745117, + "logps/rejected": -55.4895133972168, + "loss": 0.6923, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.004241111688315868, + "rewards/margins": 0.0017586927860975266, + "rewards/rejected": -0.005999804940074682, + "step": 920 + }, + { + "epoch": 0.32046864231564437, + "grad_norm": 1.724545955657959, + "learning_rate": 4.9451186043024136e-08, + "logits/chosen": -3.041184902191162, + "logits/rejected": -3.02308988571167, + "logps/chosen": -55.5092887878418, + "logps/rejected": -54.970558166503906, + "loss": 0.6907, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.0025846255011856556, + "rewards/margins": 0.004906138870865107, + "rewards/rejected": -0.007490763906389475, + "step": 930 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 1.652547001838684, + "learning_rate": 4.941940884262618e-08, + "logits/chosen": -3.0771613121032715, + "logits/rejected": -3.045358896255493, + "logps/chosen": -54.669090270996094, + "logps/rejected": -53.556488037109375, + "loss": 0.6902, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0033632400445640087, + "rewards/margins": 0.0059061916545033455, + "rewards/rejected": -0.009269431233406067, + "step": 940 + }, + { + "epoch": 0.32736044107512063, + "grad_norm": 1.7784395217895508, + "learning_rate": 4.938674816872173e-08, + "logits/chosen": -3.0897176265716553, + "logits/rejected": -3.070265054702759, + "logps/chosen": -55.5160026550293, + "logps/rejected": -54.05512237548828, + "loss": 0.691, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.00267231953330338, + "rewards/margins": 0.00433064391836524, + "rewards/rejected": -0.007002964615821838, + "step": 950 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 1.64098060131073, + "learning_rate": 4.935320520294628e-08, + "logits/chosen": -3.0121991634368896, + "logits/rejected": -2.980740785598755, + "logps/chosen": -55.34636306762695, + "logps/rejected": -54.396728515625, + "loss": 0.6908, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.004281845409423113, + "rewards/margins": 0.004687915090471506, + "rewards/rejected": -0.008969759568572044, + "step": 960 + }, + { + "epoch": 0.33425223983459684, + "grad_norm": 1.5851314067840576, + "learning_rate": 4.931878115885591e-08, + "logits/chosen": -3.029776096343994, + "logits/rejected": -3.0019984245300293, + "logps/chosen": -52.888648986816406, + "logps/rejected": -52.870269775390625, + "loss": 0.6909, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.004734398797154427, + "rewards/margins": 0.004596917890012264, + "rewards/rejected": -0.009331315755844116, + "step": 970 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 1.506502389907837, + "learning_rate": 4.9283477281883315e-08, + "logits/chosen": -3.05110764503479, + "logits/rejected": -3.0407040119171143, + "logps/chosen": -53.92938232421875, + "logps/rejected": -55.473533630371094, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0049580661579966545, + "rewards/margins": 0.00340564688667655, + "rewards/rejected": -0.008363713510334492, + "step": 980 + }, + { + "epoch": 0.34114403859407305, + "grad_norm": 1.5995771884918213, + "learning_rate": 4.9247294849292856e-08, + "logits/chosen": -3.052551507949829, + "logits/rejected": -3.035020589828491, + "logps/chosen": -56.33484649658203, + "logps/rejected": -53.77924346923828, + "loss": 0.6913, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.004676403012126684, + "rewards/margins": 0.0036912120413035154, + "rewards/rejected": -0.008367614820599556, + "step": 990 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 1.676594614982605, + "learning_rate": 4.9210235170134244e-08, + "logits/chosen": -3.0952134132385254, + "logits/rejected": -3.080845832824707, + "logps/chosen": -50.99755096435547, + "logps/rejected": -53.95912551879883, + "loss": 0.6909, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.005998981185257435, + "rewards/margins": 0.004682415165007114, + "rewards/rejected": -0.01068139635026455, + "step": 1000 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -3.1538045406341553, + "eval_logits/rejected": -3.148146629333496, + "eval_logps/chosen": -58.5271110534668, + "eval_logps/rejected": -63.12847900390625, + "eval_loss": 0.6924968361854553, + "eval_rewards/accuracies": 0.5645910501480103, + "eval_rewards/chosen": 0.0018478184938430786, + "eval_rewards/margins": 0.00133140804246068, + "eval_rewards/rejected": 0.0005164103349670768, + "eval_runtime": 384.1578, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.4, + "step": 1000 + }, + { + "epoch": 0.34803583735354926, + "grad_norm": 1.7784764766693115, + "learning_rate": 4.917229958519526e-08, + "logits/chosen": -3.0560386180877686, + "logits/rejected": -3.0308563709259033, + "logps/chosen": -53.219703674316406, + "logps/rejected": -54.31676483154297, + "loss": 0.6903, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.005041136406362057, + "rewards/margins": 0.005721528083086014, + "rewards/rejected": -0.010762663558125496, + "step": 1010 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 1.6811140775680542, + "learning_rate": 4.9133489466953204e-08, + "logits/chosen": -3.0694663524627686, + "logits/rejected": -3.0522894859313965, + "logps/chosen": -55.4501838684082, + "logps/rejected": -55.52228546142578, + "loss": 0.6913, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.005660675000399351, + "rewards/margins": 0.0038871937431395054, + "rewards/rejected": -0.009547867812216282, + "step": 1020 + }, + { + "epoch": 0.3549276361130255, + "grad_norm": 1.588110089302063, + "learning_rate": 4.909380621952524e-08, + "logits/chosen": -3.112725257873535, + "logits/rejected": -3.081007957458496, + "logps/chosen": -53.6580696105957, + "logps/rejected": -53.864463806152344, + "loss": 0.6908, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0038233071099966764, + "rewards/margins": 0.004857801832258701, + "rewards/rejected": -0.008681108243763447, + "step": 1030 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 1.6203489303588867, + "learning_rate": 4.9053251278617604e-08, + "logits/chosen": -3.0942468643188477, + "logits/rejected": -3.0676419734954834, + "logps/chosen": -54.168235778808594, + "logps/rejected": -53.65093231201172, + "loss": 0.6914, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": -0.00587734580039978, + "rewards/margins": 0.0035653274971991777, + "rewards/rejected": -0.00944267213344574, + "step": 1040 + }, + { + "epoch": 0.36181943487250173, + "grad_norm": 1.779880166053772, + "learning_rate": 4.9011826111473685e-08, + "logits/chosen": -3.063671112060547, + "logits/rejected": -3.050913095474243, + "logps/chosen": -55.40948486328125, + "logps/rejected": -53.851905822753906, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0053887562826275826, + "rewards/margins": 0.003592419670894742, + "rewards/rejected": -0.008981176652014256, + "step": 1050 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 1.7278673648834229, + "learning_rate": 4.89695322168209e-08, + "logits/chosen": -3.0266060829162598, + "logits/rejected": -3.0144662857055664, + "logps/chosen": -51.79481887817383, + "logps/rejected": -54.04804611206055, + "loss": 0.691, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.005571847315877676, + "rewards/margins": 0.004314024467021227, + "rewards/rejected": -0.009885871782898903, + "step": 1060 + }, + { + "epoch": 0.36871123363197794, + "grad_norm": 1.6478562355041504, + "learning_rate": 4.89263711248165e-08, + "logits/chosen": -2.9854063987731934, + "logits/rejected": -2.97200608253479, + "logps/chosen": -52.4482307434082, + "logps/rejected": -54.30084991455078, + "loss": 0.6914, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.008854800835251808, + "rewards/margins": 0.0037074810825288296, + "rewards/rejected": -0.012562280520796776, + "step": 1070 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 1.499592900276184, + "learning_rate": 4.8882344396992184e-08, + "logits/chosen": -3.009160041809082, + "logits/rejected": -2.978325843811035, + "logps/chosen": -55.19956588745117, + "logps/rejected": -49.916481018066406, + "loss": 0.6899, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.004767565988004208, + "rewards/margins": 0.006583952344954014, + "rewards/rejected": -0.011351518332958221, + "step": 1080 + }, + { + "epoch": 0.37560303239145415, + "grad_norm": 1.8961305618286133, + "learning_rate": 4.883745362619765e-08, + "logits/chosen": -3.1398892402648926, + "logits/rejected": -3.112936496734619, + "logps/chosen": -56.82250213623047, + "logps/rejected": -53.331634521484375, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006448267959058285, + "rewards/margins": 0.0057860021479427814, + "rewards/rejected": -0.012234269641339779, + "step": 1090 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 1.9023057222366333, + "learning_rate": 4.8791700436542915e-08, + "logits/chosen": -3.1427078247070312, + "logits/rejected": -3.127922773361206, + "logps/chosen": -53.18560791015625, + "logps/rejected": -55.44195556640625, + "loss": 0.6907, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.006103273946791887, + "rewards/margins": 0.005071353167295456, + "rewards/rejected": -0.011174628511071205, + "step": 1100 + }, + { + "epoch": 0.37904893177119225, + "eval_logits/chosen": -3.151279926300049, + "eval_logits/rejected": -3.145657777786255, + "eval_logps/chosen": -58.5153694152832, + "eval_logps/rejected": -63.14689636230469, + "eval_loss": 0.6923530697822571, + "eval_rewards/accuracies": 0.5604089498519897, + "eval_rewards/chosen": 0.0019652547780424356, + "eval_rewards/margins": 0.0016330406069755554, + "eval_rewards/rejected": 0.0003322141710668802, + "eval_runtime": 384.3912, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 1100 + }, + { + "epoch": 0.3824948311509304, + "grad_norm": 1.788101315498352, + "learning_rate": 4.874508648333959e-08, + "logits/chosen": -3.028311252593994, + "logits/rejected": -3.0164096355438232, + "logps/chosen": -54.771568298339844, + "logps/rejected": -54.829750061035156, + "loss": 0.6903, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.004693931899964809, + "rewards/margins": 0.005819953512400389, + "rewards/rejected": -0.01051388494670391, + "step": 1110 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 1.7864511013031006, + "learning_rate": 4.8697613453040974e-08, + "logits/chosen": -3.0762057304382324, + "logits/rejected": -3.0420901775360107, + "logps/chosen": -55.974891662597656, + "logps/rejected": -53.59397506713867, + "loss": 0.6898, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.00731278071179986, + "rewards/margins": 0.0069493455812335014, + "rewards/rejected": -0.014262126758694649, + "step": 1120 + }, + { + "epoch": 0.3893866299104066, + "grad_norm": 1.6350274085998535, + "learning_rate": 4.864928306318104e-08, + "logits/chosen": -2.9895944595336914, + "logits/rejected": -2.9592795372009277, + "logps/chosen": -58.717308044433594, + "logps/rejected": -56.6348876953125, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006385552231222391, + "rewards/margins": 0.006410741712898016, + "rewards/rejected": -0.012796293012797832, + "step": 1130 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 1.6556651592254639, + "learning_rate": 4.860009706231234e-08, + "logits/chosen": -2.992837429046631, + "logits/rejected": -2.971184730529785, + "logps/chosen": -54.53647994995117, + "logps/rejected": -54.9399528503418, + "loss": 0.6906, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.007840116508305073, + "rewards/margins": 0.005326352082192898, + "rewards/rejected": -0.013166469521820545, + "step": 1140 + }, + { + "epoch": 0.39627842866988283, + "grad_norm": 1.8056648969650269, + "learning_rate": 4.8550057229942654e-08, + "logits/chosen": -3.019346237182617, + "logits/rejected": -2.987921953201294, + "logps/chosen": -54.99890899658203, + "logps/rejected": -55.09080123901367, + "loss": 0.6892, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.005849289242178202, + "rewards/margins": 0.007977871224284172, + "rewards/rejected": -0.013827161863446236, + "step": 1150 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 1.6521730422973633, + "learning_rate": 4.849916537647071e-08, + "logits/chosen": -3.0615134239196777, + "logits/rejected": -3.032876491546631, + "logps/chosen": -55.03501510620117, + "logps/rejected": -52.776710510253906, + "loss": 0.6897, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.007752572186291218, + "rewards/margins": 0.007105639670044184, + "rewards/rejected": -0.014858213253319263, + "step": 1160 + }, + { + "epoch": 0.40317022742935904, + "grad_norm": 1.550277829170227, + "learning_rate": 4.844742334312059e-08, + "logits/chosen": -3.0416417121887207, + "logits/rejected": -3.0246546268463135, + "logps/chosen": -54.85407257080078, + "logps/rejected": -55.321006774902344, + "loss": 0.6897, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.007986031472682953, + "rewards/margins": 0.007059283554553986, + "rewards/rejected": -0.015045315027236938, + "step": 1170 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 1.6416267156600952, + "learning_rate": 4.8394833001875206e-08, + "logits/chosen": -3.0568747520446777, + "logits/rejected": -3.0389015674591064, + "logps/chosen": -55.069847106933594, + "logps/rejected": -54.663665771484375, + "loss": 0.6898, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.007581622339785099, + "rewards/margins": 0.0068257213570177555, + "rewards/rejected": -0.014407342299818993, + "step": 1180 + }, + { + "epoch": 0.4100620261888353, + "grad_norm": 1.6217989921569824, + "learning_rate": 4.834139625540851e-08, + "logits/chosen": -3.044999599456787, + "logits/rejected": -3.029223918914795, + "logps/chosen": -54.880348205566406, + "logps/rejected": -54.82160568237305, + "loss": 0.6901, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.008282794617116451, + "rewards/margins": 0.00628671795129776, + "rewards/rejected": -0.014569511637091637, + "step": 1190 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 1.6303709745407104, + "learning_rate": 4.828711503701667e-08, + "logits/chosen": -3.1172478199005127, + "logits/rejected": -3.093177556991577, + "logps/chosen": -55.1821174621582, + "logps/rejected": -54.7901725769043, + "loss": 0.6898, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.009148449636995792, + "rewards/margins": 0.0069651217199862, + "rewards/rejected": -0.01611356995999813, + "step": 1200 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -3.1480495929718018, + "eval_logits/rejected": -3.14241886138916, + "eval_logps/chosen": -58.530601501464844, + "eval_logps/rejected": -63.214256286621094, + "eval_loss": 0.6921030282974243, + "eval_rewards/accuracies": 0.5743494629859924, + "eval_rewards/chosen": 0.0018129091477021575, + "eval_rewards/margins": 0.0021542287431657314, + "eval_rewards/rejected": -0.0003413195663597435, + "eval_runtime": 384.364, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 1200 + }, + { + "epoch": 0.4169538249483115, + "grad_norm": 1.654517412185669, + "learning_rate": 4.823199131054816e-08, + "logits/chosen": -3.113901138305664, + "logits/rejected": -3.082550525665283, + "logps/chosen": -55.60883331298828, + "logps/rejected": -53.55535888671875, + "loss": 0.6889, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.005926042329519987, + "rewards/margins": 0.00877379346638918, + "rewards/rejected": -0.014699837192893028, + "step": 1210 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 1.735965371131897, + "learning_rate": 4.8176027070332646e-08, + "logits/chosen": -3.0720534324645996, + "logits/rejected": -3.05411958694458, + "logps/chosen": -55.67919921875, + "logps/rejected": -54.55861282348633, + "loss": 0.6897, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.007292713038623333, + "rewards/margins": 0.007126192562282085, + "rewards/rejected": -0.014418904669582844, + "step": 1220 + }, + { + "epoch": 0.4238456237077877, + "grad_norm": 1.8626728057861328, + "learning_rate": 4.811922434110889e-08, + "logits/chosen": -3.0178306102752686, + "logits/rejected": -2.9898033142089844, + "logps/chosen": -55.622398376464844, + "logps/rejected": -54.017578125, + "loss": 0.6903, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.01022510789334774, + "rewards/margins": 0.005996005143970251, + "rewards/rejected": -0.01622111350297928, + "step": 1230 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 1.7841885089874268, + "learning_rate": 4.806158517795148e-08, + "logits/chosen": -3.11910343170166, + "logits/rejected": -3.0926966667175293, + "logps/chosen": -55.5859375, + "logps/rejected": -53.11517333984375, + "loss": 0.6899, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.008747449144721031, + "rewards/margins": 0.006734578870236874, + "rewards/rejected": -0.015482030808925629, + "step": 1240 + }, + { + "epoch": 0.43073742246726393, + "grad_norm": 1.685684084892273, + "learning_rate": 4.800311166619646e-08, + "logits/chosen": -3.0900144577026367, + "logits/rejected": -3.073540210723877, + "logps/chosen": -54.59074783325195, + "logps/rejected": -55.54296875, + "loss": 0.6916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.011259722523391247, + "rewards/margins": 0.003347150282934308, + "rewards/rejected": -0.014606873504817486, + "step": 1250 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 1.6296279430389404, + "learning_rate": 4.794380592136591e-08, + "logits/chosen": -2.9728333950042725, + "logits/rejected": -2.9540724754333496, + "logps/chosen": -53.40140914916992, + "logps/rejected": -52.111854553222656, + "loss": 0.69, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.009744682349264622, + "rewards/margins": 0.006456801202148199, + "rewards/rejected": -0.016201484948396683, + "step": 1260 + }, + { + "epoch": 0.4376292212267402, + "grad_norm": 1.6278390884399414, + "learning_rate": 4.788367008909139e-08, + "logits/chosen": -3.076103448867798, + "logits/rejected": -3.0685179233551025, + "logps/chosen": -53.551544189453125, + "logps/rejected": -55.704368591308594, + "loss": 0.6907, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.01196536235511303, + "rewards/margins": 0.005077089183032513, + "rewards/rejected": -0.017042452469468117, + "step": 1270 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 1.77932870388031, + "learning_rate": 4.782270634503631e-08, + "logits/chosen": -3.0844674110412598, + "logits/rejected": -3.0556795597076416, + "logps/chosen": -57.6111946105957, + "logps/rejected": -55.6341667175293, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006394694559276104, + "rewards/margins": 0.009988361969590187, + "rewards/rejected": -0.016383057460188866, + "step": 1280 + }, + { + "epoch": 0.4445210199862164, + "grad_norm": 1.7250643968582153, + "learning_rate": 4.776091689481725e-08, + "logits/chosen": -3.075246810913086, + "logits/rejected": -3.056575059890747, + "logps/chosen": -57.056922912597656, + "logps/rejected": -56.40248489379883, + "loss": 0.6908, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.01063154824078083, + "rewards/margins": 0.004961362108588219, + "rewards/rejected": -0.015592910349369049, + "step": 1290 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 1.6703568696975708, + "learning_rate": 4.7698303973924136e-08, + "logits/chosen": -3.051967144012451, + "logits/rejected": -3.009127140045166, + "logps/chosen": -57.8825798034668, + "logps/rejected": -51.91571807861328, + "loss": 0.688, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009872758761048317, + "rewards/margins": 0.010731091722846031, + "rewards/rejected": -0.0206038486212492, + "step": 1300 + }, + { + "epoch": 0.4479669193659545, + "eval_logits/chosen": -3.144803047180176, + "eval_logits/rejected": -3.139164924621582, + "eval_logps/chosen": -58.535133361816406, + "eval_logps/rejected": -63.26057052612305, + "eval_loss": 0.6919035315513611, + "eval_rewards/accuracies": 0.574117124080658, + "eval_rewards/chosen": 0.0017676005372777581, + "eval_rewards/margins": 0.0025721341371536255, + "eval_rewards/rejected": -0.0008045334252528846, + "eval_runtime": 384.3396, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 1300 + }, + { + "epoch": 0.4514128187456926, + "grad_norm": 1.5745819807052612, + "learning_rate": 4.7634869847639334e-08, + "logits/chosen": -3.062584638595581, + "logits/rejected": -3.0274033546447754, + "logps/chosen": -55.93726348876953, + "logps/rejected": -53.12489700317383, + "loss": 0.6885, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.011147616431117058, + "rewards/margins": 0.009501030668616295, + "rewards/rejected": -0.020648647099733353, + "step": 1310 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 1.6925148963928223, + "learning_rate": 4.757061681095577e-08, + "logits/chosen": -3.001038074493408, + "logits/rejected": -2.9740278720855713, + "logps/chosen": -53.52008056640625, + "logps/rejected": -53.030731201171875, + "loss": 0.6888, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.010656071826815605, + "rewards/margins": 0.008936294354498386, + "rewards/rejected": -0.019592367112636566, + "step": 1320 + }, + { + "epoch": 0.4583046175051689, + "grad_norm": 1.7766765356063843, + "learning_rate": 4.750554718849381e-08, + "logits/chosen": -2.998490810394287, + "logits/rejected": -2.9684324264526367, + "logps/chosen": -56.33980178833008, + "logps/rejected": -55.1796875, + "loss": 0.6892, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.009794693440198898, + "rewards/margins": 0.008208018727600574, + "rewards/rejected": -0.018002711236476898, + "step": 1330 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 1.8487436771392822, + "learning_rate": 4.743966333441723e-08, + "logits/chosen": -3.0082459449768066, + "logits/rejected": -2.980231761932373, + "logps/chosen": -57.111976623535156, + "logps/rejected": -54.838539123535156, + "loss": 0.6885, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.010152112692594528, + "rewards/margins": 0.009611548855900764, + "rewards/rejected": -0.019763659685850143, + "step": 1340 + }, + { + "epoch": 0.4651964162646451, + "grad_norm": 1.653855800628662, + "learning_rate": 4.7372967632348016e-08, + "logits/chosen": -3.0194742679595947, + "logits/rejected": -2.9947919845581055, + "logps/chosen": -53.08203887939453, + "logps/rejected": -53.925636291503906, + "loss": 0.6878, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.011461066082119942, + "rewards/margins": 0.010980509221553802, + "rewards/rejected": -0.022441575303673744, + "step": 1350 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 1.7544745206832886, + "learning_rate": 4.7305462495280103e-08, + "logits/chosen": -3.0522308349609375, + "logits/rejected": -3.044433832168579, + "logps/chosen": -56.90116500854492, + "logps/rejected": -56.19316482543945, + "loss": 0.6909, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.011542152613401413, + "rewards/margins": 0.004821972921490669, + "rewards/rejected": -0.016364123672246933, + "step": 1360 + }, + { + "epoch": 0.4720882150241213, + "grad_norm": 1.8341665267944336, + "learning_rate": 4.723715036549211e-08, + "logits/chosen": -3.041679620742798, + "logits/rejected": -3.020946979522705, + "logps/chosen": -56.77973556518555, + "logps/rejected": -54.12712478637695, + "loss": 0.6895, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.010667492635548115, + "rewards/margins": 0.00756409764289856, + "rewards/rejected": -0.0182315893471241, + "step": 1370 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 1.7911666631698608, + "learning_rate": 4.7168033714458986e-08, + "logits/chosen": -2.9922127723693848, + "logits/rejected": -2.9809184074401855, + "logps/chosen": -53.072364807128906, + "logps/rejected": -56.233436584472656, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.013029935769736767, + "rewards/margins": 0.00814075767993927, + "rewards/rejected": -0.021170692518353462, + "step": 1380 + }, + { + "epoch": 0.4789800137835975, + "grad_norm": 1.6858317852020264, + "learning_rate": 4.7098115042762554e-08, + "logits/chosen": -3.0591578483581543, + "logits/rejected": -3.028982400894165, + "logps/chosen": -55.264068603515625, + "logps/rejected": -53.70880126953125, + "loss": 0.6901, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.012422731146216393, + "rewards/margins": 0.006396573968231678, + "rewards/rejected": -0.018819306045770645, + "step": 1390 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 1.8360016345977783, + "learning_rate": 4.702739688000106e-08, + "logits/chosen": -3.07702898979187, + "logits/rejected": -3.0475409030914307, + "logps/chosen": -56.95623779296875, + "logps/rejected": -55.4343376159668, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.010582750663161278, + "rewards/margins": 0.009008489549160004, + "rewards/rejected": -0.019591238349676132, + "step": 1400 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -3.142038106918335, + "eval_logits/rejected": -3.136406183242798, + "eval_logps/chosen": -58.60540771484375, + "eval_logps/rejected": -63.37487030029297, + "eval_loss": 0.6916956305503845, + "eval_rewards/accuracies": 0.5722583532333374, + "eval_rewards/chosen": 0.0010648738825693727, + "eval_rewards/margins": 0.003012324683368206, + "eval_rewards/rejected": -0.001947450335137546, + "eval_runtime": 384.5538, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 1400 + }, + { + "epoch": 0.48587181254307377, + "grad_norm": 1.7492179870605469, + "learning_rate": 4.695588178469768e-08, + "logits/chosen": -3.0327694416046143, + "logits/rejected": -3.014371871948242, + "logps/chosen": -56.01494598388672, + "logps/rejected": -56.41218948364258, + "loss": 0.6891, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.011834423989057541, + "rewards/margins": 0.00837056152522564, + "rewards/rejected": -0.02020498737692833, + "step": 1410 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 1.6037969589233398, + "learning_rate": 4.688357234420793e-08, + "logits/chosen": -2.9960224628448486, + "logits/rejected": -2.9826016426086426, + "logps/chosen": -55.37086868286133, + "logps/rejected": -55.3111686706543, + "loss": 0.6891, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.013024615123867989, + "rewards/margins": 0.008364452980458736, + "rewards/rejected": -0.0213890690356493, + "step": 1420 + }, + { + "epoch": 0.49276361130255, + "grad_norm": 1.633384346961975, + "learning_rate": 4.681047117462605e-08, + "logits/chosen": -3.034104824066162, + "logits/rejected": -3.0122103691101074, + "logps/chosen": -54.78871536254883, + "logps/rejected": -55.884490966796875, + "loss": 0.6869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011832429096102715, + "rewards/margins": 0.012881122529506683, + "rewards/rejected": -0.02471354976296425, + "step": 1430 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 1.8843494653701782, + "learning_rate": 4.673658092069036e-08, + "logits/chosen": -3.103280544281006, + "logits/rejected": -3.0781211853027344, + "logps/chosen": -56.96147537231445, + "logps/rejected": -54.37610626220703, + "loss": 0.6876, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.012295748107135296, + "rewards/margins": 0.011648855172097683, + "rewards/rejected": -0.023944605141878128, + "step": 1440 + }, + { + "epoch": 0.4996554100620262, + "grad_norm": 1.6782125234603882, + "learning_rate": 4.666190425568761e-08, + "logits/chosen": -3.1093602180480957, + "logits/rejected": -3.1019256114959717, + "logps/chosen": -53.99622344970703, + "logps/rejected": -53.916046142578125, + "loss": 0.6902, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.01273355446755886, + "rewards/margins": 0.006157218478620052, + "rewards/rejected": -0.018890773877501488, + "step": 1450 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 1.6588455438613892, + "learning_rate": 4.658644388135622e-08, + "logits/chosen": -3.080059289932251, + "logits/rejected": -3.072068691253662, + "logps/chosen": -56.12352752685547, + "logps/rejected": -57.99721145629883, + "loss": 0.6883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01284876000136137, + "rewards/margins": 0.01009051688015461, + "rewards/rejected": -0.022939275950193405, + "step": 1460 + }, + { + "epoch": 0.5065472088215024, + "grad_norm": 1.5984660387039185, + "learning_rate": 4.651020252778855e-08, + "logits/chosen": -3.040616035461426, + "logits/rejected": -3.007890224456787, + "logps/chosen": -53.751983642578125, + "logps/rejected": -53.546485900878906, + "loss": 0.6882, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.012961966916918755, + "rewards/margins": 0.010237214155495167, + "rewards/rejected": -0.023199182003736496, + "step": 1470 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 1.9550174474716187, + "learning_rate": 4.6433182953332116e-08, + "logits/chosen": -3.0700085163116455, + "logits/rejected": -3.047051429748535, + "logps/chosen": -55.44309616088867, + "logps/rejected": -55.55956268310547, + "loss": 0.6882, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.013085572049021721, + "rewards/margins": 0.010364504531025887, + "rewards/rejected": -0.023450080305337906, + "step": 1480 + }, + { + "epoch": 0.5134390075809786, + "grad_norm": 1.6405853033065796, + "learning_rate": 4.635538794448982e-08, + "logits/chosen": -2.9804530143737793, + "logits/rejected": -2.9533791542053223, + "logps/chosen": -55.882049560546875, + "logps/rejected": -54.833106994628906, + "loss": 0.6874, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.012936905026435852, + "rewards/margins": 0.011920436285436153, + "rewards/rejected": -0.02485733851790428, + "step": 1490 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 1.5811128616333008, + "learning_rate": 4.627682031581913e-08, + "logits/chosen": -3.031785726547241, + "logits/rejected": -3.015012264251709, + "logps/chosen": -55.61000442504883, + "logps/rejected": -56.501609802246094, + "loss": 0.6886, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.012989061884582043, + "rewards/margins": 0.009481636807322502, + "rewards/rejected": -0.02247069776058197, + "step": 1500 + }, + { + "epoch": 0.5168849069607168, + "eval_logits/chosen": -3.1381611824035645, + "eval_logits/rejected": -3.1325039863586426, + "eval_logps/chosen": -58.68781661987305, + "eval_logps/rejected": -63.50574493408203, + "eval_loss": 0.6914681792259216, + "eval_rewards/accuracies": 0.5736523866653442, + "eval_rewards/chosen": 0.00024078537535388023, + "eval_rewards/margins": 0.003497007070109248, + "eval_rewards/rejected": -0.0032562220003455877, + "eval_runtime": 384.6728, + "eval_samples_per_second": 11.189, + "eval_steps_per_second": 1.399, + "step": 1500 + }, + { + "epoch": 0.5203308063404548, + "grad_norm": 1.8088315725326538, + "learning_rate": 4.619748290983022e-08, + "logits/chosen": -3.1065077781677246, + "logits/rejected": -3.0788774490356445, + "logps/chosen": -56.224525451660156, + "logps/rejected": -54.217689514160156, + "loss": 0.6884, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.015114299952983856, + "rewards/margins": 0.01001181174069643, + "rewards/rejected": -0.025126110762357712, + "step": 1510 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 1.5406330823898315, + "learning_rate": 4.611737859688317e-08, + "logits/chosen": -3.102351188659668, + "logits/rejected": -3.0882105827331543, + "logps/chosen": -55.27317428588867, + "logps/rejected": -57.054290771484375, + "loss": 0.6903, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.018418293446302414, + "rewards/margins": 0.006099226884543896, + "rewards/rejected": -0.024517521262168884, + "step": 1520 + }, + { + "epoch": 0.5272226050999311, + "grad_norm": 1.724234700202942, + "learning_rate": 4.6036510275084114e-08, + "logits/chosen": -3.031350612640381, + "logits/rejected": -3.015336036682129, + "logps/chosen": -56.156715393066406, + "logps/rejected": -56.938873291015625, + "loss": 0.6907, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.015397797338664532, + "rewards/margins": 0.0053660026751458645, + "rewards/rejected": -0.02076379954814911, + "step": 1530 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 1.558053970336914, + "learning_rate": 4.5954880870180344e-08, + "logits/chosen": -2.9651525020599365, + "logits/rejected": -2.9437994956970215, + "logps/chosen": -56.369361877441406, + "logps/rejected": -57.072608947753906, + "loss": 0.6897, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.019688406959176064, + "rewards/margins": 0.007340868003666401, + "rewards/rejected": -0.02702927589416504, + "step": 1540 + }, + { + "epoch": 0.5341144038594073, + "grad_norm": 1.822608470916748, + "learning_rate": 4.587249333545453e-08, + "logits/chosen": -3.0255002975463867, + "logits/rejected": -3.0009472370147705, + "logps/chosen": -55.275184631347656, + "logps/rejected": -55.24955368041992, + "loss": 0.6878, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.015089382417500019, + "rewards/margins": 0.011031006462872028, + "rewards/rejected": -0.026120388880372047, + "step": 1550 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 1.7990676164627075, + "learning_rate": 4.578935065161782e-08, + "logits/chosen": -3.008293867111206, + "logits/rejected": -3.001120090484619, + "logps/chosen": -55.151153564453125, + "logps/rejected": -57.88622283935547, + "loss": 0.69, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.018395179882645607, + "rewards/margins": 0.006701651960611343, + "rewards/rejected": -0.0250968299806118, + "step": 1560 + }, + { + "epoch": 0.5410062026188835, + "grad_norm": 1.6503915786743164, + "learning_rate": 4.570545582670201e-08, + "logits/chosen": -3.008349657058716, + "logits/rejected": -2.9964206218719482, + "logps/chosen": -53.5858039855957, + "logps/rejected": -55.97298049926758, + "loss": 0.6897, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.01696760021150112, + "rewards/margins": 0.007355398032814264, + "rewards/rejected": -0.0243229977786541, + "step": 1570 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 1.6711068153381348, + "learning_rate": 4.5620811895950746e-08, + "logits/chosen": -3.0385825634002686, + "logits/rejected": -3.009280204772949, + "logps/chosen": -55.50677490234375, + "logps/rejected": -55.899436950683594, + "loss": 0.6858, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.012491394765675068, + "rewards/margins": 0.015325082466006279, + "rewards/rejected": -0.02781647816300392, + "step": 1580 + }, + { + "epoch": 0.5478980013783598, + "grad_norm": 1.53829824924469, + "learning_rate": 4.553542192170966e-08, + "logits/chosen": -3.060349225997925, + "logits/rejected": -3.025090217590332, + "logps/chosen": -56.067955017089844, + "logps/rejected": -53.33306884765625, + "loss": 0.6865, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.014950567856431007, + "rewards/margins": 0.013809828087687492, + "rewards/rejected": -0.02876039408147335, + "step": 1590 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 1.8090956211090088, + "learning_rate": 4.5449288993315615e-08, + "logits/chosen": -3.047544002532959, + "logits/rejected": -3.0376791954040527, + "logps/chosen": -55.40331268310547, + "logps/rejected": -56.957740783691406, + "loss": 0.6885, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01590132713317871, + "rewards/margins": 0.00968841277062893, + "rewards/rejected": -0.02558973990380764, + "step": 1600 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -3.1351428031921387, + "eval_logits/rejected": -3.1294989585876465, + "eval_logps/chosen": -58.74068832397461, + "eval_logps/rejected": -63.60573196411133, + "eval_loss": 0.6912448406219482, + "eval_rewards/accuracies": 0.5769051909446716, + "eval_rewards/chosen": -0.00028791907243430614, + "eval_rewards/margins": 0.0039681848138570786, + "eval_rewards/rejected": -0.004256103653460741, + "eval_runtime": 384.8288, + "eval_samples_per_second": 11.184, + "eval_steps_per_second": 1.398, + "step": 1600 + }, + { + "epoch": 0.554789800137836, + "grad_norm": 1.7352491617202759, + "learning_rate": 4.536241622698493e-08, + "logits/chosen": -2.9448189735412598, + "logits/rejected": -2.929994821548462, + "logps/chosen": -53.825408935546875, + "logps/rejected": -55.268638610839844, + "loss": 0.6889, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.016777951270341873, + "rewards/margins": 0.009007781744003296, + "rewards/rejected": -0.02578573301434517, + "step": 1610 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 1.6469370126724243, + "learning_rate": 4.5274806765700636e-08, + "logits/chosen": -3.078242540359497, + "logits/rejected": -3.0581135749816895, + "logps/chosen": -56.996124267578125, + "logps/rejected": -57.18330764770508, + "loss": 0.6888, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01799107901751995, + "rewards/margins": 0.009230229072272778, + "rewards/rejected": -0.027221307158470154, + "step": 1620 + }, + { + "epoch": 0.5616815988973122, + "grad_norm": 1.6323329210281372, + "learning_rate": 4.518646377909875e-08, + "logits/chosen": -3.038658857345581, + "logits/rejected": -3.0193114280700684, + "logps/chosen": -53.77470779418945, + "logps/rejected": -56.057960510253906, + "loss": 0.6868, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.017207052558660507, + "rewards/margins": 0.013134879060089588, + "rewards/rejected": -0.03034193441271782, + "step": 1630 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 1.6922211647033691, + "learning_rate": 4.5097390463353626e-08, + "logits/chosen": -3.10493540763855, + "logits/rejected": -3.097712516784668, + "logps/chosen": -54.40106201171875, + "logps/rejected": -58.211830139160156, + "loss": 0.6887, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0212591253221035, + "rewards/margins": 0.009430269710719585, + "rewards/rejected": -0.03068939410150051, + "step": 1640 + }, + { + "epoch": 0.5685733976567884, + "grad_norm": 1.8530436754226685, + "learning_rate": 4.5007590041062295e-08, + "logits/chosen": -3.092582941055298, + "logits/rejected": -3.0659327507019043, + "logps/chosen": -55.72516632080078, + "logps/rejected": -56.46184539794922, + "loss": 0.6867, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.016838040202856064, + "rewards/margins": 0.013281027786433697, + "rewards/rejected": -0.030119070783257484, + "step": 1650 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 1.7546192407608032, + "learning_rate": 4.4917065761127907e-08, + "logits/chosen": -3.00036358833313, + "logits/rejected": -2.968268871307373, + "logps/chosen": -55.17334747314453, + "logps/rejected": -53.06787109375, + "loss": 0.6861, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.017701324075460434, + "rewards/margins": 0.014755235984921455, + "rewards/rejected": -0.03245655819773674, + "step": 1660 + }, + { + "epoch": 0.5754651964162646, + "grad_norm": 1.7550996541976929, + "learning_rate": 4.482582089864214e-08, + "logits/chosen": -3.0978662967681885, + "logits/rejected": -3.072986602783203, + "logps/chosen": -56.618186950683594, + "logps/rejected": -56.91802978515625, + "loss": 0.6858, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.015549172647297382, + "rewards/margins": 0.015354210510849953, + "rewards/rejected": -0.03090338036417961, + "step": 1670 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 1.7752835750579834, + "learning_rate": 4.473385875476675e-08, + "logits/chosen": -3.021265745162964, + "logits/rejected": -2.9964027404785156, + "logps/chosen": -57.2200813293457, + "logps/rejected": -58.078155517578125, + "loss": 0.6866, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.01923130638897419, + "rewards/margins": 0.013771514408290386, + "rewards/rejected": -0.033002819865942, + "step": 1680 + }, + { + "epoch": 0.5823569951757409, + "grad_norm": 1.647120714187622, + "learning_rate": 4.464118265661414e-08, + "logits/chosen": -3.063469648361206, + "logits/rejected": -3.0422072410583496, + "logps/chosen": -54.968994140625, + "logps/rejected": -55.223297119140625, + "loss": 0.6865, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.017609132453799248, + "rewards/margins": 0.013836865313351154, + "rewards/rejected": -0.031446002423763275, + "step": 1690 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 1.740160584449768, + "learning_rate": 4.454779595712694e-08, + "logits/chosen": -3.06133770942688, + "logits/rejected": -3.0284037590026855, + "logps/chosen": -55.181007385253906, + "logps/rejected": -54.75300979614258, + "loss": 0.6861, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.01888805255293846, + "rewards/margins": 0.014551205560564995, + "rewards/rejected": -0.03343925625085831, + "step": 1700 + }, + { + "epoch": 0.585802894555479, + "eval_logits/chosen": -3.1310038566589355, + "eval_logits/rejected": -3.125328540802002, + "eval_logps/chosen": -58.872894287109375, + "eval_logps/rejected": -63.80035400390625, + "eval_loss": 0.6909549236297607, + "eval_rewards/accuracies": 0.5745818018913269, + "eval_rewards/chosen": -0.0016099718632176518, + "eval_rewards/margins": 0.0045923274010419846, + "eval_rewards/rejected": -0.006202299147844315, + "eval_runtime": 384.7457, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 1700 + }, + { + "epoch": 0.5892487939352171, + "grad_norm": 1.9542814493179321, + "learning_rate": 4.4453702034956785e-08, + "logits/chosen": -2.9857747554779053, + "logits/rejected": -2.959059000015259, + "logps/chosen": -53.07960891723633, + "logps/rejected": -53.60251998901367, + "loss": 0.6859, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.019093209877610207, + "rewards/margins": 0.015096555463969707, + "rewards/rejected": -0.03418976441025734, + "step": 1710 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 1.7722985744476318, + "learning_rate": 4.435890429434197e-08, + "logits/chosen": -3.0411362648010254, + "logits/rejected": -3.0229859352111816, + "logps/chosen": -54.2125129699707, + "logps/rejected": -57.09526824951172, + "loss": 0.6878, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.022556353360414505, + "rewards/margins": 0.011233055964112282, + "rewards/rejected": -0.033789411187171936, + "step": 1720 + }, + { + "epoch": 0.5961405926946933, + "grad_norm": 1.6309845447540283, + "learning_rate": 4.426340616498437e-08, + "logits/chosen": -3.139572858810425, + "logits/rejected": -3.112715005874634, + "logps/chosen": -57.57489013671875, + "logps/rejected": -57.2770881652832, + "loss": 0.6874, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.017954563722014427, + "rewards/margins": 0.012080615386366844, + "rewards/rejected": -0.03003517910838127, + "step": 1730 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 1.7834956645965576, + "learning_rate": 4.416721110192535e-08, + "logits/chosen": -3.0707812309265137, + "logits/rejected": -3.0469467639923096, + "logps/chosen": -57.28666305541992, + "logps/rejected": -56.38450241088867, + "loss": 0.6862, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.018242638558149338, + "rewards/margins": 0.014507819898426533, + "rewards/rejected": -0.032750457525253296, + "step": 1740 + }, + { + "epoch": 0.6030323914541695, + "grad_norm": 1.556868314743042, + "learning_rate": 4.407032258542071e-08, + "logits/chosen": -3.015918731689453, + "logits/rejected": -2.9973530769348145, + "logps/chosen": -55.209632873535156, + "logps/rejected": -58.076988220214844, + "loss": 0.6864, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.020735980942845345, + "rewards/margins": 0.0141109898686409, + "rewards/rejected": -0.03484697267413139, + "step": 1750 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 1.7012434005737305, + "learning_rate": 4.3972744120814834e-08, + "logits/chosen": -2.9489564895629883, + "logits/rejected": -2.926241874694824, + "logps/chosen": -56.891990661621094, + "logps/rejected": -55.18810272216797, + "loss": 0.6877, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.024412229657173157, + "rewards/margins": 0.011419257149100304, + "rewards/rejected": -0.03583148494362831, + "step": 1760 + }, + { + "epoch": 0.6099241902136457, + "grad_norm": 1.8544518947601318, + "learning_rate": 4.387447923841383e-08, + "logits/chosen": -3.044332504272461, + "logits/rejected": -3.0102765560150146, + "logps/chosen": -56.91345977783203, + "logps/rejected": -55.81806182861328, + "loss": 0.6855, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.019783537834882736, + "rewards/margins": 0.015991825610399246, + "rewards/rejected": -0.03577536344528198, + "step": 1770 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 1.6657090187072754, + "learning_rate": 4.377553149335783e-08, + "logits/chosen": -3.0116004943847656, + "logits/rejected": -2.992893934249878, + "logps/chosen": -56.234764099121094, + "logps/rejected": -56.17246627807617, + "loss": 0.6879, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.025032171979546547, + "rewards/margins": 0.01119538675993681, + "rewards/rejected": -0.03622755408287048, + "step": 1780 + }, + { + "epoch": 0.616815988973122, + "grad_norm": 1.7814066410064697, + "learning_rate": 4.367590446549234e-08, + "logits/chosen": -3.0905067920684814, + "logits/rejected": -3.0614664554595947, + "logps/chosen": -59.580650329589844, + "logps/rejected": -59.031532287597656, + "loss": 0.6853, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.020469004288315773, + "rewards/margins": 0.016377722844481468, + "rewards/rejected": -0.03684672713279724, + "step": 1790 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 1.8220956325531006, + "learning_rate": 4.357560175923876e-08, + "logits/chosen": -3.0348381996154785, + "logits/rejected": -3.0145459175109863, + "logps/chosen": -57.11140060424805, + "logps/rejected": -56.62044143676758, + "loss": 0.6872, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.025753721594810486, + "rewards/margins": 0.012716737575829029, + "rewards/rejected": -0.03847045823931694, + "step": 1800 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -3.126986503601074, + "eval_logits/rejected": -3.1213557720184326, + "eval_logps/chosen": -59.06035232543945, + "eval_logps/rejected": -64.03250122070312, + "eval_loss": 0.6907546520233154, + "eval_rewards/accuracies": 0.5838754773139954, + "eval_rewards/chosen": -0.0034845659974962473, + "eval_rewards/margins": 0.005039151292294264, + "eval_rewards/rejected": -0.00852371659129858, + "eval_runtime": 384.947, + "eval_samples_per_second": 11.181, + "eval_steps_per_second": 1.398, + "step": 1800 + }, + { + "epoch": 0.6237077877325982, + "grad_norm": 1.7491681575775146, + "learning_rate": 4.347462700346395e-08, + "logits/chosen": -2.9711337089538574, + "logits/rejected": -2.9505839347839355, + "logps/chosen": -55.86274337768555, + "logps/rejected": -55.249229431152344, + "loss": 0.6867, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.023622069507837296, + "rewards/margins": 0.01367507316172123, + "rewards/rejected": -0.037297140806913376, + "step": 1810 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 1.7659626007080078, + "learning_rate": 4.337298385134896e-08, + "logits/chosen": -3.0822091102600098, + "logits/rejected": -3.0714335441589355, + "logps/chosen": -55.31257247924805, + "logps/rejected": -59.754554748535156, + "loss": 0.6855, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.02220662496984005, + "rewards/margins": 0.016015606001019478, + "rewards/rejected": -0.03822223097085953, + "step": 1820 + }, + { + "epoch": 0.6305995864920745, + "grad_norm": 1.7817442417144775, + "learning_rate": 4.327067598025686e-08, + "logits/chosen": -3.037313938140869, + "logits/rejected": -3.025007963180542, + "logps/chosen": -54.705711364746094, + "logps/rejected": -57.494056701660156, + "loss": 0.6874, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.022355765104293823, + "rewards/margins": 0.012223458848893642, + "rewards/rejected": -0.03457922488451004, + "step": 1830 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 1.644735336303711, + "learning_rate": 4.316770709159966e-08, + "logits/chosen": -3.0322089195251465, + "logits/rejected": -2.9977164268493652, + "logps/chosen": -56.084678649902344, + "logps/rejected": -55.208106994628906, + "loss": 0.6857, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.023211777210235596, + "rewards/margins": 0.015626708045601845, + "rewards/rejected": -0.03883848711848259, + "step": 1840 + }, + { + "epoch": 0.6374913852515507, + "grad_norm": 1.698492169380188, + "learning_rate": 4.306408091070445e-08, + "logits/chosen": -2.997825860977173, + "logits/rejected": -2.988842010498047, + "logps/chosen": -57.037139892578125, + "logps/rejected": -63.068214416503906, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021915119141340256, + "rewards/margins": 0.016361277550458908, + "rewards/rejected": -0.038276396691799164, + "step": 1850 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 1.78380286693573, + "learning_rate": 4.29598011866786e-08, + "logits/chosen": -3.060973882675171, + "logits/rejected": -3.0355865955352783, + "logps/chosen": -55.15557861328125, + "logps/rejected": -56.25408172607422, + "loss": 0.6847, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.02272849902510643, + "rewards/margins": 0.01771937869489193, + "rewards/rejected": -0.04044787958264351, + "step": 1860 + }, + { + "epoch": 0.6443831840110269, + "grad_norm": 1.8728792667388916, + "learning_rate": 4.285487169227408e-08, + "logits/chosen": -3.0278170108795166, + "logits/rejected": -3.004077911376953, + "logps/chosen": -56.48118209838867, + "logps/rejected": -57.5565299987793, + "loss": 0.6861, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.025048470124602318, + "rewards/margins": 0.014941403642296791, + "rewards/rejected": -0.03998987004160881, + "step": 1870 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 1.72579026222229, + "learning_rate": 4.2749296223751055e-08, + "logits/chosen": -3.076904535293579, + "logits/rejected": -3.061049699783325, + "logps/chosen": -57.126136779785156, + "logps/rejected": -58.63282012939453, + "loss": 0.6859, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.025299783796072006, + "rewards/margins": 0.015334163792431355, + "rewards/rejected": -0.040633946657180786, + "step": 1880 + }, + { + "epoch": 0.6512749827705031, + "grad_norm": 1.6683820486068726, + "learning_rate": 4.264307860074045e-08, + "logits/chosen": -3.045011281967163, + "logits/rejected": -3.015040159225464, + "logps/chosen": -56.37493896484375, + "logps/rejected": -55.854759216308594, + "loss": 0.6869, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.025647884234786034, + "rewards/margins": 0.013144433498382568, + "rewards/rejected": -0.03879231959581375, + "step": 1890 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 2.010503053665161, + "learning_rate": 4.253622266610579e-08, + "logits/chosen": -3.0306594371795654, + "logits/rejected": -2.999948501586914, + "logps/chosen": -58.246925354003906, + "logps/rejected": -55.802650451660156, + "loss": 0.6862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.024062279611825943, + "rewards/margins": 0.01467285118997097, + "rewards/rejected": -0.03873513266444206, + "step": 1900 + }, + { + "epoch": 0.6547208821502413, + "eval_logits/chosen": -3.121394157409668, + "eval_logits/rejected": -3.1157350540161133, + "eval_logps/chosen": -59.24892807006836, + "eval_logps/rejected": -64.2826156616211, + "eval_loss": 0.6904721260070801, + "eval_rewards/accuracies": 0.580157995223999, + "eval_rewards/chosen": -0.005370323546230793, + "eval_rewards/margins": 0.005654662381857634, + "eval_rewards/rejected": -0.011024984531104565, + "eval_runtime": 384.523, + "eval_samples_per_second": 11.193, + "eval_steps_per_second": 1.399, + "step": 1900 + }, + { + "epoch": 0.6581667815299793, + "grad_norm": 1.7444661855697632, + "learning_rate": 4.24287322858042e-08, + "logits/chosen": -3.049037456512451, + "logits/rejected": -3.018188953399658, + "logps/chosen": -58.074134826660156, + "logps/rejected": -56.562660217285156, + "loss": 0.6851, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.022909775376319885, + "rewards/margins": 0.016958903521299362, + "rewards/rejected": -0.03986867889761925, + "step": 1910 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 1.6985474824905396, + "learning_rate": 4.2320611348746484e-08, + "logits/chosen": -3.040519952774048, + "logits/rejected": -3.0109729766845703, + "logps/chosen": -58.26678466796875, + "logps/rejected": -59.101173400878906, + "loss": 0.6835, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01885036565363407, + "rewards/margins": 0.020201385021209717, + "rewards/rejected": -0.03905175253748894, + "step": 1920 + }, + { + "epoch": 0.6650585802894555, + "grad_norm": 1.9158495664596558, + "learning_rate": 4.221186376665648e-08, + "logits/chosen": -3.1348938941955566, + "logits/rejected": -3.1139190196990967, + "logps/chosen": -58.493896484375, + "logps/rejected": -57.275177001953125, + "loss": 0.6878, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.027531281113624573, + "rewards/margins": 0.011677147820591927, + "rewards/rejected": -0.03920843079686165, + "step": 1930 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 1.7860701084136963, + "learning_rate": 4.210249347392949e-08, + "logits/chosen": -3.0448033809661865, + "logits/rejected": -3.034738063812256, + "logps/chosen": -57.56203079223633, + "logps/rejected": -58.446510314941406, + "loss": 0.6877, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.026961099356412888, + "rewards/margins": 0.011725891381502151, + "rewards/rejected": -0.03868699073791504, + "step": 1940 + }, + { + "epoch": 0.6719503790489317, + "grad_norm": 1.7874598503112793, + "learning_rate": 4.199250442748998e-08, + "logits/chosen": -3.0932488441467285, + "logits/rejected": -3.051062822341919, + "logps/chosen": -57.90545654296875, + "logps/rejected": -55.56101608276367, + "loss": 0.6838, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02397969365119934, + "rewards/margins": 0.01958349347114563, + "rewards/rejected": -0.04356318712234497, + "step": 1950 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 1.737123966217041, + "learning_rate": 4.188190060664839e-08, + "logits/chosen": -3.0911624431610107, + "logits/rejected": -3.0598087310791016, + "logps/chosen": -59.625099182128906, + "logps/rejected": -57.037322998046875, + "loss": 0.6835, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.02469709701836109, + "rewards/margins": 0.02040681801736355, + "rewards/rejected": -0.04510391876101494, + "step": 1960 + }, + { + "epoch": 0.6788421778084079, + "grad_norm": 1.9258623123168945, + "learning_rate": 4.1770686012957165e-08, + "logits/chosen": -3.038626194000244, + "logits/rejected": -3.026581287384033, + "logps/chosen": -54.969688415527344, + "logps/rejected": -57.46106719970703, + "loss": 0.6855, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.025907838717103004, + "rewards/margins": 0.016301840543746948, + "rewards/rejected": -0.0422096848487854, + "step": 1970 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 1.9813158512115479, + "learning_rate": 4.1658864670066e-08, + "logits/chosen": -3.0048956871032715, + "logits/rejected": -2.9818878173828125, + "logps/chosen": -56.708648681640625, + "logps/rejected": -59.611244201660156, + "loss": 0.6845, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.026203909888863564, + "rewards/margins": 0.018200259655714035, + "rewards/rejected": -0.04440417140722275, + "step": 1980 + }, + { + "epoch": 0.6857339765678843, + "grad_norm": 1.7088649272918701, + "learning_rate": 4.154644062357629e-08, + "logits/chosen": -3.008385419845581, + "logits/rejected": -2.9863734245300293, + "logps/chosen": -56.71128463745117, + "logps/rejected": -57.5838737487793, + "loss": 0.6862, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.030050843954086304, + "rewards/margins": 0.014781134203076363, + "rewards/rejected": -0.044831980019807816, + "step": 1990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 1.862519383430481, + "learning_rate": 4.143341794089469e-08, + "logits/chosen": -3.094057559967041, + "logits/rejected": -3.0693211555480957, + "logps/chosen": -58.01648712158203, + "logps/rejected": -58.354042053222656, + "loss": 0.6859, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.028060251846909523, + "rewards/margins": 0.015334056690335274, + "rewards/rejected": -0.0433943085372448, + "step": 2000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -3.117588758468628, + "eval_logits/rejected": -3.111891746520996, + "eval_logps/chosen": -59.51368713378906, + "eval_logps/rejected": -64.59818267822266, + "eval_loss": 0.6902511119842529, + "eval_rewards/accuracies": 0.5868958830833435, + "eval_rewards/chosen": -0.008017915301024914, + "eval_rewards/margins": 0.006162704434245825, + "eval_rewards/rejected": -0.014180620200932026, + "eval_runtime": 385.0437, + "eval_samples_per_second": 11.178, + "eval_steps_per_second": 1.397, + "step": 2000 + }, + { + "epoch": 0.6926257753273605, + "grad_norm": 1.9090850353240967, + "learning_rate": 4.1319800711086036e-08, + "logits/chosen": -3.015761375427246, + "logits/rejected": -3.00416898727417, + "logps/chosen": -53.12788772583008, + "logps/rejected": -57.91374588012695, + "loss": 0.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03025425598025322, + "rewards/margins": 0.018167123198509216, + "rewards/rejected": -0.048421382904052734, + "step": 2010 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 1.8756160736083984, + "learning_rate": 4.120559304472536e-08, + "logits/chosen": -3.0625293254852295, + "logits/rejected": -3.0455217361450195, + "logps/chosen": -58.8310546875, + "logps/rejected": -59.229408264160156, + "loss": 0.6856, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.02707960642874241, + "rewards/margins": 0.01600963994860649, + "rewards/rejected": -0.04308924451470375, + "step": 2020 + }, + { + "epoch": 0.6995175740868367, + "grad_norm": 1.7497738599777222, + "learning_rate": 4.10907990737492e-08, + "logits/chosen": -3.0139236450195312, + "logits/rejected": -2.9887921810150146, + "logps/chosen": -56.769683837890625, + "logps/rejected": -57.542869567871094, + "loss": 0.6849, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.03307128697633743, + "rewards/margins": 0.017539430409669876, + "rewards/rejected": -0.05061071366071701, + "step": 2030 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 1.9670817852020264, + "learning_rate": 4.0975422951306095e-08, + "logits/chosen": -3.007533311843872, + "logits/rejected": -2.992617607116699, + "logps/chosen": -56.2066535949707, + "logps/rejected": -58.416419982910156, + "loss": 0.6856, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.03340793401002884, + "rewards/margins": 0.01625545509159565, + "rewards/rejected": -0.04966338723897934, + "step": 2040 + }, + { + "epoch": 0.7064093728463129, + "grad_norm": 1.6379413604736328, + "learning_rate": 4.08594688516063e-08, + "logits/chosen": -3.0516562461853027, + "logits/rejected": -3.021713972091675, + "logps/chosen": -56.81854248046875, + "logps/rejected": -56.809295654296875, + "loss": 0.6824, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.026581842452287674, + "rewards/margins": 0.02265850640833378, + "rewards/rejected": -0.0492403544485569, + "step": 2050 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 1.8194429874420166, + "learning_rate": 4.0742940969770864e-08, + "logits/chosen": -2.977799892425537, + "logits/rejected": -2.9734408855438232, + "logps/chosen": -56.75453567504883, + "logps/rejected": -58.05543899536133, + "loss": 0.6878, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.03740880638360977, + "rewards/margins": 0.011692319065332413, + "rewards/rejected": -0.049101125448942184, + "step": 2060 + }, + { + "epoch": 0.7133011716057891, + "grad_norm": 1.7173527479171753, + "learning_rate": 4.062584352167971e-08, + "logits/chosen": -3.059565782546997, + "logits/rejected": -3.033684492111206, + "logps/chosen": -57.686187744140625, + "logps/rejected": -56.419921875, + "loss": 0.6843, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029393959790468216, + "rewards/margins": 0.018624670803546906, + "rewards/rejected": -0.04801863431930542, + "step": 2070 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 1.6900782585144043, + "learning_rate": 4.0508180743819255e-08, + "logits/chosen": -3.028625965118408, + "logits/rejected": -2.9993066787719727, + "logps/chosen": -58.17266082763672, + "logps/rejected": -55.96406173706055, + "loss": 0.6833, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.03412729874253273, + "rewards/margins": 0.020834842696785927, + "rewards/rejected": -0.05496213957667351, + "step": 2080 + }, + { + "epoch": 0.7201929703652653, + "grad_norm": 1.733019471168518, + "learning_rate": 4.038995689312901e-08, + "logits/chosen": -3.0275211334228516, + "logits/rejected": -3.0161385536193848, + "logps/chosen": -56.534019470214844, + "logps/rejected": -59.515838623046875, + "loss": 0.6866, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.0316321887075901, + "rewards/margins": 0.014154776930809021, + "rewards/rejected": -0.04578696936368942, + "step": 2090 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 1.9503872394561768, + "learning_rate": 4.027117624684765e-08, + "logits/chosen": -3.033446788787842, + "logits/rejected": -3.018099784851074, + "logps/chosen": -56.406578063964844, + "logps/rejected": -57.39359664916992, + "loss": 0.6846, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030192747712135315, + "rewards/margins": 0.018196851015090942, + "rewards/rejected": -0.048389606177806854, + "step": 2100 + }, + { + "epoch": 0.7236388697450035, + "eval_logits/chosen": -3.11161208152771, + "eval_logits/rejected": -3.1059489250183105, + "eval_logps/chosen": -59.78423309326172, + "eval_logps/rejected": -64.94279479980469, + "eval_loss": 0.6899173259735107, + "eval_rewards/accuracies": 0.5829461216926575, + "eval_rewards/chosen": -0.01072339154779911, + "eval_rewards/margins": 0.006903324741870165, + "eval_rewards/rejected": -0.017626715824007988, + "eval_runtime": 384.4459, + "eval_samples_per_second": 11.195, + "eval_steps_per_second": 1.399, + "step": 2100 + }, + { + "epoch": 0.7270847691247415, + "grad_norm": 1.7966265678405762, + "learning_rate": 4.0151843102358255e-08, + "logits/chosen": -2.9671010971069336, + "logits/rejected": -2.9459662437438965, + "logps/chosen": -57.2274284362793, + "logps/rejected": -58.16614532470703, + "loss": 0.6839, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03469712659716606, + "rewards/margins": 0.019563738256692886, + "rewards/rejected": -0.05426086112856865, + "step": 2110 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 1.7504278421401978, + "learning_rate": 4.0031961777032796e-08, + "logits/chosen": -3.030264377593994, + "logits/rejected": -3.007645606994629, + "logps/chosen": -58.34819412231445, + "logps/rejected": -60.5435905456543, + "loss": 0.6828, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03323102742433548, + "rewards/margins": 0.021936681121587753, + "rewards/rejected": -0.055167704820632935, + "step": 2120 + }, + { + "epoch": 0.7339765678842178, + "grad_norm": 1.8038078546524048, + "learning_rate": 3.991153660807599e-08, + "logits/chosen": -2.9835686683654785, + "logits/rejected": -2.9676289558410645, + "logps/chosen": -58.33893966674805, + "logps/rejected": -58.84006881713867, + "loss": 0.6862, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.03550464287400246, + "rewards/margins": 0.01528511755168438, + "rewards/rejected": -0.050789762288331985, + "step": 2130 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 1.7348674535751343, + "learning_rate": 3.979057195236834e-08, + "logits/chosen": -2.9713988304138184, + "logits/rejected": -2.945173740386963, + "logps/chosen": -54.82487869262695, + "logps/rejected": -56.58161544799805, + "loss": 0.6831, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.03161891549825668, + "rewards/margins": 0.021184608340263367, + "rewards/rejected": -0.05280352383852005, + "step": 2140 + }, + { + "epoch": 0.740868366643694, + "grad_norm": 1.775078296661377, + "learning_rate": 3.9669072186308496e-08, + "logits/chosen": -3.017688035964966, + "logits/rejected": -3.000037670135498, + "logps/chosen": -57.452980041503906, + "logps/rejected": -57.556190490722656, + "loss": 0.6853, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0355488583445549, + "rewards/margins": 0.016615424305200577, + "rewards/rejected": -0.05216427892446518, + "step": 2150 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 1.6424776315689087, + "learning_rate": 3.9547041705655e-08, + "logits/chosen": -3.0127129554748535, + "logits/rejected": -2.988957643508911, + "logps/chosen": -59.63279342651367, + "logps/rejected": -58.4204216003418, + "loss": 0.6836, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.0328582227230072, + "rewards/margins": 0.020023513585329056, + "rewards/rejected": -0.05288173630833626, + "step": 2160 + }, + { + "epoch": 0.7477601654031703, + "grad_norm": 1.6106654405593872, + "learning_rate": 3.942448492536717e-08, + "logits/chosen": -2.959467649459839, + "logits/rejected": -2.934079647064209, + "logps/chosen": -56.54290771484375, + "logps/rejected": -56.23845672607422, + "loss": 0.6848, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.04299347847700119, + "rewards/margins": 0.017936604097485542, + "rewards/rejected": -0.06093008443713188, + "step": 2170 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 1.8046953678131104, + "learning_rate": 3.930140627944539e-08, + "logits/chosen": -2.9991118907928467, + "logits/rejected": -2.9810397624969482, + "logps/chosen": -55.923851013183594, + "logps/rejected": -57.81981658935547, + "loss": 0.6846, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.03583117201924324, + "rewards/margins": 0.01813691481947899, + "rewards/rejected": -0.05396808311343193, + "step": 2180 + }, + { + "epoch": 0.7546519641626465, + "grad_norm": 1.6673351526260376, + "learning_rate": 3.9177810220770714e-08, + "logits/chosen": -3.047650098800659, + "logits/rejected": -3.028332233428955, + "logps/chosen": -58.14948272705078, + "logps/rejected": -58.61583709716797, + "loss": 0.6856, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.035184066742658615, + "rewards/margins": 0.016083333641290665, + "rewards/rejected": -0.05126740410923958, + "step": 2190 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 1.6753615140914917, + "learning_rate": 3.905370122094375e-08, + "logits/chosen": -2.993964672088623, + "logits/rejected": -2.978240966796875, + "logps/chosen": -58.799041748046875, + "logps/rejected": -59.7036247253418, + "loss": 0.6861, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03728087246417999, + "rewards/margins": 0.015164054930210114, + "rewards/rejected": -0.0524449348449707, + "step": 2200 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -3.108147144317627, + "eval_logits/rejected": -3.102468252182007, + "eval_logps/chosen": -60.04545974731445, + "eval_logps/rejected": -65.24906158447266, + "eval_loss": 0.6897270083427429, + "eval_rewards/accuracies": 0.5868958830833435, + "eval_rewards/chosen": -0.013335632160305977, + "eval_rewards/margins": 0.007353761233389378, + "eval_rewards/rejected": -0.02068939432501793, + "eval_runtime": 384.2736, + "eval_samples_per_second": 11.2, + "eval_steps_per_second": 1.4, + "step": 2200 + }, + { + "epoch": 0.7615437629221227, + "grad_norm": 1.6932401657104492, + "learning_rate": 3.892908377012286e-08, + "logits/chosen": -3.028898000717163, + "logits/rejected": -2.9983444213867188, + "logps/chosen": -58.44950485229492, + "logps/rejected": -58.597312927246094, + "loss": 0.6826, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03595678508281708, + "rewards/margins": 0.02251359447836876, + "rewards/rejected": -0.05847037956118584, + "step": 2210 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 1.7982257604599, + "learning_rate": 3.8803962376861776e-08, + "logits/chosen": -3.013913869857788, + "logits/rejected": -2.982335329055786, + "logps/chosen": -60.489967346191406, + "logps/rejected": -58.84846115112305, + "loss": 0.6848, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.038976140320301056, + "rewards/margins": 0.017788967117667198, + "rewards/rejected": -0.0567651093006134, + "step": 2220 + }, + { + "epoch": 0.7684355616815989, + "grad_norm": 1.8083417415618896, + "learning_rate": 3.86783415679464e-08, + "logits/chosen": -2.9765384197235107, + "logits/rejected": -2.966383457183838, + "logps/chosen": -58.33076858520508, + "logps/rejected": -59.06047439575195, + "loss": 0.6861, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.041071441024541855, + "rewards/margins": 0.015299441292881966, + "rewards/rejected": -0.05637087672948837, + "step": 2230 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 1.6782116889953613, + "learning_rate": 3.8552225888231084e-08, + "logits/chosen": -3.083024740219116, + "logits/rejected": -3.064627170562744, + "logps/chosen": -57.59839630126953, + "logps/rejected": -57.254180908203125, + "loss": 0.6843, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.041320037096738815, + "rewards/margins": 0.018795475363731384, + "rewards/rejected": -0.0601155161857605, + "step": 2240 + }, + { + "epoch": 0.7753273604410751, + "grad_norm": 1.9075140953063965, + "learning_rate": 3.842561990047419e-08, + "logits/chosen": -2.994036912918091, + "logits/rejected": -2.97294282913208, + "logps/chosen": -57.764801025390625, + "logps/rejected": -57.16424560546875, + "loss": 0.685, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.04026733711361885, + "rewards/margins": 0.017371635884046555, + "rewards/rejected": -0.05763896554708481, + "step": 2250 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 1.7869060039520264, + "learning_rate": 3.829852818517301e-08, + "logits/chosen": -3.080289363861084, + "logits/rejected": -3.0555710792541504, + "logps/chosen": -57.22008514404297, + "logps/rejected": -58.78937530517578, + "loss": 0.685, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.04204849153757095, + "rewards/margins": 0.017353584989905357, + "rewards/rejected": -0.05940207093954086, + "step": 2260 + }, + { + "epoch": 0.7822191592005513, + "grad_norm": 1.9243491888046265, + "learning_rate": 3.8170955340398024e-08, + "logits/chosen": -2.994227886199951, + "logits/rejected": -2.9837756156921387, + "logps/chosen": -57.66072463989258, + "logps/rejected": -58.144371032714844, + "loss": 0.6861, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.04407629370689392, + "rewards/margins": 0.015283094719052315, + "rewards/rejected": -0.05935938283801079, + "step": 2270 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 1.8666582107543945, + "learning_rate": 3.804290598162661e-08, + "logits/chosen": -3.0024993419647217, + "logits/rejected": -2.971998691558838, + "logps/chosen": -59.71482467651367, + "logps/rejected": -59.344825744628906, + "loss": 0.6804, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.03427065163850784, + "rewards/margins": 0.02677471563220024, + "rewards/rejected": -0.06104537099599838, + "step": 2280 + }, + { + "epoch": 0.7891109579600276, + "grad_norm": 1.7960370779037476, + "learning_rate": 3.7914384741575963e-08, + "logits/chosen": -2.9649546146392822, + "logits/rejected": -2.9389636516571045, + "logps/chosen": -56.56310272216797, + "logps/rejected": -57.679107666015625, + "loss": 0.6835, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03888117894530296, + "rewards/margins": 0.020335400477051735, + "rewards/rejected": -0.05921658128499985, + "step": 2290 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 1.7282792329788208, + "learning_rate": 3.778539627003561e-08, + "logits/chosen": -2.988661289215088, + "logits/rejected": -2.9646308422088623, + "logps/chosen": -58.56208038330078, + "logps/rejected": -59.764549255371094, + "loss": 0.6836, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.044373877346515656, + "rewards/margins": 0.02048449032008648, + "rewards/rejected": -0.06485836952924728, + "step": 2300 + }, + { + "epoch": 0.7925568573397657, + "eval_logits/chosen": -3.1044154167175293, + "eval_logits/rejected": -3.098724603652954, + "eval_logps/chosen": -60.39040756225586, + "eval_logps/rejected": -65.65296936035156, + "eval_loss": 0.6894731521606445, + "eval_rewards/accuracies": 0.5922397971153259, + "eval_rewards/chosen": -0.016785062849521637, + "eval_rewards/margins": 0.007943346165120602, + "eval_rewards/rejected": -0.024728409945964813, + "eval_runtime": 384.4947, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 2300 + }, + { + "epoch": 0.7960027567195038, + "grad_norm": 1.946254849433899, + "learning_rate": 3.7655945233699046e-08, + "logits/chosen": -2.975891590118408, + "logits/rejected": -2.958909749984741, + "logps/chosen": -57.987144470214844, + "logps/rejected": -59.5714111328125, + "loss": 0.6849, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.045588064938783646, + "rewards/margins": 0.017828360199928284, + "rewards/rejected": -0.06341642886400223, + "step": 2310 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 1.8851513862609863, + "learning_rate": 3.7526036315995024e-08, + "logits/chosen": -3.035238742828369, + "logits/rejected": -3.0107016563415527, + "logps/chosen": -60.47370147705078, + "logps/rejected": -62.362586975097656, + "loss": 0.6846, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.04454559087753296, + "rewards/margins": 0.018349364399909973, + "rewards/rejected": -0.06289495527744293, + "step": 2320 + }, + { + "epoch": 0.80289455547898, + "grad_norm": 1.8436781167984009, + "learning_rate": 3.739567421691803e-08, + "logits/chosen": -3.0294172763824463, + "logits/rejected": -3.0034923553466797, + "logps/chosen": -58.335426330566406, + "logps/rejected": -58.9324836730957, + "loss": 0.6823, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.04106391221284866, + "rewards/margins": 0.02297310158610344, + "rewards/rejected": -0.0640370100736618, + "step": 2330 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 1.6702615022659302, + "learning_rate": 3.726486365285828e-08, + "logits/chosen": -3.010782241821289, + "logits/rejected": -2.9824962615966797, + "logps/chosen": -56.69477462768555, + "logps/rejected": -56.83466339111328, + "loss": 0.6828, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.04235810041427612, + "rewards/margins": 0.02205110900104046, + "rewards/rejected": -0.06440921127796173, + "step": 2340 + }, + { + "epoch": 0.8097863542384562, + "grad_norm": 1.8457934856414795, + "learning_rate": 3.713360935643105e-08, + "logits/chosen": -3.0336556434631348, + "logits/rejected": -3.0187900066375732, + "logps/chosen": -58.77387237548828, + "logps/rejected": -61.80967330932617, + "loss": 0.6849, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.050251416862010956, + "rewards/margins": 0.017895471304655075, + "rewards/rejected": -0.06814688444137573, + "step": 2350 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 1.8453093767166138, + "learning_rate": 3.7001916076305515e-08, + "logits/chosen": -3.008915662765503, + "logits/rejected": -2.974138021469116, + "logps/chosen": -61.52025604248047, + "logps/rejected": -59.27776336669922, + "loss": 0.6804, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.04145478457212448, + "rewards/margins": 0.027239182963967323, + "rewards/rejected": -0.06869396567344666, + "step": 2360 + }, + { + "epoch": 0.8166781529979324, + "grad_norm": 2.0200343132019043, + "learning_rate": 3.686978857703287e-08, + "logits/chosen": -2.981760263442993, + "logits/rejected": -2.9633803367614746, + "logps/chosen": -58.65894317626953, + "logps/rejected": -58.9062385559082, + "loss": 0.6848, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.04565044492483139, + "rewards/margins": 0.0179426372051239, + "rewards/rejected": -0.06359308958053589, + "step": 2370 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 2.0217244625091553, + "learning_rate": 3.6737231638874e-08, + "logits/chosen": -2.9946258068084717, + "logits/rejected": -2.9741220474243164, + "logps/chosen": -58.064208984375, + "logps/rejected": -59.4661979675293, + "loss": 0.6825, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.041321393102407455, + "rewards/margins": 0.022583045065402985, + "rewards/rejected": -0.06390444189310074, + "step": 2380 + }, + { + "epoch": 0.8235699517574087, + "grad_norm": 1.9615968465805054, + "learning_rate": 3.660425005762656e-08, + "logits/chosen": -2.9946742057800293, + "logits/rejected": -2.9747567176818848, + "logps/chosen": -60.14471435546875, + "logps/rejected": -61.221946716308594, + "loss": 0.6836, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.04614607244729996, + "rewards/margins": 0.020358974114060402, + "rewards/rejected": -0.0665050521492958, + "step": 2390 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 1.947713017463684, + "learning_rate": 3.647084864445137e-08, + "logits/chosen": -2.9955036640167236, + "logits/rejected": -2.97472882270813, + "logps/chosen": -59.67714309692383, + "logps/rejected": -58.160728454589844, + "loss": 0.6847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0478370264172554, + "rewards/margins": 0.018191467970609665, + "rewards/rejected": -0.06602849066257477, + "step": 2400 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -3.100661516189575, + "eval_logits/rejected": -3.0949013233184814, + "eval_logps/chosen": -60.806888580322266, + "eval_logps/rejected": -66.14020538330078, + "eval_loss": 0.6891672611236572, + "eval_rewards/accuracies": 0.5868958830833435, + "eval_rewards/chosen": -0.020949942991137505, + "eval_rewards/margins": 0.00865084771066904, + "eval_rewards/rejected": -0.02960078790783882, + "eval_runtime": 384.3481, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 2400 + }, + { + "epoch": 0.8304617505168849, + "grad_norm": 1.8975228071212769, + "learning_rate": 3.633703222569846e-08, + "logits/chosen": -2.9866557121276855, + "logits/rejected": -2.9641973972320557, + "logps/chosen": -58.975730895996094, + "logps/rejected": -59.30794143676758, + "loss": 0.6819, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.047889094799757004, + "rewards/margins": 0.02423194609582424, + "rewards/rejected": -0.0721210390329361, + "step": 2410 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 2.121293783187866, + "learning_rate": 3.620280564273241e-08, + "logits/chosen": -2.993213176727295, + "logits/rejected": -2.966607093811035, + "logps/chosen": -59.87786865234375, + "logps/rejected": -59.63630294799805, + "loss": 0.6834, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.05004245787858963, + "rewards/margins": 0.021116720512509346, + "rewards/rejected": -0.07115916907787323, + "step": 2420 + }, + { + "epoch": 0.8373535492763611, + "grad_norm": 1.939035415649414, + "learning_rate": 3.606817375175716e-08, + "logits/chosen": -3.0517094135284424, + "logits/rejected": -3.0259761810302734, + "logps/chosen": -61.3172492980957, + "logps/rejected": -59.26446533203125, + "loss": 0.6831, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.05021647736430168, + "rewards/margins": 0.0214863158762455, + "rewards/rejected": -0.07170280069112778, + "step": 2430 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 1.7287660837173462, + "learning_rate": 3.5933141423640376e-08, + "logits/chosen": -3.0139081478118896, + "logits/rejected": -2.9831302165985107, + "logps/chosen": -60.192726135253906, + "logps/rejected": -58.50376510620117, + "loss": 0.681, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.04867525398731232, + "rewards/margins": 0.02579745091497898, + "rewards/rejected": -0.07447270303964615, + "step": 2440 + }, + { + "epoch": 0.8442453480358374, + "grad_norm": 1.7438111305236816, + "learning_rate": 3.579771354373721e-08, + "logits/chosen": -2.996495008468628, + "logits/rejected": -2.968963861465454, + "logps/chosen": -59.237281799316406, + "logps/rejected": -59.39753341674805, + "loss": 0.6812, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.04925781860947609, + "rewards/margins": 0.025830138474702835, + "rewards/rejected": -0.07508794963359833, + "step": 2450 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 1.944701910018921, + "learning_rate": 3.5661895011713494e-08, + "logits/chosen": -3.036695718765259, + "logits/rejected": -3.0117411613464355, + "logps/chosen": -60.16267013549805, + "logps/rejected": -60.866302490234375, + "loss": 0.6798, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.05073127895593643, + "rewards/margins": 0.02843235433101654, + "rewards/rejected": -0.07916363328695297, + "step": 2460 + }, + { + "epoch": 0.8511371467953136, + "grad_norm": 2.0044193267822266, + "learning_rate": 3.552569074136858e-08, + "logits/chosen": -3.09183931350708, + "logits/rejected": -3.071075916290283, + "logps/chosen": -60.33824920654297, + "logps/rejected": -63.61500930786133, + "loss": 0.6829, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.058057595044374466, + "rewards/margins": 0.021942198276519775, + "rewards/rejected": -0.07999978959560394, + "step": 2470 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 1.7847065925598145, + "learning_rate": 3.5389105660457474e-08, + "logits/chosen": -2.915327787399292, + "logits/rejected": -2.9085865020751953, + "logps/chosen": -58.845794677734375, + "logps/rejected": -61.94459915161133, + "loss": 0.687, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.053052984178066254, + "rewards/margins": 0.013705862686038017, + "rewards/rejected": -0.06675885617733002, + "step": 2480 + }, + { + "epoch": 0.8580289455547898, + "grad_norm": 1.920035719871521, + "learning_rate": 3.525214471051258e-08, + "logits/chosen": -2.9986464977264404, + "logits/rejected": -2.990365982055664, + "logps/chosen": -57.812095642089844, + "logps/rejected": -60.44374465942383, + "loss": 0.687, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.05696609616279602, + "rewards/margins": 0.013766164891421795, + "rewards/rejected": -0.07073226571083069, + "step": 2490 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 1.8735101222991943, + "learning_rate": 3.511481284666496e-08, + "logits/chosen": -2.9600300788879395, + "logits/rejected": -2.9459688663482666, + "logps/chosen": -58.956642150878906, + "logps/rejected": -60.98919677734375, + "loss": 0.6838, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.051624685525894165, + "rewards/margins": 0.020304836332798004, + "rewards/rejected": -0.07192952930927277, + "step": 2500 + }, + { + "epoch": 0.8614748449345279, + "eval_logits/chosen": -3.0967652797698975, + "eval_logits/rejected": -3.0910022258758545, + "eval_logps/chosen": -61.215702056884766, + "eval_logps/rejected": -66.6113052368164, + "eval_loss": 0.6889049410820007, + "eval_rewards/accuracies": 0.5903810262680054, + "eval_rewards/chosen": -0.025038031861186028, + "eval_rewards/margins": 0.009273835457861423, + "eval_rewards/rejected": -0.03431186452507973, + "eval_runtime": 384.2297, + "eval_samples_per_second": 11.202, + "eval_steps_per_second": 1.4, + "step": 2500 + }, + { + "epoch": 0.864920744314266, + "grad_norm": 1.9118610620498657, + "learning_rate": 3.4977115037464985e-08, + "logits/chosen": -2.9711413383483887, + "logits/rejected": -2.944815158843994, + "logps/chosen": -58.86348342895508, + "logps/rejected": -59.43571090698242, + "loss": 0.6827, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.05588380619883537, + "rewards/margins": 0.022921234369277954, + "rewards/rejected": -0.07880503684282303, + "step": 2510 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 1.8457341194152832, + "learning_rate": 3.483905626470265e-08, + "logits/chosen": -3.032449960708618, + "logits/rejected": -3.0067577362060547, + "logps/chosen": -60.454444885253906, + "logps/rejected": -59.48230743408203, + "loss": 0.6854, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.05567127466201782, + "rewards/margins": 0.017124414443969727, + "rewards/rejected": -0.07279568165540695, + "step": 2520 + }, + { + "epoch": 0.8718125430737422, + "grad_norm": 1.744009256362915, + "learning_rate": 3.470064152322728e-08, + "logits/chosen": -2.9465372562408447, + "logits/rejected": -2.9316067695617676, + "logps/chosen": -58.74321746826172, + "logps/rejected": -61.12950897216797, + "loss": 0.6837, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.05705111101269722, + "rewards/margins": 0.020534086972475052, + "rewards/rejected": -0.07758519053459167, + "step": 2530 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 1.977340579032898, + "learning_rate": 3.4561875820766864e-08, + "logits/chosen": -3.083641290664673, + "logits/rejected": -3.051438808441162, + "logps/chosen": -60.03351593017578, + "logps/rejected": -57.312896728515625, + "loss": 0.6795, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04907160997390747, + "rewards/margins": 0.028961068019270897, + "rewards/rejected": -0.07803267240524292, + "step": 2540 + }, + { + "epoch": 0.8787043418332184, + "grad_norm": 1.8939472436904907, + "learning_rate": 3.442276417774684e-08, + "logits/chosen": -2.963914394378662, + "logits/rejected": -2.930088520050049, + "logps/chosen": -60.17865753173828, + "logps/rejected": -60.11580276489258, + "loss": 0.6806, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.054998576641082764, + "rewards/margins": 0.027044588699936867, + "rewards/rejected": -0.08204315602779388, + "step": 2550 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 1.9534904956817627, + "learning_rate": 3.4283311627108525e-08, + "logits/chosen": -3.0542969703674316, + "logits/rejected": -3.020608901977539, + "logps/chosen": -61.627174377441406, + "logps/rejected": -59.780677795410156, + "loss": 0.6841, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05754048749804497, + "rewards/margins": 0.019636686891317368, + "rewards/rejected": -0.07717718183994293, + "step": 2560 + }, + { + "epoch": 0.8855961405926946, + "grad_norm": 1.8603425025939941, + "learning_rate": 3.4143523214126946e-08, + "logits/chosen": -3.0562100410461426, + "logits/rejected": -3.0208911895751953, + "logps/chosen": -60.71092987060547, + "logps/rejected": -59.271934509277344, + "loss": 0.6801, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05018734186887741, + "rewards/margins": 0.02779127098619938, + "rewards/rejected": -0.07797860354185104, + "step": 2570 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 1.9357118606567383, + "learning_rate": 3.4003403996228354e-08, + "logits/chosen": -3.037440538406372, + "logits/rejected": -3.014925718307495, + "logps/chosen": -58.04810333251953, + "logps/rejected": -59.7235107421875, + "loss": 0.6814, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.05774794891476631, + "rewards/margins": 0.02505926415324211, + "rewards/rejected": -0.08280721306800842, + "step": 2580 + }, + { + "epoch": 0.892487939352171, + "grad_norm": 1.9867465496063232, + "learning_rate": 3.386295904280725e-08, + "logits/chosen": -3.059483289718628, + "logits/rejected": -3.0427792072296143, + "logps/chosen": -60.69512176513672, + "logps/rejected": -60.93449783325195, + "loss": 0.682, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.053453318774700165, + "rewards/margins": 0.024111952632665634, + "rewards/rejected": -0.0775652676820755, + "step": 2590 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 1.870638132095337, + "learning_rate": 3.3722193435042965e-08, + "logits/chosen": -3.014085054397583, + "logits/rejected": -2.9972643852233887, + "logps/chosen": -58.396324157714844, + "logps/rejected": -61.89160919189453, + "loss": 0.6841, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.05936922878026962, + "rewards/margins": 0.01977996714413166, + "rewards/rejected": -0.07914920151233673, + "step": 2600 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -3.0933353900909424, + "eval_logits/rejected": -3.087651014328003, + "eval_logps/chosen": -61.549625396728516, + "eval_logps/rejected": -67.02259826660156, + "eval_loss": 0.6885599493980408, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -0.028377274051308632, + "eval_rewards/margins": 0.010047496296465397, + "eval_rewards/rejected": -0.038424767553806305, + "eval_runtime": 384.4537, + "eval_samples_per_second": 11.195, + "eval_steps_per_second": 1.399, + "step": 2600 + }, + { + "epoch": 0.8993797381116472, + "grad_norm": 2.0609378814697266, + "learning_rate": 3.358111226571583e-08, + "logits/chosen": -3.0247185230255127, + "logits/rejected": -3.0110678672790527, + "logps/chosen": -57.71208572387695, + "logps/rejected": -60.53589630126953, + "loss": 0.6841, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.055869292467832565, + "rewards/margins": 0.019581874832510948, + "rewards/rejected": -0.07545118033885956, + "step": 2610 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 2.041883945465088, + "learning_rate": 3.3439720639022914e-08, + "logits/chosen": -2.97845458984375, + "logits/rejected": -2.961709499359131, + "logps/chosen": -59.412628173828125, + "logps/rejected": -63.06536865234375, + "loss": 0.6794, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.05394721031188965, + "rewards/margins": 0.029407376423478127, + "rewards/rejected": -0.08335459232330322, + "step": 2620 + }, + { + "epoch": 0.9062715368711234, + "grad_norm": 2.0693163871765137, + "learning_rate": 3.32980236703934e-08, + "logits/chosen": -2.970226764678955, + "logits/rejected": -2.9422554969787598, + "logps/chosen": -59.864227294921875, + "logps/rejected": -59.469871520996094, + "loss": 0.6825, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.059826843440532684, + "rewards/margins": 0.02292069047689438, + "rewards/rejected": -0.08274753391742706, + "step": 2630 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 1.9728206396102905, + "learning_rate": 3.3156026486303463e-08, + "logits/chosen": -3.065502882003784, + "logits/rejected": -3.0424623489379883, + "logps/chosen": -59.48903274536133, + "logps/rejected": -59.970909118652344, + "loss": 0.6814, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06527196615934372, + "rewards/margins": 0.02535632625222206, + "rewards/rejected": -0.09062829613685608, + "step": 2640 + }, + { + "epoch": 0.9131633356305996, + "grad_norm": 1.9187884330749512, + "learning_rate": 3.301373422409082e-08, + "logits/chosen": -2.965153217315674, + "logits/rejected": -2.9357123374938965, + "logps/chosen": -60.7358283996582, + "logps/rejected": -61.5562744140625, + "loss": 0.6806, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.05258868262171745, + "rewards/margins": 0.02703956887125969, + "rewards/rejected": -0.07962825149297714, + "step": 2650 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 1.8104689121246338, + "learning_rate": 3.287115203176887e-08, + "logits/chosen": -3.0565733909606934, + "logits/rejected": -3.023632287979126, + "logps/chosen": -63.6373176574707, + "logps/rejected": -59.726165771484375, + "loss": 0.681, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.05793512985110283, + "rewards/margins": 0.02604956366121769, + "rewards/rejected": -0.08398470282554626, + "step": 2660 + }, + { + "epoch": 0.9200551343900758, + "grad_norm": 2.0160820484161377, + "learning_rate": 3.2728285067840426e-08, + "logits/chosen": -2.970146417617798, + "logits/rejected": -2.945462942123413, + "logps/chosen": -59.9456901550293, + "logps/rejected": -60.949256896972656, + "loss": 0.683, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.06337870657444, + "rewards/margins": 0.022111540660262108, + "rewards/rejected": -0.08549024909734726, + "step": 2670 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 1.902436375617981, + "learning_rate": 3.258513850111112e-08, + "logits/chosen": -3.000199556350708, + "logits/rejected": -2.9784107208251953, + "logps/chosen": -59.2768669128418, + "logps/rejected": -62.97795867919922, + "loss": 0.6821, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.06348595023155212, + "rewards/margins": 0.024028928950428963, + "rewards/rejected": -0.08751488476991653, + "step": 2680 + }, + { + "epoch": 0.926946933149552, + "grad_norm": 1.916286826133728, + "learning_rate": 3.244171751050235e-08, + "logits/chosen": -3.0135703086853027, + "logits/rejected": -2.984884023666382, + "logps/chosen": -60.307334899902344, + "logps/rejected": -60.89008712768555, + "loss": 0.6804, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.056459106504917145, + "rewards/margins": 0.027269680052995682, + "rewards/rejected": -0.08372878283262253, + "step": 2690 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 1.8760229349136353, + "learning_rate": 3.229802728486395e-08, + "logits/chosen": -2.998236894607544, + "logits/rejected": -2.9683120250701904, + "logps/chosen": -61.063201904296875, + "logps/rejected": -61.303009033203125, + "loss": 0.6824, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.06226380914449692, + "rewards/margins": 0.023240935057401657, + "rewards/rejected": -0.08550475537776947, + "step": 2700 + }, + { + "epoch": 0.9303928325292902, + "eval_logits/chosen": -3.089698076248169, + "eval_logits/rejected": -3.0839478969573975, + "eval_logps/chosen": -61.91862869262695, + "eval_logps/rejected": -67.45933532714844, + "eval_loss": 0.6882630586624146, + "eval_rewards/accuracies": 0.5855018496513367, + "eval_rewards/chosen": -0.032067302614450455, + "eval_rewards/margins": 0.010724782943725586, + "eval_rewards/rejected": -0.04279208183288574, + "eval_runtime": 384.3637, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 2700 + }, + { + "epoch": 0.9338387319090282, + "grad_norm": 1.9357118606567383, + "learning_rate": 3.215407302278644e-08, + "logits/chosen": -3.050286054611206, + "logits/rejected": -3.033205032348633, + "logps/chosen": -61.737022399902344, + "logps/rejected": -62.07782745361328, + "loss": 0.686, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.06558212637901306, + "rewards/margins": 0.015997232869267464, + "rewards/rejected": -0.08157936483621597, + "step": 2710 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 1.9162743091583252, + "learning_rate": 3.200985993241298e-08, + "logits/chosen": -3.0029079914093018, + "logits/rejected": -2.9888548851013184, + "logps/chosen": -61.6423454284668, + "logps/rejected": -62.909019470214844, + "loss": 0.6807, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.06742598116397858, + "rewards/margins": 0.026688963174819946, + "rewards/rejected": -0.09411494433879852, + "step": 2720 + }, + { + "epoch": 0.9407305306685044, + "grad_norm": 1.9952343702316284, + "learning_rate": 3.1865393231250884e-08, + "logits/chosen": -2.979914426803589, + "logits/rejected": -2.9662184715270996, + "logps/chosen": -61.29094314575195, + "logps/rejected": -61.263267517089844, + "loss": 0.6844, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.06579684466123581, + "rewards/margins": 0.019393552094697952, + "rewards/rejected": -0.08519040048122406, + "step": 2730 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 2.008897304534912, + "learning_rate": 3.172067814598291e-08, + "logits/chosen": -2.989284038543701, + "logits/rejected": -2.966811180114746, + "logps/chosen": -58.96021270751953, + "logps/rejected": -60.29884719848633, + "loss": 0.682, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06809289008378983, + "rewards/margins": 0.024256303906440735, + "rewards/rejected": -0.09234919399023056, + "step": 2740 + }, + { + "epoch": 0.9476223294279807, + "grad_norm": 1.9547452926635742, + "learning_rate": 3.1575719912278146e-08, + "logits/chosen": -3.0216493606567383, + "logits/rejected": -2.99959659576416, + "logps/chosen": -61.20268630981445, + "logps/rejected": -62.02368927001953, + "loss": 0.6837, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.06514692306518555, + "rewards/margins": 0.020700866356492043, + "rewards/rejected": -0.08584778755903244, + "step": 2750 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 2.0816023349761963, + "learning_rate": 3.143052377460257e-08, + "logits/chosen": -2.964078903198242, + "logits/rejected": -2.9428696632385254, + "logps/chosen": -60.8036994934082, + "logps/rejected": -60.589134216308594, + "loss": 0.6832, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.06277766823768616, + "rewards/margins": 0.021733291447162628, + "rewards/rejected": -0.08451096713542938, + "step": 2760 + }, + { + "epoch": 0.954514128187457, + "grad_norm": 1.8665035963058472, + "learning_rate": 3.128509498602933e-08, + "logits/chosen": -3.0003015995025635, + "logits/rejected": -2.9855659008026123, + "logps/chosen": -62.09807586669922, + "logps/rejected": -63.236656188964844, + "loss": 0.6822, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.05971125513315201, + "rewards/margins": 0.023781852796673775, + "rewards/rejected": -0.08349311351776123, + "step": 2770 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 2.0942790508270264, + "learning_rate": 3.113943880804867e-08, + "logits/chosen": -2.926558017730713, + "logits/rejected": -2.9052395820617676, + "logps/chosen": -59.053306579589844, + "logps/rejected": -60.78388214111328, + "loss": 0.6833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06801018118858337, + "rewards/margins": 0.02199479192495346, + "rewards/rejected": -0.09000497311353683, + "step": 2780 + }, + { + "epoch": 0.9614059269469332, + "grad_norm": 1.8221739530563354, + "learning_rate": 3.0993560510377636e-08, + "logits/chosen": -2.9689197540283203, + "logits/rejected": -2.9574332237243652, + "logps/chosen": -58.8261833190918, + "logps/rejected": -64.0365982055664, + "loss": 0.6853, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.06904148310422897, + "rewards/margins": 0.017710790038108826, + "rewards/rejected": -0.0867522731423378, + "step": 2790 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 1.8329358100891113, + "learning_rate": 3.084746537076932e-08, + "logits/chosen": -3.058472156524658, + "logits/rejected": -3.035177707672119, + "logps/chosen": -60.95489501953125, + "logps/rejected": -64.85383605957031, + "loss": 0.6824, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06287367641925812, + "rewards/margins": 0.023363064974546432, + "rewards/rejected": -0.08623673766851425, + "step": 2800 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -3.0867879390716553, + "eval_logits/rejected": -3.081066608428955, + "eval_logps/chosen": -62.05662536621094, + "eval_logps/rejected": -67.6514663696289, + "eval_loss": 0.688012421131134, + "eval_rewards/accuracies": 0.5929368138313293, + "eval_rewards/chosen": -0.03344728797674179, + "eval_rewards/margins": 0.0112661337479949, + "eval_rewards/rejected": -0.044713422656059265, + "eval_runtime": 384.6351, + "eval_samples_per_second": 11.19, + "eval_steps_per_second": 1.399, + "step": 2800 + }, + { + "epoch": 0.9682977257064094, + "grad_norm": 2.0032522678375244, + "learning_rate": 3.070115867482202e-08, + "logits/chosen": -3.013986825942993, + "logits/rejected": -2.993983507156372, + "logps/chosen": -60.495140075683594, + "logps/rejected": -62.9716796875, + "loss": 0.6808, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06785044074058533, + "rewards/margins": 0.026449289172887802, + "rewards/rejected": -0.09429973363876343, + "step": 2810 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 2.1149818897247314, + "learning_rate": 3.0554645715787926e-08, + "logits/chosen": -3.0158185958862305, + "logits/rejected": -2.9941751956939697, + "logps/chosen": -60.9375, + "logps/rejected": -61.80029296875, + "loss": 0.6832, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.06010693311691284, + "rewards/margins": 0.021654192358255386, + "rewards/rejected": -0.08176112920045853, + "step": 2820 + }, + { + "epoch": 0.9751895244658856, + "grad_norm": 1.9543670415878296, + "learning_rate": 3.040793179438167e-08, + "logits/chosen": -2.915527582168579, + "logits/rejected": -2.910946846008301, + "logps/chosen": -58.91196823120117, + "logps/rejected": -60.60822677612305, + "loss": 0.6839, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.06974777579307556, + "rewards/margins": 0.020312650129199028, + "rewards/rejected": -0.09006042778491974, + "step": 2830 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 1.9782330989837646, + "learning_rate": 3.026102221858853e-08, + "logits/chosen": -2.9936022758483887, + "logits/rejected": -2.96232271194458, + "logps/chosen": -60.83803176879883, + "logps/rejected": -61.680091857910156, + "loss": 0.6801, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.0662757009267807, + "rewards/margins": 0.028109584003686905, + "rewards/rejected": -0.09438528120517731, + "step": 2840 + }, + { + "epoch": 0.9820813232253618, + "grad_norm": 1.9992231130599976, + "learning_rate": 3.0113922303472386e-08, + "logits/chosen": -2.9486582279205322, + "logits/rejected": -2.924018383026123, + "logps/chosen": -64.09626770019531, + "logps/rejected": -61.55986404418945, + "loss": 0.6797, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.062040697783231735, + "rewards/margins": 0.028691541403532028, + "rewards/rejected": -0.09073223918676376, + "step": 2850 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 2.026111125946045, + "learning_rate": 2.9966637370983444e-08, + "logits/chosen": -3.0231735706329346, + "logits/rejected": -2.996793031692505, + "logps/chosen": -60.3083381652832, + "logps/rejected": -61.34482955932617, + "loss": 0.6773, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.06745734810829163, + "rewards/margins": 0.03389859199523926, + "rewards/rejected": -0.10135593265295029, + "step": 2860 + }, + { + "epoch": 0.988973121984838, + "grad_norm": 1.9893444776535034, + "learning_rate": 2.981917274976568e-08, + "logits/chosen": -3.01432728767395, + "logits/rejected": -2.9927713871002197, + "logps/chosen": -62.2581787109375, + "logps/rejected": -62.59688186645508, + "loss": 0.6808, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.06791607290506363, + "rewards/margins": 0.026418615132570267, + "rewards/rejected": -0.094334676861763, + "step": 2870 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 2.100264072418213, + "learning_rate": 2.967153377496405e-08, + "logits/chosen": -3.0361812114715576, + "logits/rejected": -3.0079872608184814, + "logps/chosen": -63.25773239135742, + "logps/rejected": -62.428985595703125, + "loss": 0.6802, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.06720717996358871, + "rewards/margins": 0.027882525697350502, + "rewards/rejected": -0.09508970379829407, + "step": 2880 + }, + { + "epoch": 0.9958649207443143, + "grad_norm": 2.0951766967773438, + "learning_rate": 2.9523725788031473e-08, + "logits/chosen": -2.951953649520874, + "logits/rejected": -2.9324944019317627, + "logps/chosen": -58.438323974609375, + "logps/rejected": -62.59917068481445, + "loss": 0.6824, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06279212236404419, + "rewards/margins": 0.023123985156416893, + "rewards/rejected": -0.08591610938310623, + "step": 2890 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 1.9077363014221191, + "learning_rate": 2.9375754136535602e-08, + "logits/chosen": -2.9489340782165527, + "logits/rejected": -2.917557954788208, + "logps/chosen": -57.6536979675293, + "logps/rejected": -62.20427703857422, + "loss": 0.6812, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06541205942630768, + "rewards/margins": 0.026028072461485863, + "rewards/rejected": -0.09144014120101929, + "step": 2900 + }, + { + "epoch": 0.9993108201240524, + "eval_logits/chosen": -3.0832176208496094, + "eval_logits/rejected": -3.077537775039673, + "eval_logps/chosen": -62.34251022338867, + "eval_logps/rejected": -67.98897552490234, + "eval_loss": 0.6877877116203308, + "eval_rewards/accuracies": 0.5906133651733398, + "eval_rewards/chosen": -0.03630611672997475, + "eval_rewards/margins": 0.01178241241723299, + "eval_rewards/rejected": -0.04808852821588516, + "eval_runtime": 384.1302, + "eval_samples_per_second": 11.205, + "eval_steps_per_second": 1.401, + "step": 2900 + }, + { + "epoch": 1.0027567195037905, + "grad_norm": 1.9044243097305298, + "learning_rate": 2.922762417396531e-08, + "logits/chosen": -3.0532944202423096, + "logits/rejected": -3.0282721519470215, + "logps/chosen": -60.56848907470703, + "logps/rejected": -61.425506591796875, + "loss": 0.6794, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.0701994001865387, + "rewards/margins": 0.029416104778647423, + "rewards/rejected": -0.09961550682783127, + "step": 2910 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 1.9171464443206787, + "learning_rate": 2.9079341259537044e-08, + "logits/chosen": -2.955644130706787, + "logits/rejected": -2.9289050102233887, + "logps/chosen": -59.532318115234375, + "logps/rejected": -63.27251434326172, + "loss": 0.6794, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.06590018421411514, + "rewards/margins": 0.029649043455719948, + "rewards/rejected": -0.09554923325777054, + "step": 2920 + }, + { + "epoch": 1.0096485182632666, + "grad_norm": 2.3012123107910156, + "learning_rate": 2.893091075800092e-08, + "logits/chosen": -2.9719414710998535, + "logits/rejected": -2.956596851348877, + "logps/chosen": -58.37969207763672, + "logps/rejected": -63.62909698486328, + "loss": 0.6805, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.07035626471042633, + "rewards/margins": 0.027267420664429665, + "rewards/rejected": -0.09762369096279144, + "step": 2930 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 1.9921698570251465, + "learning_rate": 2.878233803944663e-08, + "logits/chosen": -2.975639820098877, + "logits/rejected": -2.9587063789367676, + "logps/chosen": -61.81868362426758, + "logps/rejected": -63.246192932128906, + "loss": 0.6802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06357287615537643, + "rewards/margins": 0.02810371294617653, + "rewards/rejected": -0.09167659282684326, + "step": 2940 + }, + { + "epoch": 1.016540317022743, + "grad_norm": 1.9301649332046509, + "learning_rate": 2.863362847910914e-08, + "logits/chosen": -2.9926259517669678, + "logits/rejected": -2.975491762161255, + "logps/chosen": -61.97704315185547, + "logps/rejected": -65.73519897460938, + "loss": 0.6805, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.06561344116926193, + "rewards/margins": 0.027345973998308182, + "rewards/rejected": -0.09295941889286041, + "step": 2950 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 2.173574209213257, + "learning_rate": 2.8484787457174276e-08, + "logits/chosen": -2.967440128326416, + "logits/rejected": -2.9601893424987793, + "logps/chosen": -58.916534423828125, + "logps/rejected": -64.43171691894531, + "loss": 0.6847, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.06918451935052872, + "rewards/margins": 0.01901223696768284, + "rewards/rejected": -0.0881967544555664, + "step": 2960 + }, + { + "epoch": 1.0234321157822193, + "grad_norm": 1.9122604131698608, + "learning_rate": 2.833582035858399e-08, + "logits/chosen": -2.9992122650146484, + "logits/rejected": -2.9755702018737793, + "logps/chosen": -59.261016845703125, + "logps/rejected": -62.033790588378906, + "loss": 0.6796, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.07028323411941528, + "rewards/margins": 0.0288868248462677, + "rewards/rejected": -0.09917005896568298, + "step": 2970 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 1.9919573068618774, + "learning_rate": 2.81867325728416e-08, + "logits/chosen": -2.8979835510253906, + "logits/rejected": -2.870373487472534, + "logps/chosen": -62.300819396972656, + "logps/rejected": -61.94474411010742, + "loss": 0.6802, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.06286562979221344, + "rewards/margins": 0.027720922604203224, + "rewards/rejected": -0.09058655053377151, + "step": 2980 + }, + { + "epoch": 1.0303239145416954, + "grad_norm": 1.9404138326644897, + "learning_rate": 2.8037529493816785e-08, + "logits/chosen": -2.9238524436950684, + "logits/rejected": -2.914903402328491, + "logps/chosen": -59.53535842895508, + "logps/rejected": -62.575408935546875, + "loss": 0.6838, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.07301677763462067, + "rewards/margins": 0.02059059776365757, + "rewards/rejected": -0.09360738098621368, + "step": 2990 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 1.7792117595672607, + "learning_rate": 2.788821651955044e-08, + "logits/chosen": -2.9866509437561035, + "logits/rejected": -2.9607677459716797, + "logps/chosen": -60.698631286621094, + "logps/rejected": -61.67456817626953, + "loss": 0.6819, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.07110203802585602, + "rewards/margins": 0.024299288168549538, + "rewards/rejected": -0.09540131688117981, + "step": 3000 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -3.0797338485717773, + "eval_logits/rejected": -3.074019193649292, + "eval_logps/chosen": -62.44396209716797, + "eval_logps/rejected": -68.11664581298828, + "eval_loss": 0.6876720786094666, + "eval_rewards/accuracies": 0.5931691527366638, + "eval_rewards/chosen": -0.03732062131166458, + "eval_rewards/margins": 0.012044590897858143, + "eval_rewards/rejected": -0.04936521500349045, + "eval_runtime": 384.3196, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 3000 + }, + { + "epoch": 1.0372157133011717, + "grad_norm": 1.8993059396743774, + "learning_rate": 2.773879905205936e-08, + "logits/chosen": -3.061885356903076, + "logits/rejected": -3.0365428924560547, + "logps/chosen": -60.7781867980957, + "logps/rejected": -60.112342834472656, + "loss": 0.6823, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.07066932320594788, + "rewards/margins": 0.023926690220832825, + "rewards/rejected": -0.0945960059762001, + "step": 3010 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 1.7303686141967773, + "learning_rate": 2.7589282497140826e-08, + "logits/chosen": -2.90051531791687, + "logits/rejected": -2.8854193687438965, + "logps/chosen": -59.404823303222656, + "logps/rejected": -60.72951126098633, + "loss": 0.6837, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.07631709426641464, + "rewards/margins": 0.021014804020524025, + "rewards/rejected": -0.09733189642429352, + "step": 3020 + }, + { + "epoch": 1.0441075120606478, + "grad_norm": 1.7988367080688477, + "learning_rate": 2.7439672264177017e-08, + "logits/chosen": -2.9338347911834717, + "logits/rejected": -2.9088456630706787, + "logps/chosen": -63.385589599609375, + "logps/rejected": -62.660133361816406, + "loss": 0.6798, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.06967137008905411, + "rewards/margins": 0.02861020527780056, + "rewards/rejected": -0.09828157722949982, + "step": 3030 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 2.00886869430542, + "learning_rate": 2.7289973765939316e-08, + "logits/chosen": -2.961655378341675, + "logits/rejected": -2.9402718544006348, + "logps/chosen": -63.45885467529297, + "logps/rejected": -62.679222106933594, + "loss": 0.6788, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.06949891895055771, + "rewards/margins": 0.030794551596045494, + "rewards/rejected": -0.10029347240924835, + "step": 3040 + }, + { + "epoch": 1.050999310820124, + "grad_norm": 1.9969176054000854, + "learning_rate": 2.7140192418392456e-08, + "logits/chosen": -2.998516082763672, + "logits/rejected": -2.9880592823028564, + "logps/chosen": -60.910118103027344, + "logps/rejected": -62.53899002075195, + "loss": 0.6787, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.06814457476139069, + "rewards/margins": 0.031134705990552902, + "rewards/rejected": -0.09927927702665329, + "step": 3050 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 2.0115458965301514, + "learning_rate": 2.699033364049858e-08, + "logits/chosen": -3.0406007766723633, + "logits/rejected": -3.008848190307617, + "logps/chosen": -59.562950134277344, + "logps/rejected": -63.31853103637695, + "loss": 0.6779, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.06552625447511673, + "rewards/margins": 0.03282874822616577, + "rewards/rejected": -0.0983550101518631, + "step": 3060 + }, + { + "epoch": 1.0578911095796002, + "grad_norm": 2.114454746246338, + "learning_rate": 2.684040285402122e-08, + "logits/chosen": -3.008307933807373, + "logits/rejected": -2.9894957542419434, + "logps/chosen": -61.14850997924805, + "logps/rejected": -62.975914001464844, + "loss": 0.6824, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.07413209974765778, + "rewards/margins": 0.023501945659518242, + "rewards/rejected": -0.09763404726982117, + "step": 3070 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 1.9892925024032593, + "learning_rate": 2.6690405483329103e-08, + "logits/chosen": -2.9078526496887207, + "logits/rejected": -2.895010471343994, + "logps/chosen": -59.090423583984375, + "logps/rejected": -60.913917541503906, + "loss": 0.6813, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07098571211099625, + "rewards/margins": 0.025464143604040146, + "rewards/rejected": -0.09644986689090729, + "step": 3080 + }, + { + "epoch": 1.0647829083390765, + "grad_norm": 2.013815402984619, + "learning_rate": 2.6540346955199894e-08, + "logits/chosen": -3.019172191619873, + "logits/rejected": -2.998669385910034, + "logps/chosen": -65.81825256347656, + "logps/rejected": -64.49800109863281, + "loss": 0.6805, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.07010366022586823, + "rewards/margins": 0.027619188651442528, + "rewards/rejected": -0.0977228507399559, + "step": 3090 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 1.989579200744629, + "learning_rate": 2.6390232698623925e-08, + "logits/chosen": -2.934821605682373, + "logits/rejected": -2.9127309322357178, + "logps/chosen": -63.19129180908203, + "logps/rejected": -63.9019775390625, + "loss": 0.6796, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.06846065074205399, + "rewards/margins": 0.029360279440879822, + "rewards/rejected": -0.09782092273235321, + "step": 3100 + }, + { + "epoch": 1.0682288077188147, + "eval_logits/chosen": -3.075859546661377, + "eval_logits/rejected": -3.070146322250366, + "eval_logps/chosen": -62.62958908081055, + "eval_logps/rejected": -68.35604858398438, + "eval_loss": 0.6874265074729919, + "eval_rewards/accuracies": 0.5987453460693359, + "eval_rewards/chosen": -0.039176926016807556, + "eval_rewards/margins": 0.012582373805344105, + "eval_rewards/rejected": -0.051759302616119385, + "eval_runtime": 384.4384, + "eval_samples_per_second": 11.196, + "eval_steps_per_second": 1.399, + "step": 3100 + }, + { + "epoch": 1.0716747070985528, + "grad_norm": 1.7961196899414062, + "learning_rate": 2.624006814460772e-08, + "logits/chosen": -2.987920045852661, + "logits/rejected": -2.9627695083618164, + "logps/chosen": -60.823204040527344, + "logps/rejected": -62.9400520324707, + "loss": 0.6798, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.07341060042381287, + "rewards/margins": 0.02877485193312168, + "rewards/rejected": -0.10218546539545059, + "step": 3110 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 2.1381261348724365, + "learning_rate": 2.608985872597749e-08, + "logits/chosen": -3.026332139968872, + "logits/rejected": -3.0009701251983643, + "logps/chosen": -64.1605224609375, + "logps/rejected": -65.11061096191406, + "loss": 0.6773, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.06550760567188263, + "rewards/margins": 0.034324102103710175, + "rewards/rejected": -0.09983170032501221, + "step": 3120 + }, + { + "epoch": 1.078566505858029, + "grad_norm": 2.180638313293457, + "learning_rate": 2.5939609877182672e-08, + "logits/chosen": -2.9584217071533203, + "logits/rejected": -2.9367153644561768, + "logps/chosen": -62.10315704345703, + "logps/rejected": -62.0599250793457, + "loss": 0.6811, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.07277052104473114, + "rewards/margins": 0.02645127847790718, + "rewards/rejected": -0.09922181069850922, + "step": 3130 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 2.0120506286621094, + "learning_rate": 2.5789327034099196e-08, + "logits/chosen": -3.004655599594116, + "logits/rejected": -2.991770029067993, + "logps/chosen": -60.61780548095703, + "logps/rejected": -63.46196746826172, + "loss": 0.6799, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.06961344182491302, + "rewards/margins": 0.02859930694103241, + "rewards/rejected": -0.09821274131536484, + "step": 3140 + }, + { + "epoch": 1.0854583046175053, + "grad_norm": 1.911136269569397, + "learning_rate": 2.5639015633832895e-08, + "logits/chosen": -2.9630255699157715, + "logits/rejected": -2.9336097240448, + "logps/chosen": -59.48429489135742, + "logps/rejected": -62.6728630065918, + "loss": 0.6797, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.07576960325241089, + "rewards/margins": 0.028852378949522972, + "rewards/rejected": -0.10462198406457901, + "step": 3150 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 1.9514598846435547, + "learning_rate": 2.548868111452281e-08, + "logits/chosen": -2.9592223167419434, + "logits/rejected": -2.932917833328247, + "logps/chosen": -60.18004608154297, + "logps/rejected": -62.1012077331543, + "loss": 0.6837, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.07590612769126892, + "rewards/margins": 0.020728668197989464, + "rewards/rejected": -0.09663479030132294, + "step": 3160 + }, + { + "epoch": 1.0923501033769814, + "grad_norm": 2.163417339324951, + "learning_rate": 2.5338328915144336e-08, + "logits/chosen": -2.948763132095337, + "logits/rejected": -2.9275569915771484, + "logps/chosen": -61.674354553222656, + "logps/rejected": -63.544090270996094, + "loss": 0.6789, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.07235665619373322, + "rewards/margins": 0.030740192160010338, + "rewards/rejected": -0.1030968576669693, + "step": 3170 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 1.9518697261810303, + "learning_rate": 2.5187964475312597e-08, + "logits/chosen": -2.948805570602417, + "logits/rejected": -2.944000482559204, + "logps/chosen": -61.725982666015625, + "logps/rejected": -65.04237365722656, + "loss": 0.6823, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.07234276086091995, + "rewards/margins": 0.023657139390707016, + "rewards/rejected": -0.09599989652633667, + "step": 3180 + }, + { + "epoch": 1.0992419021364577, + "grad_norm": 2.125180721282959, + "learning_rate": 2.503759323508552e-08, + "logits/chosen": -3.0046706199645996, + "logits/rejected": -2.9987471103668213, + "logps/chosen": -60.51375198364258, + "logps/rejected": -64.95499420166016, + "loss": 0.6847, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.07574325799942017, + "rewards/margins": 0.0189945288002491, + "rewards/rejected": -0.09473778307437897, + "step": 3190 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 2.1917178630828857, + "learning_rate": 2.4887220634767067e-08, + "logits/chosen": -3.024988889694214, + "logits/rejected": -2.9950709342956543, + "logps/chosen": -63.498199462890625, + "logps/rejected": -63.7032356262207, + "loss": 0.6776, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.06609619408845901, + "rewards/margins": 0.033227093517780304, + "rewards/rejected": -0.09932328760623932, + "step": 3200 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -3.0731794834136963, + "eval_logits/rejected": -3.067418098449707, + "eval_logps/chosen": -62.80425262451172, + "eval_logps/rejected": -68.5819091796875, + "eval_loss": 0.6871995329856873, + "eval_rewards/accuracies": 0.5906133651733398, + "eval_rewards/chosen": -0.04092356562614441, + "eval_rewards/margins": 0.013094342313706875, + "eval_rewards/rejected": -0.05401790514588356, + "eval_runtime": 384.4612, + "eval_samples_per_second": 11.195, + "eval_steps_per_second": 1.399, + "step": 3200 + }, + { + "epoch": 1.1061337008959338, + "grad_norm": 1.9683195352554321, + "learning_rate": 2.4736852114710417e-08, + "logits/chosen": -3.0620484352111816, + "logits/rejected": -3.0399696826934814, + "logps/chosen": -61.793212890625, + "logps/rejected": -62.917564392089844, + "loss": 0.6799, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.07215774804353714, + "rewards/margins": 0.028695005923509598, + "rewards/rejected": -0.10085275024175644, + "step": 3210 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 2.0262603759765625, + "learning_rate": 2.458649311512114e-08, + "logits/chosen": -2.94061541557312, + "logits/rejected": -2.924363851547241, + "logps/chosen": -59.32355499267578, + "logps/rejected": -60.30609130859375, + "loss": 0.6839, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.07498239725828171, + "rewards/margins": 0.020939212292432785, + "rewards/rejected": -0.09592162072658539, + "step": 3220 + }, + { + "epoch": 1.11302549965541, + "grad_norm": 2.0922205448150635, + "learning_rate": 2.443614907586034e-08, + "logits/chosen": -2.944875717163086, + "logits/rejected": -2.9296464920043945, + "logps/chosen": -61.308441162109375, + "logps/rejected": -65.41650390625, + "loss": 0.6785, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0720820352435112, + "rewards/margins": 0.03132672980427742, + "rewards/rejected": -0.10340876877307892, + "step": 3230 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 2.039001941680908, + "learning_rate": 2.4285825436247875e-08, + "logits/chosen": -2.9310574531555176, + "logits/rejected": -2.901254415512085, + "logps/chosen": -61.543861389160156, + "logps/rejected": -61.45790481567383, + "loss": 0.6769, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.06877841800451279, + "rewards/margins": 0.03471370413899422, + "rewards/rejected": -0.1034921258687973, + "step": 3240 + }, + { + "epoch": 1.1199172984148862, + "grad_norm": 2.0376031398773193, + "learning_rate": 2.413552763486558e-08, + "logits/chosen": -3.0727105140686035, + "logits/rejected": -3.0629258155822754, + "logps/chosen": -63.042259216308594, + "logps/rejected": -63.58977508544922, + "loss": 0.684, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.08128751814365387, + "rewards/margins": 0.0204035434871912, + "rewards/rejected": -0.10169105231761932, + "step": 3250 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 2.0699033737182617, + "learning_rate": 2.3985261109360457e-08, + "logits/chosen": -2.9842300415039062, + "logits/rejected": -2.948054313659668, + "logps/chosen": -61.926849365234375, + "logps/rejected": -62.5003662109375, + "loss": 0.6765, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.0672898143529892, + "rewards/margins": 0.035381607711315155, + "rewards/rejected": -0.10267140716314316, + "step": 3260 + }, + { + "epoch": 1.1268090971743625, + "grad_norm": 2.1659767627716064, + "learning_rate": 2.3835031296247988e-08, + "logits/chosen": -2.9566397666931152, + "logits/rejected": -2.9284496307373047, + "logps/chosen": -62.831298828125, + "logps/rejected": -64.27127838134766, + "loss": 0.6789, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.0733214020729065, + "rewards/margins": 0.03064587712287903, + "rewards/rejected": -0.10396728664636612, + "step": 3270 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 1.9470397233963013, + "learning_rate": 2.3684843630715446e-08, + "logits/chosen": -2.9487035274505615, + "logits/rejected": -2.9159140586853027, + "logps/chosen": -61.108551025390625, + "logps/rejected": -62.888671875, + "loss": 0.6753, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.07333475351333618, + "rewards/margins": 0.037748340517282486, + "rewards/rejected": -0.11108310520648956, + "step": 3280 + }, + { + "epoch": 1.1337008959338388, + "grad_norm": 1.8661203384399414, + "learning_rate": 2.3534703546425203e-08, + "logits/chosen": -3.003861904144287, + "logits/rejected": -2.9693593978881836, + "logps/chosen": -61.7313346862793, + "logps/rejected": -61.66279220581055, + "loss": 0.6798, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.07435169070959091, + "rewards/margins": 0.028780505061149597, + "rewards/rejected": -0.1031322032213211, + "step": 3290 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 1.8614228963851929, + "learning_rate": 2.338461647531821e-08, + "logits/chosen": -3.0054256916046143, + "logits/rejected": -2.986172914505005, + "logps/chosen": -62.00334548950195, + "logps/rejected": -63.677406311035156, + "loss": 0.6824, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.07767204940319061, + "rewards/margins": 0.023757968097925186, + "rewards/rejected": -0.1014300212264061, + "step": 3300 + }, + { + "epoch": 1.1371467953135768, + "eval_logits/chosen": -3.0700747966766357, + "eval_logits/rejected": -3.064316749572754, + "eval_logps/chosen": -63.07503128051758, + "eval_logps/rejected": -68.88993835449219, + "eval_loss": 0.6870441436767578, + "eval_rewards/accuracies": 0.5945631861686707, + "eval_rewards/chosen": -0.04363138601183891, + "eval_rewards/margins": 0.013466770760715008, + "eval_rewards/rejected": -0.057098157703876495, + "eval_runtime": 384.3926, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 3300 + }, + { + "epoch": 1.140592694693315, + "grad_norm": 2.364335536956787, + "learning_rate": 2.3234587847417447e-08, + "logits/chosen": -2.9901270866394043, + "logits/rejected": -2.969078540802002, + "logps/chosen": -60.3624382019043, + "logps/rejected": -62.73461151123047, + "loss": 0.6811, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.08183901011943817, + "rewards/margins": 0.025942673906683922, + "rewards/rejected": -0.10778167098760605, + "step": 3310 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 1.817582607269287, + "learning_rate": 2.3084623090631447e-08, + "logits/chosen": -2.979372501373291, + "logits/rejected": -2.9465384483337402, + "logps/chosen": -61.04265213012695, + "logps/rejected": -60.45512008666992, + "loss": 0.6781, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.07833977043628693, + "rewards/margins": 0.032259080559015274, + "rewards/rejected": -0.1105988472700119, + "step": 3320 + }, + { + "epoch": 1.1474844934527912, + "grad_norm": 2.0053155422210693, + "learning_rate": 2.2934727630557967e-08, + "logits/chosen": -3.0550856590270996, + "logits/rejected": -3.0343222618103027, + "logps/chosen": -59.455810546875, + "logps/rejected": -64.58208465576172, + "loss": 0.6778, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07518685609102249, + "rewards/margins": 0.03304674103856087, + "rewards/rejected": -0.10823359340429306, + "step": 3330 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 2.0891268253326416, + "learning_rate": 2.278490689028765e-08, + "logits/chosen": -2.9639458656311035, + "logits/rejected": -2.9505555629730225, + "logps/chosen": -59.85667037963867, + "logps/rejected": -64.63966369628906, + "loss": 0.6794, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.07990622520446777, + "rewards/margins": 0.029703548178076744, + "rewards/rejected": -0.10960976779460907, + "step": 3340 + }, + { + "epoch": 1.1543762922122673, + "grad_norm": 2.108522891998291, + "learning_rate": 2.263516629020784e-08, + "logits/chosen": -2.969435930252075, + "logits/rejected": -2.953042507171631, + "logps/chosen": -63.971824645996094, + "logps/rejected": -66.6336441040039, + "loss": 0.6772, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.07327866554260254, + "rewards/margins": 0.034395571798086166, + "rewards/rejected": -0.107674241065979, + "step": 3350 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 1.9754632711410522, + "learning_rate": 2.2485511247806493e-08, + "logits/chosen": -2.967040538787842, + "logits/rejected": -2.9482016563415527, + "logps/chosen": -61.31055450439453, + "logps/rejected": -63.2735595703125, + "loss": 0.6766, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.07140516489744186, + "rewards/margins": 0.03584301844239235, + "rewards/rejected": -0.10724818706512451, + "step": 3360 + }, + { + "epoch": 1.1612680909717437, + "grad_norm": 1.840278148651123, + "learning_rate": 2.233594717747614e-08, + "logits/chosen": -2.9520742893218994, + "logits/rejected": -2.9398889541625977, + "logps/chosen": -63.01189041137695, + "logps/rejected": -65.7177505493164, + "loss": 0.6813, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.07444876432418823, + "rewards/margins": 0.025889653712511063, + "rewards/rejected": -0.100338414311409, + "step": 3370 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 1.9508111476898193, + "learning_rate": 2.2186479490318026e-08, + "logits/chosen": -3.0001091957092285, + "logits/rejected": -2.9788658618927, + "logps/chosen": -60.84368896484375, + "logps/rejected": -62.41579055786133, + "loss": 0.6812, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.08587892353534698, + "rewards/margins": 0.0261564739048481, + "rewards/rejected": -0.11203539371490479, + "step": 3380 + }, + { + "epoch": 1.1681598897312198, + "grad_norm": 2.1116995811462402, + "learning_rate": 2.203711359394635e-08, + "logits/chosen": -2.984722852706909, + "logits/rejected": -2.966047763824463, + "logps/chosen": -62.917694091796875, + "logps/rejected": -65.20925903320312, + "loss": 0.6794, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07749287784099579, + "rewards/margins": 0.029815923422574997, + "rewards/rejected": -0.10730880498886108, + "step": 3390 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 2.0296571254730225, + "learning_rate": 2.1887854892292585e-08, + "logits/chosen": -2.9498519897460938, + "logits/rejected": -2.9285740852355957, + "logps/chosen": -59.8641242980957, + "logps/rejected": -63.11505126953125, + "loss": 0.6787, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.08392973244190216, + "rewards/margins": 0.031144190579652786, + "rewards/rejected": -0.11507391929626465, + "step": 3400 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -3.0668275356292725, + "eval_logits/rejected": -3.061082363128662, + "eval_logps/chosen": -63.29131317138672, + "eval_logps/rejected": -69.14148712158203, + "eval_loss": 0.6868980526924133, + "eval_rewards/accuracies": 0.5940985083580017, + "eval_rewards/chosen": -0.04579411447048187, + "eval_rewards/margins": 0.013819512911140919, + "eval_rewards/rejected": -0.059613630175590515, + "eval_runtime": 384.4849, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 3400 + }, + { + "epoch": 1.175051688490696, + "grad_norm": 2.0198545455932617, + "learning_rate": 2.1738708785409993e-08, + "logits/chosen": -3.011244058609009, + "logits/rejected": -2.9873194694519043, + "logps/chosen": -62.25665283203125, + "logps/rejected": -63.576087951660156, + "loss": 0.6729, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.06893068552017212, + "rewards/margins": 0.04300907254219055, + "rewards/rejected": -0.11193976551294327, + "step": 3410 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 2.206679105758667, + "learning_rate": 2.1589680669278273e-08, + "logits/chosen": -3.0483782291412354, + "logits/rejected": -3.0321924686431885, + "logps/chosen": -63.279327392578125, + "logps/rejected": -64.15853881835938, + "loss": 0.6839, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.07862303406000137, + "rewards/margins": 0.020904963836073875, + "rewards/rejected": -0.0995279997587204, + "step": 3420 + }, + { + "epoch": 1.1819434872501722, + "grad_norm": 1.9603242874145508, + "learning_rate": 2.14407759356083e-08, + "logits/chosen": -2.9316813945770264, + "logits/rejected": -2.8970885276794434, + "logps/chosen": -64.16649627685547, + "logps/rejected": -64.34681701660156, + "loss": 0.6753, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.07526973634958267, + "rewards/margins": 0.038385529071092606, + "rewards/rejected": -0.11365525424480438, + "step": 3430 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 2.079928159713745, + "learning_rate": 2.1291999971647077e-08, + "logits/chosen": -2.967278003692627, + "logits/rejected": -2.946516752243042, + "logps/chosen": -61.39111328125, + "logps/rejected": -62.85634231567383, + "loss": 0.681, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.087169349193573, + "rewards/margins": 0.026739057153463364, + "rewards/rejected": -0.11390841007232666, + "step": 3440 + }, + { + "epoch": 1.1888352860096485, + "grad_norm": 2.0616533756256104, + "learning_rate": 2.1143358159982836e-08, + "logits/chosen": -2.967345952987671, + "logits/rejected": -2.9432525634765625, + "logps/chosen": -60.566123962402344, + "logps/rejected": -63.50170135498047, + "loss": 0.6796, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07633674144744873, + "rewards/margins": 0.029550742357969284, + "rewards/rejected": -0.10588748753070831, + "step": 3450 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 2.107825756072998, + "learning_rate": 2.0994855878350274e-08, + "logits/chosen": -3.0657877922058105, + "logits/rejected": -3.0379040241241455, + "logps/chosen": -64.24806213378906, + "logps/rejected": -65.42948913574219, + "loss": 0.6783, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.07753048092126846, + "rewards/margins": 0.03188103809952736, + "rewards/rejected": -0.10941150039434433, + "step": 3460 + }, + { + "epoch": 1.1957270847691248, + "grad_norm": 2.052567720413208, + "learning_rate": 2.084649849943604e-08, + "logits/chosen": -2.89976167678833, + "logits/rejected": -2.862915277481079, + "logps/chosen": -64.68438720703125, + "logps/rejected": -63.40746307373047, + "loss": 0.6739, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.06885982304811478, + "rewards/margins": 0.04099997133016586, + "rewards/rejected": -0.10985980182886124, + "step": 3470 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 1.953604817390442, + "learning_rate": 2.0698291390684307e-08, + "logits/chosen": -3.04626727104187, + "logits/rejected": -3.0169880390167236, + "logps/chosen": -62.52994918823242, + "logps/rejected": -61.918418884277344, + "loss": 0.6763, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.07942342013120651, + "rewards/margins": 0.03616594150662422, + "rewards/rejected": -0.11558938026428223, + "step": 3480 + }, + { + "epoch": 1.202618883528601, + "grad_norm": 1.9615064859390259, + "learning_rate": 2.0550239914102593e-08, + "logits/chosen": -2.9919002056121826, + "logits/rejected": -2.959578275680542, + "logps/chosen": -61.7932243347168, + "logps/rejected": -63.043975830078125, + "loss": 0.6747, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.07409648597240448, + "rewards/margins": 0.03935806080698967, + "rewards/rejected": -0.11345455795526505, + "step": 3490 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 2.0523669719696045, + "learning_rate": 2.0402349426067798e-08, + "logits/chosen": -3.0102508068084717, + "logits/rejected": -2.99145770072937, + "logps/chosen": -65.25118255615234, + "logps/rejected": -65.96293640136719, + "loss": 0.6801, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.0861230120062828, + "rewards/margins": 0.028741363435983658, + "rewards/rejected": -0.11486438661813736, + "step": 3500 + }, + { + "epoch": 1.206064782908339, + "eval_logits/chosen": -3.0645828247070312, + "eval_logits/rejected": -3.058849573135376, + "eval_logps/chosen": -63.53173065185547, + "eval_logps/rejected": -69.41845703125, + "eval_loss": 0.6867420673370361, + "eval_rewards/accuracies": 0.5929368138313293, + "eval_rewards/chosen": -0.04819829761981964, + "eval_rewards/margins": 0.014185106381773949, + "eval_rewards/rejected": -0.06238340213894844, + "eval_runtime": 384.3982, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 3500 + }, + { + "epoch": 1.2095106822880772, + "grad_norm": 2.0280165672302246, + "learning_rate": 2.0254625277132383e-08, + "logits/chosen": -2.9636659622192383, + "logits/rejected": -2.9369940757751465, + "logps/chosen": -61.99406051635742, + "logps/rejected": -64.51708984375, + "loss": 0.6755, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.08114029467105865, + "rewards/margins": 0.03792557865381241, + "rewards/rejected": -0.11906588077545166, + "step": 3510 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 2.152580976486206, + "learning_rate": 2.0107072811830786e-08, + "logits/chosen": -2.9667038917541504, + "logits/rejected": -2.9484071731567383, + "logps/chosen": -62.68111038208008, + "logps/rejected": -66.8736343383789, + "loss": 0.6795, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08092640340328217, + "rewards/margins": 0.029753107577562332, + "rewards/rejected": -0.1106795221567154, + "step": 3520 + }, + { + "epoch": 1.2164024810475533, + "grad_norm": 2.1510519981384277, + "learning_rate": 1.9959697368486107e-08, + "logits/chosen": -2.968773365020752, + "logits/rejected": -2.9507293701171875, + "logps/chosen": -61.249794006347656, + "logps/rejected": -65.181396484375, + "loss": 0.6787, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.08916781842708588, + "rewards/margins": 0.031191542744636536, + "rewards/rejected": -0.1203593760728836, + "step": 3530 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 2.2037718296051025, + "learning_rate": 1.9812504279016915e-08, + "logits/chosen": -3.019176483154297, + "logits/rejected": -2.990429162979126, + "logps/chosen": -62.22697830200195, + "logps/rejected": -64.77278137207031, + "loss": 0.6794, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08191390335559845, + "rewards/margins": 0.029749279841780663, + "rewards/rejected": -0.11166318506002426, + "step": 3540 + }, + { + "epoch": 1.2232942798070296, + "grad_norm": 2.0587522983551025, + "learning_rate": 1.9665498868744378e-08, + "logits/chosen": -3.0102896690368652, + "logits/rejected": -2.994324207305908, + "logps/chosen": -63.87359619140625, + "logps/rejected": -66.18248748779297, + "loss": 0.679, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.08578020334243774, + "rewards/margins": 0.03075292706489563, + "rewards/rejected": -0.11653313785791397, + "step": 3550 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 2.123263120651245, + "learning_rate": 1.95186864561996e-08, + "logits/chosen": -2.9897990226745605, + "logits/rejected": -2.949155330657959, + "logps/chosen": -65.28316497802734, + "logps/rejected": -62.66881561279297, + "loss": 0.6774, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.075536347925663, + "rewards/margins": 0.03343961387872696, + "rewards/rejected": -0.10897596180438995, + "step": 3560 + }, + { + "epoch": 1.230186078566506, + "grad_norm": 1.9812653064727783, + "learning_rate": 1.9372072352931186e-08, + "logits/chosen": -2.9194540977478027, + "logits/rejected": -2.9036872386932373, + "logps/chosen": -61.8432502746582, + "logps/rejected": -63.653343200683594, + "loss": 0.6791, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0801379531621933, + "rewards/margins": 0.030780458822846413, + "rewards/rejected": -0.11091840267181396, + "step": 3570 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 2.041963577270508, + "learning_rate": 1.9225661863313063e-08, + "logits/chosen": -3.0060529708862305, + "logits/rejected": -2.9751877784729004, + "logps/chosen": -63.72297286987305, + "logps/rejected": -66.69009399414062, + "loss": 0.6775, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.0869833379983902, + "rewards/margins": 0.034079745411872864, + "rewards/rejected": -0.12106309086084366, + "step": 3580 + }, + { + "epoch": 1.237077877325982, + "grad_norm": 2.1056067943573, + "learning_rate": 1.9079460284352616e-08, + "logits/chosen": -2.989348888397217, + "logits/rejected": -2.968977689743042, + "logps/chosen": -64.03874206542969, + "logps/rejected": -65.50846862792969, + "loss": 0.6805, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08604481816291809, + "rewards/margins": 0.027634482830762863, + "rewards/rejected": -0.11367928981781006, + "step": 3590 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 2.2646119594573975, + "learning_rate": 1.893347290549901e-08, + "logits/chosen": -2.9118189811706543, + "logits/rejected": -2.8909945487976074, + "logps/chosen": -61.814781188964844, + "logps/rejected": -66.79518127441406, + "loss": 0.6797, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08930834382772446, + "rewards/margins": 0.029664453119039536, + "rewards/rejected": -0.11897280067205429, + "step": 3600 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -3.0616257190704346, + "eval_logits/rejected": -3.0558876991271973, + "eval_logps/chosen": -63.699798583984375, + "eval_logps/rejected": -69.62061309814453, + "eval_loss": 0.6865953207015991, + "eval_rewards/accuracies": 0.5915427803993225, + "eval_rewards/chosen": -0.049879107624292374, + "eval_rewards/margins": 0.014525760896503925, + "eval_rewards/rejected": -0.06440486758947372, + "eval_runtime": 384.6476, + "eval_samples_per_second": 11.189, + "eval_steps_per_second": 1.399, + "step": 3600 + }, + { + "epoch": 1.2439696760854584, + "grad_norm": 2.077509641647339, + "learning_rate": 1.878770500845181e-08, + "logits/chosen": -2.971928119659424, + "logits/rejected": -2.9434304237365723, + "logps/chosen": -64.08241271972656, + "logps/rejected": -63.58278274536133, + "loss": 0.6798, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.08303068578243256, + "rewards/margins": 0.02932748757302761, + "rewards/rejected": -0.11235816776752472, + "step": 3610 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 2.0165388584136963, + "learning_rate": 1.8642161866969946e-08, + "logits/chosen": -2.955901622772217, + "logits/rejected": -2.940150260925293, + "logps/chosen": -64.76017761230469, + "logps/rejected": -65.11692810058594, + "loss": 0.6827, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.08926106989383698, + "rewards/margins": 0.023149941116571426, + "rewards/rejected": -0.1124110072851181, + "step": 3620 + }, + { + "epoch": 1.2508614748449345, + "grad_norm": 2.059694528579712, + "learning_rate": 1.8496848746680856e-08, + "logits/chosen": -3.020172119140625, + "logits/rejected": -2.988006114959717, + "logps/chosen": -62.92469024658203, + "logps/rejected": -64.46615600585938, + "loss": 0.679, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08797968924045563, + "rewards/margins": 0.03054668940603733, + "rewards/rejected": -0.11852637678384781, + "step": 3630 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 2.0419082641601562, + "learning_rate": 1.8351770904890036e-08, + "logits/chosen": -3.047367572784424, + "logits/rejected": -3.0146524906158447, + "logps/chosen": -63.642982482910156, + "logps/rejected": -64.89458465576172, + "loss": 0.6747, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.08200275897979736, + "rewards/margins": 0.039216045290231705, + "rewards/rejected": -0.12121880054473877, + "step": 3640 + }, + { + "epoch": 1.2577532736044108, + "grad_norm": 2.025785207748413, + "learning_rate": 1.8206933590390786e-08, + "logits/chosen": -2.825869560241699, + "logits/rejected": -2.8109097480773926, + "logps/chosen": -61.419586181640625, + "logps/rejected": -64.72754669189453, + "loss": 0.6847, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.0889354795217514, + "rewards/margins": 0.018987303599715233, + "rewards/rejected": -0.10792279243469238, + "step": 3650 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 2.176628351211548, + "learning_rate": 1.8062342043274324e-08, + "logits/chosen": -3.015015125274658, + "logits/rejected": -2.9903564453125, + "logps/chosen": -63.53308868408203, + "logps/rejected": -63.09611892700195, + "loss": 0.681, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08826801180839539, + "rewards/margins": 0.027484869584441185, + "rewards/rejected": -0.11575287580490112, + "step": 3660 + }, + { + "epoch": 1.264645072363887, + "grad_norm": 2.1196792125701904, + "learning_rate": 1.7918001494740237e-08, + "logits/chosen": -2.9659061431884766, + "logits/rejected": -2.9388701915740967, + "logps/chosen": -62.460182189941406, + "logps/rejected": -64.67069244384766, + "loss": 0.6761, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.08238350600004196, + "rewards/margins": 0.03688802570104599, + "rewards/rejected": -0.11927153170108795, + "step": 3670 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 2.1168670654296875, + "learning_rate": 1.777391716690718e-08, + "logits/chosen": -2.9998278617858887, + "logits/rejected": -2.9749624729156494, + "logps/chosen": -64.06501770019531, + "logps/rejected": -63.614013671875, + "loss": 0.6785, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.08456394076347351, + "rewards/margins": 0.031805459409952164, + "rewards/rejected": -0.11636941134929657, + "step": 3680 + }, + { + "epoch": 1.2715368711233632, + "grad_norm": 2.04862380027771, + "learning_rate": 1.7630094272623956e-08, + "logits/chosen": -2.8991315364837646, + "logits/rejected": -2.8779456615448, + "logps/chosen": -61.9876708984375, + "logps/rejected": -63.97594451904297, + "loss": 0.6819, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09000525623559952, + "rewards/margins": 0.025113940238952637, + "rewards/rejected": -0.11511919647455215, + "step": 3690 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 2.112943410873413, + "learning_rate": 1.748653801528095e-08, + "logits/chosen": -2.9221792221069336, + "logits/rejected": -2.8926749229431152, + "logps/chosen": -62.65099334716797, + "logps/rejected": -63.81853103637695, + "loss": 0.6783, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08818589150905609, + "rewards/margins": 0.03205002099275589, + "rewards/rejected": -0.12023589760065079, + "step": 3700 + }, + { + "epoch": 1.2749827705031014, + "eval_logits/chosen": -3.0599453449249268, + "eval_logits/rejected": -3.05422043800354, + "eval_logps/chosen": -63.817176818847656, + "eval_logps/rejected": -69.7728042602539, + "eval_loss": 0.686441957950592, + "eval_rewards/accuracies": 0.5903810262680054, + "eval_rewards/chosen": -0.051052823662757874, + "eval_rewards/margins": 0.014873947948217392, + "eval_rewards/rejected": -0.06592677533626556, + "eval_runtime": 384.3427, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 3700 + }, + { + "epoch": 1.2784286698828393, + "grad_norm": 2.092205047607422, + "learning_rate": 1.734325358862181e-08, + "logits/chosen": -2.863043785095215, + "logits/rejected": -2.8345303535461426, + "logps/chosen": -62.89555740356445, + "logps/rejected": -64.21585845947266, + "loss": 0.6783, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.07933399826288223, + "rewards/margins": 0.03237777203321457, + "rewards/rejected": -0.1117117628455162, + "step": 3710 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 2.1363396644592285, + "learning_rate": 1.7200246176555605e-08, + "logits/chosen": -2.9469122886657715, + "logits/rejected": -2.9180445671081543, + "logps/chosen": -64.09722900390625, + "logps/rejected": -65.64142608642578, + "loss": 0.6761, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09392695873975754, + "rewards/margins": 0.036279790103435516, + "rewards/rejected": -0.13020673394203186, + "step": 3720 + }, + { + "epoch": 1.2853204686423156, + "grad_norm": 1.9918975830078125, + "learning_rate": 1.7057520952969256e-08, + "logits/chosen": -2.9698033332824707, + "logits/rejected": -2.957746982574463, + "logps/chosen": -61.49199676513672, + "logps/rejected": -64.32593536376953, + "loss": 0.679, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08447978645563126, + "rewards/margins": 0.030800744891166687, + "rewards/rejected": -0.11528053134679794, + "step": 3730 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 1.9824419021606445, + "learning_rate": 1.6915083081540328e-08, + "logits/chosen": -2.9427554607391357, + "logits/rejected": -2.9203765392303467, + "logps/chosen": -63.301551818847656, + "logps/rejected": -61.42414093017578, + "loss": 0.6819, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0865807980298996, + "rewards/margins": 0.02510489523410797, + "rewards/rejected": -0.11168569326400757, + "step": 3740 + }, + { + "epoch": 1.292212267401792, + "grad_norm": 1.8477709293365479, + "learning_rate": 1.6772937715550234e-08, + "logits/chosen": -2.9018781185150146, + "logits/rejected": -2.882781505584717, + "logps/chosen": -61.00310134887695, + "logps/rejected": -64.47721099853516, + "loss": 0.6796, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08985722810029984, + "rewards/margins": 0.029285427182912827, + "rewards/rejected": -0.11914266645908356, + "step": 3750 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 2.3149027824401855, + "learning_rate": 1.6631089997697788e-08, + "logits/chosen": -2.9336066246032715, + "logits/rejected": -2.9068706035614014, + "logps/chosen": -63.48087692260742, + "logps/rejected": -64.29015350341797, + "loss": 0.6777, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.08386462181806564, + "rewards/margins": 0.0336444191634655, + "rewards/rejected": -0.11750902980566025, + "step": 3760 + }, + { + "epoch": 1.299104066161268, + "grad_norm": 1.8848634958267212, + "learning_rate": 1.648954505991315e-08, + "logits/chosen": -2.9510250091552734, + "logits/rejected": -2.946815013885498, + "logps/chosen": -60.505516052246094, + "logps/rejected": -64.12696838378906, + "loss": 0.6821, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08626588433980942, + "rewards/margins": 0.024447450414299965, + "rewards/rejected": -0.11071332544088364, + "step": 3770 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 2.1947124004364014, + "learning_rate": 1.634830802317215e-08, + "logits/chosen": -2.992312431335449, + "logits/rejected": -2.9723546504974365, + "logps/chosen": -59.96136474609375, + "logps/rejected": -65.40067291259766, + "loss": 0.6743, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.07778512686491013, + "rewards/margins": 0.04034097120165825, + "rewards/rejected": -0.11812610924243927, + "step": 3780 + }, + { + "epoch": 1.3059958649207444, + "grad_norm": 2.0879478454589844, + "learning_rate": 1.6207383997311025e-08, + "logits/chosen": -2.9999008178710938, + "logits/rejected": -2.9816055297851562, + "logps/chosen": -64.22468566894531, + "logps/rejected": -65.10344696044922, + "loss": 0.6753, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.07819289714097977, + "rewards/margins": 0.03828134760260582, + "rewards/rejected": -0.11647425591945648, + "step": 3790 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 1.9533721208572388, + "learning_rate": 1.6066778080841532e-08, + "logits/chosen": -2.996748208999634, + "logits/rejected": -2.9594829082489014, + "logps/chosen": -64.72180938720703, + "logps/rejected": -62.791358947753906, + "loss": 0.6771, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09291788935661316, + "rewards/margins": 0.034953463822603226, + "rewards/rejected": -0.1278713345527649, + "step": 3800 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -3.0579733848571777, + "eval_logits/rejected": -3.052189826965332, + "eval_logps/chosen": -63.923500061035156, + "eval_logps/rejected": -69.89812469482422, + "eval_loss": 0.6863577365875244, + "eval_rewards/accuracies": 0.5920074582099915, + "eval_rewards/chosen": -0.052116088569164276, + "eval_rewards/margins": 0.01506392378360033, + "eval_rewards/rejected": -0.06718001514673233, + "eval_runtime": 384.3289, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 3800 + }, + { + "epoch": 1.3128876636802205, + "grad_norm": 2.1521520614624023, + "learning_rate": 1.5926495360766518e-08, + "logits/chosen": -2.952903985977173, + "logits/rejected": -2.9230117797851562, + "logps/chosen": -63.550682067871094, + "logps/rejected": -63.614044189453125, + "loss": 0.678, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.08338330686092377, + "rewards/margins": 0.03267858177423477, + "rewards/rejected": -0.11606190353631973, + "step": 3810 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 2.238865852355957, + "learning_rate": 1.5786540912395846e-08, + "logits/chosen": -2.890934467315674, + "logits/rejected": -2.885941982269287, + "logps/chosen": -62.68060302734375, + "logps/rejected": -64.84013366699219, + "loss": 0.6803, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.08805284649133682, + "rewards/margins": 0.028121262788772583, + "rewards/rejected": -0.11617410182952881, + "step": 3820 + }, + { + "epoch": 1.3197794624396968, + "grad_norm": 2.5020928382873535, + "learning_rate": 1.564691979916278e-08, + "logits/chosen": -2.9928812980651855, + "logits/rejected": -2.960798740386963, + "logps/chosen": -66.76985931396484, + "logps/rejected": -67.78041076660156, + "loss": 0.6774, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09056535363197327, + "rewards/margins": 0.03447294607758522, + "rewards/rejected": -0.1250382959842682, + "step": 3830 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 2.00618839263916, + "learning_rate": 1.5507637072440824e-08, + "logits/chosen": -2.9697744846343994, + "logits/rejected": -2.9465317726135254, + "logps/chosen": -62.504920959472656, + "logps/rejected": -63.22511672973633, + "loss": 0.677, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.08414576202630997, + "rewards/margins": 0.034877367317676544, + "rewards/rejected": -0.11902312934398651, + "step": 3840 + }, + { + "epoch": 1.3266712611991731, + "grad_norm": 2.2420315742492676, + "learning_rate": 1.5368697771360922e-08, + "logits/chosen": -3.0211071968078613, + "logits/rejected": -2.982313632965088, + "logps/chosen": -62.169822692871094, + "logps/rejected": -62.871063232421875, + "loss": 0.6762, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.0782238095998764, + "rewards/margins": 0.03669989854097366, + "rewards/rejected": -0.11492369323968887, + "step": 3850 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 2.1969306468963623, + "learning_rate": 1.523010692262918e-08, + "logits/chosen": -2.981954336166382, + "logits/rejected": -2.956352710723877, + "logps/chosen": -63.6324462890625, + "logps/rejected": -63.42945098876953, + "loss": 0.6806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09774903953075409, + "rewards/margins": 0.027855467051267624, + "rewards/rejected": -0.125604510307312, + "step": 3860 + }, + { + "epoch": 1.3335630599586492, + "grad_norm": 2.0029098987579346, + "learning_rate": 1.5091869540345003e-08, + "logits/chosen": -2.9232115745544434, + "logits/rejected": -2.903977155685425, + "logps/chosen": -61.975440979003906, + "logps/rejected": -65.58573913574219, + "loss": 0.6774, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09161805361509323, + "rewards/margins": 0.03398921340703964, + "rewards/rejected": -0.12560728192329407, + "step": 3870 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 2.2118442058563232, + "learning_rate": 1.495399062581966e-08, + "logits/chosen": -2.8885598182678223, + "logits/rejected": -2.8633341789245605, + "logps/chosen": -60.8160400390625, + "logps/rejected": -64.75898742675781, + "loss": 0.6735, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.08591288328170776, + "rewards/margins": 0.04224963113665581, + "rewards/rejected": -0.12816253304481506, + "step": 3880 + }, + { + "epoch": 1.3404548587181253, + "grad_norm": 2.047248601913452, + "learning_rate": 1.481647516739537e-08, + "logits/chosen": -2.9682374000549316, + "logits/rejected": -2.947425127029419, + "logps/chosen": -61.350563049316406, + "logps/rejected": -65.50421905517578, + "loss": 0.6806, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.09522966295480728, + "rewards/margins": 0.027817973867058754, + "rewards/rejected": -0.12304762750864029, + "step": 3890 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 2.1648757457733154, + "learning_rate": 1.4679328140264815e-08, + "logits/chosen": -2.9958930015563965, + "logits/rejected": -2.985562801361084, + "logps/chosen": -62.2169189453125, + "logps/rejected": -65.93193054199219, + "loss": 0.6785, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.09441477060317993, + "rewards/margins": 0.03176296874880791, + "rewards/rejected": -0.12617774307727814, + "step": 3900 + }, + { + "epoch": 1.3439007580978635, + "eval_logits/chosen": -3.055612087249756, + "eval_logits/rejected": -3.0498664379119873, + "eval_logps/chosen": -64.0693359375, + "eval_logps/rejected": -70.08139038085938, + "eval_loss": 0.6861928701400757, + "eval_rewards/accuracies": 0.5922397971153259, + "eval_rewards/chosen": -0.053574394434690475, + "eval_rewards/margins": 0.015438344329595566, + "eval_rewards/rejected": -0.06901273876428604, + "eval_runtime": 384.3531, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 3900 + }, + { + "epoch": 1.3473466574776016, + "grad_norm": 1.999382734298706, + "learning_rate": 1.4542554506291169e-08, + "logits/chosen": -3.0110673904418945, + "logits/rejected": -2.9811036586761475, + "logps/chosen": -65.58875274658203, + "logps/rejected": -66.58174133300781, + "loss": 0.6762, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.07897213101387024, + "rewards/margins": 0.036501772701740265, + "rewards/rejected": -0.11547388881444931, + "step": 3910 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 1.908639907836914, + "learning_rate": 1.4406159213828506e-08, + "logits/chosen": -2.8948254585266113, + "logits/rejected": -2.8793833255767822, + "logps/chosen": -61.128509521484375, + "logps/rejected": -66.39567565917969, + "loss": 0.6788, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.09444743394851685, + "rewards/margins": 0.031206313520669937, + "rewards/rejected": -0.12565374374389648, + "step": 3920 + }, + { + "epoch": 1.354238456237078, + "grad_norm": 2.048689365386963, + "learning_rate": 1.427014719754287e-08, + "logits/chosen": -2.902984142303467, + "logits/rejected": -2.86833119392395, + "logps/chosen": -63.13886642456055, + "logps/rejected": -63.15281295776367, + "loss": 0.6758, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.08902109414339066, + "rewards/margins": 0.037360358983278275, + "rewards/rejected": -0.12638147175312042, + "step": 3930 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 2.2913434505462646, + "learning_rate": 1.4134523378233698e-08, + "logits/chosen": -2.973330497741699, + "logits/rejected": -2.956834316253662, + "logps/chosen": -63.40961456298828, + "logps/rejected": -65.80083465576172, + "loss": 0.6777, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0901583805680275, + "rewards/margins": 0.03346914425492287, + "rewards/rejected": -0.12362752109766006, + "step": 3940 + }, + { + "epoch": 1.361130254996554, + "grad_norm": 2.275186538696289, + "learning_rate": 1.3999292662655754e-08, + "logits/chosen": -2.937798261642456, + "logits/rejected": -2.9266557693481445, + "logps/chosen": -62.40935516357422, + "logps/rejected": -66.2726821899414, + "loss": 0.6811, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09133242070674896, + "rewards/margins": 0.026894664391875267, + "rewards/rejected": -0.11822707951068878, + "step": 3950 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 2.071085214614868, + "learning_rate": 1.3864459943341675e-08, + "logits/chosen": -2.936525583267212, + "logits/rejected": -2.912968158721924, + "logps/chosen": -64.78622436523438, + "logps/rejected": -64.55671691894531, + "loss": 0.6792, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.08964154869318008, + "rewards/margins": 0.030369237065315247, + "rewards/rejected": -0.12001077830791473, + "step": 3960 + }, + { + "epoch": 1.3680220537560304, + "grad_norm": 2.226252794265747, + "learning_rate": 1.3730030098424927e-08, + "logits/chosen": -2.943465232849121, + "logits/rejected": -2.92138671875, + "logps/chosen": -67.17789459228516, + "logps/rejected": -66.60552978515625, + "loss": 0.6744, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.08730453252792358, + "rewards/margins": 0.04065509885549545, + "rewards/rejected": -0.12795962393283844, + "step": 3970 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 2.1076691150665283, + "learning_rate": 1.3596007991463298e-08, + "logits/chosen": -2.858703851699829, + "logits/rejected": -2.8402962684631348, + "logps/chosen": -60.17496871948242, + "logps/rejected": -64.92707061767578, + "loss": 0.6801, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.09182435274124146, + "rewards/margins": 0.02867903746664524, + "rewards/rejected": -0.12050338089466095, + "step": 3980 + }, + { + "epoch": 1.3749138525155065, + "grad_norm": 2.15260910987854, + "learning_rate": 1.3462398471262992e-08, + "logits/chosen": -2.9901604652404785, + "logits/rejected": -2.9676501750946045, + "logps/chosen": -65.43225860595703, + "logps/rejected": -66.92817687988281, + "loss": 0.6769, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.09293768554925919, + "rewards/margins": 0.03545621410012245, + "rewards/rejected": -0.12839388847351074, + "step": 3990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 2.315973997116089, + "learning_rate": 1.3329206371703166e-08, + "logits/chosen": -2.9690933227539062, + "logits/rejected": -2.9580492973327637, + "logps/chosen": -61.68376541137695, + "logps/rejected": -65.7838134765625, + "loss": 0.6807, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.0909140557050705, + "rewards/margins": 0.027417322620749474, + "rewards/rejected": -0.11833137273788452, + "step": 4000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -3.054121732711792, + "eval_logits/rejected": -3.048370838165283, + "eval_logps/chosen": -64.221435546875, + "eval_logps/rejected": -70.25933074951172, + "eval_loss": 0.6860847473144531, + "eval_rewards/accuracies": 0.5908457040786743, + "eval_rewards/chosen": -0.05509539321064949, + "eval_rewards/margins": 0.01569669507443905, + "eval_rewards/rejected": -0.07079208642244339, + "eval_runtime": 384.3005, + "eval_samples_per_second": 11.2, + "eval_steps_per_second": 1.4, + "step": 4000 + }, + { + "epoch": 1.3818056512749828, + "grad_norm": 2.1167848110198975, + "learning_rate": 1.3196436511561027e-08, + "logits/chosen": -2.933992385864258, + "logits/rejected": -2.903357982635498, + "logps/chosen": -67.85502624511719, + "logps/rejected": -65.80936431884766, + "loss": 0.6792, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0915619507431984, + "rewards/margins": 0.030700866132974625, + "rewards/rejected": -0.12226282060146332, + "step": 4010 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 2.022864818572998, + "learning_rate": 1.3064093694337552e-08, + "logits/chosen": -2.918813705444336, + "logits/rejected": -2.895181655883789, + "logps/chosen": -62.11616134643555, + "logps/rejected": -65.38654327392578, + "loss": 0.6779, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.09331174194812775, + "rewards/margins": 0.03332715108990669, + "rewards/rejected": -0.12663887441158295, + "step": 4020 + }, + { + "epoch": 1.388697450034459, + "grad_norm": 2.205150842666626, + "learning_rate": 1.2932182708083659e-08, + "logits/chosen": -2.9874258041381836, + "logits/rejected": -2.959113836288452, + "logps/chosen": -63.654685974121094, + "logps/rejected": -65.35508728027344, + "loss": 0.6748, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.08431627601385117, + "rewards/margins": 0.039567284286022186, + "rewards/rejected": -0.12388356775045395, + "step": 4030 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 2.237994909286499, + "learning_rate": 1.2800708325226967e-08, + "logits/chosen": -2.9087979793548584, + "logits/rejected": -2.8909714221954346, + "logps/chosen": -62.17333221435547, + "logps/rejected": -65.90696716308594, + "loss": 0.6795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08950966596603394, + "rewards/margins": 0.029924685135483742, + "rewards/rejected": -0.11943434178829193, + "step": 4040 + }, + { + "epoch": 1.3955892487939352, + "grad_norm": 2.220797538757324, + "learning_rate": 1.2669675302399174e-08, + "logits/chosen": -2.9189624786376953, + "logits/rejected": -2.901270866394043, + "logps/chosen": -62.6547737121582, + "logps/rejected": -66.87588500976562, + "loss": 0.6791, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09054653346538544, + "rewards/margins": 0.030481979250907898, + "rewards/rejected": -0.12102852761745453, + "step": 4050 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 2.227811813354492, + "learning_rate": 1.2539088380263958e-08, + "logits/chosen": -2.9427545070648193, + "logits/rejected": -2.9144515991210938, + "logps/chosen": -64.08809661865234, + "logps/rejected": -64.04742431640625, + "loss": 0.6791, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0900612324476242, + "rewards/margins": 0.030776774510741234, + "rewards/rejected": -0.12083800882101059, + "step": 4060 + }, + { + "epoch": 1.4024810475534115, + "grad_norm": 2.1904659271240234, + "learning_rate": 1.240895228334542e-08, + "logits/chosen": -2.9165213108062744, + "logits/rejected": -2.894753932952881, + "logps/chosen": -62.841758728027344, + "logps/rejected": -63.72822189331055, + "loss": 0.6797, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.09068237990140915, + "rewards/margins": 0.029550915583968163, + "rewards/rejected": -0.12023331224918365, + "step": 4070 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 2.2701804637908936, + "learning_rate": 1.2279271719857196e-08, + "logits/chosen": -2.946728229522705, + "logits/rejected": -2.9265058040618896, + "logps/chosen": -63.28468704223633, + "logps/rejected": -65.9094009399414, + "loss": 0.6745, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.09204259514808655, + "rewards/margins": 0.04024175554513931, + "rewards/rejected": -0.13228434324264526, + "step": 4080 + }, + { + "epoch": 1.4093728463128876, + "grad_norm": 2.4241137504577637, + "learning_rate": 1.2150051381532137e-08, + "logits/chosen": -2.972846746444702, + "logits/rejected": -2.9496474266052246, + "logps/chosen": -66.7400894165039, + "logps/rejected": -65.05125427246094, + "loss": 0.6803, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.08738545328378677, + "rewards/margins": 0.028310012072324753, + "rewards/rejected": -0.11569547653198242, + "step": 4090 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 2.0823371410369873, + "learning_rate": 1.2021295943452495e-08, + "logits/chosen": -2.9438533782958984, + "logits/rejected": -2.9151291847229004, + "logps/chosen": -65.56358337402344, + "logps/rejected": -65.81645202636719, + "loss": 0.6769, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.090767040848732, + "rewards/margins": 0.03532954305410385, + "rewards/rejected": -0.12609657645225525, + "step": 4100 + }, + { + "epoch": 1.4128187456926258, + "eval_logits/chosen": -3.052527904510498, + "eval_logits/rejected": -3.046738386154175, + "eval_logps/chosen": -64.33756256103516, + "eval_logps/rejected": -70.39879608154297, + "eval_loss": 0.6859830021858215, + "eval_rewards/accuracies": 0.5929368138313293, + "eval_rewards/chosen": -0.05625665932893753, + "eval_rewards/margins": 0.01593007706105709, + "eval_rewards/rejected": -0.07218674570322037, + "eval_runtime": 384.5631, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 4100 + }, + { + "epoch": 1.416264645072364, + "grad_norm": 2.0488412380218506, + "learning_rate": 1.1893010063880853e-08, + "logits/chosen": -2.9012386798858643, + "logits/rejected": -2.879481792449951, + "logps/chosen": -63.4229621887207, + "logps/rejected": -66.8333969116211, + "loss": 0.6799, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09323953092098236, + "rewards/margins": 0.029085148125886917, + "rewards/rejected": -0.12232469022274017, + "step": 4110 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 2.0539770126342773, + "learning_rate": 1.1765198384091577e-08, + "logits/chosen": -2.9786791801452637, + "logits/rejected": -2.954512357711792, + "logps/chosen": -64.86912536621094, + "logps/rejected": -64.17298126220703, + "loss": 0.6774, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.09158007800579071, + "rewards/margins": 0.03424326702952385, + "rewards/rejected": -0.12582334876060486, + "step": 4120 + }, + { + "epoch": 1.42315644383184, + "grad_norm": 2.2448647022247314, + "learning_rate": 1.1637865528202845e-08, + "logits/chosen": -3.007713794708252, + "logits/rejected": -2.9852776527404785, + "logps/chosen": -65.87146759033203, + "logps/rejected": -67.21484375, + "loss": 0.6768, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08839482069015503, + "rewards/margins": 0.03572607785463333, + "rewards/rejected": -0.12412089109420776, + "step": 4130 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 2.146552085876465, + "learning_rate": 1.1511016103009425e-08, + "logits/chosen": -2.973470449447632, + "logits/rejected": -2.962482452392578, + "logps/chosen": -63.569091796875, + "logps/rejected": -66.65575408935547, + "loss": 0.6852, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.106827512383461, + "rewards/margins": 0.0185137577354908, + "rewards/rejected": -0.1253412663936615, + "step": 4140 + }, + { + "epoch": 1.4300482425913164, + "grad_norm": 2.1673996448516846, + "learning_rate": 1.1384654697815973e-08, + "logits/chosen": -2.997055768966675, + "logits/rejected": -2.9608774185180664, + "logps/chosen": -67.5733642578125, + "logps/rejected": -66.72795104980469, + "loss": 0.6768, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.08815548568964005, + "rewards/margins": 0.035506755113601685, + "rewards/rejected": -0.12366221845149994, + "step": 4150 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 1.9435546398162842, + "learning_rate": 1.1258785884270972e-08, + "logits/chosen": -2.899432420730591, + "logits/rejected": -2.8756940364837646, + "logps/chosen": -62.95705032348633, + "logps/rejected": -64.1615219116211, + "loss": 0.6794, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08565764874219894, + "rewards/margins": 0.029919400811195374, + "rewards/rejected": -0.11557704210281372, + "step": 4160 + }, + { + "epoch": 1.4369400413507925, + "grad_norm": 2.0451626777648926, + "learning_rate": 1.1133414216201372e-08, + "logits/chosen": -2.969944715499878, + "logits/rejected": -2.9574761390686035, + "logps/chosen": -61.43464279174805, + "logps/rejected": -65.87757873535156, + "loss": 0.6786, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.09905441105365753, + "rewards/margins": 0.03209648281335831, + "rewards/rejected": -0.13115090131759644, + "step": 4170 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 2.095008134841919, + "learning_rate": 1.1008544229447836e-08, + "logits/chosen": -2.8462460041046143, + "logits/rejected": -2.8191661834716797, + "logps/chosen": -63.70579147338867, + "logps/rejected": -65.39628601074219, + "loss": 0.678, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0984322726726532, + "rewards/margins": 0.03305213525891304, + "rewards/rejected": -0.13148441910743713, + "step": 4180 + }, + { + "epoch": 1.4438318401102688, + "grad_norm": 1.9745500087738037, + "learning_rate": 1.0884180441700588e-08, + "logits/chosen": -2.981600761413574, + "logits/rejected": -2.9631736278533936, + "logps/chosen": -60.92009353637695, + "logps/rejected": -66.35347747802734, + "loss": 0.6809, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09771338850259781, + "rewards/margins": 0.02702244557440281, + "rewards/rejected": -0.12473583221435547, + "step": 4190 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 2.2227063179016113, + "learning_rate": 1.0760327352336024e-08, + "logits/chosen": -2.9796640872955322, + "logits/rejected": -2.9522905349731445, + "logps/chosen": -62.57625198364258, + "logps/rejected": -66.95372009277344, + "loss": 0.6722, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.09075101464986801, + "rewards/margins": 0.04433668404817581, + "rewards/rejected": -0.13508769869804382, + "step": 4200 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -3.051344633102417, + "eval_logits/rejected": -3.045560359954834, + "eval_logps/chosen": -64.48454284667969, + "eval_logps/rejected": -70.56287384033203, + "eval_loss": 0.6859112977981567, + "eval_rewards/accuracies": 0.5945631861686707, + "eval_rewards/chosen": -0.057726457715034485, + "eval_rewards/margins": 0.01610107533633709, + "eval_rewards/rejected": -0.07382753491401672, + "eval_runtime": 384.4021, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 4200 + }, + { + "epoch": 1.450723638869745, + "grad_norm": 2.0472159385681152, + "learning_rate": 1.0636989442253914e-08, + "logits/chosen": -2.865917682647705, + "logits/rejected": -2.8423261642456055, + "logps/chosen": -64.19918060302734, + "logps/rejected": -65.27023315429688, + "loss": 0.6737, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0954306572675705, + "rewards/margins": 0.041881926357746124, + "rewards/rejected": -0.13731257617473602, + "step": 4210 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 2.3937063217163086, + "learning_rate": 1.0514171173715245e-08, + "logits/chosen": -2.9541773796081543, + "logits/rejected": -2.937562942504883, + "logps/chosen": -63.5893440246582, + "logps/rejected": -66.68025207519531, + "loss": 0.6788, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.09447337687015533, + "rewards/margins": 0.03139979764819145, + "rewards/rejected": -0.12587317824363708, + "step": 4220 + }, + { + "epoch": 1.4576154376292212, + "grad_norm": 2.123077392578125, + "learning_rate": 1.039187699018085e-08, + "logits/chosen": -2.9126861095428467, + "logits/rejected": -2.8963119983673096, + "logps/chosen": -59.836151123046875, + "logps/rejected": -66.63883972167969, + "loss": 0.675, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.09426899254322052, + "rewards/margins": 0.038940686732530594, + "rewards/rejected": -0.13320967555046082, + "step": 4230 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 2.055708408355713, + "learning_rate": 1.0270111316150585e-08, + "logits/chosen": -2.9423460960388184, + "logits/rejected": -2.9130282402038574, + "logps/chosen": -64.48773956298828, + "logps/rejected": -65.76631927490234, + "loss": 0.6773, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.09935905039310455, + "rewards/margins": 0.0346347838640213, + "rewards/rejected": -0.13399383425712585, + "step": 4240 + }, + { + "epoch": 1.4645072363886975, + "grad_norm": 2.029475688934326, + "learning_rate": 1.0148878557003299e-08, + "logits/chosen": -2.924720048904419, + "logits/rejected": -2.9136366844177246, + "logps/chosen": -64.05335998535156, + "logps/rejected": -68.36885070800781, + "loss": 0.6771, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09529970586299896, + "rewards/margins": 0.03515272215008736, + "rewards/rejected": -0.13045242428779602, + "step": 4250 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 2.112820625305176, + "learning_rate": 1.0028183098837409e-08, + "logits/chosen": -2.9278299808502197, + "logits/rejected": -2.892338275909424, + "logps/chosen": -65.2867202758789, + "logps/rejected": -63.962562561035156, + "loss": 0.6734, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09577113389968872, + "rewards/margins": 0.04274382442235947, + "rewards/rejected": -0.13851496577262878, + "step": 4260 + }, + { + "epoch": 1.4713990351481736, + "grad_norm": 2.1950058937072754, + "learning_rate": 9.908029308312266e-09, + "logits/chosen": -2.973106861114502, + "logits/rejected": -2.9561305046081543, + "logps/chosen": -63.97013473510742, + "logps/rejected": -65.02579498291016, + "loss": 0.6825, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.09679555147886276, + "rewards/margins": 0.024157661944627762, + "rewards/rejected": -0.12095322459936142, + "step": 4270 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 2.039232015609741, + "learning_rate": 9.788421532490134e-09, + "logits/chosen": -3.002267360687256, + "logits/rejected": -2.986990451812744, + "logps/chosen": -63.63147735595703, + "logps/rejected": -66.7364501953125, + "loss": 0.6781, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.09608881175518036, + "rewards/margins": 0.03295501321554184, + "rewards/rejected": -0.1290438175201416, + "step": 4280 + }, + { + "epoch": 1.47829083390765, + "grad_norm": 2.1652369499206543, + "learning_rate": 9.669364098678912e-09, + "logits/chosen": -2.946402072906494, + "logits/rejected": -2.9230525493621826, + "logps/chosen": -63.1927604675293, + "logps/rejected": -66.66388702392578, + "loss": 0.6724, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.08946090936660767, + "rewards/margins": 0.044275470077991486, + "rewards/rejected": -0.13373637199401855, + "step": 4290 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 2.233903646469116, + "learning_rate": 9.550861314275613e-09, + "logits/chosen": -2.9582359790802, + "logits/rejected": -2.9278035163879395, + "logps/chosen": -63.89642333984375, + "logps/rejected": -64.79170227050781, + "loss": 0.6769, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.08680012077093124, + "rewards/margins": 0.03546776622533798, + "rewards/rejected": -0.12226790189743042, + "step": 4300 + }, + { + "epoch": 1.481736733287388, + "eval_logits/chosen": -3.049945592880249, + "eval_logits/rejected": -3.0441927909851074, + "eval_logps/chosen": -64.53496551513672, + "eval_logps/rejected": -70.63489532470703, + "eval_loss": 0.6858123540878296, + "eval_rewards/accuracies": 0.5938661694526672, + "eval_rewards/chosen": -0.05823073908686638, + "eval_rewards/margins": 0.01631702482700348, + "eval_rewards/rejected": -0.07454776763916016, + "eval_runtime": 384.2598, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 4300 + }, + { + "epoch": 1.4851826326671262, + "grad_norm": 2.15106201171875, + "learning_rate": 9.432917466610505e-09, + "logits/chosen": -2.9038984775543213, + "logits/rejected": -2.8834316730499268, + "logps/chosen": -64.21763610839844, + "logps/rejected": -63.90519332885742, + "loss": 0.6784, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0975811555981636, + "rewards/margins": 0.032436296343803406, + "rewards/rejected": -0.1300174444913864, + "step": 4310 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 2.1343038082122803, + "learning_rate": 9.315536822791976e-09, + "logits/chosen": -2.915823459625244, + "logits/rejected": -2.8945200443267822, + "logps/chosen": -62.840476989746094, + "logps/rejected": -64.214111328125, + "loss": 0.6811, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09117156267166138, + "rewards/margins": 0.02677854336798191, + "rewards/rejected": -0.11795011907815933, + "step": 4320 + }, + { + "epoch": 1.4920744314266023, + "grad_norm": 2.0389392375946045, + "learning_rate": 9.198723629552205e-09, + "logits/chosen": -2.90847110748291, + "logits/rejected": -2.8905720710754395, + "logps/chosen": -63.195213317871094, + "logps/rejected": -65.94998168945312, + "loss": 0.6747, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.0903761237859726, + "rewards/margins": 0.040145523846149445, + "rewards/rejected": -0.13052165508270264, + "step": 4330 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 2.3100173473358154, + "learning_rate": 9.08248211309346e-09, + "logits/chosen": -2.9821550846099854, + "logits/rejected": -2.9700443744659424, + "logps/chosen": -61.75278854370117, + "logps/rejected": -66.24479675292969, + "loss": 0.6793, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.10415707528591156, + "rewards/margins": 0.03043574094772339, + "rewards/rejected": -0.13459283113479614, + "step": 4340 + }, + { + "epoch": 1.4989662301860784, + "grad_norm": 2.2711362838745117, + "learning_rate": 8.966816478935255e-09, + "logits/chosen": -2.9720687866210938, + "logits/rejected": -2.9405291080474854, + "logps/chosen": -65.44647979736328, + "logps/rejected": -63.351318359375, + "loss": 0.6772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09429587423801422, + "rewards/margins": 0.03446224331855774, + "rewards/rejected": -0.12875810265541077, + "step": 4350 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 2.1748154163360596, + "learning_rate": 8.851730911762168e-09, + "logits/chosen": -2.936713695526123, + "logits/rejected": -2.917022943496704, + "logps/chosen": -63.976097106933594, + "logps/rejected": -66.04635620117188, + "loss": 0.6801, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.09384501725435257, + "rewards/margins": 0.02860867977142334, + "rewards/rejected": -0.12245368957519531, + "step": 4360 + }, + { + "epoch": 1.5058580289455548, + "grad_norm": 2.08954119682312, + "learning_rate": 8.73722957527242e-09, + "logits/chosen": -2.9445550441741943, + "logits/rejected": -2.9229681491851807, + "logps/chosen": -62.3499870300293, + "logps/rejected": -65.8035888671875, + "loss": 0.6801, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.10512308776378632, + "rewards/margins": 0.028672099113464355, + "rewards/rejected": -0.13379518687725067, + "step": 4370 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 2.039855718612671, + "learning_rate": 8.623316612027284e-09, + "logits/chosen": -2.9334137439727783, + "logits/rejected": -2.9160149097442627, + "logps/chosen": -61.61011505126953, + "logps/rejected": -67.03166198730469, + "loss": 0.6754, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.09012607485055923, + "rewards/margins": 0.03832302242517471, + "rewards/rejected": -0.12844911217689514, + "step": 4380 + }, + { + "epoch": 1.512749827705031, + "grad_norm": 2.3842740058898926, + "learning_rate": 8.509996143301196e-09, + "logits/chosen": -2.967993974685669, + "logits/rejected": -2.9537787437438965, + "logps/chosen": -61.7550163269043, + "logps/rejected": -65.1753921508789, + "loss": 0.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09670481830835342, + "rewards/margins": 0.03900107368826866, + "rewards/rejected": -0.1357058882713318, + "step": 4390 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 2.1551573276519775, + "learning_rate": 8.397272268932618e-09, + "logits/chosen": -2.9331369400024414, + "logits/rejected": -2.90215802192688, + "logps/chosen": -64.6319580078125, + "logps/rejected": -65.17381286621094, + "loss": 0.6785, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0967051237821579, + "rewards/margins": 0.03241132199764252, + "rewards/rejected": -0.12911644577980042, + "step": 4400 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -3.049013376235962, + "eval_logits/rejected": -3.0432231426239014, + "eval_logps/chosen": -64.5702896118164, + "eval_logps/rejected": -70.67756652832031, + "eval_loss": 0.6857818365097046, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -0.05858391523361206, + "eval_rewards/margins": 0.016390599310398102, + "eval_rewards/rejected": -0.07497451454401016, + "eval_runtime": 384.4066, + "eval_samples_per_second": 11.196, + "eval_steps_per_second": 1.4, + "step": 4400 + }, + { + "epoch": 1.5196416264645074, + "grad_norm": 2.1354825496673584, + "learning_rate": 8.285149067175734e-09, + "logits/chosen": -2.999840259552002, + "logits/rejected": -2.978172779083252, + "logps/chosen": -60.5115852355957, + "logps/rejected": -64.41769409179688, + "loss": 0.6777, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.0981929823756218, + "rewards/margins": 0.03404136374592781, + "rewards/rejected": -0.1322343647480011, + "step": 4410 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 2.2247259616851807, + "learning_rate": 8.173630594552924e-09, + "logits/chosen": -2.8461086750030518, + "logits/rejected": -2.8300206661224365, + "logps/chosen": -62.6235237121582, + "logps/rejected": -65.7718734741211, + "loss": 0.6777, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0915134847164154, + "rewards/margins": 0.03351100534200668, + "rewards/rejected": -0.1250244826078415, + "step": 4420 + }, + { + "epoch": 1.5265334252239835, + "grad_norm": 2.1211180686950684, + "learning_rate": 8.062720885707983e-09, + "logits/chosen": -3.0012969970703125, + "logits/rejected": -2.981078624725342, + "logps/chosen": -61.487266540527344, + "logps/rejected": -65.78907775878906, + "loss": 0.6772, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.08771520853042603, + "rewards/margins": 0.0344039723277092, + "rewards/rejected": -0.12211918830871582, + "step": 4430 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 2.0191056728363037, + "learning_rate": 7.95242395326011e-09, + "logits/chosen": -2.9936842918395996, + "logits/rejected": -2.9634318351745605, + "logps/chosen": -65.0488510131836, + "logps/rejected": -66.46080017089844, + "loss": 0.6784, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09938286244869232, + "rewards/margins": 0.03235219046473503, + "rewards/rejected": -0.13173505663871765, + "step": 4440 + }, + { + "epoch": 1.5334252239834596, + "grad_norm": 2.232675075531006, + "learning_rate": 7.842743787658812e-09, + "logits/chosen": -2.952040910720825, + "logits/rejected": -2.923306941986084, + "logps/chosen": -64.1955337524414, + "logps/rejected": -64.81131744384766, + "loss": 0.6773, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09505288302898407, + "rewards/margins": 0.03423549234867096, + "rewards/rejected": -0.12928839027881622, + "step": 4450 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 2.273517370223999, + "learning_rate": 7.733684357039492e-09, + "logits/chosen": -2.9811654090881348, + "logits/rejected": -2.9586715698242188, + "logps/chosen": -66.3079833984375, + "logps/rejected": -66.52787017822266, + "loss": 0.6748, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09947341680526733, + "rewards/margins": 0.039953988045454025, + "rewards/rejected": -0.13942742347717285, + "step": 4460 + }, + { + "epoch": 1.540317022742936, + "grad_norm": 2.2406179904937744, + "learning_rate": 7.62524960707986e-09, + "logits/chosen": -2.9634737968444824, + "logits/rejected": -2.938974618911743, + "logps/chosen": -64.69285583496094, + "logps/rejected": -65.86080169677734, + "loss": 0.6789, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.09530860185623169, + "rewards/margins": 0.03149518743157387, + "rewards/rejected": -0.12680378556251526, + "step": 4470 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 2.190739870071411, + "learning_rate": 7.517443460857229e-09, + "logits/chosen": -2.923884391784668, + "logits/rejected": -2.9112801551818848, + "logps/chosen": -64.11064147949219, + "logps/rejected": -67.6056137084961, + "loss": 0.6783, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0985846072435379, + "rewards/margins": 0.03296193480491638, + "rewards/rejected": -0.1315465271472931, + "step": 4480 + }, + { + "epoch": 1.5472088215024122, + "grad_norm": 2.0800647735595703, + "learning_rate": 7.410269818706574e-09, + "logits/chosen": -2.979024648666382, + "logits/rejected": -2.949497938156128, + "logps/chosen": -63.89508819580078, + "logps/rejected": -64.94587707519531, + "loss": 0.6794, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.10012755542993546, + "rewards/margins": 0.03033982776105404, + "rewards/rejected": -0.13046738505363464, + "step": 4490 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 2.15670108795166, + "learning_rate": 7.303732558079379e-09, + "logits/chosen": -2.946776866912842, + "logits/rejected": -2.923444986343384, + "logps/chosen": -64.97744750976562, + "logps/rejected": -66.2874984741211, + "loss": 0.6735, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.08795378357172012, + "rewards/margins": 0.04191059246659279, + "rewards/rejected": -0.12986436486244202, + "step": 4500 + }, + { + "epoch": 1.5506547208821502, + "eval_logits/chosen": -3.04790997505188, + "eval_logits/rejected": -3.042128801345825, + "eval_logps/chosen": -64.68529510498047, + "eval_logps/rejected": -70.79721069335938, + "eval_loss": 0.6857719421386719, + "eval_rewards/accuracies": 0.5920074582099915, + "eval_rewards/chosen": -0.05973397567868233, + "eval_rewards/margins": 0.016436897218227386, + "eval_rewards/rejected": -0.07617087662220001, + "eval_runtime": 384.4992, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 4500 + }, + { + "epoch": 1.5541006202618883, + "grad_norm": 2.1693248748779297, + "learning_rate": 7.197835533403404e-09, + "logits/chosen": -2.910331964492798, + "logits/rejected": -2.889179229736328, + "logps/chosen": -64.47348022460938, + "logps/rejected": -65.3521957397461, + "loss": 0.6756, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09691820293664932, + "rewards/margins": 0.037731070071458817, + "rewards/rejected": -0.13464927673339844, + "step": 4510 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 2.151700496673584, + "learning_rate": 7.092582575943218e-09, + "logits/chosen": -2.9018807411193848, + "logits/rejected": -2.8939027786254883, + "logps/chosen": -61.126686096191406, + "logps/rejected": -65.8957290649414, + "loss": 0.6794, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.09966224431991577, + "rewards/margins": 0.03021586500108242, + "rewards/rejected": -0.12987811863422394, + "step": 4520 + }, + { + "epoch": 1.5609924190213644, + "grad_norm": 2.032938241958618, + "learning_rate": 6.9879774936615645e-09, + "logits/chosen": -2.9603374004364014, + "logits/rejected": -2.9376699924468994, + "logps/chosen": -64.51271057128906, + "logps/rejected": -65.9655990600586, + "loss": 0.6801, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10277440398931503, + "rewards/margins": 0.0286547988653183, + "rewards/rejected": -0.13142919540405273, + "step": 4530 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 2.025911569595337, + "learning_rate": 6.884024071081632e-09, + "logits/chosen": -2.9223618507385254, + "logits/rejected": -2.91190767288208, + "logps/chosen": -62.47723388671875, + "logps/rejected": -67.90995025634766, + "loss": 0.6767, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.09204110503196716, + "rewards/margins": 0.03594468906521797, + "rewards/rejected": -0.12798579037189484, + "step": 4540 + }, + { + "epoch": 1.5678842177808407, + "grad_norm": 2.2641587257385254, + "learning_rate": 6.7807260691501196e-09, + "logits/chosen": -2.928924560546875, + "logits/rejected": -2.8977210521698, + "logps/chosen": -65.50735473632812, + "logps/rejected": -65.04016876220703, + "loss": 0.6763, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09474433958530426, + "rewards/margins": 0.03602954000234604, + "rewards/rejected": -0.1307738721370697, + "step": 4550 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 2.3552682399749756, + "learning_rate": 6.67808722510112e-09, + "logits/chosen": -2.9768292903900146, + "logits/rejected": -2.945518732070923, + "logps/chosen": -66.51522064208984, + "logps/rejected": -66.87415313720703, + "loss": 0.6733, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.08753177523612976, + "rewards/margins": 0.042732808738946915, + "rewards/rejected": -0.13026458024978638, + "step": 4560 + }, + { + "epoch": 1.574776016540317, + "grad_norm": 2.2089123725891113, + "learning_rate": 6.576111252321001e-09, + "logits/chosen": -2.938915729522705, + "logits/rejected": -2.9010097980499268, + "logps/chosen": -64.55790710449219, + "logps/rejected": -64.71064758300781, + "loss": 0.6716, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.08407676219940186, + "rewards/margins": 0.04665817320346832, + "rewards/rejected": -0.13073492050170898, + "step": 4570 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 1.9244182109832764, + "learning_rate": 6.474801840213995e-09, + "logits/chosen": -2.9675724506378174, + "logits/rejected": -2.9516897201538086, + "logps/chosen": -63.53330612182617, + "logps/rejected": -67.43972778320312, + "loss": 0.6753, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.09795354306697845, + "rewards/margins": 0.0386352464556694, + "rewards/rejected": -0.13658878207206726, + "step": 4580 + }, + { + "epoch": 1.5816678152997934, + "grad_norm": 2.2728357315063477, + "learning_rate": 6.3741626540687156e-09, + "logits/chosen": -2.9654381275177, + "logits/rejected": -2.938734531402588, + "logps/chosen": -65.29521179199219, + "logps/rejected": -64.78874206542969, + "loss": 0.6772, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1050899475812912, + "rewards/margins": 0.03461449593305588, + "rewards/rejected": -0.13970443606376648, + "step": 4590 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 2.1909267902374268, + "learning_rate": 6.274197334925596e-09, + "logits/chosen": -3.022188186645508, + "logits/rejected": -3.008340358734131, + "logps/chosen": -63.90410614013672, + "logps/rejected": -67.8543930053711, + "loss": 0.6786, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10399514436721802, + "rewards/margins": 0.03185725957155228, + "rewards/rejected": -0.13585242629051208, + "step": 4600 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -3.0471458435058594, + "eval_logits/rejected": -3.0413613319396973, + "eval_logps/chosen": -64.74617004394531, + "eval_logps/rejected": -70.86981964111328, + "eval_loss": 0.6857200264930725, + "eval_rewards/accuracies": 0.5966542959213257, + "eval_rewards/chosen": -0.06034281104803085, + "eval_rewards/margins": 0.016554230824112892, + "eval_rewards/rejected": -0.076897032558918, + "eval_runtime": 383.2954, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 4600 + }, + { + "epoch": 1.5885596140592695, + "grad_norm": 2.1440834999084473, + "learning_rate": 6.174909499445125e-09, + "logits/chosen": -2.898419141769409, + "logits/rejected": -2.873417377471924, + "logps/chosen": -63.03126907348633, + "logps/rejected": -64.13099670410156, + "loss": 0.6778, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10101036727428436, + "rewards/margins": 0.03360176831483841, + "rewards/rejected": -0.13461214303970337, + "step": 4610 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 2.136821746826172, + "learning_rate": 6.07630273977699e-09, + "logits/chosen": -2.8824410438537598, + "logits/rejected": -2.8729777336120605, + "logps/chosen": -62.408958435058594, + "logps/rejected": -66.50130462646484, + "loss": 0.6791, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.096805140376091, + "rewards/margins": 0.03096623346209526, + "rewards/rejected": -0.12777137756347656, + "step": 4620 + }, + { + "epoch": 1.5954514128187456, + "grad_norm": 2.1429455280303955, + "learning_rate": 5.978380623430152e-09, + "logits/chosen": -2.9070467948913574, + "logits/rejected": -2.8850066661834717, + "logps/chosen": -61.453765869140625, + "logps/rejected": -64.62207794189453, + "loss": 0.6805, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.10029391199350357, + "rewards/margins": 0.027933398261666298, + "rewards/rejected": -0.12822730839252472, + "step": 4630 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 2.195216417312622, + "learning_rate": 5.8811466931437624e-09, + "logits/chosen": -2.99377703666687, + "logits/rejected": -2.9689157009124756, + "logps/chosen": -64.21118927001953, + "logps/rejected": -66.28094482421875, + "loss": 0.6773, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.09979419410228729, + "rewards/margins": 0.034975916147232056, + "rewards/rejected": -0.13477011024951935, + "step": 4640 + }, + { + "epoch": 1.602343211578222, + "grad_norm": 2.260923385620117, + "learning_rate": 5.784604466758955e-09, + "logits/chosen": -2.8885650634765625, + "logits/rejected": -2.8749794960021973, + "logps/chosen": -64.46866607666016, + "logps/rejected": -66.56623840332031, + "loss": 0.68, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10299519449472427, + "rewards/margins": 0.028997087851166725, + "rewards/rejected": -0.13199228048324585, + "step": 4650 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 2.0964856147766113, + "learning_rate": 5.688757437091632e-09, + "logits/chosen": -2.9830551147460938, + "logits/rejected": -2.958256244659424, + "logps/chosen": -62.82770538330078, + "logps/rejected": -66.51475524902344, + "loss": 0.6765, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10538817942142487, + "rewards/margins": 0.03581491857767105, + "rewards/rejected": -0.14120309054851532, + "step": 4660 + }, + { + "epoch": 1.6092350103376982, + "grad_norm": 2.319704294204712, + "learning_rate": 5.593609071806061e-09, + "logits/chosen": -2.9287524223327637, + "logits/rejected": -2.9094161987304688, + "logps/chosen": -64.73773956298828, + "logps/rejected": -67.68860626220703, + "loss": 0.6757, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.10544122755527496, + "rewards/margins": 0.03820051625370979, + "rewards/rejected": -0.14364174008369446, + "step": 4670 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 1.9816330671310425, + "learning_rate": 5.499162813289407e-09, + "logits/chosen": -2.9848568439483643, + "logits/rejected": -2.9498484134674072, + "logps/chosen": -64.04865264892578, + "logps/rejected": -66.0094223022461, + "loss": 0.6731, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.09432440996170044, + "rewards/margins": 0.04329352080821991, + "rewards/rejected": -0.13761794567108154, + "step": 4680 + }, + { + "epoch": 1.6161268090971743, + "grad_norm": 2.197380781173706, + "learning_rate": 5.405422078527233e-09, + "logits/chosen": -3.0338993072509766, + "logits/rejected": -3.0085062980651855, + "logps/chosen": -63.707061767578125, + "logps/rejected": -64.739013671875, + "loss": 0.6816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0958305075764656, + "rewards/margins": 0.02577478252351284, + "rewards/rejected": -0.1216052919626236, + "step": 4690 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 2.2032158374786377, + "learning_rate": 5.312390258979841e-09, + "logits/chosen": -2.8527743816375732, + "logits/rejected": -2.8366029262542725, + "logps/chosen": -63.841766357421875, + "logps/rejected": -67.6465072631836, + "loss": 0.6803, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.10257796943187714, + "rewards/margins": 0.02846847102046013, + "rewards/rejected": -0.13104644417762756, + "step": 4700 + }, + { + "epoch": 1.6195727084769125, + "eval_logits/chosen": -3.0465500354766846, + "eval_logits/rejected": -3.040802240371704, + "eval_logps/chosen": -64.74346160888672, + "eval_logps/rejected": -70.8780746459961, + "eval_loss": 0.6856712698936462, + "eval_rewards/accuracies": 0.597815990447998, + "eval_rewards/chosen": -0.06031567603349686, + "eval_rewards/margins": 0.01666383258998394, + "eval_rewards/rejected": -0.07697951048612595, + "eval_runtime": 382.8006, + "eval_samples_per_second": 11.243, + "eval_steps_per_second": 1.405, + "step": 4700 + }, + { + "epoch": 1.6230186078566504, + "grad_norm": 2.3515686988830566, + "learning_rate": 5.220070720459571e-09, + "logits/chosen": -2.943565845489502, + "logits/rejected": -2.9256954193115234, + "logps/chosen": -64.6860122680664, + "logps/rejected": -65.74381256103516, + "loss": 0.6777, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1022968515753746, + "rewards/margins": 0.03354020044207573, + "rewards/rejected": -0.13583704829216003, + "step": 4710 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 2.104708671569824, + "learning_rate": 5.1284668030090485e-09, + "logits/chosen": -2.9260170459747314, + "logits/rejected": -2.917184829711914, + "logps/chosen": -61.97522735595703, + "logps/rejected": -64.06890869140625, + "loss": 0.6824, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.1094181165099144, + "rewards/margins": 0.02429693005979061, + "rewards/rejected": -0.13371506333351135, + "step": 4720 + }, + { + "epoch": 1.6299104066161267, + "grad_norm": 2.6643316745758057, + "learning_rate": 5.037581820780335e-09, + "logits/chosen": -2.986668348312378, + "logits/rejected": -2.9803264141082764, + "logps/chosen": -63.790992736816406, + "logps/rejected": -66.82393646240234, + "loss": 0.6772, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1097106784582138, + "rewards/margins": 0.035210516303777695, + "rewards/rejected": -0.1449211984872818, + "step": 4730 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 2.132471799850464, + "learning_rate": 4.947419061915037e-09, + "logits/chosen": -2.8456203937530518, + "logits/rejected": -2.8134427070617676, + "logps/chosen": -64.08192443847656, + "logps/rejected": -65.60581970214844, + "loss": 0.6782, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.10763104259967804, + "rewards/margins": 0.03280221298336983, + "rewards/rejected": -0.14043326675891876, + "step": 4740 + }, + { + "epoch": 1.636802205375603, + "grad_norm": 1.9625790119171143, + "learning_rate": 4.857981788425305e-09, + "logits/chosen": -2.9289822578430176, + "logits/rejected": -2.9128329753875732, + "logps/chosen": -63.73872756958008, + "logps/rejected": -66.92320251464844, + "loss": 0.6775, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.09859279543161392, + "rewards/margins": 0.03389236703515053, + "rewards/rejected": -0.13248515129089355, + "step": 4750 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 2.143996000289917, + "learning_rate": 4.7692732360758634e-09, + "logits/chosen": -2.9664969444274902, + "logits/rejected": -2.951070785522461, + "logps/chosen": -61.64174270629883, + "logps/rejected": -69.74330139160156, + "loss": 0.6756, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10031326115131378, + "rewards/margins": 0.03815789893269539, + "rewards/rejected": -0.13847115635871887, + "step": 4760 + }, + { + "epoch": 1.6436940041350794, + "grad_norm": 2.208103656768799, + "learning_rate": 4.68129661426693e-09, + "logits/chosen": -2.9453184604644775, + "logits/rejected": -2.9222664833068848, + "logps/chosen": -64.02022552490234, + "logps/rejected": -66.49871826171875, + "loss": 0.6756, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.10512950271368027, + "rewards/margins": 0.03840099275112152, + "rewards/rejected": -0.14353050291538239, + "step": 4770 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 2.6903138160705566, + "learning_rate": 4.594055105918071e-09, + "logits/chosen": -2.9889702796936035, + "logits/rejected": -2.9868006706237793, + "logps/chosen": -63.49650955200195, + "logps/rejected": -68.11885070800781, + "loss": 0.679, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.10073445737361908, + "rewards/margins": 0.031192371621727943, + "rewards/rejected": -0.13192683458328247, + "step": 4780 + }, + { + "epoch": 1.6505858028945555, + "grad_norm": 2.081115245819092, + "learning_rate": 4.507551867353093e-09, + "logits/chosen": -2.967514991760254, + "logits/rejected": -2.9510347843170166, + "logps/chosen": -62.506919860839844, + "logps/rejected": -67.48614501953125, + "loss": 0.6767, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.1013697013258934, + "rewards/margins": 0.03546713665127754, + "rewards/rejected": -0.13683685660362244, + "step": 4790 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 2.1103594303131104, + "learning_rate": 4.4217900281858236e-09, + "logits/chosen": -2.9971890449523926, + "logits/rejected": -2.9710536003112793, + "logps/chosen": -65.31468963623047, + "logps/rejected": -67.2812728881836, + "loss": 0.6789, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.09736104309558868, + "rewards/margins": 0.03208557516336441, + "rewards/rejected": -0.1294466108083725, + "step": 4800 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -3.045680284500122, + "eval_logits/rejected": -3.039926052093506, + "eval_logps/chosen": -64.78036499023438, + "eval_logps/rejected": -70.9262924194336, + "eval_loss": 0.6856197118759155, + "eval_rewards/accuracies": 0.5929368138313293, + "eval_rewards/chosen": -0.060684628784656525, + "eval_rewards/margins": 0.016777031123638153, + "eval_rewards/rejected": -0.07746166735887527, + "eval_runtime": 383.212, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 4800 + }, + { + "epoch": 1.6574776016540316, + "grad_norm": 2.2263193130493164, + "learning_rate": 4.336772691206877e-09, + "logits/chosen": -2.993753433227539, + "logits/rejected": -2.97983455657959, + "logps/chosen": -64.71400451660156, + "logps/rejected": -67.22502136230469, + "loss": 0.6806, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09266643226146698, + "rewards/margins": 0.02778668701648712, + "rewards/rejected": -0.1204531341791153, + "step": 4810 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 2.1050963401794434, + "learning_rate": 4.252502932271423e-09, + "logits/chosen": -2.9959557056427, + "logits/rejected": -2.966916561126709, + "logps/chosen": -66.26362609863281, + "logps/rejected": -65.68085479736328, + "loss": 0.6776, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10536062717437744, + "rewards/margins": 0.03362419083714485, + "rewards/rejected": -0.1389847993850708, + "step": 4820 + }, + { + "epoch": 1.664369400413508, + "grad_norm": 2.623863697052002, + "learning_rate": 4.168983800187892e-09, + "logits/chosen": -2.9033761024475098, + "logits/rejected": -2.8900322914123535, + "logps/chosen": -63.26140594482422, + "logps/rejected": -66.57955169677734, + "loss": 0.6819, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.10230235755443573, + "rewards/margins": 0.025283504277467728, + "rewards/rejected": -0.12758587300777435, + "step": 4830 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 2.2002789974212646, + "learning_rate": 4.086218316607654e-09, + "logits/chosen": -2.985736608505249, + "logits/rejected": -2.9631614685058594, + "logps/chosen": -64.74356079101562, + "logps/rejected": -65.73597717285156, + "loss": 0.6772, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09450788795948029, + "rewards/margins": 0.035158026963472366, + "rewards/rejected": -0.12966591119766235, + "step": 4840 + }, + { + "epoch": 1.6712611991729842, + "grad_norm": 2.155360460281372, + "learning_rate": 4.004209475915732e-09, + "logits/chosen": -2.8897435665130615, + "logits/rejected": -2.8641912937164307, + "logps/chosen": -64.5123519897461, + "logps/rejected": -67.00016784667969, + "loss": 0.6782, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10245025157928467, + "rewards/margins": 0.033127255737781525, + "rewards/rejected": -0.1355775147676468, + "step": 4850 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 2.3200953006744385, + "learning_rate": 3.9229602451224554e-09, + "logits/chosen": -2.875943183898926, + "logits/rejected": -2.8616209030151367, + "logps/chosen": -63.52336502075195, + "logps/rejected": -64.73592376708984, + "loss": 0.6793, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.10029806196689606, + "rewards/margins": 0.030711418017745018, + "rewards/rejected": -0.13100948929786682, + "step": 4860 + }, + { + "epoch": 1.6781529979324605, + "grad_norm": 1.9992374181747437, + "learning_rate": 3.8424735637560965e-09, + "logits/chosen": -2.949007034301758, + "logits/rejected": -2.9328529834747314, + "logps/chosen": -64.01113891601562, + "logps/rejected": -66.80305480957031, + "loss": 0.6794, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.09905761480331421, + "rewards/margins": 0.030456161126494408, + "rewards/rejected": -0.12951377034187317, + "step": 4870 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 2.017805576324463, + "learning_rate": 3.762752343756531e-09, + "logits/chosen": -2.971914768218994, + "logits/rejected": -2.9384541511535645, + "logps/chosen": -63.82556915283203, + "logps/rejected": -64.0127182006836, + "loss": 0.6745, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09669949859380722, + "rewards/margins": 0.04025216028094292, + "rewards/rejected": -0.13695165514945984, + "step": 4880 + }, + { + "epoch": 1.6850447966919366, + "grad_norm": 2.0400807857513428, + "learning_rate": 3.683799469369919e-09, + "logits/chosen": -2.929330825805664, + "logits/rejected": -2.895925521850586, + "logps/chosen": -67.0200424194336, + "logps/rejected": -66.51060485839844, + "loss": 0.6738, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.09524741768836975, + "rewards/margins": 0.042062025517225266, + "rewards/rejected": -0.13730944693088531, + "step": 4890 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 2.2557101249694824, + "learning_rate": 3.6056177970442995e-09, + "logits/chosen": -2.9533286094665527, + "logits/rejected": -2.935368061065674, + "logps/chosen": -64.12738037109375, + "logps/rejected": -69.1882553100586, + "loss": 0.6723, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09020443260669708, + "rewards/margins": 0.04525913670659065, + "rewards/rejected": -0.13546356558799744, + "step": 4900 + }, + { + "epoch": 1.6884906960716748, + "eval_logits/chosen": -3.0447659492492676, + "eval_logits/rejected": -3.0389816761016846, + "eval_logps/chosen": -64.8213119506836, + "eval_logps/rejected": -70.97413635253906, + "eval_loss": 0.6855877041816711, + "eval_rewards/accuracies": 0.5985130071640015, + "eval_rewards/chosen": -0.06109423562884331, + "eval_rewards/margins": 0.016845842823386192, + "eval_rewards/rejected": -0.07794006913900375, + "eval_runtime": 382.7336, + "eval_samples_per_second": 11.245, + "eval_steps_per_second": 1.406, + "step": 4900 + }, + { + "epoch": 1.6919365954514127, + "grad_norm": 2.2069523334503174, + "learning_rate": 3.528210155326289e-09, + "logits/chosen": -2.9314873218536377, + "logits/rejected": -2.9017434120178223, + "logps/chosen": -64.22460174560547, + "logps/rejected": -68.1082992553711, + "loss": 0.6711, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.09454147517681122, + "rewards/margins": 0.04726605862379074, + "rewards/rejected": -0.14180754125118256, + "step": 4910 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 2.172011613845825, + "learning_rate": 3.4515793447587342e-09, + "logits/chosen": -2.960524082183838, + "logits/rejected": -2.9422361850738525, + "logps/chosen": -67.4477310180664, + "logps/rejected": -68.8906478881836, + "loss": 0.6813, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.10514837503433228, + "rewards/margins": 0.026843750849366188, + "rewards/rejected": -0.1319921314716339, + "step": 4920 + }, + { + "epoch": 1.698828394210889, + "grad_norm": 2.2750051021575928, + "learning_rate": 3.3757281377793793e-09, + "logits/chosen": -2.9778566360473633, + "logits/rejected": -2.9670004844665527, + "logps/chosen": -65.68384552001953, + "logps/rejected": -65.3903579711914, + "loss": 0.6826, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.10219858586788177, + "rewards/margins": 0.023961419239640236, + "rewards/rejected": -0.12615999579429626, + "step": 4930 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 2.0987696647644043, + "learning_rate": 3.3006592786205793e-09, + "logits/chosen": -2.9296352863311768, + "logits/rejected": -2.914144992828369, + "logps/chosen": -62.513702392578125, + "logps/rejected": -66.65951538085938, + "loss": 0.6776, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.1011534184217453, + "rewards/margins": 0.0337558388710022, + "rewards/rejected": -0.1349092572927475, + "step": 4940 + }, + { + "epoch": 1.7057201929703654, + "grad_norm": 2.069197416305542, + "learning_rate": 3.226375483210017e-09, + "logits/chosen": -2.890801191329956, + "logits/rejected": -2.8730523586273193, + "logps/chosen": -63.73029327392578, + "logps/rejected": -65.79854583740234, + "loss": 0.6735, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09668199717998505, + "rewards/margins": 0.04293070361018181, + "rewards/rejected": -0.13961270451545715, + "step": 4950 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 2.062432050704956, + "learning_rate": 3.152879439072409e-09, + "logits/chosen": -3.0091025829315186, + "logits/rejected": -2.9726009368896484, + "logps/chosen": -64.92745208740234, + "logps/rejected": -63.91938400268555, + "loss": 0.674, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09798876196146011, + "rewards/margins": 0.04130322486162186, + "rewards/rejected": -0.13929200172424316, + "step": 4960 + }, + { + "epoch": 1.7126119917298415, + "grad_norm": 2.0526952743530273, + "learning_rate": 3.0801738052323224e-09, + "logits/chosen": -2.986570358276367, + "logits/rejected": -2.982990264892578, + "logps/chosen": -63.114768981933594, + "logps/rejected": -70.09123229980469, + "loss": 0.6813, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.09630588442087173, + "rewards/margins": 0.026602035388350487, + "rewards/rejected": -0.12290791422128677, + "step": 4970 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 2.1260287761688232, + "learning_rate": 3.0082612121179434e-09, + "logits/chosen": -3.0010061264038086, + "logits/rejected": -2.9745213985443115, + "logps/chosen": -64.26847076416016, + "logps/rejected": -66.21635437011719, + "loss": 0.6814, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.10567764937877655, + "rewards/margins": 0.026792770251631737, + "rewards/rejected": -0.13247041404247284, + "step": 4980 + }, + { + "epoch": 1.7195037904893176, + "grad_norm": 2.227865695953369, + "learning_rate": 2.9371442614659096e-09, + "logits/chosen": -2.9317967891693115, + "logits/rejected": -2.915430784225464, + "logps/chosen": -62.3828125, + "logps/rejected": -66.34077453613281, + "loss": 0.6767, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.10304798930883408, + "rewards/margins": 0.035633284598588943, + "rewards/rejected": -0.13868127763271332, + "step": 4990 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 2.139073610305786, + "learning_rate": 2.8668255262271985e-09, + "logits/chosen": -2.99995493888855, + "logits/rejected": -2.965244770050049, + "logps/chosen": -64.31925201416016, + "logps/rejected": -66.57722473144531, + "loss": 0.6767, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09578624367713928, + "rewards/margins": 0.03556639701128006, + "rewards/rejected": -0.13135263323783875, + "step": 5000 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -3.0446012020111084, + "eval_logits/rejected": -3.0388095378875732, + "eval_logps/chosen": -64.83766174316406, + "eval_logps/rejected": -70.99246215820312, + "eval_loss": 0.6855791211128235, + "eval_rewards/accuracies": 0.5959572196006775, + "eval_rewards/chosen": -0.06125764176249504, + "eval_rewards/margins": 0.016865791752934456, + "eval_rewards/rejected": -0.07812343537807465, + "eval_runtime": 383.0259, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 5000 + }, + { + "epoch": 1.7263955892487939, + "grad_norm": 2.236734390258789, + "learning_rate": 2.7973075504740317e-09, + "logits/chosen": -2.9484639167785645, + "logits/rejected": -2.921968460083008, + "logps/chosen": -63.20332717895508, + "logps/rejected": -65.99695587158203, + "loss": 0.6749, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.10177341848611832, + "rewards/margins": 0.039411745965480804, + "rewards/rejected": -0.14118514955043793, + "step": 5010 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 2.2744734287261963, + "learning_rate": 2.7285928493078174e-09, + "logits/chosen": -3.028642177581787, + "logits/rejected": -3.005147933959961, + "logps/chosen": -65.42494201660156, + "logps/rejected": -65.94140625, + "loss": 0.6777, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09308835119009018, + "rewards/margins": 0.033491719514131546, + "rewards/rejected": -0.12658005952835083, + "step": 5020 + }, + { + "epoch": 1.7332873880082702, + "grad_norm": 1.9225391149520874, + "learning_rate": 2.660683908768191e-09, + "logits/chosen": -2.914492130279541, + "logits/rejected": -2.909972667694092, + "logps/chosen": -62.579566955566406, + "logps/rejected": -66.6541976928711, + "loss": 0.6889, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.11225111782550812, + "rewards/margins": 0.011178082786500454, + "rewards/rejected": -0.12342919409275055, + "step": 5030 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 2.368408679962158, + "learning_rate": 2.5935831857430283e-09, + "logits/chosen": -2.9320456981658936, + "logits/rejected": -2.91937255859375, + "logps/chosen": -64.72118377685547, + "logps/rejected": -67.03709411621094, + "loss": 0.6795, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10224765539169312, + "rewards/margins": 0.030294686555862427, + "rewards/rejected": -0.13254234194755554, + "step": 5040 + }, + { + "epoch": 1.7401791867677465, + "grad_norm": 2.43100905418396, + "learning_rate": 2.527293107879602e-09, + "logits/chosen": -3.0009281635284424, + "logits/rejected": -2.9862048625946045, + "logps/chosen": -64.345458984375, + "logps/rejected": -69.44126892089844, + "loss": 0.6762, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0966586247086525, + "rewards/margins": 0.03664679080247879, + "rewards/rejected": -0.13330543041229248, + "step": 5050 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 2.1548216342926025, + "learning_rate": 2.4618160734967168e-09, + "logits/chosen": -2.9960198402404785, + "logits/rejected": -2.989203453063965, + "logps/chosen": -62.73298263549805, + "logps/rejected": -66.20845794677734, + "loss": 0.6796, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.10584567487239838, + "rewards/margins": 0.02997753582894802, + "rewards/rejected": -0.13582322001457214, + "step": 5060 + }, + { + "epoch": 1.7470709855272226, + "grad_norm": 2.455150604248047, + "learning_rate": 2.397154451497957e-09, + "logits/chosen": -2.993448257446289, + "logits/rejected": -2.961956024169922, + "logps/chosen": -65.48920440673828, + "logps/rejected": -66.84199523925781, + "loss": 0.6727, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.08603125810623169, + "rewards/margins": 0.044304654002189636, + "rewards/rejected": -0.13033589720726013, + "step": 5070 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 2.25144362449646, + "learning_rate": 2.333310581285988e-09, + "logits/chosen": -2.9321632385253906, + "logits/rejected": -2.9152073860168457, + "logps/chosen": -64.05274963378906, + "logps/rejected": -67.70162963867188, + "loss": 0.677, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10330088436603546, + "rewards/margins": 0.03567297011613846, + "rewards/rejected": -0.13897386193275452, + "step": 5080 + }, + { + "epoch": 1.7539627842866987, + "grad_norm": 2.1542866230010986, + "learning_rate": 2.27028677267789e-09, + "logits/chosen": -2.918034553527832, + "logits/rejected": -2.898874282836914, + "logps/chosen": -63.1301155090332, + "logps/rejected": -66.78999328613281, + "loss": 0.6772, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09454191476106644, + "rewards/margins": 0.03478521481156349, + "rewards/rejected": -0.12932713329792023, + "step": 5090 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 2.031991720199585, + "learning_rate": 2.2080853058216274e-09, + "logits/chosen": -2.960707426071167, + "logits/rejected": -2.938732385635376, + "logps/chosen": -61.773902893066406, + "logps/rejected": -64.69305419921875, + "loss": 0.6774, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.10084456205368042, + "rewards/margins": 0.03442123904824257, + "rewards/rejected": -0.1352657973766327, + "step": 5100 + }, + { + "epoch": 1.757408683666437, + "eval_logits/chosen": -3.0444910526275635, + "eval_logits/rejected": -3.0386948585510254, + "eval_logps/chosen": -64.86609649658203, + "eval_logps/rejected": -71.0175552368164, + "eval_loss": 0.685597836971283, + "eval_rewards/accuracies": 0.5938661694526672, + "eval_rewards/chosen": -0.0615420788526535, + "eval_rewards/margins": 0.01683231070637703, + "eval_rewards/rejected": -0.07837438583374023, + "eval_runtime": 383.0373, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 5100 + }, + { + "epoch": 1.760854583046175, + "grad_norm": 2.0644540786743164, + "learning_rate": 2.1467084311135226e-09, + "logits/chosen": -2.9693267345428467, + "logits/rejected": -2.9648232460021973, + "logps/chosen": -61.736297607421875, + "logps/rejected": -67.93791198730469, + "loss": 0.6782, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10537086427211761, + "rewards/margins": 0.032763343304395676, + "rewards/rejected": -0.1381342113018036, + "step": 5110 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 2.284945011138916, + "learning_rate": 2.0861583691168637e-09, + "logits/chosen": -2.8928122520446777, + "logits/rejected": -2.866619110107422, + "logps/chosen": -62.84453582763672, + "logps/rejected": -66.07553100585938, + "loss": 0.6769, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.10395550727844238, + "rewards/margins": 0.035675663501024246, + "rewards/rejected": -0.13963118195533752, + "step": 5120 + }, + { + "epoch": 1.7677463818056514, + "grad_norm": 2.1467080116271973, + "learning_rate": 2.0264373104815602e-09, + "logits/chosen": -2.9610791206359863, + "logits/rejected": -2.9423394203186035, + "logps/chosen": -65.15470123291016, + "logps/rejected": -66.92606353759766, + "loss": 0.6769, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.10134553909301758, + "rewards/margins": 0.035156022757291794, + "rewards/rejected": -0.13650156557559967, + "step": 5130 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 2.283031463623047, + "learning_rate": 1.967547415864862e-09, + "logits/chosen": -2.924994945526123, + "logits/rejected": -2.9106478691101074, + "logps/chosen": -65.27745819091797, + "logps/rejected": -68.17292785644531, + "loss": 0.6774, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10397525876760483, + "rewards/margins": 0.03433610126376152, + "rewards/rejected": -0.13831135630607605, + "step": 5140 + }, + { + "epoch": 1.7746381805651275, + "grad_norm": 2.338268756866455, + "learning_rate": 1.909490815853232e-09, + "logits/chosen": -2.9108078479766846, + "logits/rejected": -2.9008777141571045, + "logps/chosen": -63.09922409057617, + "logps/rejected": -68.67888641357422, + "loss": 0.6767, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.10057171434164047, + "rewards/margins": 0.03636406734585762, + "rewards/rejected": -0.1369357705116272, + "step": 5150 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 2.083338499069214, + "learning_rate": 1.8522696108852348e-09, + "logits/chosen": -2.9106225967407227, + "logits/rejected": -2.8816580772399902, + "logps/chosen": -66.49388122558594, + "logps/rejected": -66.64405059814453, + "loss": 0.6738, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.09981705248355865, + "rewards/margins": 0.041750866919755936, + "rewards/rejected": -0.1415679156780243, + "step": 5160 + }, + { + "epoch": 1.7815299793246038, + "grad_norm": 2.3757216930389404, + "learning_rate": 1.795885871175537e-09, + "logits/chosen": -2.967358112335205, + "logits/rejected": -2.9483819007873535, + "logps/chosen": -64.157958984375, + "logps/rejected": -68.16368865966797, + "loss": 0.6767, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10042033344507217, + "rewards/margins": 0.03579426556825638, + "rewards/rejected": -0.13621459901332855, + "step": 5170 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 2.0755326747894287, + "learning_rate": 1.7403416366400385e-09, + "logits/chosen": -2.9863715171813965, + "logits/rejected": -2.9560065269470215, + "logps/chosen": -64.21257781982422, + "logps/rejected": -64.93058776855469, + "loss": 0.6755, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0919819101691246, + "rewards/margins": 0.03802384436130524, + "rewards/rejected": -0.13000576198101044, + "step": 5180 + }, + { + "epoch": 1.7884217780840799, + "grad_norm": 2.1907148361206055, + "learning_rate": 1.6856389168220547e-09, + "logits/chosen": -2.8582282066345215, + "logits/rejected": -2.836745262145996, + "logps/chosen": -62.414825439453125, + "logps/rejected": -66.34150695800781, + "loss": 0.6783, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10486672073602676, + "rewards/margins": 0.0326237753033638, + "rewards/rejected": -0.13749048113822937, + "step": 5190 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 2.2690272331237793, + "learning_rate": 1.6317796908195985e-09, + "logits/chosen": -2.9531779289245605, + "logits/rejected": -2.9294018745422363, + "logps/chosen": -64.02655792236328, + "logps/rejected": -68.70619201660156, + "loss": 0.6748, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.10051832348108292, + "rewards/margins": 0.03954003006219864, + "rewards/rejected": -0.14005833864212036, + "step": 5200 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -3.0440940856933594, + "eval_logits/rejected": -3.038301467895508, + "eval_logps/chosen": -64.87361145019531, + "eval_logps/rejected": -71.03768920898438, + "eval_loss": 0.6855441331863403, + "eval_rewards/accuracies": 0.5938661694526672, + "eval_rewards/chosen": -0.06161721795797348, + "eval_rewards/margins": 0.016958480700850487, + "eval_rewards/rejected": -0.07857570797204971, + "eval_runtime": 382.8884, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.405, + "step": 5200 + }, + { + "epoch": 1.7953135768435562, + "grad_norm": 2.0647106170654297, + "learning_rate": 1.5787659072137944e-09, + "logits/chosen": -2.946464776992798, + "logits/rejected": -2.913212299346924, + "logps/chosen": -64.98263549804688, + "logps/rejected": -67.13221740722656, + "loss": 0.6751, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09663587808609009, + "rewards/margins": 0.03867806866765022, + "rewards/rejected": -0.13531394302845, + "step": 5210 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 2.2280561923980713, + "learning_rate": 1.5265994839983893e-09, + "logits/chosen": -3.0170464515686035, + "logits/rejected": -3.0034279823303223, + "logps/chosen": -66.4332504272461, + "logps/rejected": -68.14265441894531, + "loss": 0.68, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1006295308470726, + "rewards/margins": 0.02905629202723503, + "rewards/rejected": -0.12968583405017853, + "step": 5220 + }, + { + "epoch": 1.8022053756030325, + "grad_norm": 2.3320565223693848, + "learning_rate": 1.4752823085103476e-09, + "logits/chosen": -2.971269130706787, + "logits/rejected": -2.9558329582214355, + "logps/chosen": -63.98261642456055, + "logps/rejected": -68.88826751708984, + "loss": 0.6762, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09884113073348999, + "rewards/margins": 0.03671208769083023, + "rewards/rejected": -0.13555321097373962, + "step": 5230 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 2.208251953125, + "learning_rate": 1.4248162373615536e-09, + "logits/chosen": -2.8929576873779297, + "logits/rejected": -2.862753391265869, + "logps/chosen": -63.76152801513672, + "logps/rejected": -65.0456771850586, + "loss": 0.6765, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0982288122177124, + "rewards/margins": 0.03631594032049179, + "rewards/rejected": -0.1345447450876236, + "step": 5240 + }, + { + "epoch": 1.8090971743625086, + "grad_norm": 2.184211015701294, + "learning_rate": 1.37520309637168e-09, + "logits/chosen": -2.9574813842773438, + "logits/rejected": -2.9355812072753906, + "logps/chosen": -62.548095703125, + "logps/rejected": -67.11660766601562, + "loss": 0.6749, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.09293146431446075, + "rewards/margins": 0.03991398215293884, + "rewards/rejected": -0.1328454464673996, + "step": 5250 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 2.1589388847351074, + "learning_rate": 1.326444680502098e-09, + "logits/chosen": -2.974087715148926, + "logits/rejected": -2.9640023708343506, + "logps/chosen": -63.1976203918457, + "logps/rejected": -67.73504638671875, + "loss": 0.6759, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10160807520151138, + "rewards/margins": 0.03716858848929405, + "rewards/rejected": -0.13877665996551514, + "step": 5260 + }, + { + "epoch": 1.8159889731219847, + "grad_norm": 2.1708314418792725, + "learning_rate": 1.2785427537909481e-09, + "logits/chosen": -2.9062724113464355, + "logits/rejected": -2.882266044616699, + "logps/chosen": -63.059059143066406, + "logps/rejected": -64.22677612304688, + "loss": 0.6784, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.10223253071308136, + "rewards/margins": 0.03243548795580864, + "rewards/rejected": -0.1346680223941803, + "step": 5270 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 2.188985586166382, + "learning_rate": 1.2314990492893278e-09, + "logits/chosen": -2.9831230640411377, + "logits/rejected": -2.9730265140533447, + "logps/chosen": -65.0916976928711, + "logps/rejected": -68.92658996582031, + "loss": 0.6781, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.10400126874446869, + "rewards/margins": 0.033252011984586716, + "rewards/rejected": -0.1372532844543457, + "step": 5280 + }, + { + "epoch": 1.822880771881461, + "grad_norm": 2.0532279014587402, + "learning_rate": 1.185315268998574e-09, + "logits/chosen": -2.9915385246276855, + "logits/rejected": -2.9743523597717285, + "logps/chosen": -64.66096496582031, + "logps/rejected": -68.3445053100586, + "loss": 0.6768, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.10006566345691681, + "rewards/margins": 0.035489775240421295, + "rewards/rejected": -0.1355554461479187, + "step": 5290 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 2.076235055923462, + "learning_rate": 1.1399930838086962e-09, + "logits/chosen": -2.983145236968994, + "logits/rejected": -2.957339286804199, + "logps/chosen": -64.48441314697266, + "logps/rejected": -66.11888122558594, + "loss": 0.6761, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.10090861469507217, + "rewards/margins": 0.03716624155640602, + "rewards/rejected": -0.1380748599767685, + "step": 5300 + }, + { + "epoch": 1.8263266712611992, + "eval_logits/chosen": -3.043860912322998, + "eval_logits/rejected": -3.038022518157959, + "eval_logps/chosen": -64.87781524658203, + "eval_logps/rejected": -71.046875, + "eval_loss": 0.685516357421875, + "eval_rewards/accuracies": 0.5950278639793396, + "eval_rewards/chosen": -0.06165924295783043, + "eval_rewards/margins": 0.017008250579237938, + "eval_rewards/rejected": -0.07866749167442322, + "eval_runtime": 382.9671, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 5300 + }, + { + "epoch": 1.8297725706409373, + "grad_norm": 2.3067526817321777, + "learning_rate": 1.095534133437928e-09, + "logits/chosen": -2.9042887687683105, + "logits/rejected": -2.887679100036621, + "logps/chosen": -62.7168083190918, + "logps/rejected": -65.52967834472656, + "loss": 0.6788, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10282865911722183, + "rewards/margins": 0.03149380534887314, + "rewards/rejected": -0.13432244956493378, + "step": 5310 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 2.2261345386505127, + "learning_rate": 1.051940026373399e-09, + "logits/chosen": -2.9474563598632812, + "logits/rejected": -2.9266719818115234, + "logps/chosen": -64.75906372070312, + "logps/rejected": -67.07856750488281, + "loss": 0.6768, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10096816718578339, + "rewards/margins": 0.03607845678925514, + "rewards/rejected": -0.13704662024974823, + "step": 5320 + }, + { + "epoch": 1.8366643694004137, + "grad_norm": 2.0321414470672607, + "learning_rate": 1.0092123398129343e-09, + "logits/chosen": -2.9617254734039307, + "logits/rejected": -2.944420337677002, + "logps/chosen": -64.52486419677734, + "logps/rejected": -68.56999206542969, + "loss": 0.6761, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.10365060716867447, + "rewards/margins": 0.0370609350502491, + "rewards/rejected": -0.14071156084537506, + "step": 5330 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 2.1324450969696045, + "learning_rate": 9.673526196080029e-10, + "logits/chosen": -2.952425003051758, + "logits/rejected": -2.9208550453186035, + "logps/chosen": -64.861328125, + "logps/rejected": -65.89351654052734, + "loss": 0.6745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10647694021463394, + "rewards/margins": 0.04073434695601463, + "rewards/rejected": -0.14721128344535828, + "step": 5340 + }, + { + "epoch": 1.8435561681598898, + "grad_norm": 1.9503605365753174, + "learning_rate": 9.263623802078014e-10, + "logits/chosen": -2.9817001819610596, + "logits/rejected": -2.958287000656128, + "logps/chosen": -64.78105926513672, + "logps/rejected": -66.6141357421875, + "loss": 0.6775, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.09822291880846024, + "rewards/margins": 0.034339554607868195, + "rewards/rejected": -0.13256247341632843, + "step": 5350 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 2.1737399101257324, + "learning_rate": 8.862431046044172e-10, + "logits/chosen": -2.934706449508667, + "logits/rejected": -2.904364824295044, + "logps/chosen": -63.380645751953125, + "logps/rejected": -64.81143951416016, + "loss": 0.6767, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.10364832729101181, + "rewards/margins": 0.0358717143535614, + "rewards/rejected": -0.13952001929283142, + "step": 5360 + }, + { + "epoch": 1.8504479669193659, + "grad_norm": 2.612893581390381, + "learning_rate": 8.469962442792355e-10, + "logits/chosen": -2.9706287384033203, + "logits/rejected": -2.947335720062256, + "logps/chosen": -64.75955200195312, + "logps/rejected": -66.10575866699219, + "loss": 0.6788, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.10434827953577042, + "rewards/margins": 0.03156623989343643, + "rewards/rejected": -0.13591453433036804, + "step": 5370 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 2.0904970169067383, + "learning_rate": 8.086232191503839e-10, + "logits/chosen": -2.9142327308654785, + "logits/rejected": -2.8894944190979004, + "logps/chosen": -62.5820198059082, + "logps/rejected": -66.3860855102539, + "loss": 0.6748, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.09712487459182739, + "rewards/margins": 0.039482034742832184, + "rewards/rejected": -0.13660690188407898, + "step": 5380 + }, + { + "epoch": 1.8573397656788422, + "grad_norm": 2.241687297821045, + "learning_rate": 7.711254175213705e-10, + "logits/chosen": -2.968531847000122, + "logits/rejected": -2.951683759689331, + "logps/chosen": -64.6425552368164, + "logps/rejected": -67.84783935546875, + "loss": 0.6772, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09521899372339249, + "rewards/margins": 0.03488728031516075, + "rewards/rejected": -0.13010628521442413, + "step": 5390 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 2.245168685913086, + "learning_rate": 7.345041960308663e-10, + "logits/chosen": -3.0399162769317627, + "logits/rejected": -3.0135927200317383, + "logps/chosen": -61.74034881591797, + "logps/rejected": -67.44328308105469, + "loss": 0.6738, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09590132534503937, + "rewards/margins": 0.041902266442775726, + "rewards/rejected": -0.1378035843372345, + "step": 5400 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -3.043799638748169, + "eval_logits/rejected": -3.038034200668335, + "eval_logps/chosen": -64.88846588134766, + "eval_logps/rejected": -71.0632553100586, + "eval_loss": 0.6854896545410156, + "eval_rewards/accuracies": 0.5985130071640015, + "eval_rewards/chosen": -0.06176569312810898, + "eval_rewards/margins": 0.017065657302737236, + "eval_rewards/rejected": -0.07883134484291077, + "eval_runtime": 383.0482, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.405, + "step": 5400 + }, + { + "epoch": 1.8642315644383185, + "grad_norm": 1.990185022354126, + "learning_rate": 6.987608796036132e-10, + "logits/chosen": -2.9385836124420166, + "logits/rejected": -2.9050357341766357, + "logps/chosen": -64.62165832519531, + "logps/rejected": -64.18122863769531, + "loss": 0.6744, + "rewards/accuracies": 0.653124988079071, + "rewards/chosen": -0.09384270012378693, + "rewards/margins": 0.0403972789645195, + "rewards/rejected": -0.13424000144004822, + "step": 5410 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 2.0287582874298096, + "learning_rate": 6.638967614024937e-10, + "logits/chosen": -2.953791856765747, + "logits/rejected": -2.924999713897705, + "logps/chosen": -62.311973571777344, + "logps/rejected": -65.3152847290039, + "loss": 0.6781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09908489882946014, + "rewards/margins": 0.033005110919475555, + "rewards/rejected": -0.1320900171995163, + "step": 5420 + }, + { + "epoch": 1.8711233631977946, + "grad_norm": 2.1748769283294678, + "learning_rate": 6.299131027817401e-10, + "logits/chosen": -2.963130235671997, + "logits/rejected": -2.9388515949249268, + "logps/chosen": -62.30155563354492, + "logps/rejected": -66.90140533447266, + "loss": 0.6753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09591764956712723, + "rewards/margins": 0.038752518594264984, + "rewards/rejected": -0.1346701681613922, + "step": 5430 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 2.203587770462036, + "learning_rate": 5.968111332413095e-10, + "logits/chosen": -2.9019947052001953, + "logits/rejected": -2.8894219398498535, + "logps/chosen": -62.104217529296875, + "logps/rejected": -67.97303771972656, + "loss": 0.6753, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09756793826818466, + "rewards/margins": 0.03838181495666504, + "rewards/rejected": -0.1359497606754303, + "step": 5440 + }, + { + "epoch": 1.8780151619572707, + "grad_norm": 2.005970001220703, + "learning_rate": 5.645920503823898e-10, + "logits/chosen": -3.024892568588257, + "logits/rejected": -2.9996724128723145, + "logps/chosen": -63.72404861450195, + "logps/rejected": -65.83196258544922, + "loss": 0.677, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09647668153047562, + "rewards/margins": 0.03470000997185707, + "rewards/rejected": -0.13117669522762299, + "step": 5450 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 2.098788022994995, + "learning_rate": 5.332570198640779e-10, + "logits/chosen": -2.9300484657287598, + "logits/rejected": -2.9011282920837402, + "logps/chosen": -65.2178955078125, + "logps/rejected": -67.35456848144531, + "loss": 0.6729, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09465857595205307, + "rewards/margins": 0.04338352754712105, + "rewards/rejected": -0.13804210722446442, + "step": 5460 + }, + { + "epoch": 1.884906960716747, + "grad_norm": 2.382202625274658, + "learning_rate": 5.028071753612167e-10, + "logits/chosen": -2.869131565093994, + "logits/rejected": -2.8438637256622314, + "logps/chosen": -65.35198974609375, + "logps/rejected": -66.67762756347656, + "loss": 0.678, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10016383975744247, + "rewards/margins": 0.033315885812044144, + "rewards/rejected": -0.13347972929477692, + "step": 5470 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 2.279611587524414, + "learning_rate": 4.73243618523353e-10, + "logits/chosen": -2.9345710277557373, + "logits/rejected": -2.9160847663879395, + "logps/chosen": -64.69429779052734, + "logps/rejected": -67.59037780761719, + "loss": 0.6772, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.10485859960317612, + "rewards/margins": 0.03470388427376747, + "rewards/rejected": -0.1395624727010727, + "step": 5480 + }, + { + "epoch": 1.8917987594762233, + "grad_norm": 2.3026857376098633, + "learning_rate": 4.4456741893491023e-10, + "logits/chosen": -2.977466583251953, + "logits/rejected": -2.9550909996032715, + "logps/chosen": -65.40562438964844, + "logps/rejected": -68.69434356689453, + "loss": 0.6757, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.09687596559524536, + "rewards/margins": 0.03861897066235542, + "rewards/rejected": -0.13549493253231049, + "step": 5490 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 2.6653425693511963, + "learning_rate": 4.1677961407647345e-10, + "logits/chosen": -2.9842467308044434, + "logits/rejected": -2.9766757488250732, + "logps/chosen": -63.60997772216797, + "logps/rejected": -68.15296173095703, + "loss": 0.6821, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.10921657085418701, + "rewards/margins": 0.024912817403674126, + "rewards/rejected": -0.13412940502166748, + "step": 5500 + }, + { + "epoch": 1.8952446588559613, + "eval_logits/chosen": -3.043705701828003, + "eval_logits/rejected": -3.037916421890259, + "eval_logps/chosen": -64.89189147949219, + "eval_logps/rejected": -71.06378936767578, + "eval_loss": 0.6855042576789856, + "eval_rewards/accuracies": 0.5934014916419983, + "eval_rewards/chosen": -0.06179996207356453, + "eval_rewards/margins": 0.01703677512705326, + "eval_rewards/rejected": -0.07883673906326294, + "eval_runtime": 382.8045, + "eval_samples_per_second": 11.243, + "eval_steps_per_second": 1.405, + "step": 5500 + }, + { + "epoch": 1.8986905582356997, + "grad_norm": 2.232734203338623, + "learning_rate": 3.8988120928726274e-10, + "logits/chosen": -3.009869337081909, + "logits/rejected": -2.984719753265381, + "logps/chosen": -63.23027801513672, + "logps/rejected": -65.56352233886719, + "loss": 0.6737, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.09125113487243652, + "rewards/margins": 0.0418318547308445, + "rewards/rejected": -0.13308298587799072, + "step": 5510 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 2.130568504333496, + "learning_rate": 3.6387317772875457e-10, + "logits/chosen": -2.999778985977173, + "logits/rejected": -2.9881069660186768, + "logps/chosen": -65.87593841552734, + "logps/rejected": -69.14570617675781, + "loss": 0.6753, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.10248447954654694, + "rewards/margins": 0.03881872445344925, + "rewards/rejected": -0.1413031965494156, + "step": 5520 + }, + { + "epoch": 1.9055823569951758, + "grad_norm": 2.2380268573760986, + "learning_rate": 3.3875646034947634e-10, + "logits/chosen": -2.9597513675689697, + "logits/rejected": -2.928396224975586, + "logps/chosen": -62.51630401611328, + "logps/rejected": -67.06792449951172, + "loss": 0.6739, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09396801888942719, + "rewards/margins": 0.04169601574540138, + "rewards/rejected": -0.13566403090953827, + "step": 5530 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 2.1978256702423096, + "learning_rate": 3.145319658509699e-10, + "logits/chosen": -2.9985365867614746, + "logits/rejected": -2.961423635482788, + "logps/chosen": -63.85062789916992, + "logps/rejected": -65.30555725097656, + "loss": 0.6709, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09511172026395798, + "rewards/margins": 0.04764672368764877, + "rewards/rejected": -0.14275844395160675, + "step": 5540 + }, + { + "epoch": 1.9124741557546519, + "grad_norm": 2.0679409503936768, + "learning_rate": 2.9120057065490365e-10, + "logits/chosen": -2.8995628356933594, + "logits/rejected": -2.882148265838623, + "logps/chosen": -64.40314483642578, + "logps/rejected": -66.68975830078125, + "loss": 0.6783, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10052362829446793, + "rewards/margins": 0.032626282423734665, + "rewards/rejected": -0.1331499069929123, + "step": 5550 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 2.1227855682373047, + "learning_rate": 2.687631188713735e-10, + "logits/chosen": -3.0305912494659424, + "logits/rejected": -3.0032315254211426, + "logps/chosen": -67.49004364013672, + "logps/rejected": -68.33811950683594, + "loss": 0.675, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.10781770944595337, + "rewards/margins": 0.039582930505275726, + "rewards/rejected": -0.1474006474018097, + "step": 5560 + }, + { + "epoch": 1.9193659545141282, + "grad_norm": 2.0987863540649414, + "learning_rate": 2.4722042226835993e-10, + "logits/chosen": -2.889064073562622, + "logits/rejected": -2.8677499294281006, + "logps/chosen": -65.31095886230469, + "logps/rejected": -65.89826965332031, + "loss": 0.6793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10466232150793076, + "rewards/margins": 0.0305057130753994, + "rewards/rejected": -0.13516804575920105, + "step": 5570 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 2.0254909992218018, + "learning_rate": 2.2657326024235755e-10, + "logits/chosen": -3.000871419906616, + "logits/rejected": -2.9850192070007324, + "logps/chosen": -64.88275909423828, + "logps/rejected": -66.03553771972656, + "loss": 0.6822, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.09776648133993149, + "rewards/margins": 0.02495993860065937, + "rewards/rejected": -0.12272641807794571, + "step": 5580 + }, + { + "epoch": 1.9262577532736045, + "grad_norm": 2.1231415271759033, + "learning_rate": 2.0682237979018636e-10, + "logits/chosen": -2.937507152557373, + "logits/rejected": -2.9114415645599365, + "logps/chosen": -65.44194030761719, + "logps/rejected": -63.282936096191406, + "loss": 0.6772, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.09877943992614746, + "rewards/margins": 0.03481840714812279, + "rewards/rejected": -0.13359785079956055, + "step": 5590 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 2.185314893722534, + "learning_rate": 1.8796849548195215e-10, + "logits/chosen": -3.00420880317688, + "logits/rejected": -2.9785873889923096, + "logps/chosen": -63.3000373840332, + "logps/rejected": -66.32828521728516, + "loss": 0.6724, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.09202561527490616, + "rewards/margins": 0.04433347284793854, + "rewards/rejected": -0.1363590806722641, + "step": 5600 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -3.0436623096466064, + "eval_logits/rejected": -3.037909507751465, + "eval_logps/chosen": -64.89786529541016, + "eval_logps/rejected": -71.06352996826172, + "eval_loss": 0.6855349540710449, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -0.06185971945524216, + "eval_rewards/margins": 0.016974303871393204, + "eval_rewards/rejected": -0.07883401960134506, + "eval_runtime": 383.1627, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 5600 + }, + { + "epoch": 1.9331495520330806, + "grad_norm": 2.23110032081604, + "learning_rate": 1.7001228943520075e-10, + "logits/chosen": -2.909824848175049, + "logits/rejected": -2.8948612213134766, + "logps/chosen": -61.312904357910156, + "logps/rejected": -68.531982421875, + "loss": 0.6767, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.10082676261663437, + "rewards/margins": 0.03634323552250862, + "rewards/rejected": -0.1371699869632721, + "step": 5610 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 2.072162628173828, + "learning_rate": 1.5295441129024312e-10, + "logits/chosen": -2.9221153259277344, + "logits/rejected": -2.9079785346984863, + "logps/chosen": -65.98023986816406, + "logps/rejected": -65.65779113769531, + "loss": 0.6825, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1077071875333786, + "rewards/margins": 0.024027761071920395, + "rewards/rejected": -0.1317349374294281, + "step": 5620 + }, + { + "epoch": 1.940041350792557, + "grad_norm": 2.2267067432403564, + "learning_rate": 1.3679547818664927e-10, + "logits/chosen": -2.920483112335205, + "logits/rejected": -2.8899428844451904, + "logps/chosen": -65.67730712890625, + "logps/rejected": -66.02433013916016, + "loss": 0.6754, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09616454690694809, + "rewards/margins": 0.03814741224050522, + "rewards/rejected": -0.1343119889497757, + "step": 5630 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 2.207374095916748, + "learning_rate": 1.2153607474091332e-10, + "logits/chosen": -2.9759020805358887, + "logits/rejected": -2.9456405639648438, + "logps/chosen": -65.62007141113281, + "logps/rejected": -65.35276794433594, + "loss": 0.6748, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.09680736809968948, + "rewards/margins": 0.039464615285396576, + "rewards/rejected": -0.13627198338508606, + "step": 5640 + }, + { + "epoch": 1.946933149552033, + "grad_norm": 2.3040826320648193, + "learning_rate": 1.0717675302531482e-10, + "logits/chosen": -2.946089029312134, + "logits/rejected": -2.925248861312866, + "logps/chosen": -68.79296875, + "logps/rejected": -67.72799682617188, + "loss": 0.6776, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.09351503849029541, + "rewards/margins": 0.03450186923146248, + "rewards/rejected": -0.1280169039964676, + "step": 5650 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 2.2648158073425293, + "learning_rate": 9.371803254794308e-11, + "logits/chosen": -3.002150297164917, + "logits/rejected": -2.984257459640503, + "logps/chosen": -62.06635665893555, + "logps/rejected": -66.76404571533203, + "loss": 0.6714, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09374018013477325, + "rewards/margins": 0.046740688383579254, + "rewards/rejected": -0.1404808610677719, + "step": 5660 + }, + { + "epoch": 1.9538249483115093, + "grad_norm": 2.20288348197937, + "learning_rate": 8.116040023388448e-11, + "logits/chosen": -2.9187135696411133, + "logits/rejected": -2.8906636238098145, + "logps/chosen": -64.13285064697266, + "logps/rejected": -65.82918548583984, + "loss": 0.6736, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.1016378179192543, + "rewards/margins": 0.04218579828739166, + "rewards/rejected": -0.14382360875606537, + "step": 5670 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 2.0635271072387695, + "learning_rate": 6.950431040763371e-11, + "logits/chosen": -2.9942047595977783, + "logits/rejected": -2.9737515449523926, + "logps/chosen": -65.24699401855469, + "logps/rejected": -66.28753662109375, + "loss": 0.6781, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10666315257549286, + "rewards/margins": 0.033177699893713, + "rewards/rejected": -0.13984087109565735, + "step": 5680 + }, + { + "epoch": 1.9607167470709856, + "grad_norm": 2.0154037475585938, + "learning_rate": 5.875018477663752e-11, + "logits/chosen": -2.9185800552368164, + "logits/rejected": -2.8994216918945312, + "logps/chosen": -64.3661880493164, + "logps/rejected": -65.51478576660156, + "loss": 0.6779, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.10311891883611679, + "rewards/margins": 0.03355059400200844, + "rewards/rejected": -0.13666951656341553, + "step": 5690 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 2.2101991176605225, + "learning_rate": 4.8898412416040203e-11, + "logits/chosen": -2.9987101554870605, + "logits/rejected": -2.971622943878174, + "logps/chosen": -63.42491912841797, + "logps/rejected": -67.65690612792969, + "loss": 0.6745, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09306361526250839, + "rewards/margins": 0.04037128761410713, + "rewards/rejected": -0.13343490660190582, + "step": 5700 + }, + { + "epoch": 1.9641626464507236, + "eval_logits/chosen": -3.0438084602355957, + "eval_logits/rejected": -3.037998914718628, + "eval_logps/chosen": -64.90370178222656, + "eval_logps/rejected": -71.07880401611328, + "eval_loss": 0.685490071773529, + "eval_rewards/accuracies": 0.595724880695343, + "eval_rewards/chosen": -0.06191807612776756, + "eval_rewards/margins": 0.01706887222826481, + "eval_rewards/rejected": -0.07898694276809692, + "eval_runtime": 382.8267, + "eval_samples_per_second": 11.243, + "eval_steps_per_second": 1.405, + "step": 5700 + }, + { + "epoch": 1.9676085458304617, + "grad_norm": 2.2737042903900146, + "learning_rate": 3.994934975461439e-11, + "logits/chosen": -2.9715285301208496, + "logits/rejected": -2.9414966106414795, + "logps/chosen": -64.19334411621094, + "logps/rejected": -65.5436782836914, + "loss": 0.6778, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.09507995843887329, + "rewards/margins": 0.033717669546604156, + "rewards/rejected": -0.12879762053489685, + "step": 5710 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 2.0696685314178467, + "learning_rate": 3.190332056186018e-11, + "logits/chosen": -2.9221713542938232, + "logits/rejected": -2.9031994342803955, + "logps/chosen": -64.0558853149414, + "logps/rejected": -65.50578308105469, + "loss": 0.6817, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.10295647382736206, + "rewards/margins": 0.02570858970284462, + "rewards/rejected": -0.12866505980491638, + "step": 5720 + }, + { + "epoch": 1.9745003445899378, + "grad_norm": 2.249941110610962, + "learning_rate": 2.4760615936289532e-11, + "logits/chosen": -2.905991315841675, + "logits/rejected": -2.884575366973877, + "logps/chosen": -64.17007446289062, + "logps/rejected": -67.10668182373047, + "loss": 0.6746, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10047076642513275, + "rewards/margins": 0.04041288048028946, + "rewards/rejected": -0.1408836394548416, + "step": 5730 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 2.211496353149414, + "learning_rate": 1.8521494294898578e-11, + "logits/chosen": -2.997009754180908, + "logits/rejected": -2.9652652740478516, + "logps/chosen": -66.19706726074219, + "logps/rejected": -64.80754089355469, + "loss": 0.6729, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09728576242923737, + "rewards/margins": 0.04374868795275688, + "rewards/rejected": -0.14103445410728455, + "step": 5740 + }, + { + "epoch": 1.9813921433494142, + "grad_norm": 1.8820128440856934, + "learning_rate": 1.318618136381955e-11, + "logits/chosen": -2.9602997303009033, + "logits/rejected": -2.9354488849639893, + "logps/chosen": -62.84728240966797, + "logps/rejected": -63.922279357910156, + "loss": 0.6815, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.10357487201690674, + "rewards/margins": 0.026113441213965416, + "rewards/rejected": -0.1296883076429367, + "step": 5750 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 2.149010181427002, + "learning_rate": 8.75487017014953e-12, + "logits/chosen": -2.9395554065704346, + "logits/rejected": -2.9065656661987305, + "logps/chosen": -66.78532409667969, + "logps/rejected": -65.26570129394531, + "loss": 0.6724, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09145772457122803, + "rewards/margins": 0.04497409611940384, + "rewards/rejected": -0.13643182814121246, + "step": 5760 + }, + { + "epoch": 1.9882839421088905, + "grad_norm": 2.0296709537506104, + "learning_rate": 5.227721034969934e-12, + "logits/chosen": -2.9788923263549805, + "logits/rejected": -2.9554014205932617, + "logps/chosen": -61.77091598510742, + "logps/rejected": -65.63267517089844, + "loss": 0.6753, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0966632291674614, + "rewards/margins": 0.038477830588817596, + "rewards/rejected": -0.1351410448551178, + "step": 5770 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 2.1751058101654053, + "learning_rate": 2.6048615675483555e-12, + "logits/chosen": -2.9429450035095215, + "logits/rejected": -2.9181408882141113, + "logps/chosen": -63.3074836730957, + "logps/rejected": -65.84387969970703, + "loss": 0.6777, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.10151276737451553, + "rewards/margins": 0.033877789974212646, + "rewards/rejected": -0.13539054989814758, + "step": 5780 + }, + { + "epoch": 1.9951757408683668, + "grad_norm": 2.299027919769287, + "learning_rate": 8.863866607144999e-13, + "logits/chosen": -2.9754512310028076, + "logits/rejected": -2.9556355476379395, + "logps/chosen": -66.12899780273438, + "logps/rejected": -69.43649291992188, + "loss": 0.6805, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.10944394022226334, + "rewards/margins": 0.028050214052200317, + "rewards/rejected": -0.13749414682388306, + "step": 5790 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 2.1351449489593506, + "learning_rate": 7.235848743236683e-14, + "logits/chosen": -2.9115071296691895, + "logits/rejected": -2.88586688041687, + "logps/chosen": -64.3592758178711, + "logps/rejected": -65.3360366821289, + "loss": 0.6767, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.09383897483348846, + "rewards/margins": 0.03549923375248909, + "rewards/rejected": -0.12933818995952606, + "step": 5800 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -3.0438718795776367, + "eval_logits/rejected": -3.0380868911743164, + "eval_logps/chosen": -64.8961181640625, + "eval_logps/rejected": -71.05843353271484, + "eval_loss": 0.6855542659759521, + "eval_rewards/accuracies": 0.5954925417900085, + "eval_rewards/chosen": -0.06184223294258118, + "eval_rewards/margins": 0.016940835863351822, + "eval_rewards/rejected": -0.0787830725312233, + "eval_runtime": 383.0406, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.405, + "step": 5800 + }, + { + "epoch": 2.0, + "step": 5804, + "total_flos": 0.0, + "train_loss": 0.6830481951767456, + "train_runtime": 69306.1943, + "train_samples_per_second": 2.68, + "train_steps_per_second": 0.084 + } + ], + "logging_steps": 10, + "max_steps": 5804, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}