diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4496 @@ +{ + "best_metric": 0.5855222344398499, + "best_model_checkpoint": "data/tinyllama_moe_dpo_ultrafeedback_v2_epochs3/checkpoint-2700", + "epoch": 2.998430141287284, + "eval_steps": 100, + "global_step": 2865, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.208333333333333e-09, + "logits/chosen": -3.0106258392333984, + "logits/rejected": -3.0041162967681885, + "logps/chosen": -291.6616516113281, + "logps/rejected": -273.537353515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -3.0320074558258057, + "logits/rejected": -2.934544801712036, + "logps/chosen": -352.8655090332031, + "logps/rejected": -284.1784362792969, + "loss": 0.6931, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": -0.00030098477145656943, + "rewards/margins": 6.371454219333827e-05, + "rewards/rejected": -0.00036469934275373816, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.9808428287506104, + "logits/rejected": -2.9612295627593994, + "logps/chosen": -309.6392822265625, + "logps/rejected": -278.2618103027344, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0001357152796117589, + "rewards/margins": 0.00045497194514609873, + "rewards/rejected": -0.0003192565927747637, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.5624999999999999e-07, + "logits/chosen": -3.041250705718994, + "logits/rejected": -2.9839859008789062, + "logps/chosen": -342.7677917480469, + "logps/rejected": -301.0032653808594, + "loss": 0.6933, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00031170996953733265, + "rewards/margins": -0.00037957567838020623, + "rewards/rejected": 6.78658252581954e-05, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -3.019380807876587, + "logits/rejected": -2.974083423614502, + "logps/chosen": -331.848876953125, + "logps/rejected": -276.879150390625, + "loss": 0.6933, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.00034866915666498244, + "rewards/margins": -0.00015269347932189703, + "rewards/rejected": -0.00019597564823925495, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 2.604166666666667e-07, + "logits/chosen": -3.025217294692993, + "logits/rejected": -2.984266757965088, + "logps/chosen": -347.17523193359375, + "logps/rejected": -309.79034423828125, + "loss": 0.6932, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.0004488878767006099, + "rewards/margins": -0.0009045412880368531, + "rewards/rejected": 0.0004556533822324127, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -3.008871078491211, + "logits/rejected": -2.947890281677246, + "logps/chosen": -348.37127685546875, + "logps/rejected": -318.5699462890625, + "loss": 0.6931, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.417404564563185e-05, + "rewards/margins": -0.00020001048687845469, + "rewards/rejected": 0.0001858363684732467, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 3.645833333333333e-07, + "logits/chosen": -3.078416585922241, + "logits/rejected": -3.026381492614746, + "logps/chosen": -382.86102294921875, + "logps/rejected": -335.41156005859375, + "loss": 0.6928, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0005409394507296383, + "rewards/margins": 0.0006863707094453275, + "rewards/rejected": -0.00014543140423484147, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.9282450675964355, + "logits/rejected": -2.872313976287842, + "logps/chosen": -355.75653076171875, + "logps/rejected": -294.4638366699219, + "loss": 0.6925, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0010749772191047668, + "rewards/margins": 0.0013450259575620294, + "rewards/rejected": -0.0002700486802496016, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 4.6874999999999996e-07, + "logits/chosen": -3.0248095989227295, + "logits/rejected": -2.9788899421691895, + "logps/chosen": -348.9649963378906, + "logps/rejected": -311.44647216796875, + "loss": 0.6918, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.0013544766698032618, + "rewards/margins": 0.0028665403369814157, + "rewards/rejected": -0.001512063667178154, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 4.999974255581648e-07, + "logits/chosen": -2.998736619949341, + "logits/rejected": -2.972041606903076, + "logps/chosen": -367.4033203125, + "logps/rejected": -341.64483642578125, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00238443398848176, + "rewards/margins": 0.0027559935115277767, + "rewards/rejected": -0.00037155949394218624, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -3.040524482727051, + "eval_logits/rejected": -2.987576484680176, + "eval_logps/chosen": -344.61151123046875, + "eval_logps/rejected": -302.75537109375, + "eval_loss": 0.6913198828697205, + "eval_rewards/accuracies": 0.6349206566810608, + "eval_rewards/chosen": 0.0043304311111569405, + "eval_rewards/margins": 0.004837073851376772, + "eval_rewards/rejected": -0.0005066432058811188, + "eval_runtime": 244.6034, + "eval_samples_per_second": 8.177, + "eval_steps_per_second": 0.258, + "step": 100 + }, + { + "epoch": 0.12, + "learning_rate": 4.999684636964402e-07, + "logits/chosen": -3.017376661300659, + "logits/rejected": -2.9261746406555176, + "logps/chosen": -315.4708557128906, + "logps/rejected": -251.8899383544922, + "loss": 0.6908, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0034377477131783962, + "rewards/margins": 0.0038212400395423174, + "rewards/rejected": -0.00038349232636392117, + "step": 110 + }, + { + "epoch": 0.13, + "learning_rate": 4.999073256611217e-07, + "logits/chosen": -2.9883151054382324, + "logits/rejected": -2.910341262817383, + "logps/chosen": -361.08331298828125, + "logps/rejected": -289.36883544921875, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007036352995783091, + "rewards/margins": 0.008973561227321625, + "rewards/rejected": -0.0019372075330466032, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 4.998140193219545e-07, + "logits/chosen": -3.0366291999816895, + "logits/rejected": -2.9711904525756836, + "logps/chosen": -381.4322204589844, + "logps/rejected": -310.4883728027344, + "loss": 0.6896, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00792085099965334, + "rewards/margins": 0.0068405852653086185, + "rewards/rejected": 0.0010802658507600427, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 4.996885566894172e-07, + "logits/chosen": -2.9757721424102783, + "logits/rejected": -2.955821990966797, + "logps/chosen": -286.1941833496094, + "logps/rejected": -260.9352111816406, + "loss": 0.6888, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.007854754105210304, + "rewards/margins": 0.008457413874566555, + "rewards/rejected": -0.0006026608753018081, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 4.995309539131771e-07, + "logits/chosen": -3.035512685775757, + "logits/rejected": -3.00437593460083, + "logps/chosen": -335.96539306640625, + "logps/rejected": -335.1185607910156, + "loss": 0.6891, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00691782683134079, + "rewards/margins": 0.003046808298677206, + "rewards/rejected": 0.0038710187654942274, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 4.993412312800101e-07, + "logits/chosen": -3.0231873989105225, + "logits/rejected": -2.9608006477355957, + "logps/chosen": -354.9385681152344, + "logps/rejected": -332.57757568359375, + "loss": 0.687, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.010783755220472813, + "rewards/margins": 0.010257494635879993, + "rewards/rejected": 0.0005262610502541065, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 4.991194132111906e-07, + "logits/chosen": -3.037797212600708, + "logits/rejected": -2.991992473602295, + "logps/chosen": -325.5256042480469, + "logps/rejected": -280.1507263183594, + "loss": 0.686, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.015455600805580616, + "rewards/margins": 0.014618036337196827, + "rewards/rejected": 0.0008375659817829728, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 4.988655282593471e-07, + "logits/chosen": -2.967064619064331, + "logits/rejected": -2.900017261505127, + "logps/chosen": -299.7720947265625, + "logps/rejected": -267.1083984375, + "loss": 0.6848, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.011125795543193817, + "rewards/margins": 0.019041184335947037, + "rewards/rejected": -0.00791538879275322, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 4.985796091047871e-07, + "logits/chosen": -3.0344669818878174, + "logits/rejected": -2.965481996536255, + "logps/chosen": -342.3845520019531, + "logps/rejected": -299.60418701171875, + "loss": 0.6836, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.017673885449767113, + "rewards/margins": 0.02089458890259266, + "rewards/rejected": -0.0032207041513174772, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 4.982616925512907e-07, + "logits/chosen": -2.9737305641174316, + "logits/rejected": -2.9367403984069824, + "logps/chosen": -344.54833984375, + "logps/rejected": -315.73516845703125, + "loss": 0.6836, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02112628147006035, + "rewards/margins": 0.02438109926879406, + "rewards/rejected": -0.0032548182643949986, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -3.024318218231201, + "eval_logits/rejected": -2.9700303077697754, + "eval_logps/chosen": -343.5496826171875, + "eval_logps/rejected": -303.65081787109375, + "eval_loss": 0.6830371022224426, + "eval_rewards/accuracies": 0.6448412537574768, + "eval_rewards/chosen": 0.01494832057505846, + "eval_rewards/margins": 0.0244095791131258, + "eval_rewards/rejected": -0.009461257606744766, + "eval_runtime": 246.2124, + "eval_samples_per_second": 8.123, + "eval_steps_per_second": 0.256, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 4.979118195213728e-07, + "logits/chosen": -3.0241358280181885, + "logits/rejected": -2.944836378097534, + "logps/chosen": -367.56805419921875, + "logps/rejected": -298.8295593261719, + "loss": 0.682, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.01484000962227583, + "rewards/margins": 0.023524824529886246, + "rewards/rejected": -0.008684814907610416, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 4.975300350510161e-07, + "logits/chosen": -3.002933979034424, + "logits/rejected": -2.9467215538024902, + "logps/chosen": -350.8382263183594, + "logps/rejected": -319.9858703613281, + "loss": 0.682, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.01789170131087303, + "rewards/margins": 0.02975825034081936, + "rewards/rejected": -0.011866547167301178, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 4.971163882838732e-07, + "logits/chosen": -2.9935097694396973, + "logits/rejected": -2.9317831993103027, + "logps/chosen": -349.8140563964844, + "logps/rejected": -288.98077392578125, + "loss": 0.6769, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011691467836499214, + "rewards/margins": 0.029249072074890137, + "rewards/rejected": -0.017557602375745773, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.966709324649415e-07, + "logits/chosen": -2.9749813079833984, + "logits/rejected": -2.9110968112945557, + "logps/chosen": -335.0533447265625, + "logps/rejected": -266.52215576171875, + "loss": 0.6787, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.01340649463236332, + "rewards/margins": 0.027955498546361923, + "rewards/rejected": -0.014549002051353455, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.961937249337091e-07, + "logits/chosen": -2.9835305213928223, + "logits/rejected": -2.9534316062927246, + "logps/chosen": -320.1576232910156, + "logps/rejected": -320.8951721191406, + "loss": 0.68, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.009979739785194397, + "rewards/margins": 0.03077465295791626, + "rewards/rejected": -0.020794907584786415, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.956848271167743e-07, + "logits/chosen": -2.9606919288635254, + "logits/rejected": -2.9051098823547363, + "logps/chosen": -341.62701416015625, + "logps/rejected": -306.46734619140625, + "loss": 0.6771, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.004492092411965132, + "rewards/margins": 0.0419074110686779, + "rewards/rejected": -0.03741531819105148, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.951443045199382e-07, + "logits/chosen": -3.039965867996216, + "logits/rejected": -2.974046230316162, + "logps/chosen": -353.82366943359375, + "logps/rejected": -296.0869140625, + "loss": 0.6743, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.013023038394749165, + "rewards/margins": 0.053806107491254807, + "rewards/rejected": -0.04078306630253792, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.945722267197731e-07, + "logits/chosen": -3.0149953365325928, + "logits/rejected": -2.9855613708496094, + "logps/chosen": -353.22808837890625, + "logps/rejected": -332.72735595703125, + "loss": 0.6759, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0013278704136610031, + "rewards/margins": 0.0351131446659565, + "rewards/rejected": -0.03644100949168205, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.939686673546668e-07, + "logits/chosen": -2.979219675064087, + "logits/rejected": -2.940520763397217, + "logps/chosen": -338.106201171875, + "logps/rejected": -298.94110107421875, + "loss": 0.6718, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.008899662643671036, + "rewards/margins": 0.0386703684926033, + "rewards/rejected": -0.047570034861564636, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.93333704115343e-07, + "logits/chosen": -2.9941365718841553, + "logits/rejected": -2.884251117706299, + "logps/chosen": -341.53387451171875, + "logps/rejected": -282.03814697265625, + "loss": 0.6662, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.00807519443333149, + "rewards/margins": 0.05831586569547653, + "rewards/rejected": -0.06639105826616287, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.9976212978363037, + "eval_logits/rejected": -2.942260980606079, + "eval_logps/chosen": -346.3836364746094, + "eval_logps/rejected": -309.570068359375, + "eval_loss": 0.6711614727973938, + "eval_rewards/accuracies": 0.6746031641960144, + "eval_rewards/chosen": -0.013390865176916122, + "eval_rewards/margins": 0.05526304244995117, + "eval_rewards/rejected": -0.06865391135215759, + "eval_runtime": 248.4655, + "eval_samples_per_second": 8.049, + "eval_steps_per_second": 0.254, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.926674187348617e-07, + "logits/chosen": -2.9200305938720703, + "logits/rejected": -2.890380382537842, + "logps/chosen": -342.61090087890625, + "logps/rejected": -319.4290466308594, + "loss": 0.6655, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.016244180500507355, + "rewards/margins": 0.06994068622589111, + "rewards/rejected": -0.08618486672639847, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.91969896978098e-07, + "logits/chosen": -2.929490804672241, + "logits/rejected": -2.892176866531372, + "logps/chosen": -334.2842712402344, + "logps/rejected": -318.81976318359375, + "loss": 0.6658, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.013753254897892475, + "rewards/margins": 0.05771704763174057, + "rewards/rejected": -0.07147030532360077, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 4.912412286307025e-07, + "logits/chosen": -2.908052921295166, + "logits/rejected": -2.8586883544921875, + "logps/chosen": -334.84564208984375, + "logps/rejected": -278.5325927734375, + "loss": 0.6698, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.014625480398535728, + "rewards/margins": 0.06231003254652023, + "rewards/rejected": -0.07693551480770111, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 4.904815074875438e-07, + "logits/chosen": -2.9351956844329834, + "logits/rejected": -2.882476806640625, + "logps/chosen": -297.5643310546875, + "logps/rejected": -270.01666259765625, + "loss": 0.6703, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05379326269030571, + "rewards/margins": 0.038004521280527115, + "rewards/rejected": -0.09179778397083282, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 4.896908313406355e-07, + "logits/chosen": -2.9030866622924805, + "logits/rejected": -2.8945038318634033, + "logps/chosen": -336.22027587890625, + "logps/rejected": -334.49365234375, + "loss": 0.6683, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05760540813207626, + "rewards/margins": 0.0570923313498497, + "rewards/rejected": -0.11469773948192596, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 4.88869301966548e-07, + "logits/chosen": -2.961442232131958, + "logits/rejected": -2.9141147136688232, + "logps/chosen": -337.873046875, + "logps/rejected": -292.5345153808594, + "loss": 0.6599, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07577836513519287, + "rewards/margins": 0.0639306977391243, + "rewards/rejected": -0.13970905542373657, + "step": 360 + }, + { + "epoch": 0.39, + "learning_rate": 4.880170251133081e-07, + "logits/chosen": -2.8861405849456787, + "logits/rejected": -2.873425006866455, + "logps/chosen": -290.9164123535156, + "logps/rejected": -303.1529846191406, + "loss": 0.6579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0845533087849617, + "rewards/margins": 0.07466953992843628, + "rewards/rejected": -0.15922284126281738, + "step": 370 + }, + { + "epoch": 0.4, + "learning_rate": 4.871341104867864e-07, + "logits/chosen": -2.9816460609436035, + "logits/rejected": -2.9324944019317627, + "logps/chosen": -363.70806884765625, + "logps/rejected": -322.94158935546875, + "loss": 0.6615, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07034312188625336, + "rewards/margins": 0.08779822289943695, + "rewards/rejected": -0.15814131498336792, + "step": 380 + }, + { + "epoch": 0.41, + "learning_rate": 4.862206717365765e-07, + "logits/chosen": -2.8958492279052734, + "logits/rejected": -2.8334128856658936, + "logps/chosen": -334.65643310546875, + "logps/rejected": -287.338134765625, + "loss": 0.6634, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.06828784197568893, + "rewards/margins": 0.08057762682437897, + "rewards/rejected": -0.1488654762506485, + "step": 390 + }, + { + "epoch": 0.42, + "learning_rate": 4.852768264413655e-07, + "logits/chosen": -2.973942756652832, + "logits/rejected": -2.9239089488983154, + "logps/chosen": -374.6542663574219, + "logps/rejected": -325.08868408203125, + "loss": 0.6538, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0820477306842804, + "rewards/margins": 0.0960758849978447, + "rewards/rejected": -0.1781235933303833, + "step": 400 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.954821825027466, + "eval_logits/rejected": -2.897944450378418, + "eval_logps/chosen": -353.1802062988281, + "eval_logps/rejected": -320.7437744140625, + "eval_loss": 0.6571324467658997, + "eval_rewards/accuracies": 0.6765872836112976, + "eval_rewards/chosen": -0.08135689049959183, + "eval_rewards/margins": 0.09903378784656525, + "eval_rewards/rejected": -0.18039065599441528, + "eval_runtime": 240.8082, + "eval_samples_per_second": 8.305, + "eval_steps_per_second": 0.262, + "step": 400 + }, + { + "epoch": 0.43, + "learning_rate": 4.843026960937995e-07, + "logits/chosen": -3.005959987640381, + "logits/rejected": -2.960634708404541, + "logps/chosen": -376.323974609375, + "logps/rejected": -351.9470520019531, + "loss": 0.6532, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0597182922065258, + "rewards/margins": 0.10858096927404404, + "rewards/rejected": -0.16829927265644073, + "step": 410 + }, + { + "epoch": 0.44, + "learning_rate": 4.832984060848445e-07, + "logits/chosen": -2.883970260620117, + "logits/rejected": -2.8291678428649902, + "logps/chosen": -314.81866455078125, + "logps/rejected": -272.30511474609375, + "loss": 0.652, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0751439556479454, + "rewards/margins": 0.10195841640233994, + "rewards/rejected": -0.17710237205028534, + "step": 420 + }, + { + "epoch": 0.45, + "learning_rate": 4.822640856876464e-07, + "logits/chosen": -2.9058735370635986, + "logits/rejected": -2.877462863922119, + "logps/chosen": -320.7926940917969, + "logps/rejected": -292.57293701171875, + "loss": 0.6553, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09108451753854752, + "rewards/margins": 0.0795835480093956, + "rewards/rejected": -0.1706680804491043, + "step": 430 + }, + { + "epoch": 0.46, + "learning_rate": 4.811998680408907e-07, + "logits/chosen": -2.9125401973724365, + "logits/rejected": -2.903066873550415, + "logps/chosen": -323.45245361328125, + "logps/rejected": -306.0644226074219, + "loss": 0.6524, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.13419905304908752, + "rewards/margins": 0.06887730956077576, + "rewards/rejected": -0.2030763328075409, + "step": 440 + }, + { + "epoch": 0.47, + "learning_rate": 4.801058901316645e-07, + "logits/chosen": -2.8140056133270264, + "logits/rejected": -2.7843329906463623, + "logps/chosen": -326.67822265625, + "logps/rejected": -301.6144714355469, + "loss": 0.647, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11106200516223907, + "rewards/margins": 0.14083310961723328, + "rewards/rejected": -0.25189512968063354, + "step": 450 + }, + { + "epoch": 0.48, + "learning_rate": 4.78982292777824e-07, + "logits/chosen": -2.905949354171753, + "logits/rejected": -2.8415050506591797, + "logps/chosen": -332.97406005859375, + "logps/rejected": -316.91937255859375, + "loss": 0.6423, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.121739961206913, + "rewards/margins": 0.10120918601751328, + "rewards/rejected": -0.22294914722442627, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.778292206098673e-07, + "logits/chosen": -2.9177937507629395, + "logits/rejected": -2.8414828777313232, + "logps/chosen": -376.5325927734375, + "logps/rejected": -326.441162109375, + "loss": 0.6372, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.11956751346588135, + "rewards/margins": 0.19230221211910248, + "rewards/rejected": -0.311869740486145, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.7664682205231877e-07, + "logits/chosen": -2.83701753616333, + "logits/rejected": -2.8011679649353027, + "logps/chosen": -291.61614990234375, + "logps/rejected": -290.9491882324219, + "loss": 0.6612, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.15953049063682556, + "rewards/margins": 0.07381532341241837, + "rewards/rejected": -0.23334583640098572, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.754352493046224e-07, + "logits/chosen": -2.9087753295898438, + "logits/rejected": -2.8327887058258057, + "logps/chosen": -348.1717224121094, + "logps/rejected": -318.3050842285156, + "loss": 0.6416, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.16357417404651642, + "rewards/margins": 0.13152363896369934, + "rewards/rejected": -0.29509779810905457, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.741946583215514e-07, + "logits/chosen": -2.8774545192718506, + "logits/rejected": -2.8319413661956787, + "logps/chosen": -337.0196838378906, + "logps/rejected": -319.65899658203125, + "loss": 0.6405, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17791475355625153, + "rewards/margins": 0.15151168406009674, + "rewards/rejected": -0.32942643761634827, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.9120142459869385, + "eval_logits/rejected": -2.8540806770324707, + "eval_logps/chosen": -364.5343933105469, + "eval_logps/rejected": -337.2181396484375, + "eval_loss": 0.6447514891624451, + "eval_rewards/accuracies": 0.6726190447807312, + "eval_rewards/chosen": -0.19489827752113342, + "eval_rewards/margins": 0.1502356082201004, + "eval_rewards/rejected": -0.345133900642395, + "eval_runtime": 248.0229, + "eval_samples_per_second": 8.064, + "eval_steps_per_second": 0.254, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.729252087931332e-07, + "logits/chosen": -2.8862080574035645, + "logits/rejected": -2.8038384914398193, + "logps/chosen": -377.8186340332031, + "logps/rejected": -323.3064880371094, + "loss": 0.6386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17217716574668884, + "rewards/margins": 0.19356265664100647, + "rewards/rejected": -0.3657398223876953, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.716270641240941e-07, + "logits/chosen": -2.8480124473571777, + "logits/rejected": -2.811933994293213, + "logps/chosen": -320.3323974609375, + "logps/rejected": -317.38763427734375, + "loss": 0.6423, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.148501917719841, + "rewards/margins": 0.18107430636882782, + "rewards/rejected": -0.3295762240886688, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.703003914128258e-07, + "logits/chosen": -2.847687244415283, + "logits/rejected": -2.8126273155212402, + "logps/chosen": -355.64996337890625, + "logps/rejected": -330.8966369628906, + "loss": 0.6401, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.15957526862621307, + "rewards/margins": 0.15057061612606049, + "rewards/rejected": -0.31014585494995117, + "step": 530 + }, + { + "epoch": 0.57, + "learning_rate": 4.689453614298758e-07, + "logits/chosen": -2.8674135208129883, + "logits/rejected": -2.8252620697021484, + "logps/chosen": -375.3172302246094, + "logps/rejected": -377.0331726074219, + "loss": 0.6343, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.169221892952919, + "rewards/margins": 0.15300126373767853, + "rewards/rejected": -0.32222312688827515, + "step": 540 + }, + { + "epoch": 0.58, + "learning_rate": 4.6756214859596645e-07, + "logits/chosen": -2.8661575317382812, + "logits/rejected": -2.80385684967041, + "logps/chosen": -347.62969970703125, + "logps/rejected": -327.3890380859375, + "loss": 0.6463, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24611690640449524, + "rewards/margins": 0.16446547210216522, + "rewards/rejected": -0.41058236360549927, + "step": 550 + }, + { + "epoch": 0.59, + "learning_rate": 4.661509309595426e-07, + "logits/chosen": -2.8666725158691406, + "logits/rejected": -2.801342487335205, + "logps/chosen": -344.13330078125, + "logps/rejected": -313.7489929199219, + "loss": 0.6312, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24062931537628174, + "rewards/margins": 0.20030991733074188, + "rewards/rejected": -0.4409392476081848, + "step": 560 + }, + { + "epoch": 0.6, + "learning_rate": 4.647118901738537e-07, + "logits/chosen": -2.8669309616088867, + "logits/rejected": -2.7961795330047607, + "logps/chosen": -360.96478271484375, + "logps/rejected": -339.89886474609375, + "loss": 0.644, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.25550180673599243, + "rewards/margins": 0.14098383486270905, + "rewards/rejected": -0.39648565649986267, + "step": 570 + }, + { + "epoch": 0.61, + "learning_rate": 4.632452114735706e-07, + "logits/chosen": -2.814235210418701, + "logits/rejected": -2.757559061050415, + "logps/chosen": -350.2564697265625, + "logps/rejected": -310.9596252441406, + "loss": 0.6359, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.24496586620807648, + "rewards/margins": 0.14976012706756592, + "rewards/rejected": -0.3947260081768036, + "step": 580 + }, + { + "epoch": 0.62, + "learning_rate": 4.617510836509424e-07, + "logits/chosen": -2.8700594902038574, + "logits/rejected": -2.832390069961548, + "logps/chosen": -341.0650329589844, + "logps/rejected": -324.2391357421875, + "loss": 0.638, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1659560203552246, + "rewards/margins": 0.10084130614995956, + "rewards/rejected": -0.26679736375808716, + "step": 590 + }, + { + "epoch": 0.63, + "learning_rate": 4.602296990314949e-07, + "logits/chosen": -2.8588919639587402, + "logits/rejected": -2.809976100921631, + "logps/chosen": -410.419189453125, + "logps/rejected": -396.41949462890625, + "loss": 0.6394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17672498524188995, + "rewards/margins": 0.19581544399261475, + "rewards/rejected": -0.3725404143333435, + "step": 600 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.8732762336730957, + "eval_logits/rejected": -2.814659595489502, + "eval_logps/chosen": -368.0754089355469, + "eval_logps/rejected": -344.1863098144531, + "eval_loss": 0.6372315883636475, + "eval_rewards/accuracies": 0.682539701461792, + "eval_rewards/chosen": -0.2303084284067154, + "eval_rewards/margins": 0.18450765311717987, + "eval_rewards/rejected": -0.41481611132621765, + "eval_runtime": 248.1999, + "eval_samples_per_second": 8.058, + "eval_steps_per_second": 0.254, + "step": 600 + }, + { + "epoch": 0.64, + "learning_rate": 4.5868125344927397e-07, + "logits/chosen": -2.881749153137207, + "logits/rejected": -2.810695171356201, + "logps/chosen": -356.07464599609375, + "logps/rejected": -302.9112854003906, + "loss": 0.6283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22698974609375, + "rewards/margins": 0.1910962164402008, + "rewards/rejected": -0.4180859625339508, + "step": 610 + }, + { + "epoch": 0.65, + "learning_rate": 4.5710594622163814e-07, + "logits/chosen": -2.8739027976989746, + "logits/rejected": -2.8048624992370605, + "logps/chosen": -364.158935546875, + "logps/rejected": -319.1551208496094, + "loss": 0.6286, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.29370805621147156, + "rewards/margins": 0.15704287588596344, + "rewards/rejected": -0.4507509171962738, + "step": 620 + }, + { + "epoch": 0.66, + "learning_rate": 4.555039801236017e-07, + "logits/chosen": -2.746525287628174, + "logits/rejected": -2.707292079925537, + "logps/chosen": -338.1776123046875, + "logps/rejected": -335.0467224121094, + "loss": 0.6272, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.21730093657970428, + "rewards/margins": 0.19230973720550537, + "rewards/rejected": -0.40961068868637085, + "step": 630 + }, + { + "epoch": 0.67, + "learning_rate": 4.538755613617336e-07, + "logits/chosen": -2.791337251663208, + "logits/rejected": -2.7343456745147705, + "logps/chosen": -348.16802978515625, + "logps/rejected": -334.0476989746094, + "loss": 0.6273, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.23033007979393005, + "rewards/margins": 0.19607330858707428, + "rewards/rejected": -0.42640337347984314, + "step": 640 + }, + { + "epoch": 0.68, + "learning_rate": 4.522208995476145e-07, + "logits/chosen": -2.889570713043213, + "logits/rejected": -2.7822928428649902, + "logps/chosen": -401.09234619140625, + "logps/rejected": -364.5537414550781, + "loss": 0.6171, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.23586571216583252, + "rewards/margins": 0.25756314396858215, + "rewards/rejected": -0.4934287965297699, + "step": 650 + }, + { + "epoch": 0.69, + "learning_rate": 4.50540207670855e-07, + "logits/chosen": -2.8644819259643555, + "logits/rejected": -2.8281166553497314, + "logps/chosen": -379.15887451171875, + "logps/rejected": -360.2496337890625, + "loss": 0.6235, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24248214066028595, + "rewards/margins": 0.2142285406589508, + "rewards/rejected": -0.45671066641807556, + "step": 660 + }, + { + "epoch": 0.7, + "learning_rate": 4.488337020716798e-07, + "logits/chosen": -2.8308560848236084, + "logits/rejected": -2.8049094676971436, + "logps/chosen": -365.39263916015625, + "logps/rejected": -360.0053405761719, + "loss": 0.6242, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.26678240299224854, + "rewards/margins": 0.21639254689216614, + "rewards/rejected": -0.4831749498844147, + "step": 670 + }, + { + "epoch": 0.71, + "learning_rate": 4.4710160241307993e-07, + "logits/chosen": -2.765575885772705, + "logits/rejected": -2.7461342811584473, + "logps/chosen": -347.0950012207031, + "logps/rejected": -324.2687683105469, + "loss": 0.6322, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3175194561481476, + "rewards/margins": 0.09245363622903824, + "rewards/rejected": -0.4099730849266052, + "step": 680 + }, + { + "epoch": 0.72, + "learning_rate": 4.453441316525376e-07, + "logits/chosen": -2.7600603103637695, + "logits/rejected": -2.700854539871216, + "logps/chosen": -349.8055725097656, + "logps/rejected": -332.6409606933594, + "loss": 0.63, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3004549443721771, + "rewards/margins": 0.1717255860567093, + "rewards/rejected": -0.4721805155277252, + "step": 690 + }, + { + "epoch": 0.73, + "learning_rate": 4.435615160133268e-07, + "logits/chosen": -2.791268825531006, + "logits/rejected": -2.7100212574005127, + "logps/chosen": -335.6754150390625, + "logps/rejected": -332.56280517578125, + "loss": 0.6218, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34759417176246643, + "rewards/margins": 0.21199102699756622, + "rewards/rejected": -0.5595852136611938, + "step": 700 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.8268723487854004, + "eval_logits/rejected": -2.766594648361206, + "eval_logps/chosen": -373.9845275878906, + "eval_logps/rejected": -353.7791748046875, + "eval_loss": 0.631250262260437, + "eval_rewards/accuracies": 0.682539701461792, + "eval_rewards/chosen": -0.28939977288246155, + "eval_rewards/margins": 0.22134484350681305, + "eval_rewards/rejected": -0.5107446312904358, + "eval_runtime": 250.2322, + "eval_samples_per_second": 7.993, + "eval_steps_per_second": 0.252, + "step": 700 + }, + { + "epoch": 0.74, + "learning_rate": 4.4175398495539397e-07, + "logits/chosen": -2.8154489994049072, + "logits/rejected": -2.7193312644958496, + "logps/chosen": -390.2218322753906, + "logps/rejected": -323.1303405761719, + "loss": 0.6142, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.25868695974349976, + "rewards/margins": 0.22367513179779053, + "rewards/rejected": -0.48236212134361267, + "step": 710 + }, + { + "epoch": 0.75, + "learning_rate": 4.3992177114582117e-07, + "logits/chosen": -2.8137097358703613, + "logits/rejected": -2.7677464485168457, + "logps/chosen": -371.6590576171875, + "logps/rejected": -349.91522216796875, + "loss": 0.6315, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24545975029468536, + "rewards/margins": 0.20291368663311005, + "rewards/rejected": -0.4483734965324402, + "step": 720 + }, + { + "epoch": 0.76, + "learning_rate": 4.380651104288776e-07, + "logits/chosen": -2.79219913482666, + "logits/rejected": -2.7212865352630615, + "logps/chosen": -383.16070556640625, + "logps/rejected": -343.8384094238281, + "loss": 0.6285, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2518269717693329, + "rewards/margins": 0.21711906790733337, + "rewards/rejected": -0.46894603967666626, + "step": 730 + }, + { + "epoch": 0.77, + "learning_rate": 4.3618424179566094e-07, + "logits/chosen": -2.7794361114501953, + "logits/rejected": -2.7013282775878906, + "logps/chosen": -409.0721130371094, + "logps/rejected": -345.78033447265625, + "loss": 0.6197, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.23068316280841827, + "rewards/margins": 0.23051229119300842, + "rewards/rejected": -0.4611954689025879, + "step": 740 + }, + { + "epoch": 0.78, + "learning_rate": 4.3427940735333436e-07, + "logits/chosen": -2.7824223041534424, + "logits/rejected": -2.7694931030273438, + "logps/chosen": -373.7041931152344, + "logps/rejected": -375.1035461425781, + "loss": 0.6172, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.33885717391967773, + "rewards/margins": 0.19208331406116486, + "rewards/rejected": -0.5309404134750366, + "step": 750 + }, + { + "epoch": 0.8, + "learning_rate": 4.323508522939624e-07, + "logits/chosen": -2.750168800354004, + "logits/rejected": -2.710522174835205, + "logps/chosen": -366.13519287109375, + "logps/rejected": -355.31829833984375, + "loss": 0.6092, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4173508286476135, + "rewards/margins": 0.22161607444286346, + "rewards/rejected": -0.6389669179916382, + "step": 760 + }, + { + "epoch": 0.81, + "learning_rate": 4.3039882486294966e-07, + "logits/chosen": -2.7729387283325195, + "logits/rejected": -2.747562885284424, + "logps/chosen": -393.59881591796875, + "logps/rejected": -406.81768798828125, + "loss": 0.6199, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.38891881704330444, + "rewards/margins": 0.215108722448349, + "rewards/rejected": -0.604027509689331, + "step": 770 + }, + { + "epoch": 0.82, + "learning_rate": 4.2842357632708603e-07, + "logits/chosen": -2.7065768241882324, + "logits/rejected": -2.6670963764190674, + "logps/chosen": -340.0065002441406, + "logps/rejected": -324.4345397949219, + "loss": 0.6215, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2909296154975891, + "rewards/margins": 0.1907232701778412, + "rewards/rejected": -0.4816528856754303, + "step": 780 + }, + { + "epoch": 0.83, + "learning_rate": 4.264253609422038e-07, + "logits/chosen": -2.7775344848632812, + "logits/rejected": -2.7437610626220703, + "logps/chosen": -391.9022521972656, + "logps/rejected": -384.43426513671875, + "loss": 0.6313, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2783506214618683, + "rewards/margins": 0.23514008522033691, + "rewards/rejected": -0.5134907960891724, + "step": 790 + }, + { + "epoch": 0.84, + "learning_rate": 4.244044359204495e-07, + "logits/chosen": -2.713089942932129, + "logits/rejected": -2.6497960090637207, + "logps/chosen": -433.2242126464844, + "logps/rejected": -373.65130615234375, + "loss": 0.6035, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.29666823148727417, + "rewards/margins": 0.24842536449432373, + "rewards/rejected": -0.5450935959815979, + "step": 800 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.7681221961975098, + "eval_logits/rejected": -2.7056021690368652, + "eval_logps/chosen": -381.1849060058594, + "eval_logps/rejected": -364.1535949707031, + "eval_loss": 0.6248704195022583, + "eval_rewards/accuracies": 0.6845238208770752, + "eval_rewards/chosen": -0.3614034950733185, + "eval_rewards/margins": 0.2530852258205414, + "eval_rewards/rejected": -0.6144886612892151, + "eval_runtime": 249.0101, + "eval_samples_per_second": 8.032, + "eval_steps_per_second": 0.253, + "step": 800 + }, + { + "epoch": 0.85, + "learning_rate": 4.223610613971753e-07, + "logits/chosen": -2.7306289672851562, + "logits/rejected": -2.6280109882354736, + "logps/chosen": -378.93804931640625, + "logps/rejected": -328.80938720703125, + "loss": 0.6265, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3830115795135498, + "rewards/margins": 0.13664865493774414, + "rewards/rejected": -0.519660234451294, + "step": 810 + }, + { + "epoch": 0.86, + "learning_rate": 4.2029550039745396e-07, + "logits/chosen": -2.674085855484009, + "logits/rejected": -2.6277194023132324, + "logps/chosen": -331.6683654785156, + "logps/rejected": -327.3447570800781, + "loss": 0.6375, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3621932566165924, + "rewards/margins": 0.2161780148744583, + "rewards/rejected": -0.5783712267875671, + "step": 820 + }, + { + "epoch": 0.87, + "learning_rate": 4.1820801880222236e-07, + "logits/chosen": -2.6937668323516846, + "logits/rejected": -2.678345203399658, + "logps/chosen": -336.51519775390625, + "logps/rejected": -338.04058837890625, + "loss": 0.6178, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.372277170419693, + "rewards/margins": 0.2381594181060791, + "rewards/rejected": -0.6104366183280945, + "step": 830 + }, + { + "epoch": 0.88, + "learning_rate": 4.160988853140567e-07, + "logits/chosen": -2.68011212348938, + "logits/rejected": -2.6486706733703613, + "logps/chosen": -405.85235595703125, + "logps/rejected": -377.17059326171875, + "loss": 0.6225, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.44021159410476685, + "rewards/margins": 0.13439835608005524, + "rewards/rejected": -0.5746098756790161, + "step": 840 + }, + { + "epoch": 0.89, + "learning_rate": 4.1396837142258507e-07, + "logits/chosen": -2.757357597351074, + "logits/rejected": -2.696622848510742, + "logps/chosen": -403.8123779296875, + "logps/rejected": -353.694091796875, + "loss": 0.6194, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.355672687292099, + "rewards/margins": 0.28402405977249146, + "rewards/rejected": -0.6396967172622681, + "step": 850 + }, + { + "epoch": 0.9, + "learning_rate": 4.1181675136954106e-07, + "logits/chosen": -2.753262758255005, + "logits/rejected": -2.715118646621704, + "logps/chosen": -377.8106384277344, + "logps/rejected": -364.499755859375, + "loss": 0.6216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4113299250602722, + "rewards/margins": 0.2852802276611328, + "rewards/rejected": -0.6966102123260498, + "step": 860 + }, + { + "epoch": 0.91, + "learning_rate": 4.09644302113463e-07, + "logits/chosen": -2.6933841705322266, + "logits/rejected": -2.671048641204834, + "logps/chosen": -337.9107666015625, + "logps/rejected": -356.3653564453125, + "loss": 0.6052, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.40935665369033813, + "rewards/margins": 0.30245229601860046, + "rewards/rejected": -0.7118089199066162, + "step": 870 + }, + { + "epoch": 0.92, + "learning_rate": 4.0745130329404365e-07, + "logits/chosen": -2.701093912124634, + "logits/rejected": -2.63181471824646, + "logps/chosen": -388.31011962890625, + "logps/rejected": -363.6514587402344, + "loss": 0.6198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.504043459892273, + "rewards/margins": 0.22630378603935242, + "rewards/rejected": -0.7303472757339478, + "step": 880 + }, + { + "epoch": 0.93, + "learning_rate": 4.052380371961347e-07, + "logits/chosen": -2.684615135192871, + "logits/rejected": -2.652864933013916, + "logps/chosen": -377.3784484863281, + "logps/rejected": -379.3691711425781, + "loss": 0.6286, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5067042112350464, + "rewards/margins": 0.17588753998279572, + "rewards/rejected": -0.6825917363166809, + "step": 890 + }, + { + "epoch": 0.94, + "learning_rate": 4.030047887134108e-07, + "logits/chosen": -2.636793851852417, + "logits/rejected": -2.5783472061157227, + "logps/chosen": -407.3863220214844, + "logps/rejected": -394.4078674316406, + "loss": 0.6326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48934751749038696, + "rewards/margins": 0.17317138612270355, + "rewards/rejected": -0.6625188589096069, + "step": 900 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.720741033554077, + "eval_logits/rejected": -2.65683913230896, + "eval_logps/chosen": -397.63446044921875, + "eval_logps/rejected": -382.7857360839844, + "eval_loss": 0.6203979253768921, + "eval_rewards/accuracies": 0.6845238208770752, + "eval_rewards/chosen": -0.5258990526199341, + "eval_rewards/margins": 0.274911493062973, + "eval_rewards/rejected": -0.8008105158805847, + "eval_runtime": 244.5075, + "eval_samples_per_second": 8.18, + "eval_steps_per_second": 0.258, + "step": 900 + }, + { + "epoch": 0.95, + "learning_rate": 4.007518453116979e-07, + "logits/chosen": -2.6805593967437744, + "logits/rejected": -2.6232123374938965, + "logps/chosen": -357.9183044433594, + "logps/rejected": -359.6230773925781, + "loss": 0.6057, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5844573974609375, + "rewards/margins": 0.2230493128299713, + "rewards/rejected": -0.8075065612792969, + "step": 910 + }, + { + "epoch": 0.96, + "learning_rate": 3.984794969919702e-07, + "logits/chosen": -2.6928341388702393, + "logits/rejected": -2.6027913093566895, + "logps/chosen": -404.1787414550781, + "logps/rejected": -385.9433288574219, + "loss": 0.6132, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5911322832107544, + "rewards/margins": 0.29310551285743713, + "rewards/rejected": -0.8842377662658691, + "step": 920 + }, + { + "epoch": 0.97, + "learning_rate": 3.96188036253021e-07, + "logits/chosen": -2.7151689529418945, + "logits/rejected": -2.6285009384155273, + "logps/chosen": -379.22552490234375, + "logps/rejected": -366.44720458984375, + "loss": 0.5874, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.46869564056396484, + "rewards/margins": 0.3491496443748474, + "rewards/rejected": -0.8178452253341675, + "step": 930 + }, + { + "epoch": 0.98, + "learning_rate": 3.938777580538119e-07, + "logits/chosen": -2.695061683654785, + "logits/rejected": -2.608922243118286, + "logps/chosen": -434.6171875, + "logps/rejected": -423.9627380371094, + "loss": 0.6077, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5053583383560181, + "rewards/margins": 0.35829007625579834, + "rewards/rejected": -0.8636484146118164, + "step": 940 + }, + { + "epoch": 0.99, + "learning_rate": 3.9154895977550585e-07, + "logits/chosen": -2.7094149589538574, + "logits/rejected": -2.6375811100006104, + "logps/chosen": -389.6069641113281, + "logps/rejected": -375.0043029785156, + "loss": 0.6176, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4657767415046692, + "rewards/margins": 0.2953377366065979, + "rewards/rejected": -0.7611144185066223, + "step": 950 + }, + { + "epoch": 1.0, + "learning_rate": 3.8920194118318725e-07, + "logits/chosen": -2.649557590484619, + "logits/rejected": -2.591104030609131, + "logps/chosen": -365.44061279296875, + "logps/rejected": -376.2007141113281, + "loss": 0.6041, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3993779420852661, + "rewards/margins": 0.3410153388977051, + "rewards/rejected": -0.740393340587616, + "step": 960 + }, + { + "epoch": 1.02, + "learning_rate": 3.868370043872768e-07, + "logits/chosen": -2.683077812194824, + "logits/rejected": -2.6237359046936035, + "logps/chosen": -416.84088134765625, + "logps/rejected": -408.75482177734375, + "loss": 0.5978, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.43280115723609924, + "rewards/margins": 0.34043940901756287, + "rewards/rejected": -0.7732406258583069, + "step": 970 + }, + { + "epoch": 1.03, + "learning_rate": 3.844544538046425e-07, + "logits/chosen": -2.617851972579956, + "logits/rejected": -2.592369556427002, + "logps/chosen": -354.9996032714844, + "logps/rejected": -388.5142822265625, + "loss": 0.5878, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4673759341239929, + "rewards/margins": 0.39914676547050476, + "rewards/rejected": -0.8665226101875305, + "step": 980 + }, + { + "epoch": 1.04, + "learning_rate": 3.8205459611941577e-07, + "logits/chosen": -2.669250011444092, + "logits/rejected": -2.602257490158081, + "logps/chosen": -417.526611328125, + "logps/rejected": -391.24456787109375, + "loss": 0.6071, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4427572190761566, + "rewards/margins": 0.2923399806022644, + "rewards/rejected": -0.7350972890853882, + "step": 990 + }, + { + "epoch": 1.05, + "learning_rate": 3.7963774024351423e-07, + "logits/chosen": -2.6895923614501953, + "logits/rejected": -2.6821107864379883, + "logps/chosen": -368.69537353515625, + "logps/rejected": -377.12225341796875, + "loss": 0.6103, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.48284831643104553, + "rewards/margins": 0.22640752792358398, + "rewards/rejected": -0.7092558145523071, + "step": 1000 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -2.6968600749969482, + "eval_logits/rejected": -2.6322262287139893, + "eval_logps/chosen": -396.68231201171875, + "eval_logps/rejected": -384.4855651855469, + "eval_loss": 0.6144962310791016, + "eval_rewards/accuracies": 0.6944444179534912, + "eval_rewards/chosen": -0.5163776874542236, + "eval_rewards/margins": 0.30143067240715027, + "eval_rewards/rejected": -0.8178083896636963, + "eval_runtime": 246.7178, + "eval_samples_per_second": 8.106, + "eval_steps_per_second": 0.255, + "step": 1000 + }, + { + "epoch": 1.06, + "learning_rate": 3.7720419727687865e-07, + "logits/chosen": -2.6810877323150635, + "logits/rejected": -2.6230504512786865, + "logps/chosen": -413.1744689941406, + "logps/rejected": -382.6622314453125, + "loss": 0.5989, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5526056289672852, + "rewards/margins": 0.2373414933681488, + "rewards/rejected": -0.7899471521377563, + "step": 1010 + }, + { + "epoch": 1.07, + "learning_rate": 3.747542804674274e-07, + "logits/chosen": -2.661088228225708, + "logits/rejected": -2.6390786170959473, + "logps/chosen": -398.89813232421875, + "logps/rejected": -401.23431396484375, + "loss": 0.6005, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.46416956186294556, + "rewards/margins": 0.30864232778549194, + "rewards/rejected": -0.7728118300437927, + "step": 1020 + }, + { + "epoch": 1.08, + "learning_rate": 3.7228830517073527e-07, + "logits/chosen": -2.622685670852661, + "logits/rejected": -2.5972726345062256, + "logps/chosen": -360.4078063964844, + "logps/rejected": -360.20269775390625, + "loss": 0.5902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3580246567726135, + "rewards/margins": 0.3438374400138855, + "rewards/rejected": -0.7018621563911438, + "step": 1030 + }, + { + "epoch": 1.09, + "learning_rate": 3.698065888094405e-07, + "logits/chosen": -2.6089298725128174, + "logits/rejected": -2.6035428047180176, + "logps/chosen": -365.2914123535156, + "logps/rejected": -406.7452087402344, + "loss": 0.6074, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.43625983595848083, + "rewards/margins": 0.285591185092926, + "rewards/rejected": -0.7218509912490845, + "step": 1040 + }, + { + "epoch": 1.1, + "learning_rate": 3.6730945083238594e-07, + "logits/chosen": -2.6788554191589355, + "logits/rejected": -2.60333251953125, + "logps/chosen": -396.67388916015625, + "logps/rejected": -369.4822998046875, + "loss": 0.5952, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4486874043941498, + "rewards/margins": 0.3007965087890625, + "rewards/rejected": -0.7494838833808899, + "step": 1050 + }, + { + "epoch": 1.11, + "learning_rate": 3.64797212673499e-07, + "logits/chosen": -2.663841962814331, + "logits/rejected": -2.530757427215576, + "logps/chosen": -434.9602966308594, + "logps/rejected": -395.90704345703125, + "loss": 0.5741, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.434063196182251, + "rewards/margins": 0.4063114523887634, + "rewards/rejected": -0.840374767780304, + "step": 1060 + }, + { + "epoch": 1.12, + "learning_rate": 3.6227019771041664e-07, + "logits/chosen": -2.631474018096924, + "logits/rejected": -2.5514721870422363, + "logps/chosen": -367.2853698730469, + "logps/rejected": -334.1014709472656, + "loss": 0.6031, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.45699542760849, + "rewards/margins": 0.30584800243377686, + "rewards/rejected": -0.7628434896469116, + "step": 1070 + }, + { + "epoch": 1.13, + "learning_rate": 3.5972873122285994e-07, + "logits/chosen": -2.6002821922302246, + "logits/rejected": -2.5638835430145264, + "logps/chosen": -362.3927307128906, + "logps/rejected": -384.8812561035156, + "loss": 0.5971, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5179690718650818, + "rewards/margins": 0.29387664794921875, + "rewards/rejected": -0.8118457794189453, + "step": 1080 + }, + { + "epoch": 1.14, + "learning_rate": 3.571731403507635e-07, + "logits/chosen": -2.6326613426208496, + "logits/rejected": -2.5637834072113037, + "logps/chosen": -416.04046630859375, + "logps/rejected": -411.080810546875, + "loss": 0.5785, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5498114228248596, + "rewards/margins": 0.36757707595825195, + "rewards/rejected": -0.9173885583877563, + "step": 1090 + }, + { + "epoch": 1.15, + "learning_rate": 3.5460375405216603e-07, + "logits/chosen": -2.665194034576416, + "logits/rejected": -2.601635456085205, + "logps/chosen": -397.65960693359375, + "logps/rejected": -401.44976806640625, + "loss": 0.6002, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6755828261375427, + "rewards/margins": 0.26671653985977173, + "rewards/rejected": -0.9422993659973145, + "step": 1100 + }, + { + "epoch": 1.15, + "eval_logits/chosen": -2.668804883956909, + "eval_logits/rejected": -2.6023752689361572, + "eval_logps/chosen": -396.83331298828125, + "eval_logps/rejected": -385.9577941894531, + "eval_loss": 0.6116264462471008, + "eval_rewards/accuracies": 0.692460298538208, + "eval_rewards/chosen": -0.5178873538970947, + "eval_rewards/margins": 0.31464365124702454, + "eval_rewards/rejected": -0.8325309753417969, + "eval_runtime": 250.5629, + "eval_samples_per_second": 7.982, + "eval_steps_per_second": 0.251, + "step": 1100 + }, + { + "epoch": 1.16, + "learning_rate": 3.520209030608662e-07, + "logits/chosen": -2.6696746349334717, + "logits/rejected": -2.6097488403320312, + "logps/chosen": -393.92529296875, + "logps/rejected": -380.6165466308594, + "loss": 0.5968, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47735723853111267, + "rewards/margins": 0.25016671419143677, + "rewards/rejected": -0.727523922920227, + "step": 1110 + }, + { + "epoch": 1.17, + "learning_rate": 3.4942491984385066e-07, + "logits/chosen": -2.666564464569092, + "logits/rejected": -2.6042842864990234, + "logps/chosen": -403.80145263671875, + "logps/rejected": -381.906982421875, + "loss": 0.5944, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4305171072483063, + "rewards/margins": 0.3264329731464386, + "rewards/rejected": -0.7569500207901001, + "step": 1120 + }, + { + "epoch": 1.18, + "learning_rate": 3.468161385584982e-07, + "logits/chosen": -2.6324963569641113, + "logits/rejected": -2.5880188941955566, + "logps/chosen": -381.2325744628906, + "logps/rejected": -379.158203125, + "loss": 0.5799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3703802824020386, + "rewards/margins": 0.34355229139328003, + "rewards/rejected": -0.7139325141906738, + "step": 1130 + }, + { + "epoch": 1.19, + "learning_rate": 3.441948950095672e-07, + "logits/chosen": -2.6656455993652344, + "logits/rejected": -2.5824806690216064, + "logps/chosen": -407.565673828125, + "logps/rejected": -368.3275451660156, + "loss": 0.5875, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.43670734763145447, + "rewards/margins": 0.37962377071380615, + "rewards/rejected": -0.8163310885429382, + "step": 1140 + }, + { + "epoch": 1.2, + "learning_rate": 3.4156152660596994e-07, + "logits/chosen": -2.6464781761169434, + "logits/rejected": -2.5937983989715576, + "logps/chosen": -402.5704345703125, + "logps/rejected": -392.44952392578125, + "loss": 0.5997, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5296522974967957, + "rewards/margins": 0.3139727711677551, + "rewards/rejected": -0.8436250686645508, + "step": 1150 + }, + { + "epoch": 1.21, + "learning_rate": 3.3891637231734125e-07, + "logits/chosen": -2.6544508934020996, + "logits/rejected": -2.595885753631592, + "logps/chosen": -386.92401123046875, + "logps/rejected": -395.77642822265625, + "loss": 0.5905, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.44389739632606506, + "rewards/margins": 0.35769903659820557, + "rewards/rejected": -0.801596462726593, + "step": 1160 + }, + { + "epoch": 1.22, + "learning_rate": 3.3625977263040643e-07, + "logits/chosen": -2.666747570037842, + "logits/rejected": -2.5960164070129395, + "logps/chosen": -414.5166015625, + "logps/rejected": -367.41424560546875, + "loss": 0.5867, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4271417260169983, + "rewards/margins": 0.2684404253959656, + "rewards/rejected": -0.6955822110176086, + "step": 1170 + }, + { + "epoch": 1.23, + "learning_rate": 3.3359206950515266e-07, + "logits/chosen": -2.6162502765655518, + "logits/rejected": -2.5295591354370117, + "logps/chosen": -400.8308410644531, + "logps/rejected": -366.78558349609375, + "loss": 0.5793, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5077975988388062, + "rewards/margins": 0.36180368065834045, + "rewards/rejected": -0.8696013689041138, + "step": 1180 + }, + { + "epoch": 1.25, + "learning_rate": 3.3091360633081236e-07, + "logits/chosen": -2.687870502471924, + "logits/rejected": -2.6193759441375732, + "logps/chosen": -402.45098876953125, + "logps/rejected": -392.7806701660156, + "loss": 0.5752, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5487203598022461, + "rewards/margins": 0.37653276324272156, + "rewards/rejected": -0.9252530932426453, + "step": 1190 + }, + { + "epoch": 1.26, + "learning_rate": 3.2822472788166146e-07, + "logits/chosen": -2.6522316932678223, + "logits/rejected": -2.5871059894561768, + "logps/chosen": -410.39764404296875, + "logps/rejected": -405.1246643066406, + "loss": 0.5729, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.47655636072158813, + "rewards/margins": 0.35264235734939575, + "rewards/rejected": -0.8291987180709839, + "step": 1200 + }, + { + "epoch": 1.26, + "eval_logits/chosen": -2.6376395225524902, + "eval_logits/rejected": -2.570849657058716, + "eval_logps/chosen": -403.4270935058594, + "eval_logps/rejected": -394.70733642578125, + "eval_loss": 0.6083069443702698, + "eval_rewards/accuracies": 0.704365074634552, + "eval_rewards/chosen": -0.5838249325752258, + "eval_rewards/margins": 0.33620160818099976, + "eval_rewards/rejected": -0.9200265407562256, + "eval_runtime": 246.1597, + "eval_samples_per_second": 8.125, + "eval_steps_per_second": 0.256, + "step": 1200 + }, + { + "epoch": 1.27, + "learning_rate": 3.2552578027263955e-07, + "logits/chosen": -2.623401165008545, + "logits/rejected": -2.5443992614746094, + "logps/chosen": -361.82171630859375, + "logps/rejected": -374.11334228515625, + "loss": 0.5843, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5367065668106079, + "rewards/margins": 0.31009745597839355, + "rewards/rejected": -0.8468039631843567, + "step": 1210 + }, + { + "epoch": 1.28, + "learning_rate": 3.228171109147982e-07, + "logits/chosen": -2.643584728240967, + "logits/rejected": -2.5270514488220215, + "logps/chosen": -410.5106506347656, + "logps/rejected": -380.1089172363281, + "loss": 0.5983, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.44848957657814026, + "rewards/margins": 0.35878002643585205, + "rewards/rejected": -0.8072696924209595, + "step": 1220 + }, + { + "epoch": 1.29, + "learning_rate": 3.2009906847058125e-07, + "logits/chosen": -2.639606475830078, + "logits/rejected": -2.5487468242645264, + "logps/chosen": -424.10595703125, + "logps/rejected": -401.09674072265625, + "loss": 0.587, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.49688854813575745, + "rewards/margins": 0.3513595461845398, + "rewards/rejected": -0.8482481241226196, + "step": 1230 + }, + { + "epoch": 1.3, + "learning_rate": 3.1737200280894516e-07, + "logits/chosen": -2.5839312076568604, + "logits/rejected": -2.5728745460510254, + "logps/chosen": -371.0592041015625, + "logps/rejected": -380.7933349609375, + "loss": 0.602, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.523228645324707, + "rewards/margins": 0.30655089020729065, + "rewards/rejected": -0.8297795057296753, + "step": 1240 + }, + { + "epoch": 1.31, + "learning_rate": 3.146362649603233e-07, + "logits/chosen": -2.596813917160034, + "logits/rejected": -2.546060562133789, + "logps/chosen": -406.51953125, + "logps/rejected": -392.93890380859375, + "loss": 0.5825, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5865792036056519, + "rewards/margins": 0.3280216157436371, + "rewards/rejected": -0.9146007299423218, + "step": 1250 + }, + { + "epoch": 1.32, + "learning_rate": 3.118922070714408e-07, + "logits/chosen": -2.569214105606079, + "logits/rejected": -2.54783034324646, + "logps/chosen": -347.00006103515625, + "logps/rejected": -362.70562744140625, + "loss": 0.602, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5019794702529907, + "rewards/margins": 0.271824449300766, + "rewards/rejected": -0.7738040089607239, + "step": 1260 + }, + { + "epoch": 1.33, + "learning_rate": 3.091401823599865e-07, + "logits/chosen": -2.594982624053955, + "logits/rejected": -2.5411128997802734, + "logps/chosen": -371.3302917480469, + "logps/rejected": -372.2278747558594, + "loss": 0.5734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3963788151741028, + "rewards/margins": 0.3730164170265198, + "rewards/rejected": -0.7693952322006226, + "step": 1270 + }, + { + "epoch": 1.34, + "learning_rate": 3.063805450691458e-07, + "logits/chosen": -2.655991315841675, + "logits/rejected": -2.56858491897583, + "logps/chosen": -389.52099609375, + "logps/rejected": -369.41400146484375, + "loss": 0.577, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.39786046743392944, + "rewards/margins": 0.3743780255317688, + "rewards/rejected": -0.7722384929656982, + "step": 1280 + }, + { + "epoch": 1.35, + "learning_rate": 3.036136504220025e-07, + "logits/chosen": -2.6017796993255615, + "logits/rejected": -2.544804096221924, + "logps/chosen": -393.8155517578125, + "logps/rejected": -385.86541748046875, + "loss": 0.5892, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.46426922082901, + "rewards/margins": 0.35931476950645447, + "rewards/rejected": -0.8235839605331421, + "step": 1290 + }, + { + "epoch": 1.36, + "learning_rate": 3.0083985457581415e-07, + "logits/chosen": -2.5615665912628174, + "logits/rejected": -2.544076681137085, + "logps/chosen": -372.3790588378906, + "logps/rejected": -374.0905456542969, + "loss": 0.599, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5213760137557983, + "rewards/margins": 0.2982157766819, + "rewards/rejected": -0.8195918202400208, + "step": 1300 + }, + { + "epoch": 1.36, + "eval_logits/chosen": -2.613380193710327, + "eval_logits/rejected": -2.545426368713379, + "eval_logps/chosen": -397.10211181640625, + "eval_logps/rejected": -387.2309875488281, + "eval_loss": 0.6077432632446289, + "eval_rewards/accuracies": 0.7103174328804016, + "eval_rewards/chosen": -0.5205760598182678, + "eval_rewards/margins": 0.32468709349632263, + "eval_rewards/rejected": -0.8452631831169128, + "eval_runtime": 247.7632, + "eval_samples_per_second": 8.072, + "eval_steps_per_second": 0.254, + "step": 1300 + }, + { + "epoch": 1.37, + "learning_rate": 2.9805951457616684e-07, + "logits/chosen": -2.5732953548431396, + "logits/rejected": -2.505056381225586, + "logps/chosen": -370.58367919921875, + "logps/rejected": -377.2376708984375, + "loss": 0.5948, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4765905737876892, + "rewards/margins": 0.30376702547073364, + "rewards/rejected": -0.7803575396537781, + "step": 1310 + }, + { + "epoch": 1.38, + "learning_rate": 2.952729883110164e-07, + "logits/chosen": -2.560969829559326, + "logits/rejected": -2.4919774532318115, + "logps/chosen": -375.130859375, + "logps/rejected": -401.1944580078125, + "loss": 0.582, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5600558519363403, + "rewards/margins": 0.3885023593902588, + "rewards/rejected": -0.9485582113265991, + "step": 1320 + }, + { + "epoch": 1.39, + "learning_rate": 2.924806344646205e-07, + "logits/chosen": -2.5645477771759033, + "logits/rejected": -2.5073866844177246, + "logps/chosen": -408.23126220703125, + "logps/rejected": -424.2571716308594, + "loss": 0.5979, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6115280389785767, + "rewards/margins": 0.380671888589859, + "rewards/rejected": -0.9921998977661133, + "step": 1330 + }, + { + "epoch": 1.4, + "learning_rate": 2.896828124713684e-07, + "logits/chosen": -2.549287796020508, + "logits/rejected": -2.5001778602600098, + "logps/chosen": -386.74102783203125, + "logps/rejected": -394.2340393066406, + "loss": 0.6196, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6311862468719482, + "rewards/margins": 0.3289794921875, + "rewards/rejected": -0.960165798664093, + "step": 1340 + }, + { + "epoch": 1.41, + "learning_rate": 2.8687988246951437e-07, + "logits/chosen": -2.5453922748565674, + "logits/rejected": -2.518723726272583, + "logps/chosen": -342.259033203125, + "logps/rejected": -360.65850830078125, + "loss": 0.5773, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.47795337438583374, + "rewards/margins": 0.33936068415641785, + "rewards/rejected": -0.8173141479492188, + "step": 1350 + }, + { + "epoch": 1.42, + "learning_rate": 2.8407220525482047e-07, + "logits/chosen": -2.5927088260650635, + "logits/rejected": -2.489152431488037, + "logps/chosen": -413.27105712890625, + "logps/rejected": -376.00689697265625, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4944841265678406, + "rewards/margins": 0.2662241756916046, + "rewards/rejected": -0.7607083916664124, + "step": 1360 + }, + { + "epoch": 1.43, + "learning_rate": 2.81260142234114e-07, + "logits/chosen": -2.6516880989074707, + "logits/rejected": -2.5429511070251465, + "logps/chosen": -403.22454833984375, + "logps/rejected": -371.4908142089844, + "loss": 0.5748, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.46684926748275757, + "rewards/margins": 0.3777723014354706, + "rewards/rejected": -0.8446215391159058, + "step": 1370 + }, + { + "epoch": 1.44, + "learning_rate": 2.7844405537876766e-07, + "logits/chosen": -2.5847606658935547, + "logits/rejected": -2.5084991455078125, + "logps/chosen": -383.04254150390625, + "logps/rejected": -428.8330078125, + "loss": 0.5614, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47037452459335327, + "rewards/margins": 0.42992621660232544, + "rewards/rejected": -0.9003008008003235, + "step": 1380 + }, + { + "epoch": 1.45, + "learning_rate": 2.7562430717810586e-07, + "logits/chosen": -2.5550408363342285, + "logits/rejected": -2.54259991645813, + "logps/chosen": -390.04278564453125, + "logps/rejected": -383.87872314453125, + "loss": 0.5857, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5229761004447937, + "rewards/margins": 0.38916486501693726, + "rewards/rejected": -0.912140965461731, + "step": 1390 + }, + { + "epoch": 1.47, + "learning_rate": 2.728012605927447e-07, + "logits/chosen": -2.605335235595703, + "logits/rejected": -2.5029044151306152, + "logps/chosen": -421.9798889160156, + "logps/rejected": -388.80914306640625, + "loss": 0.5821, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.57950758934021, + "rewards/margins": 0.4323285222053528, + "rewards/rejected": -1.011836290359497, + "step": 1400 + }, + { + "epoch": 1.47, + "eval_logits/chosen": -2.5899722576141357, + "eval_logits/rejected": -2.5210795402526855, + "eval_logps/chosen": -404.44964599609375, + "eval_logps/rejected": -398.31060791015625, + "eval_loss": 0.602461040019989, + "eval_rewards/accuracies": 0.7063491940498352, + "eval_rewards/chosen": -0.594050943851471, + "eval_rewards/margins": 0.36200812458992004, + "eval_rewards/rejected": -0.9560590386390686, + "eval_runtime": 240.2615, + "eval_samples_per_second": 8.324, + "eval_steps_per_second": 0.262, + "step": 1400 + }, + { + "epoch": 1.48, + "learning_rate": 2.699752790078714e-07, + "logits/chosen": -2.576601982116699, + "logits/rejected": -2.484036922454834, + "logps/chosen": -422.91094970703125, + "logps/rejected": -394.78570556640625, + "loss": 0.578, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5908924341201782, + "rewards/margins": 0.4301966726779938, + "rewards/rejected": -1.0210891962051392, + "step": 1410 + }, + { + "epoch": 1.49, + "learning_rate": 2.6714672618646916e-07, + "logits/chosen": -2.550550699234009, + "logits/rejected": -2.5153775215148926, + "logps/chosen": -406.955322265625, + "logps/rejected": -409.8970642089844, + "loss": 0.5851, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5765306949615479, + "rewards/margins": 0.34898003935813904, + "rewards/rejected": -0.9255107045173645, + "step": 1420 + }, + { + "epoch": 1.5, + "learning_rate": 2.643159662224931e-07, + "logits/chosen": -2.510113477706909, + "logits/rejected": -2.4473376274108887, + "logps/chosen": -401.7076110839844, + "logps/rejected": -385.7937927246094, + "loss": 0.5892, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5977416634559631, + "rewards/margins": 0.2979085147380829, + "rewards/rejected": -0.8956502079963684, + "step": 1430 + }, + { + "epoch": 1.51, + "learning_rate": 2.6148336349400386e-07, + "logits/chosen": -2.581653118133545, + "logits/rejected": -2.4795095920562744, + "logps/chosen": -414.6515197753906, + "logps/rejected": -404.5352783203125, + "loss": 0.5781, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4708114564418793, + "rewards/margins": 0.42272576689720154, + "rewards/rejected": -0.893537163734436, + "step": 1440 + }, + { + "epoch": 1.52, + "learning_rate": 2.5864928261626416e-07, + "logits/chosen": -2.5162148475646973, + "logits/rejected": -2.454847812652588, + "logps/chosen": -392.3681945800781, + "logps/rejected": -386.84161376953125, + "loss": 0.5779, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5253309011459351, + "rewards/margins": 0.3526184856891632, + "rewards/rejected": -0.8779493570327759, + "step": 1450 + }, + { + "epoch": 1.53, + "learning_rate": 2.558140883948058e-07, + "logits/chosen": -2.554378032684326, + "logits/rejected": -2.4660162925720215, + "logps/chosen": -411.94757080078125, + "logps/rejected": -378.79522705078125, + "loss": 0.5712, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5396718978881836, + "rewards/margins": 0.370322585105896, + "rewards/rejected": -0.9099944829940796, + "step": 1460 + }, + { + "epoch": 1.54, + "learning_rate": 2.5297814577847116e-07, + "logits/chosen": -2.5671238899230957, + "logits/rejected": -2.525606155395508, + "logps/chosen": -404.15545654296875, + "logps/rejected": -415.3843688964844, + "loss": 0.5932, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5830854177474976, + "rewards/margins": 0.32624879479408264, + "rewards/rejected": -0.9093341827392578, + "step": 1470 + }, + { + "epoch": 1.55, + "learning_rate": 2.501418198124365e-07, + "logits/chosen": -2.56373929977417, + "logits/rejected": -2.491987705230713, + "logps/chosen": -436.6615295410156, + "logps/rejected": -413.1544494628906, + "loss": 0.5685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5825528502464294, + "rewards/margins": 0.4194890856742859, + "rewards/rejected": -1.0020420551300049, + "step": 1480 + }, + { + "epoch": 1.56, + "learning_rate": 2.473054755912234e-07, + "logits/chosen": -2.5011954307556152, + "logits/rejected": -2.4363338947296143, + "logps/chosen": -406.9830017089844, + "logps/rejected": -418.9451599121094, + "loss": 0.5797, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5498841404914856, + "rewards/margins": 0.49638238549232483, + "rewards/rejected": -1.0462663173675537, + "step": 1490 + }, + { + "epoch": 1.57, + "learning_rate": 2.444694782117033e-07, + "logits/chosen": -2.5000977516174316, + "logits/rejected": -2.443779468536377, + "logps/chosen": -392.17327880859375, + "logps/rejected": -432.06195068359375, + "loss": 0.574, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6436781287193298, + "rewards/margins": 0.43835169076919556, + "rewards/rejected": -1.082029938697815, + "step": 1500 + }, + { + "epoch": 1.57, + "eval_logits/chosen": -2.5592637062072754, + "eval_logits/rejected": -2.488711357116699, + "eval_logps/chosen": -411.21783447265625, + "eval_logps/rejected": -407.4162292480469, + "eval_loss": 0.5976974964141846, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": -0.6617329716682434, + "eval_rewards/margins": 0.3853819668292999, + "eval_rewards/rejected": -1.0471149682998657, + "eval_runtime": 249.6762, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 0.252, + "step": 1500 + }, + { + "epoch": 1.58, + "learning_rate": 2.416341927261016e-07, + "logits/chosen": -2.509632110595703, + "logits/rejected": -2.4712796211242676, + "logps/chosen": -388.2112121582031, + "logps/rejected": -393.1347961425781, + "loss": 0.5739, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6419853568077087, + "rewards/margins": 0.44171229004859924, + "rewards/rejected": -1.0836976766586304, + "step": 1510 + }, + { + "epoch": 1.59, + "learning_rate": 2.3879998409500845e-07, + "logits/chosen": -2.5448498725891113, + "logits/rejected": -2.4746804237365723, + "logps/chosen": -435.6138610839844, + "logps/rejected": -420.2179260253906, + "loss": 0.5764, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7542072534561157, + "rewards/margins": 0.4248180389404297, + "rewards/rejected": -1.1790252923965454, + "step": 1520 + }, + { + "epoch": 1.6, + "learning_rate": 2.3596721714039998e-07, + "logits/chosen": -2.4864585399627686, + "logits/rejected": -2.4267334938049316, + "logps/chosen": -395.937744140625, + "logps/rejected": -394.3992919921875, + "loss": 0.5586, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.689933180809021, + "rewards/margins": 0.41859620809555054, + "rewards/rejected": -1.1085295677185059, + "step": 1530 + }, + { + "epoch": 1.61, + "learning_rate": 2.3313625649867824e-07, + "logits/chosen": -2.5655341148376465, + "logits/rejected": -2.4968507289886475, + "logps/chosen": -454.5184631347656, + "logps/rejected": -442.9791564941406, + "loss": 0.5844, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.721511960029602, + "rewards/margins": 0.4738715589046478, + "rewards/rejected": -1.1953833103179932, + "step": 1540 + }, + { + "epoch": 1.62, + "learning_rate": 2.303074665737355e-07, + "logits/chosen": -2.503943920135498, + "logits/rejected": -2.450364112854004, + "logps/chosen": -424.10760498046875, + "logps/rejected": -405.66070556640625, + "loss": 0.5654, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.673719048500061, + "rewards/margins": 0.3581104576587677, + "rewards/rejected": -1.0318294763565063, + "step": 1550 + }, + { + "epoch": 1.63, + "learning_rate": 2.274812114900469e-07, + "logits/chosen": -2.505079507827759, + "logits/rejected": -2.4630515575408936, + "logps/chosen": -368.4876403808594, + "logps/rejected": -392.97467041015625, + "loss": 0.585, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6174124479293823, + "rewards/margins": 0.3502601981163025, + "rewards/rejected": -0.9676725268363953, + "step": 1560 + }, + { + "epoch": 1.64, + "learning_rate": 2.2465785504580074e-07, + "logits/chosen": -2.4787967205047607, + "logits/rejected": -2.406154155731201, + "logps/chosen": -432.4744567871094, + "logps/rejected": -413.21453857421875, + "loss": 0.5799, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.658542811870575, + "rewards/margins": 0.3376074433326721, + "rewards/rejected": -0.9961503744125366, + "step": 1570 + }, + { + "epoch": 1.65, + "learning_rate": 2.2183776066606947e-07, + "logits/chosen": -2.5223982334136963, + "logits/rejected": -2.51818585395813, + "logps/chosen": -380.69573974609375, + "logps/rejected": -432.8943786621094, + "loss": 0.5772, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5959492921829224, + "rewards/margins": 0.47599101066589355, + "rewards/rejected": -1.0719401836395264, + "step": 1580 + }, + { + "epoch": 1.66, + "learning_rate": 2.190212913560298e-07, + "logits/chosen": -2.4563565254211426, + "logits/rejected": -2.4220333099365234, + "logps/chosen": -383.9895324707031, + "logps/rejected": -395.3768005371094, + "loss": 0.5726, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6407332420349121, + "rewards/margins": 0.4355078339576721, + "rewards/rejected": -1.076241135597229, + "step": 1590 + }, + { + "epoch": 1.67, + "learning_rate": 2.1620880965423596e-07, + "logits/chosen": -2.488713502883911, + "logits/rejected": -2.434023380279541, + "logps/chosen": -390.2538757324219, + "logps/rejected": -380.91302490234375, + "loss": 0.5716, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6403465867042542, + "rewards/margins": 0.4435759484767914, + "rewards/rejected": -1.0839226245880127, + "step": 1600 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -2.536853313446045, + "eval_logits/rejected": -2.4650635719299316, + "eval_logps/chosen": -412.695556640625, + "eval_logps/rejected": -411.40203857421875, + "eval_loss": 0.5954813361167908, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -0.6765100955963135, + "eval_rewards/margins": 0.41046345233917236, + "eval_rewards/rejected": -1.0869736671447754, + "eval_runtime": 247.9314, + "eval_samples_per_second": 8.067, + "eval_steps_per_second": 0.254, + "step": 1600 + }, + { + "epoch": 1.68, + "learning_rate": 2.134006775859537e-07, + "logits/chosen": -2.4386448860168457, + "logits/rejected": -2.4446792602539062, + "logps/chosen": -400.6998596191406, + "logps/rejected": -428.584716796875, + "loss": 0.5628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7244473695755005, + "rewards/margins": 0.43817657232284546, + "rewards/rejected": -1.1626240015029907, + "step": 1610 + }, + { + "epoch": 1.7, + "learning_rate": 2.1059725661655948e-07, + "logits/chosen": -2.543206214904785, + "logits/rejected": -2.481884002685547, + "logps/chosen": -437.24176025390625, + "logps/rejected": -411.68115234375, + "loss": 0.5581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.815970778465271, + "rewards/margins": 0.4566773474216461, + "rewards/rejected": -1.2726482152938843, + "step": 1620 + }, + { + "epoch": 1.71, + "learning_rate": 2.077989076050133e-07, + "logits/chosen": -2.4628424644470215, + "logits/rejected": -2.414001703262329, + "logps/chosen": -437.2959899902344, + "logps/rejected": -445.44427490234375, + "loss": 0.5536, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.874140739440918, + "rewards/margins": 0.4144424498081207, + "rewards/rejected": -1.2885833978652954, + "step": 1630 + }, + { + "epoch": 1.72, + "learning_rate": 2.050059907574076e-07, + "logits/chosen": -2.507554054260254, + "logits/rejected": -2.427834987640381, + "logps/chosen": -408.7767028808594, + "logps/rejected": -413.369873046875, + "loss": 0.5731, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7224391102790833, + "rewards/margins": 0.4979207515716553, + "rewards/rejected": -1.2203600406646729, + "step": 1640 + }, + { + "epoch": 1.73, + "learning_rate": 2.022188655806016e-07, + "logits/chosen": -2.523545503616333, + "logits/rejected": -2.448479413986206, + "logps/chosen": -451.228515625, + "logps/rejected": -419.6627502441406, + "loss": 0.5857, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7539777755737305, + "rewards/margins": 0.37716880440711975, + "rewards/rejected": -1.1311466693878174, + "step": 1650 + }, + { + "epoch": 1.74, + "learning_rate": 1.9943789083594564e-07, + "logits/chosen": -2.4754841327667236, + "logits/rejected": -2.428614377975464, + "logps/chosen": -385.0950622558594, + "logps/rejected": -403.7740783691406, + "loss": 0.5752, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7256184816360474, + "rewards/margins": 0.4796815812587738, + "rewards/rejected": -1.2053000926971436, + "step": 1660 + }, + { + "epoch": 1.75, + "learning_rate": 1.9666342449310025e-07, + "logits/chosen": -2.530557870864868, + "logits/rejected": -2.4692397117614746, + "logps/chosen": -409.13922119140625, + "logps/rejected": -402.8389892578125, + "loss": 0.5764, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6921072602272034, + "rewards/margins": 0.4285035729408264, + "rewards/rejected": -1.1206107139587402, + "step": 1670 + }, + { + "epoch": 1.76, + "learning_rate": 1.938958236839588e-07, + "logits/chosen": -2.52577543258667, + "logits/rejected": -2.4166135787963867, + "logps/chosen": -440.7547302246094, + "logps/rejected": -413.27911376953125, + "loss": 0.5711, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7067890167236328, + "rewards/margins": 0.4192212224006653, + "rewards/rejected": -1.1260101795196533, + "step": 1680 + }, + { + "epoch": 1.77, + "learning_rate": 1.9113544465667637e-07, + "logits/chosen": -2.46167254447937, + "logits/rejected": -2.4476611614227295, + "logps/chosen": -376.3783264160156, + "logps/rejected": -403.9447326660156, + "loss": 0.5749, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6462227702140808, + "rewards/margins": 0.4927960932254791, + "rewards/rejected": -1.1390188932418823, + "step": 1690 + }, + { + "epoch": 1.78, + "learning_rate": 1.88382642729814e-07, + "logits/chosen": -2.5014305114746094, + "logits/rejected": -2.4548792839050293, + "logps/chosen": -411.25457763671875, + "logps/rejected": -420.2723693847656, + "loss": 0.5477, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6595852971076965, + "rewards/margins": 0.5647996068000793, + "rewards/rejected": -1.2243849039077759, + "step": 1700 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -2.5078556537628174, + "eval_logits/rejected": -2.4342150688171387, + "eval_logps/chosen": -425.2423095703125, + "eval_logps/rejected": -427.0003356933594, + "eval_loss": 0.5904152393341064, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.8019776940345764, + "eval_rewards/margins": 0.44097864627838135, + "eval_rewards/rejected": -1.242956280708313, + "eval_runtime": 248.3578, + "eval_samples_per_second": 8.053, + "eval_steps_per_second": 0.254, + "step": 1700 + }, + { + "epoch": 1.79, + "learning_rate": 1.856377722466009e-07, + "logits/chosen": -2.4711380004882812, + "logits/rejected": -2.408111810684204, + "logps/chosen": -441.938720703125, + "logps/rejected": -463.12432861328125, + "loss": 0.5691, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7691561579704285, + "rewards/margins": 0.46461838483810425, + "rewards/rejected": -1.2337746620178223, + "step": 1710 + }, + { + "epoch": 1.8, + "learning_rate": 1.8290118652932364e-07, + "logits/chosen": -2.4911022186279297, + "logits/rejected": -2.45578670501709, + "logps/chosen": -388.4582214355469, + "logps/rejected": -418.9519958496094, + "loss": 0.5829, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7712088823318481, + "rewards/margins": 0.48189014196395874, + "rewards/rejected": -1.2530990839004517, + "step": 1720 + }, + { + "epoch": 1.81, + "learning_rate": 1.8017323783384601e-07, + "logits/chosen": -2.524731397628784, + "logits/rejected": -2.4751369953155518, + "logps/chosen": -411.6083984375, + "logps/rejected": -451.7425231933594, + "loss": 0.5679, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6576933860778809, + "rewards/margins": 0.4532436430454254, + "rewards/rejected": -1.1109369993209839, + "step": 1730 + }, + { + "epoch": 1.82, + "learning_rate": 1.7745427730426635e-07, + "logits/chosen": -2.5422208309173584, + "logits/rejected": -2.4694812297821045, + "logps/chosen": -394.8658447265625, + "logps/rejected": -415.96124267578125, + "loss": 0.5595, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.6860970258712769, + "rewards/margins": 0.5364459753036499, + "rewards/rejected": -1.2225428819656372, + "step": 1740 + }, + { + "epoch": 1.83, + "learning_rate": 1.7474465492771772e-07, + "logits/chosen": -2.4822356700897217, + "logits/rejected": -2.392840623855591, + "logps/chosen": -445.530029296875, + "logps/rejected": -415.55078125, + "loss": 0.5643, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7760051488876343, + "rewards/margins": 0.48258695006370544, + "rewards/rejected": -1.258592128753662, + "step": 1750 + }, + { + "epoch": 1.84, + "learning_rate": 1.7204471948931758e-07, + "logits/chosen": -2.3855233192443848, + "logits/rejected": -2.342365026473999, + "logps/chosen": -368.435791015625, + "logps/rejected": -391.09893798828125, + "loss": 0.5579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7862086296081543, + "rewards/margins": 0.42663902044296265, + "rewards/rejected": -1.2128477096557617, + "step": 1760 + }, + { + "epoch": 1.85, + "learning_rate": 1.6935481852727173e-07, + "logits/chosen": -2.4512977600097656, + "logits/rejected": -2.3995602130889893, + "logps/chosen": -432.49578857421875, + "logps/rejected": -437.99298095703125, + "loss": 0.5795, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7520878314971924, + "rewards/margins": 0.4426315426826477, + "rewards/rejected": -1.1947194337844849, + "step": 1770 + }, + { + "epoch": 1.86, + "learning_rate": 1.6667529828813853e-07, + "logits/chosen": -2.5046944618225098, + "logits/rejected": -2.4515976905822754, + "logps/chosen": -406.6270446777344, + "logps/rejected": -427.93231201171875, + "loss": 0.5629, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.6665788888931274, + "rewards/margins": 0.5746535658836365, + "rewards/rejected": -1.2412325143814087, + "step": 1780 + }, + { + "epoch": 1.87, + "learning_rate": 1.640065036822605e-07, + "logits/chosen": -2.5178215503692627, + "logits/rejected": -2.4513187408447266, + "logps/chosen": -407.05023193359375, + "logps/rejected": -399.77423095703125, + "loss": 0.5559, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.682750403881073, + "rewards/margins": 0.4834769368171692, + "rewards/rejected": -1.1662273406982422, + "step": 1790 + }, + { + "epoch": 1.88, + "learning_rate": 1.6134877823936607e-07, + "logits/chosen": -2.4752984046936035, + "logits/rejected": -2.40830659866333, + "logps/chosen": -424.7567443847656, + "logps/rejected": -407.9845275878906, + "loss": 0.5718, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.758663535118103, + "rewards/margins": 0.4667127728462219, + "rewards/rejected": -1.2253763675689697, + "step": 1800 + }, + { + "epoch": 1.88, + "eval_logits/chosen": -2.4927799701690674, + "eval_logits/rejected": -2.4185616970062256, + "eval_logps/chosen": -424.36309814453125, + "eval_logps/rejected": -427.0936584472656, + "eval_loss": 0.5897929668426514, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.7931855320930481, + "eval_rewards/margins": 0.4507039487361908, + "eval_rewards/rejected": -1.2438894510269165, + "eval_runtime": 244.3171, + "eval_samples_per_second": 8.186, + "eval_steps_per_second": 0.258, + "step": 1800 + }, + { + "epoch": 1.89, + "learning_rate": 1.587024640643513e-07, + "logits/chosen": -2.446739673614502, + "logits/rejected": -2.4178757667541504, + "logps/chosen": -401.63641357421875, + "logps/rejected": -406.626708984375, + "loss": 0.5706, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7500771284103394, + "rewards/margins": 0.4350431561470032, + "rewards/rejected": -1.1851202249526978, + "step": 1810 + }, + { + "epoch": 1.9, + "learning_rate": 1.5606790179324257e-07, + "logits/chosen": -2.469186305999756, + "logits/rejected": -2.3644776344299316, + "logps/chosen": -437.4012756347656, + "logps/rejected": -429.28369140625, + "loss": 0.553, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7953532338142395, + "rewards/margins": 0.5376496315002441, + "rewards/rejected": -1.3330028057098389, + "step": 1820 + }, + { + "epoch": 1.92, + "learning_rate": 1.534454305493509e-07, + "logits/chosen": -2.4878664016723633, + "logits/rejected": -2.402182102203369, + "logps/chosen": -430.43035888671875, + "logps/rejected": -446.88055419921875, + "loss": 0.5665, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.718338131904602, + "rewards/margins": 0.5019052624702454, + "rewards/rejected": -1.2202433347702026, + "step": 1830 + }, + { + "epoch": 1.93, + "learning_rate": 1.5083538789961846e-07, + "logits/chosen": -2.4689135551452637, + "logits/rejected": -2.3651726245880127, + "logps/chosen": -424.533935546875, + "logps/rejected": -407.27264404296875, + "loss": 0.5746, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7878087162971497, + "rewards/margins": 0.4385625720024109, + "rewards/rejected": -1.2263712882995605, + "step": 1840 + }, + { + "epoch": 1.94, + "learning_rate": 1.4823810981116767e-07, + "logits/chosen": -2.420289993286133, + "logits/rejected": -2.3955094814300537, + "logps/chosen": -412.322509765625, + "logps/rejected": -425.9512634277344, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6696097254753113, + "rewards/margins": 0.4169555604457855, + "rewards/rejected": -1.086565375328064, + "step": 1850 + }, + { + "epoch": 1.95, + "learning_rate": 1.456539306080543e-07, + "logits/chosen": -2.4510982036590576, + "logits/rejected": -2.4162471294403076, + "logps/chosen": -413.22509765625, + "logps/rejected": -462.84381103515625, + "loss": 0.5746, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7580252289772034, + "rewards/margins": 0.40020751953125, + "rewards/rejected": -1.1582326889038086, + "step": 1860 + }, + { + "epoch": 1.96, + "learning_rate": 1.4308318292823364e-07, + "logits/chosen": -2.4641025066375732, + "logits/rejected": -2.4155373573303223, + "logps/chosen": -411.8502502441406, + "logps/rejected": -425.70416259765625, + "loss": 0.5509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7296292185783386, + "rewards/margins": 0.44563156366348267, + "rewards/rejected": -1.1752609014511108, + "step": 1870 + }, + { + "epoch": 1.97, + "learning_rate": 1.4052619768074267e-07, + "logits/chosen": -2.449817180633545, + "logits/rejected": -2.390150547027588, + "logps/chosen": -420.39849853515625, + "logps/rejected": -426.16400146484375, + "loss": 0.575, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7945643663406372, + "rewards/margins": 0.48763760924339294, + "rewards/rejected": -1.2822020053863525, + "step": 1880 + }, + { + "epoch": 1.98, + "learning_rate": 1.3798330400310537e-07, + "logits/chosen": -2.404205322265625, + "logits/rejected": -2.3124032020568848, + "logps/chosen": -383.17266845703125, + "logps/rejected": -382.43841552734375, + "loss": 0.5779, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6244784593582153, + "rewards/margins": 0.4246695935726166, + "rewards/rejected": -1.0491479635238647, + "step": 1890 + }, + { + "epoch": 1.99, + "learning_rate": 1.354548292189657e-07, + "logits/chosen": -2.437732458114624, + "logits/rejected": -2.3991034030914307, + "logps/chosen": -385.60443115234375, + "logps/rejected": -417.61468505859375, + "loss": 0.563, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7070263624191284, + "rewards/margins": 0.44580182433128357, + "rewards/rejected": -1.1528282165527344, + "step": 1900 + }, + { + "epoch": 1.99, + "eval_logits/chosen": -2.496089220046997, + "eval_logits/rejected": -2.422304391860962, + "eval_logps/chosen": -413.7807312011719, + "eval_logps/rejected": -415.832763671875, + "eval_loss": 0.5903951525688171, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -0.687361478805542, + "eval_rewards/margins": 0.44391965866088867, + "eval_rewards/rejected": -1.1312810182571411, + "eval_runtime": 243.6107, + "eval_samples_per_second": 8.21, + "eval_steps_per_second": 0.259, + "step": 1900 + }, + { + "epoch": 2.0, + "learning_rate": 1.3294109879595412e-07, + "logits/chosen": -2.518566131591797, + "logits/rejected": -2.4946486949920654, + "logps/chosen": -407.2130126953125, + "logps/rejected": -429.19573974609375, + "loss": 0.5917, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6605619192123413, + "rewards/margins": 0.3588492274284363, + "rewards/rejected": -1.0194110870361328, + "step": 1910 + }, + { + "epoch": 2.01, + "learning_rate": 1.304424363037932e-07, + "logits/chosen": -2.449763059616089, + "logits/rejected": -2.358640193939209, + "logps/chosen": -434.584716796875, + "logps/rejected": -435.99725341796875, + "loss": 0.5423, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7439482808113098, + "rewards/margins": 0.45913758873939514, + "rewards/rejected": -1.2030858993530273, + "step": 1920 + }, + { + "epoch": 2.02, + "learning_rate": 1.2795916337264756e-07, + "logits/chosen": -2.46913480758667, + "logits/rejected": -2.377265214920044, + "logps/chosen": -426.83660888671875, + "logps/rejected": -421.1961364746094, + "loss": 0.5715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7704228162765503, + "rewards/margins": 0.47521066665649414, + "rewards/rejected": -1.2456334829330444, + "step": 1930 + }, + { + "epoch": 2.03, + "learning_rate": 1.2549159965172295e-07, + "logits/chosen": -2.4469761848449707, + "logits/rejected": -2.3427939414978027, + "logps/chosen": -425.2955017089844, + "logps/rejected": -424.61492919921875, + "loss": 0.5667, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7805430293083191, + "rewards/margins": 0.49475497007369995, + "rewards/rejected": -1.275297999382019, + "step": 1940 + }, + { + "epoch": 2.04, + "learning_rate": 1.2304006276812122e-07, + "logits/chosen": -2.3801145553588867, + "logits/rejected": -2.350468397140503, + "logps/chosen": -366.3126220703125, + "logps/rejected": -400.9440002441406, + "loss": 0.5529, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7851904630661011, + "rewards/margins": 0.4816361963748932, + "rewards/rejected": -1.2668267488479614, + "step": 1950 + }, + { + "epoch": 2.05, + "learning_rate": 1.2060486828595442e-07, + "logits/chosen": -2.4748592376708984, + "logits/rejected": -2.408360719680786, + "logps/chosen": -419.6524353027344, + "logps/rejected": -431.26055908203125, + "loss": 0.5753, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8105200529098511, + "rewards/margins": 0.42913907766342163, + "rewards/rejected": -1.239659309387207, + "step": 1960 + }, + { + "epoch": 2.06, + "learning_rate": 1.1818632966572578e-07, + "logits/chosen": -2.4946236610412598, + "logits/rejected": -2.4172983169555664, + "logps/chosen": -416.42645263671875, + "logps/rejected": -442.043212890625, + "loss": 0.5641, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8131068348884583, + "rewards/margins": 0.5074289441108704, + "rewards/rejected": -1.3205358982086182, + "step": 1970 + }, + { + "epoch": 2.07, + "learning_rate": 1.1578475822398032e-07, + "logits/chosen": -2.455920696258545, + "logits/rejected": -2.3736257553100586, + "logps/chosen": -422.4615783691406, + "logps/rejected": -441.34521484375, + "loss": 0.566, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.817348301410675, + "rewards/margins": 0.49631649255752563, + "rewards/rejected": -1.3136647939682007, + "step": 1980 + }, + { + "epoch": 2.08, + "learning_rate": 1.1340046309323206e-07, + "logits/chosen": -2.474325656890869, + "logits/rejected": -2.4197046756744385, + "logps/chosen": -406.9759826660156, + "logps/rejected": -424.93023681640625, + "loss": 0.5626, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8615679740905762, + "rewards/margins": 0.4161076545715332, + "rewards/rejected": -1.2776756286621094, + "step": 1990 + }, + { + "epoch": 2.09, + "learning_rate": 1.1103375118217218e-07, + "logits/chosen": -2.4074063301086426, + "logits/rejected": -2.339216709136963, + "logps/chosen": -384.3484802246094, + "logps/rejected": -401.8287048339844, + "loss": 0.5633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7149208784103394, + "rewards/margins": 0.4786139130592346, + "rewards/rejected": -1.1935349702835083, + "step": 2000 + }, + { + "epoch": 2.09, + "eval_logits/chosen": -2.4819118976593018, + "eval_logits/rejected": -2.407344341278076, + "eval_logps/chosen": -420.6850891113281, + "eval_logps/rejected": -423.75042724609375, + "eval_loss": 0.5884086489677429, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.756405770778656, + "eval_rewards/margins": 0.45405152440071106, + "eval_rewards/rejected": -1.2104572057724, + "eval_runtime": 244.0013, + "eval_samples_per_second": 8.197, + "eval_steps_per_second": 0.258, + "step": 2000 + }, + { + "epoch": 2.1, + "learning_rate": 1.086849271361634e-07, + "logits/chosen": -2.4910244941711426, + "logits/rejected": -2.4178977012634277, + "logps/chosen": -421.26654052734375, + "logps/rejected": -442.17657470703125, + "loss": 0.5609, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7127526998519897, + "rewards/margins": 0.517103374004364, + "rewards/rejected": -1.2298561334609985, + "step": 2010 + }, + { + "epoch": 2.11, + "learning_rate": 1.0635429329802578e-07, + "logits/chosen": -2.428316831588745, + "logits/rejected": -2.4025187492370605, + "logps/chosen": -385.6629943847656, + "logps/rejected": -423.72857666015625, + "loss": 0.5433, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7322330474853516, + "rewards/margins": 0.46380695700645447, + "rewards/rejected": -1.1960399150848389, + "step": 2020 + }, + { + "epoch": 2.12, + "learning_rate": 1.0404214966911895e-07, + "logits/chosen": -2.4701590538024902, + "logits/rejected": -2.3929755687713623, + "logps/chosen": -432.38629150390625, + "logps/rejected": -409.7430114746094, + "loss": 0.5412, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7193957567214966, + "rewards/margins": 0.4175872206687927, + "rewards/rejected": -1.136983036994934, + "step": 2030 + }, + { + "epoch": 2.14, + "learning_rate": 1.0174879387072549e-07, + "logits/chosen": -2.4195656776428223, + "logits/rejected": -2.3856568336486816, + "logps/chosen": -378.31634521484375, + "logps/rejected": -429.25396728515625, + "loss": 0.5402, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7908437848091125, + "rewards/margins": 0.4326706528663635, + "rewards/rejected": -1.2235145568847656, + "step": 2040 + }, + { + "epoch": 2.15, + "learning_rate": 9.947452110574098e-08, + "logits/chosen": -2.411748170852661, + "logits/rejected": -2.3596882820129395, + "logps/chosen": -426.4925842285156, + "logps/rejected": -443.3196716308594, + "loss": 0.5473, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7338422536849976, + "rewards/margins": 0.503951907157898, + "rewards/rejected": -1.2377939224243164, + "step": 2050 + }, + { + "epoch": 2.16, + "learning_rate": 9.721962412067519e-08, + "logits/chosen": -2.401766061782837, + "logits/rejected": -2.316685676574707, + "logps/chosen": -408.12103271484375, + "logps/rejected": -398.24017333984375, + "loss": 0.5627, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8616282343864441, + "rewards/margins": 0.36459067463874817, + "rewards/rejected": -1.2262189388275146, + "step": 2060 + }, + { + "epoch": 2.17, + "learning_rate": 9.498439316796913e-08, + "logits/chosen": -2.429500102996826, + "logits/rejected": -2.341491937637329, + "logps/chosen": -388.14208984375, + "logps/rejected": -396.15423583984375, + "loss": 0.551, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7477534413337708, + "rewards/margins": 0.5198956727981567, + "rewards/rejected": -1.2676490545272827, + "step": 2070 + }, + { + "epoch": 2.18, + "learning_rate": 9.276911596863441e-08, + "logits/chosen": -2.430539608001709, + "logits/rejected": -2.3732991218566895, + "logps/chosen": -408.09771728515625, + "logps/rejected": -429.29705810546875, + "loss": 0.5612, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6935451626777649, + "rewards/margins": 0.5118761658668518, + "rewards/rejected": -1.2054214477539062, + "step": 2080 + }, + { + "epoch": 2.19, + "learning_rate": 9.05740776752163e-08, + "logits/chosen": -2.534486770629883, + "logits/rejected": -2.425327777862549, + "logps/chosen": -466.2666931152344, + "logps/rejected": -450.0174255371094, + "loss": 0.5509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.763168215751648, + "rewards/margins": 0.5992245674133301, + "rewards/rejected": -1.362392783164978, + "step": 2090 + }, + { + "epoch": 2.2, + "learning_rate": 8.839956083508959e-08, + "logits/chosen": -2.4332027435302734, + "logits/rejected": -2.4018168449401855, + "logps/chosen": -428.01910400390625, + "logps/rejected": -455.42889404296875, + "loss": 0.5564, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7803698182106018, + "rewards/margins": 0.4613746106624603, + "rewards/rejected": -1.2417443990707397, + "step": 2100 + }, + { + "epoch": 2.2, + "eval_logits/chosen": -2.4696059226989746, + "eval_logits/rejected": -2.394796133041382, + "eval_logps/chosen": -426.54876708984375, + "eval_logps/rejected": -430.72430419921875, + "eval_loss": 0.587758481502533, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.815041720867157, + "eval_rewards/margins": 0.4651543200016022, + "eval_rewards/rejected": -1.280196189880371, + "eval_runtime": 249.102, + "eval_samples_per_second": 8.029, + "eval_steps_per_second": 0.253, + "step": 2100 + }, + { + "epoch": 2.21, + "learning_rate": 8.624584535408836e-08, + "logits/chosen": -2.467682361602783, + "logits/rejected": -2.3958325386047363, + "logps/chosen": -442.32879638671875, + "logps/rejected": -444.9439392089844, + "loss": 0.5491, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7700357437133789, + "rewards/margins": 0.5091968178749084, + "rewards/rejected": -1.2792325019836426, + "step": 2110 + }, + { + "epoch": 2.22, + "learning_rate": 8.411320846047637e-08, + "logits/chosen": -2.4758048057556152, + "logits/rejected": -2.452558755874634, + "logps/chosen": -398.3210754394531, + "logps/rejected": -397.4677734375, + "loss": 0.5505, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7616570591926575, + "rewards/margins": 0.417407363653183, + "rewards/rejected": -1.179064393043518, + "step": 2120 + }, + { + "epoch": 2.23, + "learning_rate": 8.200192466926201e-08, + "logits/chosen": -2.4519848823547363, + "logits/rejected": -2.3826258182525635, + "logps/chosen": -436.55859375, + "logps/rejected": -458.65594482421875, + "loss": 0.5332, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8493996858596802, + "rewards/margins": 0.5574203729629517, + "rewards/rejected": -1.4068200588226318, + "step": 2130 + }, + { + "epoch": 2.24, + "learning_rate": 7.991226574686241e-08, + "logits/chosen": -2.4183828830718994, + "logits/rejected": -2.3716251850128174, + "logps/chosen": -374.54803466796875, + "logps/rejected": -389.0216064453125, + "loss": 0.5528, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7416827082633972, + "rewards/margins": 0.5108539462089539, + "rewards/rejected": -1.252536654472351, + "step": 2140 + }, + { + "epoch": 2.25, + "learning_rate": 7.784450067612138e-08, + "logits/chosen": -2.4434866905212402, + "logits/rejected": -2.3613460063934326, + "logps/chosen": -435.7193298339844, + "logps/rejected": -428.2972717285156, + "loss": 0.5349, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7578371167182922, + "rewards/margins": 0.5514657497406006, + "rewards/rejected": -1.3093029260635376, + "step": 2150 + }, + { + "epoch": 2.26, + "learning_rate": 7.579889562168585e-08, + "logits/chosen": -2.434720993041992, + "logits/rejected": -2.359743595123291, + "logps/chosen": -392.49005126953125, + "logps/rejected": -418.412353515625, + "loss": 0.532, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7233065962791443, + "rewards/margins": 0.6114121675491333, + "rewards/rejected": -1.3347185850143433, + "step": 2160 + }, + { + "epoch": 2.27, + "learning_rate": 7.377571389574474e-08, + "logits/chosen": -2.4690604209899902, + "logits/rejected": -2.412727117538452, + "logps/chosen": -407.22943115234375, + "logps/rejected": -429.3172302246094, + "loss": 0.5137, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8272277116775513, + "rewards/margins": 0.5033684968948364, + "rewards/rejected": -1.3305962085723877, + "step": 2170 + }, + { + "epoch": 2.28, + "learning_rate": 7.177521592413505e-08, + "logits/chosen": -2.4891464710235596, + "logits/rejected": -2.4008851051330566, + "logps/chosen": -416.02374267578125, + "logps/rejected": -434.93853759765625, + "loss": 0.5715, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.84009850025177, + "rewards/margins": 0.4889064431190491, + "rewards/rejected": -1.3290048837661743, + "step": 2180 + }, + { + "epoch": 2.29, + "learning_rate": 6.979765921282021e-08, + "logits/chosen": -2.4268863201141357, + "logits/rejected": -2.3376641273498535, + "logps/chosen": -441.4808044433594, + "logps/rejected": -436.463134765625, + "loss": 0.5379, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8702206611633301, + "rewards/margins": 0.5598399639129639, + "rewards/rejected": -1.430060625076294, + "step": 2190 + }, + { + "epoch": 2.3, + "learning_rate": 6.784329831474276e-08, + "logits/chosen": -2.4219398498535156, + "logits/rejected": -2.3306546211242676, + "logps/chosen": -419.054443359375, + "logps/rejected": -465.0423889160156, + "loss": 0.5373, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7490620613098145, + "rewards/margins": 0.6311138272285461, + "rewards/rejected": -1.3801758289337158, + "step": 2200 + }, + { + "epoch": 2.3, + "eval_logits/chosen": -2.4548380374908447, + "eval_logits/rejected": -2.379462718963623, + "eval_logps/chosen": -432.9532470703125, + "eval_logps/rejected": -438.72894287109375, + "eval_loss": 0.5864917039871216, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.879087507724762, + "eval_rewards/margins": 0.481155127286911, + "eval_rewards/rejected": -1.3602426052093506, + "eval_runtime": 246.5339, + "eval_samples_per_second": 8.112, + "eval_steps_per_second": 0.256, + "step": 2200 + }, + { + "epoch": 2.31, + "learning_rate": 6.591238479705901e-08, + "logits/chosen": -2.487351179122925, + "logits/rejected": -2.405980348587036, + "logps/chosen": -408.71954345703125, + "logps/rejected": -426.19744873046875, + "loss": 0.5408, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7479840517044067, + "rewards/margins": 0.6013373136520386, + "rewards/rejected": -1.3493213653564453, + "step": 2210 + }, + { + "epoch": 2.32, + "learning_rate": 6.40051672087562e-08, + "logits/chosen": -2.4331510066986084, + "logits/rejected": -2.3278965950012207, + "logps/chosen": -438.2916564941406, + "logps/rejected": -428.78485107421875, + "loss": 0.5201, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8200156092643738, + "rewards/margins": 0.6146233677864075, + "rewards/rejected": -1.4346389770507812, + "step": 2220 + }, + { + "epoch": 2.33, + "learning_rate": 6.212189104865972e-08, + "logits/chosen": -2.448960781097412, + "logits/rejected": -2.387979030609131, + "logps/chosen": -428.59039306640625, + "logps/rejected": -447.049560546875, + "loss": 0.5591, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7488642930984497, + "rewards/margins": 0.5563133955001831, + "rewards/rejected": -1.3051776885986328, + "step": 2230 + }, + { + "epoch": 2.34, + "learning_rate": 6.026279873383191e-08, + "logits/chosen": -2.3341879844665527, + "logits/rejected": -2.2485053539276123, + "logps/chosen": -434.8133850097656, + "logps/rejected": -454.1504821777344, + "loss": 0.5177, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8202449083328247, + "rewards/margins": 0.660535454750061, + "rewards/rejected": -1.4807803630828857, + "step": 2240 + }, + { + "epoch": 2.35, + "learning_rate": 5.842812956836804e-08, + "logits/chosen": -2.5089287757873535, + "logits/rejected": -2.418996572494507, + "logps/chosen": -466.69097900390625, + "logps/rejected": -467.65948486328125, + "loss": 0.5671, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8531349897384644, + "rewards/margins": 0.5565911531448364, + "rewards/rejected": -1.4097262620925903, + "step": 2250 + }, + { + "epoch": 2.37, + "learning_rate": 5.661811971259284e-08, + "logits/chosen": -2.4990134239196777, + "logits/rejected": -2.427431106567383, + "logps/chosen": -414.7474060058594, + "logps/rejected": -434.27142333984375, + "loss": 0.5575, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7728979587554932, + "rewards/margins": 0.4413650631904602, + "rewards/rejected": -1.2142630815505981, + "step": 2260 + }, + { + "epoch": 2.38, + "learning_rate": 5.483300215266168e-08, + "logits/chosen": -2.392768621444702, + "logits/rejected": -2.3579678535461426, + "logps/chosen": -379.36358642578125, + "logps/rejected": -460.744873046875, + "loss": 0.5327, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.820690929889679, + "rewards/margins": 0.571506679058075, + "rewards/rejected": -1.392197608947754, + "step": 2270 + }, + { + "epoch": 2.39, + "learning_rate": 5.307300667057049e-08, + "logits/chosen": -2.4417078495025635, + "logits/rejected": -2.350722551345825, + "logps/chosen": -456.86138916015625, + "logps/rejected": -441.4781188964844, + "loss": 0.5599, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7839853763580322, + "rewards/margins": 0.4950861930847168, + "rewards/rejected": -1.279071569442749, + "step": 2280 + }, + { + "epoch": 2.4, + "learning_rate": 5.133835981457771e-08, + "logits/chosen": -2.4283745288848877, + "logits/rejected": -2.406078815460205, + "logps/chosen": -382.7402038574219, + "logps/rejected": -412.78192138671875, + "loss": 0.5472, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8303159475326538, + "rewards/margins": 0.4645245671272278, + "rewards/rejected": -1.2948405742645264, + "step": 2290 + }, + { + "epoch": 2.41, + "learning_rate": 4.962928487004339e-08, + "logits/chosen": -2.4081058502197266, + "logits/rejected": -2.3385443687438965, + "logps/chosen": -403.322509765625, + "logps/rejected": -437.6986389160156, + "loss": 0.5559, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7759246826171875, + "rewards/margins": 0.6043592691421509, + "rewards/rejected": -1.3802839517593384, + "step": 2300 + }, + { + "epoch": 2.41, + "eval_logits/chosen": -2.4495673179626465, + "eval_logits/rejected": -2.374314069747925, + "eval_logps/chosen": -429.79962158203125, + "eval_logps/rejected": -435.3000793457031, + "eval_loss": 0.587175726890564, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.8475508689880371, + "eval_rewards/margins": 0.478402704000473, + "eval_rewards/rejected": -1.325953483581543, + "eval_runtime": 245.2318, + "eval_samples_per_second": 8.156, + "eval_steps_per_second": 0.257, + "step": 2300 + }, + { + "epoch": 2.42, + "learning_rate": 4.794600183068687e-08, + "logits/chosen": -2.4469664096832275, + "logits/rejected": -2.386204719543457, + "logps/chosen": -409.12469482421875, + "logps/rejected": -439.278564453125, + "loss": 0.5463, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8211742639541626, + "rewards/margins": 0.6064980626106262, + "rewards/rejected": -1.4276723861694336, + "step": 2310 + }, + { + "epoch": 2.43, + "learning_rate": 4.628872737026984e-08, + "logits/chosen": -2.4036548137664795, + "logits/rejected": -2.3470802307128906, + "logps/chosen": -398.6167907714844, + "logps/rejected": -412.97027587890625, + "loss": 0.554, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8146356344223022, + "rewards/margins": 0.4236365258693695, + "rewards/rejected": -1.2382723093032837, + "step": 2320 + }, + { + "epoch": 2.44, + "learning_rate": 4.4657674814705085e-08, + "logits/chosen": -2.4584813117980957, + "logits/rejected": -2.3649630546569824, + "logps/chosen": -418.7437438964844, + "logps/rejected": -406.3216857910156, + "loss": 0.5604, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7241760492324829, + "rewards/margins": 0.540256917476654, + "rewards/rejected": -1.2644331455230713, + "step": 2330 + }, + { + "epoch": 2.45, + "learning_rate": 4.305305411459773e-08, + "logits/chosen": -2.4563419818878174, + "logits/rejected": -2.410745859146118, + "logps/chosen": -436.936767578125, + "logps/rejected": -442.15948486328125, + "loss": 0.5533, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7409111261367798, + "rewards/margins": 0.5352380275726318, + "rewards/rejected": -1.276149034500122, + "step": 2340 + }, + { + "epoch": 2.46, + "learning_rate": 4.1475071818219466e-08, + "logits/chosen": -2.3814117908477783, + "logits/rejected": -2.3320162296295166, + "logps/chosen": -450.9549865722656, + "logps/rejected": -440.48974609375, + "loss": 0.5545, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8471084833145142, + "rewards/margins": 0.5237026214599609, + "rewards/rejected": -1.3708112239837646, + "step": 2350 + }, + { + "epoch": 2.47, + "learning_rate": 3.992393104492209e-08, + "logits/chosen": -2.4187827110290527, + "logits/rejected": -2.3232665061950684, + "logps/chosen": -408.9256286621094, + "logps/rejected": -428.8360290527344, + "loss": 0.5629, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8053148984909058, + "rewards/margins": 0.47992628812789917, + "rewards/rejected": -1.2852413654327393, + "step": 2360 + }, + { + "epoch": 2.48, + "learning_rate": 3.839983145899148e-08, + "logits/chosen": -2.399820566177368, + "logits/rejected": -2.2919247150421143, + "logps/chosen": -429.00469970703125, + "logps/rejected": -428.13018798828125, + "loss": 0.5486, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7831125259399414, + "rewards/margins": 0.573869526386261, + "rewards/rejected": -1.3569821119308472, + "step": 2370 + }, + { + "epoch": 2.49, + "learning_rate": 3.690296924394659e-08, + "logits/chosen": -2.3557441234588623, + "logits/rejected": -2.333583354949951, + "logps/chosen": -409.0820007324219, + "logps/rejected": -417.942138671875, + "loss": 0.5611, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9311076402664185, + "rewards/margins": 0.3103558421134949, + "rewards/rejected": -1.2414636611938477, + "step": 2380 + }, + { + "epoch": 2.5, + "learning_rate": 3.543353707728672e-08, + "logits/chosen": -2.426609992980957, + "logits/rejected": -2.3464412689208984, + "logps/chosen": -406.8370666503906, + "logps/rejected": -408.04913330078125, + "loss": 0.5695, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.8274558186531067, + "rewards/margins": 0.5477991700172424, + "rewards/rejected": -1.3752549886703491, + "step": 2390 + }, + { + "epoch": 2.51, + "learning_rate": 3.3991724105689736e-08, + "logits/chosen": -2.3941025733947754, + "logits/rejected": -2.2982020378112793, + "logps/chosen": -423.27313232421875, + "logps/rejected": -424.16973876953125, + "loss": 0.5467, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.898272693157196, + "rewards/margins": 0.496847003698349, + "rewards/rejected": -1.3951194286346436, + "step": 2400 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -2.4452052116394043, + "eval_logits/rejected": -2.3696937561035156, + "eval_logps/chosen": -429.87860107421875, + "eval_logps/rejected": -435.4400939941406, + "eval_loss": 0.5867913961410522, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": -0.8483405113220215, + "eval_rewards/margins": 0.4790137708187103, + "eval_rewards/rejected": -1.3273543119430542, + "eval_runtime": 247.943, + "eval_samples_per_second": 8.066, + "eval_steps_per_second": 0.254, + "step": 2400 + }, + { + "epoch": 2.52, + "learning_rate": 3.257771592066499e-08, + "logits/chosen": -2.428584098815918, + "logits/rejected": -2.3538498878479004, + "logps/chosen": -428.6109924316406, + "logps/rejected": -435.99542236328125, + "loss": 0.5562, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8293555378913879, + "rewards/margins": 0.58681321144104, + "rewards/rejected": -1.4161686897277832, + "step": 2410 + }, + { + "epoch": 2.53, + "learning_rate": 3.119169453466367e-08, + "logits/chosen": -2.5020382404327393, + "logits/rejected": -2.42659330368042, + "logps/chosen": -431.7511291503906, + "logps/rejected": -445.3756408691406, + "loss": 0.5214, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7522695660591125, + "rewards/margins": 0.48350271582603455, + "rewards/rejected": -1.2357723712921143, + "step": 2420 + }, + { + "epoch": 2.54, + "learning_rate": 2.983383835765038e-08, + "logits/chosen": -2.4027044773101807, + "logits/rejected": -2.3710248470306396, + "logps/chosen": -436.78973388671875, + "logps/rejected": -459.3680725097656, + "loss": 0.5583, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7804380655288696, + "rewards/margins": 0.5590247511863708, + "rewards/rejected": -1.3394627571105957, + "step": 2430 + }, + { + "epoch": 2.55, + "learning_rate": 2.8504322174137452e-08, + "logits/chosen": -2.419508695602417, + "logits/rejected": -2.393357992172241, + "logps/chosen": -372.4573059082031, + "logps/rejected": -410.5888671875, + "loss": 0.5364, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.818364143371582, + "rewards/margins": 0.5231537818908691, + "rewards/rejected": -1.341517686843872, + "step": 2440 + }, + { + "epoch": 2.56, + "learning_rate": 2.7203317120687214e-08, + "logits/chosen": -2.36185359954834, + "logits/rejected": -2.3087821006774902, + "logps/chosen": -438.8936462402344, + "logps/rejected": -449.9029846191406, + "loss": 0.5202, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8277127146720886, + "rewards/margins": 0.5720094442367554, + "rewards/rejected": -1.3997222185134888, + "step": 2450 + }, + { + "epoch": 2.57, + "learning_rate": 2.5930990663882298e-08, + "logits/chosen": -2.5004594326019287, + "logits/rejected": -2.4132115840911865, + "logps/chosen": -463.64031982421875, + "logps/rejected": -466.9752502441406, + "loss": 0.53, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8853015899658203, + "rewards/margins": 0.4785071015357971, + "rewards/rejected": -1.3638086318969727, + "step": 2460 + }, + { + "epoch": 2.59, + "learning_rate": 2.4687506578770195e-08, + "logits/chosen": -2.4215641021728516, + "logits/rejected": -2.373194456100464, + "logps/chosen": -429.7237854003906, + "logps/rejected": -465.56622314453125, + "loss": 0.5514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7883203029632568, + "rewards/margins": 0.5758455991744995, + "rewards/rejected": -1.3641657829284668, + "step": 2470 + }, + { + "epoch": 2.6, + "learning_rate": 2.3473024927780888e-08, + "logits/chosen": -2.3653247356414795, + "logits/rejected": -2.373579502105713, + "logps/chosen": -422.0316467285156, + "logps/rejected": -431.6985778808594, + "loss": 0.5492, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.880569577217102, + "rewards/margins": 0.3722376823425293, + "rewards/rejected": -1.252807378768921, + "step": 2480 + }, + { + "epoch": 2.61, + "learning_rate": 2.228770204012448e-08, + "logits/chosen": -2.4217820167541504, + "logits/rejected": -2.3459651470184326, + "logps/chosen": -400.4313049316406, + "logps/rejected": -409.916259765625, + "loss": 0.5498, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8538491129875183, + "rewards/margins": 0.4800568222999573, + "rewards/rejected": -1.3339059352874756, + "step": 2490 + }, + { + "epoch": 2.62, + "learning_rate": 2.1131690491667547e-08, + "logits/chosen": -2.431408405303955, + "logits/rejected": -2.384169578552246, + "logps/chosen": -432.4293518066406, + "logps/rejected": -427.60626220703125, + "loss": 0.5666, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9473799467086792, + "rewards/margins": 0.34295937418937683, + "rewards/rejected": -1.2903392314910889, + "step": 2500 + }, + { + "epoch": 2.62, + "eval_logits/chosen": -2.439899444580078, + "eval_logits/rejected": -2.364093780517578, + "eval_logps/chosen": -432.5810852050781, + "eval_logps/rejected": -438.9631042480469, + "eval_loss": 0.585797131061554, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.8753649592399597, + "eval_rewards/margins": 0.4872189164161682, + "eval_rewards/rejected": -1.3625837564468384, + "eval_runtime": 250.9047, + "eval_samples_per_second": 7.971, + "eval_steps_per_second": 0.251, + "step": 2500 + }, + { + "epoch": 2.63, + "learning_rate": 2.0005139085293942e-08, + "logits/chosen": -2.3748772144317627, + "logits/rejected": -2.366490364074707, + "logps/chosen": -394.9242248535156, + "logps/rejected": -445.06317138671875, + "loss": 0.5697, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9380524754524231, + "rewards/margins": 0.4403650164604187, + "rewards/rejected": -1.3784174919128418, + "step": 2510 + }, + { + "epoch": 2.64, + "learning_rate": 1.8908192831750545e-08, + "logits/chosen": -2.3916258811950684, + "logits/rejected": -2.3020219802856445, + "logps/chosen": -426.46990966796875, + "logps/rejected": -416.8695373535156, + "loss": 0.5335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8564615249633789, + "rewards/margins": 0.5280572175979614, + "rewards/rejected": -1.3845187425613403, + "step": 2520 + }, + { + "epoch": 2.65, + "learning_rate": 1.7840992930981345e-08, + "logits/chosen": -2.4265730381011963, + "logits/rejected": -2.38875150680542, + "logps/chosen": -462.3692932128906, + "logps/rejected": -460.39337158203125, + "loss": 0.5537, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8171685338020325, + "rewards/margins": 0.4775725305080414, + "rewards/rejected": -1.294741153717041, + "step": 2530 + }, + { + "epoch": 2.66, + "learning_rate": 1.6803676753952138e-08, + "logits/chosen": -2.3773293495178223, + "logits/rejected": -2.3178646564483643, + "logps/chosen": -409.1978759765625, + "logps/rejected": -452.56549072265625, + "loss": 0.529, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7988417744636536, + "rewards/margins": 0.6674584150314331, + "rewards/rejected": -1.466300129890442, + "step": 2540 + }, + { + "epoch": 2.67, + "learning_rate": 1.5796377824967788e-08, + "logits/chosen": -2.4757285118103027, + "logits/rejected": -2.3862550258636475, + "logps/chosen": -461.76483154296875, + "logps/rejected": -448.6534118652344, + "loss": 0.5449, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7839967012405396, + "rewards/margins": 0.5081378817558289, + "rewards/rejected": -1.2921345233917236, + "step": 2550 + }, + { + "epoch": 2.68, + "learning_rate": 1.481922580448533e-08, + "logits/chosen": -2.402846574783325, + "logits/rejected": -2.3772921562194824, + "logps/chosen": -418.25079345703125, + "logps/rejected": -474.2286682128906, + "loss": 0.5587, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7962145805358887, + "rewards/margins": 0.4984118938446045, + "rewards/rejected": -1.2946264743804932, + "step": 2560 + }, + { + "epoch": 2.69, + "learning_rate": 1.3872346472423246e-08, + "logits/chosen": -2.4501101970672607, + "logits/rejected": -2.356166362762451, + "logps/chosen": -448.30303955078125, + "logps/rejected": -447.73712158203125, + "loss": 0.5517, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8609638214111328, + "rewards/margins": 0.46454888582229614, + "rewards/rejected": -1.3255127668380737, + "step": 2570 + }, + { + "epoch": 2.7, + "learning_rate": 1.2955861711971745e-08, + "logits/chosen": -2.4125468730926514, + "logits/rejected": -2.312474012374878, + "logps/chosen": -454.7066955566406, + "logps/rejected": -423.25390625, + "loss": 0.5568, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8763197064399719, + "rewards/margins": 0.5553014874458313, + "rewards/rejected": -1.4316211938858032, + "step": 2580 + }, + { + "epoch": 2.71, + "learning_rate": 1.2069889493903112e-08, + "logits/chosen": -2.40596342086792, + "logits/rejected": -2.3436505794525146, + "logps/chosen": -426.20660400390625, + "logps/rejected": -443.60552978515625, + "loss": 0.5476, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8504959344863892, + "rewards/margins": 0.5777041912078857, + "rewards/rejected": -1.4282000064849854, + "step": 2590 + }, + { + "epoch": 2.72, + "learning_rate": 1.1214543861387039e-08, + "logits/chosen": -2.3759148120880127, + "logits/rejected": -2.3197531700134277, + "logps/chosen": -403.3725280761719, + "logps/rejected": -445.80767822265625, + "loss": 0.5113, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.731840193271637, + "rewards/margins": 0.7046986818313599, + "rewards/rejected": -1.4365389347076416, + "step": 2600 + }, + { + "epoch": 2.72, + "eval_logits/chosen": -2.4361400604248047, + "eval_logits/rejected": -2.3603618144989014, + "eval_logps/chosen": -434.4620361328125, + "eval_logps/rejected": -441.12109375, + "eval_loss": 0.5855809450149536, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.894174337387085, + "eval_rewards/margins": 0.4899892508983612, + "eval_rewards/rejected": -1.384163737297058, + "eval_runtime": 244.6014, + "eval_samples_per_second": 8.177, + "eval_steps_per_second": 0.258, + "step": 2600 + }, + { + "epoch": 2.73, + "learning_rate": 1.0389934915310344e-08, + "logits/chosen": -2.3394923210144043, + "logits/rejected": -2.297569990158081, + "logps/chosen": -414.2259826660156, + "logps/rejected": -447.39239501953125, + "loss": 0.5295, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9291477203369141, + "rewards/margins": 0.49646610021591187, + "rewards/rejected": -1.4256137609481812, + "step": 2610 + }, + { + "epoch": 2.74, + "learning_rate": 9.596168800105081e-09, + "logits/chosen": -2.4024291038513184, + "logits/rejected": -2.347120523452759, + "logps/chosen": -436.79364013671875, + "logps/rejected": -450.7452697753906, + "loss": 0.5388, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8427282571792603, + "rewards/margins": 0.5560713410377502, + "rewards/rejected": -1.3987995386123657, + "step": 2620 + }, + { + "epoch": 2.75, + "learning_rate": 8.833347690085258e-09, + "logits/chosen": -2.4367868900299072, + "logits/rejected": -2.39370059967041, + "logps/chosen": -430.2493591308594, + "logps/rejected": -459.7605895996094, + "loss": 0.5325, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.816397488117218, + "rewards/margins": 0.48605260252952576, + "rewards/rejected": -1.3024499416351318, + "step": 2630 + }, + { + "epoch": 2.76, + "learning_rate": 8.101569776295087e-09, + "logits/chosen": -2.44547700881958, + "logits/rejected": -2.3715062141418457, + "logps/chosen": -451.3525390625, + "logps/rejected": -472.49871826171875, + "loss": 0.5547, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8327264785766602, + "rewards/margins": 0.5013109445571899, + "rewards/rejected": -1.3340375423431396, + "step": 2640 + }, + { + "epoch": 2.77, + "learning_rate": 7.400929253869537e-09, + "logits/chosen": -2.386373519897461, + "logits/rejected": -2.3548552989959717, + "logps/chosen": -411.06036376953125, + "logps/rejected": -401.76202392578125, + "loss": 0.5401, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7602349519729614, + "rewards/margins": 0.544723391532898, + "rewards/rejected": -1.3049582242965698, + "step": 2650 + }, + { + "epoch": 2.78, + "learning_rate": 6.731516309909619e-09, + "logits/chosen": -2.4245498180389404, + "logits/rejected": -2.351013660430908, + "logps/chosen": -413.696533203125, + "logps/rejected": -431.78143310546875, + "loss": 0.5556, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8400930166244507, + "rewards/margins": 0.5150105357170105, + "rewards/rejected": -1.3551037311553955, + "step": 2660 + }, + { + "epoch": 2.79, + "learning_rate": 6.093417111873306e-09, + "logits/chosen": -2.405090808868408, + "logits/rejected": -2.363133668899536, + "logps/chosen": -446.4725036621094, + "logps/rejected": -457.05291748046875, + "loss": 0.5637, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.940416157245636, + "rewards/margins": 0.46864986419677734, + "rewards/rejected": -1.4090659618377686, + "step": 2670 + }, + { + "epoch": 2.8, + "learning_rate": 5.486713796483966e-09, + "logits/chosen": -2.4188191890716553, + "logits/rejected": -2.4030518531799316, + "logps/chosen": -426.7608947753906, + "logps/rejected": -478.41424560546875, + "loss": 0.5375, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9050353765487671, + "rewards/margins": 0.4908295273780823, + "rewards/rejected": -1.3958650827407837, + "step": 2680 + }, + { + "epoch": 2.82, + "learning_rate": 4.911484459157844e-09, + "logits/chosen": -2.355522632598877, + "logits/rejected": -2.2627921104431152, + "logps/chosen": -414.41058349609375, + "logps/rejected": -408.3012390136719, + "loss": 0.5264, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8410286903381348, + "rewards/margins": 0.5534576177597046, + "rewards/rejected": -1.394486427307129, + "step": 2690 + }, + { + "epoch": 2.83, + "learning_rate": 4.36780314395116e-09, + "logits/chosen": -2.391401767730713, + "logits/rejected": -2.301657199859619, + "logps/chosen": -408.6781005859375, + "logps/rejected": -399.43438720703125, + "loss": 0.5601, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.82252436876297, + "rewards/margins": 0.43992215394973755, + "rewards/rejected": -1.262446403503418, + "step": 2700 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -2.434546947479248, + "eval_logits/rejected": -2.35845685005188, + "eval_logps/chosen": -435.4488525390625, + "eval_logps/rejected": -442.29296875, + "eval_loss": 0.5855222344398499, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.904042661190033, + "eval_rewards/margins": 0.49184030294418335, + "eval_rewards/rejected": -1.3958829641342163, + "eval_runtime": 247.9608, + "eval_samples_per_second": 8.066, + "eval_steps_per_second": 0.254, + "step": 2700 + }, + { + "epoch": 2.84, + "learning_rate": 3.8557398340296195e-09, + "logits/chosen": -2.4358608722686768, + "logits/rejected": -2.317959785461426, + "logps/chosen": -421.12969970703125, + "logps/rejected": -426.5562438964844, + "loss": 0.5643, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9019654393196106, + "rewards/margins": 0.5431965589523315, + "rewards/rejected": -1.445162057876587, + "step": 2710 + }, + { + "epoch": 2.85, + "learning_rate": 3.3753604426595417e-09, + "logits/chosen": -2.3855860233306885, + "logits/rejected": -2.311525344848633, + "logps/chosen": -406.04010009765625, + "logps/rejected": -413.10186767578125, + "loss": 0.5756, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8721002340316772, + "rewards/margins": 0.516608715057373, + "rewards/rejected": -1.3887090682983398, + "step": 2720 + }, + { + "epoch": 2.86, + "learning_rate": 2.926726804723917e-09, + "logits/chosen": -2.3775479793548584, + "logits/rejected": -2.3673160076141357, + "logps/chosen": -443.3018493652344, + "logps/rejected": -457.483154296875, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8438417315483093, + "rewards/margins": 0.5027114748954773, + "rewards/rejected": -1.3465534448623657, + "step": 2730 + }, + { + "epoch": 2.87, + "learning_rate": 2.5098966687626954e-09, + "logits/chosen": -2.4318108558654785, + "logits/rejected": -2.3373677730560303, + "logps/chosen": -429.6363830566406, + "logps/rejected": -441.6180725097656, + "loss": 0.5237, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8596477508544922, + "rewards/margins": 0.6358083486557007, + "rewards/rejected": -1.4954560995101929, + "step": 2740 + }, + { + "epoch": 2.88, + "learning_rate": 2.124923689539426e-09, + "logits/chosen": -2.4223837852478027, + "logits/rejected": -2.356449604034424, + "logps/chosen": -418.0809631347656, + "logps/rejected": -436.1025390625, + "loss": 0.5379, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8189032673835754, + "rewards/margins": 0.5849324464797974, + "rewards/rejected": -1.4038358926773071, + "step": 2750 + }, + { + "epoch": 2.89, + "learning_rate": 1.7718574211347537e-09, + "logits/chosen": -2.3926260471343994, + "logits/rejected": -2.3217663764953613, + "logps/chosen": -391.73358154296875, + "logps/rejected": -397.6258239746094, + "loss": 0.5441, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8713921308517456, + "rewards/margins": 0.4508208632469177, + "rewards/rejected": -1.3222129344940186, + "step": 2760 + }, + { + "epoch": 2.9, + "learning_rate": 1.4507433105677703e-09, + "logits/chosen": -2.361290454864502, + "logits/rejected": -2.2786500453948975, + "logps/chosen": -435.0477600097656, + "logps/rejected": -461.3599548339844, + "loss": 0.5338, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8669681549072266, + "rewards/margins": 0.5835943222045898, + "rewards/rejected": -1.4505623579025269, + "step": 2770 + }, + { + "epoch": 2.91, + "learning_rate": 1.1616226919460015e-09, + "logits/chosen": -2.339124917984009, + "logits/rejected": -2.259683132171631, + "logps/chosen": -381.10076904296875, + "logps/rejected": -404.74615478515625, + "loss": 0.5604, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9128230214118958, + "rewards/margins": 0.4604857563972473, + "rewards/rejected": -1.3733086585998535, + "step": 2780 + }, + { + "epoch": 2.92, + "learning_rate": 9.045327811449676e-10, + "logits/chosen": -2.357779026031494, + "logits/rejected": -2.281944990158081, + "logps/chosen": -403.49346923828125, + "logps/rejected": -422.0860290527344, + "loss": 0.5393, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8404678106307983, + "rewards/margins": 0.6726306676864624, + "rewards/rejected": -1.5130985975265503, + "step": 2790 + }, + { + "epoch": 2.93, + "learning_rate": 6.795066710175157e-10, + "logits/chosen": -2.414121150970459, + "logits/rejected": -2.334519863128662, + "logps/chosen": -422.6380920410156, + "logps/rejected": -431.3646545410156, + "loss": 0.5303, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8720327615737915, + "rewards/margins": 0.5439731478691101, + "rewards/rejected": -1.4160058498382568, + "step": 2800 + }, + { + "epoch": 2.93, + "eval_logits/chosen": -2.434152364730835, + "eval_logits/rejected": -2.358067512512207, + "eval_logps/chosen": -435.0786437988281, + "eval_logps/rejected": -441.68048095703125, + "eval_loss": 0.5856688618659973, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.9003406763076782, + "eval_rewards/margins": 0.4894171357154846, + "eval_rewards/rejected": -1.389757752418518, + "eval_runtime": 249.9152, + "eval_samples_per_second": 8.003, + "eval_steps_per_second": 0.252, + "step": 2800 + }, + { + "epoch": 2.94, + "learning_rate": 4.86573327134282e-10, + "logits/chosen": -2.392698287963867, + "logits/rejected": -2.3596174716949463, + "logps/chosen": -470.7052307128906, + "logps/rejected": -468.05133056640625, + "loss": 0.5605, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9614042043685913, + "rewards/margins": 0.41344791650772095, + "rewards/rejected": -1.374852180480957, + "step": 2810 + }, + { + "epoch": 2.95, + "learning_rate": 3.2575758405506414e-10, + "logits/chosen": -2.449122428894043, + "logits/rejected": -2.3575618267059326, + "logps/chosen": -462.04248046875, + "logps/rejected": -493.18243408203125, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8269790410995483, + "rewards/margins": 0.6107988357543945, + "rewards/rejected": -1.4377778768539429, + "step": 2820 + }, + { + "epoch": 2.96, + "learning_rate": 1.9708014213221101e-10, + "logits/chosen": -2.3595032691955566, + "logits/rejected": -2.317110538482666, + "logps/chosen": -446.1568298339844, + "logps/rejected": -459.81341552734375, + "loss": 0.5503, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9512478709220886, + "rewards/margins": 0.4871467649936676, + "rewards/rejected": -1.4383947849273682, + "step": 2830 + }, + { + "epoch": 2.97, + "learning_rate": 1.0055756484589339e-10, + "logits/chosen": -2.350008249282837, + "logits/rejected": -2.2946763038635254, + "logps/chosen": -425.847412109375, + "logps/rejected": -416.67535400390625, + "loss": 0.5338, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7966060042381287, + "rewards/margins": 0.5134121179580688, + "rewards/rejected": -1.3100181818008423, + "step": 2840 + }, + { + "epoch": 2.98, + "learning_rate": 3.620227667228137e-11, + "logits/chosen": -2.44765043258667, + "logits/rejected": -2.388314723968506, + "logps/chosen": -457.08624267578125, + "logps/rejected": -460.8662109375, + "loss": 0.5446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.860503077507019, + "rewards/margins": 0.4813441336154938, + "rewards/rejected": -1.3418471813201904, + "step": 2850 + }, + { + "epoch": 2.99, + "learning_rate": 4.022561484018361e-12, + "logits/chosen": -2.4590046405792236, + "logits/rejected": -2.3513126373291016, + "logps/chosen": -433.288330078125, + "logps/rejected": -440.690673828125, + "loss": 0.5286, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8339791297912598, + "rewards/margins": 0.594098687171936, + "rewards/rejected": -1.4280778169631958, + "step": 2860 + }, + { + "epoch": 3.0, + "step": 2865, + "total_flos": 0.0, + "train_loss": 0.5928295368507478, + "train_runtime": 47453.5759, + "train_samples_per_second": 3.865, + "train_steps_per_second": 0.06 + } + ], + "logging_steps": 10, + "max_steps": 2865, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}