{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984591679506933, "eval_steps": 100, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.101982836222645, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.362821102142334, "logits/rejected": -0.6466645002365112, "logps/chosen": -1025.3448486328125, "logps/rejected": -1304.718017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 6.0068256381560765, "learning_rate": 1.5151515151515152e-06, "logits/chosen": -0.6083016991615295, "logits/rejected": -0.6111394166946411, "logps/chosen": -990.301025390625, "logps/rejected": -1385.5863037109375, "loss": 0.6912, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.00793336983770132, "rewards/margins": 0.0015673839952796698, "rewards/rejected": -0.009500754997134209, "step": 10 }, { "epoch": 0.06, "grad_norm": 6.64385894959999, "learning_rate": 3.0303030303030305e-06, "logits/chosen": -0.39747971296310425, "logits/rejected": -0.5266290903091431, "logps/chosen": -1019.9202270507812, "logps/rejected": -1275.5029296875, "loss": 0.6306, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17681096494197845, "rewards/margins": 0.19123901426792145, "rewards/rejected": -0.3680500090122223, "step": 20 }, { "epoch": 0.09, "grad_norm": 5.146938095650329, "learning_rate": 4.5454545454545455e-06, "logits/chosen": -0.3289431631565094, "logits/rejected": -0.3537369966506958, "logps/chosen": -914.8097534179688, "logps/rejected": -1425.679443359375, "loss": 0.5081, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.23936215043067932, "rewards/margins": 0.8084670305252075, "rewards/rejected": -1.0478291511535645, "step": 30 }, { "epoch": 0.12, "grad_norm": 11.382612181193535, "learning_rate": 4.9928646847826494e-06, "logits/chosen": -0.27268069982528687, "logits/rejected": -0.3392156958580017, "logps/chosen": -1024.892578125, "logps/rejected": -1513.9617919921875, "loss": 0.4356, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.20300360023975372, "rewards/margins": 1.8095756769180298, "rewards/rejected": -2.0125787258148193, "step": 40 }, { "epoch": 0.15, "grad_norm": 4.485452143975147, "learning_rate": 4.958014217656855e-06, "logits/chosen": -0.23044054210186005, "logits/rejected": -0.25221356749534607, "logps/chosen": -967.2037353515625, "logps/rejected": -1537.017333984375, "loss": 0.3801, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.12485456466674805, "rewards/margins": 2.005478620529175, "rewards/rejected": -2.130333185195923, "step": 50 }, { "epoch": 0.18, "grad_norm": 3.8070457431642892, "learning_rate": 4.894543310469968e-06, "logits/chosen": -0.19517004489898682, "logits/rejected": -0.22597365081310272, "logps/chosen": -916.4852294921875, "logps/rejected": -1595.1839599609375, "loss": 0.3655, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.2139274626970291, "rewards/margins": 2.360320568084717, "rewards/rejected": -2.5742483139038086, "step": 60 }, { "epoch": 0.22, "grad_norm": 3.6141979805383726, "learning_rate": 4.803191000971128e-06, "logits/chosen": -0.17929306626319885, "logits/rejected": -0.18331752717494965, "logps/chosen": -965.8648681640625, "logps/rejected": -1572.9818115234375, "loss": 0.3243, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.4429554343223572, "rewards/margins": 2.501615047454834, "rewards/rejected": -2.944570541381836, "step": 70 }, { "epoch": 0.25, "grad_norm": 3.9698745789179712, "learning_rate": 4.68502097027319e-06, "logits/chosen": -0.18549516797065735, "logits/rejected": -0.30454546213150024, "logps/chosen": -881.955078125, "logps/rejected": -1555.6883544921875, "loss": 0.284, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10575978457927704, "rewards/margins": 2.606116771697998, "rewards/rejected": -2.711876392364502, "step": 80 }, { "epoch": 0.28, "grad_norm": 4.915459926093003, "learning_rate": 4.541409157643027e-06, "logits/chosen": -0.2555353045463562, "logits/rejected": -0.3517759442329407, "logps/chosen": -1006.6803588867188, "logps/rejected": -1699.8876953125, "loss": 0.2626, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.24506595730781555, "rewards/margins": 3.7898712158203125, "rewards/rejected": -4.034937381744385, "step": 90 }, { "epoch": 0.31, "grad_norm": 5.579746591418778, "learning_rate": 4.374027739443953e-06, "logits/chosen": -0.2530584931373596, "logits/rejected": -0.39778950810432434, "logps/chosen": -1006.4603271484375, "logps/rejected": -1830.1383056640625, "loss": 0.249, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.23174142837524414, "rewards/margins": 4.588972091674805, "rewards/rejected": -4.820713996887207, "step": 100 }, { "epoch": 0.31, "eval_logits/chosen": -0.2358812391757965, "eval_logits/rejected": -0.266615092754364, "eval_logps/chosen": -535.3107299804688, "eval_logps/rejected": -1504.041259765625, "eval_loss": 0.3604305684566498, "eval_rewards/accuracies": 0.8942307829856873, "eval_rewards/chosen": -0.7724042534828186, "eval_rewards/margins": 7.1227898597717285, "eval_rewards/rejected": -7.8951945304870605, "eval_runtime": 41.33, "eval_samples_per_second": 9.581, "eval_steps_per_second": 0.315, "step": 100 }, { "epoch": 0.34, "grad_norm": 13.962571315802977, "learning_rate": 4.184825658775027e-06, "logits/chosen": -0.35490721464157104, "logits/rejected": -0.3757438659667969, "logps/chosen": -973.4483642578125, "logps/rejected": -1818.024658203125, "loss": 0.2291, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5153377056121826, "rewards/margins": 3.8287353515625, "rewards/rejected": -4.344073295593262, "step": 110 }, { "epoch": 0.37, "grad_norm": 3.793575374162859, "learning_rate": 3.976005932514807e-06, "logits/chosen": -0.3033773601055145, "logits/rejected": -0.33670344948768616, "logps/chosen": -1026.07373046875, "logps/rejected": -1623.307861328125, "loss": 0.1906, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.40073472261428833, "rewards/margins": 3.711804151535034, "rewards/rejected": -4.112539768218994, "step": 120 }, { "epoch": 0.4, "grad_norm": 3.3455045057377073, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.22906668484210968, "logits/rejected": -0.30051860213279724, "logps/chosen": -947.1951904296875, "logps/rejected": -1786.0419921875, "loss": 0.1972, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5512313842773438, "rewards/margins": 3.6224727630615234, "rewards/rejected": -4.173704147338867, "step": 130 }, { "epoch": 0.43, "grad_norm": 5.442047168951181, "learning_rate": 3.5094394120160047e-06, "logits/chosen": -0.2941485047340393, "logits/rejected": -0.324366956949234, "logps/chosen": -1009.9542236328125, "logps/rejected": -1759.7135009765625, "loss": 0.2106, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6786917448043823, "rewards/margins": 3.91167950630188, "rewards/rejected": -4.590371608734131, "step": 140 }, { "epoch": 0.46, "grad_norm": 4.215335437797857, "learning_rate": 3.257125189744877e-06, "logits/chosen": -0.32115620374679565, "logits/rejected": -0.36864355206489563, "logps/chosen": -954.0335693359375, "logps/rejected": -1671.050537109375, "loss": 0.1917, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.24637527763843536, "rewards/margins": 3.504626750946045, "rewards/rejected": -3.751002073287964, "step": 150 }, { "epoch": 0.49, "grad_norm": 2.836600464080731, "learning_rate": 2.9959952104467247e-06, "logits/chosen": -0.3462420105934143, "logits/rejected": -0.37651991844177246, "logps/chosen": -1160.772216796875, "logps/rejected": -1859.069091796875, "loss": 0.1688, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.4739566743373871, "rewards/margins": 4.305468559265137, "rewards/rejected": -4.779424667358398, "step": 160 }, { "epoch": 0.52, "grad_norm": 7.01626911931463, "learning_rate": 2.729089999626637e-06, "logits/chosen": -0.323803573846817, "logits/rejected": -0.38482701778411865, "logps/chosen": -950.5234375, "logps/rejected": -1763.620849609375, "loss": 0.1733, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5791029930114746, "rewards/margins": 4.474118709564209, "rewards/rejected": -5.053222179412842, "step": 170 }, { "epoch": 0.55, "grad_norm": 2.8279150301129055, "learning_rate": 2.4595173279937464e-06, "logits/chosen": -0.373486191034317, "logits/rejected": -0.42519837617874146, "logps/chosen": -935.4544677734375, "logps/rejected": -1869.3843994140625, "loss": 0.126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6261727809906006, "rewards/margins": 5.2783613204956055, "rewards/rejected": -5.904534339904785, "step": 180 }, { "epoch": 0.59, "grad_norm": 5.49355443422865, "learning_rate": 2.190416025435675e-06, "logits/chosen": -0.40133827924728394, "logits/rejected": -0.4126282334327698, "logps/chosen": -1012.4228515625, "logps/rejected": -1692.1015625, "loss": 0.1903, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6535183191299438, "rewards/margins": 4.786801815032959, "rewards/rejected": -5.440320014953613, "step": 190 }, { "epoch": 0.62, "grad_norm": 3.3031942791417244, "learning_rate": 1.9249194333484567e-06, "logits/chosen": -0.32231295108795166, "logits/rejected": -0.42156219482421875, "logps/chosen": -821.6759643554688, "logps/rejected": -1748.581787109375, "loss": 0.1374, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3118464946746826, "rewards/margins": 4.042534828186035, "rewards/rejected": -4.354381561279297, "step": 200 }, { "epoch": 0.62, "eval_logits/chosen": -0.2821931540966034, "eval_logits/rejected": -0.1753174513578415, "eval_logps/chosen": -550.3823852539062, "eval_logps/rejected": -1521.086181640625, "eval_loss": 0.23887068033218384, "eval_rewards/accuracies": 0.9038461446762085, "eval_rewards/chosen": -0.9231204390525818, "eval_rewards/margins": 7.142522811889648, "eval_rewards/rejected": -8.065644264221191, "eval_runtime": 41.3932, "eval_samples_per_second": 9.567, "eval_steps_per_second": 0.314, "step": 200 }, { "epoch": 0.65, "grad_norm": 5.527965146477785, "learning_rate": 1.6661189208729492e-06, "logits/chosen": -0.38927820324897766, "logits/rejected": -0.5214006304740906, "logps/chosen": -1015.1365356445312, "logps/rejected": -1966.6363525390625, "loss": 0.1308, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6492040157318115, "rewards/margins": 5.716192722320557, "rewards/rejected": -6.365396022796631, "step": 210 }, { "epoch": 0.68, "grad_norm": 3.602893800956427, "learning_rate": 1.4170278898446176e-06, "logits/chosen": -0.4857853055000305, "logits/rejected": -0.529462993144989, "logps/chosen": -1030.688232421875, "logps/rejected": -1932.1002197265625, "loss": 0.1249, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7277423143386841, "rewards/margins": 5.351422309875488, "rewards/rejected": -6.079164028167725, "step": 220 }, { "epoch": 0.71, "grad_norm": 4.7188705667925674, "learning_rate": 1.1805466875731277e-06, "logits/chosen": -0.49866923689842224, "logits/rejected": -0.6342719793319702, "logps/chosen": -1055.165771484375, "logps/rejected": -1956.5159912109375, "loss": 0.1241, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6889677047729492, "rewards/margins": 5.961886405944824, "rewards/rejected": -6.650854587554932, "step": 230 }, { "epoch": 0.74, "grad_norm": 3.655545564406198, "learning_rate": 9.594288359976817e-07, "logits/chosen": -0.48687905073165894, "logits/rejected": -0.5934125185012817, "logps/chosen": -927.3426513671875, "logps/rejected": -1977.37890625, "loss": 0.1424, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.49007558822631836, "rewards/margins": 5.872694969177246, "rewards/rejected": -6.362771034240723, "step": 240 }, { "epoch": 0.77, "grad_norm": 3.6147182029800553, "learning_rate": 7.56248970436493e-07, "logits/chosen": -0.4635826647281647, "logits/rejected": -0.5487635135650635, "logps/chosen": -992.0515747070312, "logps/rejected": -1830.902587890625, "loss": 0.133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.31789669394493103, "rewards/margins": 5.5414719581604, "rewards/rejected": -5.859368801116943, "step": 250 }, { "epoch": 0.8, "grad_norm": 7.0194030888446655, "learning_rate": 5.733728612427772e-07, "logits/chosen": -0.48970723152160645, "logits/rejected": -0.5219728350639343, "logps/chosen": -932.720703125, "logps/rejected": -1843.7044677734375, "loss": 0.1156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4680628776550293, "rewards/margins": 5.688261032104492, "rewards/rejected": -6.1563239097595215, "step": 260 }, { "epoch": 0.83, "grad_norm": 3.4254663130723073, "learning_rate": 4.129298674268226e-07, "logits/chosen": -0.47387346625328064, "logits/rejected": -0.5744868516921997, "logps/chosen": -919.7806396484375, "logps/rejected": -2094.39404296875, "loss": 0.1271, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5868527293205261, "rewards/margins": 6.565216064453125, "rewards/rejected": -7.152068138122559, "step": 270 }, { "epoch": 0.86, "grad_norm": 2.915035180237974, "learning_rate": 2.7678814298657735e-07, "logits/chosen": -0.48424941301345825, "logits/rejected": -0.5429133176803589, "logps/chosen": -989.7317504882812, "logps/rejected": -2131.172607421875, "loss": 0.112, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.640870213508606, "rewards/margins": 6.7747802734375, "rewards/rejected": -7.415650844573975, "step": 280 }, { "epoch": 0.89, "grad_norm": 4.035709496896224, "learning_rate": 1.6653288463741064e-07, "logits/chosen": -0.5001234412193298, "logits/rejected": -0.5271893739700317, "logps/chosen": -983.5861206054688, "logps/rejected": -2070.129150390625, "loss": 0.119, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6213029623031616, "rewards/margins": 7.315102577209473, "rewards/rejected": -7.936405181884766, "step": 290 }, { "epoch": 0.92, "grad_norm": 3.63054147536607, "learning_rate": 8.344787421847216e-08, "logits/chosen": -0.4631820619106293, "logits/rejected": -0.5495749711990356, "logps/chosen": -926.2537231445312, "logps/rejected": -1872.173095703125, "loss": 0.0982, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8281421661376953, "rewards/margins": 5.6122145652771, "rewards/rejected": -6.440356254577637, "step": 300 }, { "epoch": 0.92, "eval_logits/chosen": -0.3568806052207947, "eval_logits/rejected": -0.21113936603069305, "eval_logps/chosen": -567.682861328125, "eval_logps/rejected": -2033.01416015625, "eval_loss": 0.24133986234664917, "eval_rewards/accuracies": 0.8942307829856873, "eval_rewards/chosen": -1.0961254835128784, "eval_rewards/margins": 12.088796615600586, "eval_rewards/rejected": -13.184922218322754, "eval_runtime": 41.398, "eval_samples_per_second": 9.566, "eval_steps_per_second": 0.314, "step": 300 }, { "epoch": 0.96, "grad_norm": 7.390559743925079, "learning_rate": 2.850053069080344e-08, "logits/chosen": -0.4385458827018738, "logits/rejected": -0.5586596131324768, "logps/chosen": -981.732421875, "logps/rejected": -2004.222900390625, "loss": 0.1192, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7575939893722534, "rewards/margins": 6.195023536682129, "rewards/rejected": -6.952617645263672, "step": 310 }, { "epoch": 0.99, "grad_norm": 7.4456834695369585, "learning_rate": 2.330645777598173e-09, "logits/chosen": -0.5341562032699585, "logits/rejected": -0.5692285299301147, "logps/chosen": -944.4517822265625, "logps/rejected": -1914.8863525390625, "loss": 0.1173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5554883480072021, "rewards/margins": 6.5892229080200195, "rewards/rejected": -7.144711494445801, "step": 320 }, { "epoch": 1.0, "step": 324, "total_flos": 0.0, "train_loss": 0.23018195102980107, "train_runtime": 4792.5755, "train_samples_per_second": 4.331, "train_steps_per_second": 0.068 } ], "logging_steps": 10, "max_steps": 324, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }