{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7543859649122808, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007017543859649123, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 1.3034, "step": 10 }, { "epoch": 0.014035087719298246, "grad_norm": 11.65329647064209, "learning_rate": 9.999987849060752e-07, "loss": 1.3006, "step": 20 }, { "epoch": 0.021052631578947368, "grad_norm": 8.014320373535156, "learning_rate": 9.999632438442366e-07, "loss": 1.233, "step": 30 }, { "epoch": 0.028070175438596492, "grad_norm": 7.890571594238281, "learning_rate": 9.998660418225644e-07, "loss": 1.1962, "step": 40 }, { "epoch": 0.03508771929824561, "grad_norm": 7.12827205657959, "learning_rate": 9.997081019722536e-07, "loss": 1.2213, "step": 50 }, { "epoch": 0.042105263157894736, "grad_norm": 7.200845718383789, "learning_rate": 9.99489443484293e-07, "loss": 1.1679, "step": 60 }, { "epoch": 0.04912280701754386, "grad_norm": 7.650635242462158, "learning_rate": 9.992100929274846e-07, "loss": 1.1699, "step": 70 }, { "epoch": 0.056140350877192984, "grad_norm": 7.227153778076172, "learning_rate": 9.988700842452145e-07, "loss": 1.1207, "step": 80 }, { "epoch": 0.06315789473684211, "grad_norm": 7.5115532875061035, "learning_rate": 9.984694587513297e-07, "loss": 1.1387, "step": 90 }, { "epoch": 0.07017543859649122, "grad_norm": 7.4819512367248535, "learning_rate": 9.980082651251174e-07, "loss": 1.1544, "step": 100 }, { "epoch": 0.07017543859649122, "eval_loss": 1.1328155994415283, "eval_runtime": 27.6835, "eval_samples_per_second": 173.388, "eval_steps_per_second": 2.709, "step": 100 }, { "epoch": 0.07719298245614035, "grad_norm": 7.3147759437561035, "learning_rate": 9.9748655940539e-07, "loss": 1.1726, "step": 110 }, { "epoch": 0.08421052631578947, "grad_norm": 7.672832489013672, "learning_rate": 9.969044049836765e-07, "loss": 1.115, "step": 120 }, { "epoch": 0.0912280701754386, "grad_norm": 7.895420551300049, "learning_rate": 9.962618725965194e-07, "loss": 1.1274, "step": 130 }, { "epoch": 0.09824561403508772, "grad_norm": 7.362156867980957, "learning_rate": 9.955590403168798e-07, "loss": 1.1401, "step": 140 }, { "epoch": 0.10526315789473684, "grad_norm": 7.586355209350586, "learning_rate": 9.947959935446506e-07, "loss": 1.1543, "step": 150 }, { "epoch": 0.11228070175438597, "grad_norm": 7.309718132019043, "learning_rate": 9.939728249962806e-07, "loss": 1.115, "step": 160 }, { "epoch": 0.11929824561403508, "grad_norm": 7.269148826599121, "learning_rate": 9.930896346935075e-07, "loss": 1.0933, "step": 170 }, { "epoch": 0.12631578947368421, "grad_norm": 7.365452766418457, "learning_rate": 9.921465299512052e-07, "loss": 1.0965, "step": 180 }, { "epoch": 0.13333333333333333, "grad_norm": 7.434603214263916, "learning_rate": 9.911436253643443e-07, "loss": 1.0972, "step": 190 }, { "epoch": 0.14035087719298245, "grad_norm": 7.557833194732666, "learning_rate": 9.900810427940668e-07, "loss": 1.1182, "step": 200 }, { "epoch": 0.14035087719298245, "eval_loss": 1.1001578569412231, "eval_runtime": 27.6607, "eval_samples_per_second": 173.531, "eval_steps_per_second": 2.711, "step": 200 }, { "epoch": 0.14736842105263157, "grad_norm": 7.197221279144287, "learning_rate": 9.889589113528808e-07, "loss": 1.0991, "step": 210 }, { "epoch": 0.1543859649122807, "grad_norm": 7.870287895202637, "learning_rate": 9.8777736738897e-07, "loss": 1.1135, "step": 220 }, { "epoch": 0.16140350877192983, "grad_norm": 7.257969379425049, "learning_rate": 9.865365544696286e-07, "loss": 1.1207, "step": 230 }, { "epoch": 0.16842105263157894, "grad_norm": 7.788718223571777, "learning_rate": 9.852366233638143e-07, "loss": 1.1084, "step": 240 }, { "epoch": 0.17543859649122806, "grad_norm": 7.723772048950195, "learning_rate": 9.838777320238312e-07, "loss": 1.0881, "step": 250 }, { "epoch": 0.1824561403508772, "grad_norm": 6.814189434051514, "learning_rate": 9.824600455661351e-07, "loss": 1.1118, "step": 260 }, { "epoch": 0.18947368421052632, "grad_norm": 7.434762477874756, "learning_rate": 9.809837362512718e-07, "loss": 1.0948, "step": 270 }, { "epoch": 0.19649122807017544, "grad_norm": 7.205653190612793, "learning_rate": 9.794489834629454e-07, "loss": 1.0837, "step": 280 }, { "epoch": 0.20350877192982456, "grad_norm": 7.118565559387207, "learning_rate": 9.77855973686222e-07, "loss": 1.092, "step": 290 }, { "epoch": 0.21052631578947367, "grad_norm": 7.293910503387451, "learning_rate": 9.762049004848705e-07, "loss": 1.1015, "step": 300 }, { "epoch": 0.21052631578947367, "eval_loss": 1.0845627784729004, "eval_runtime": 27.672, "eval_samples_per_second": 173.461, "eval_steps_per_second": 2.71, "step": 300 }, { "epoch": 0.21754385964912282, "grad_norm": 7.512034893035889, "learning_rate": 9.744959644778421e-07, "loss": 1.0836, "step": 310 }, { "epoch": 0.22456140350877193, "grad_norm": 7.277877330780029, "learning_rate": 9.727293733148942e-07, "loss": 1.0717, "step": 320 }, { "epoch": 0.23157894736842105, "grad_norm": 7.781631946563721, "learning_rate": 9.709053416513591e-07, "loss": 1.0391, "step": 330 }, { "epoch": 0.23859649122807017, "grad_norm": 7.217984199523926, "learning_rate": 9.690240911220617e-07, "loss": 1.1131, "step": 340 }, { "epoch": 0.24561403508771928, "grad_norm": 7.256911277770996, "learning_rate": 9.67085850314389e-07, "loss": 1.0628, "step": 350 }, { "epoch": 0.25263157894736843, "grad_norm": 7.053469657897949, "learning_rate": 9.650908547405143e-07, "loss": 1.0583, "step": 360 }, { "epoch": 0.2596491228070175, "grad_norm": 7.0806498527526855, "learning_rate": 9.630393468087817e-07, "loss": 1.0714, "step": 370 }, { "epoch": 0.26666666666666666, "grad_norm": 7.368037223815918, "learning_rate": 9.609315757942502e-07, "loss": 1.0629, "step": 380 }, { "epoch": 0.2736842105263158, "grad_norm": 7.083371639251709, "learning_rate": 9.58767797808406e-07, "loss": 1.0748, "step": 390 }, { "epoch": 0.2807017543859649, "grad_norm": 7.305485248565674, "learning_rate": 9.565482757680414e-07, "loss": 1.0736, "step": 400 }, { "epoch": 0.2807017543859649, "eval_loss": 1.072194218635559, "eval_runtime": 27.6671, "eval_samples_per_second": 173.491, "eval_steps_per_second": 2.711, "step": 400 }, { "epoch": 0.28771929824561404, "grad_norm": 7.741823196411133, "learning_rate": 9.542732793633097e-07, "loss": 1.0913, "step": 410 }, { "epoch": 0.29473684210526313, "grad_norm": 6.781225204467773, "learning_rate": 9.519430850249549e-07, "loss": 1.0826, "step": 420 }, { "epoch": 0.3017543859649123, "grad_norm": 6.993170738220215, "learning_rate": 9.495579758907229e-07, "loss": 1.0472, "step": 430 }, { "epoch": 0.3087719298245614, "grad_norm": 6.528597831726074, "learning_rate": 9.471182417709586e-07, "loss": 1.0795, "step": 440 }, { "epoch": 0.3157894736842105, "grad_norm": 7.972232341766357, "learning_rate": 9.446241791133907e-07, "loss": 1.0656, "step": 450 }, { "epoch": 0.32280701754385965, "grad_norm": 6.81664514541626, "learning_rate": 9.420760909671118e-07, "loss": 1.0888, "step": 460 }, { "epoch": 0.3298245614035088, "grad_norm": 6.822625160217285, "learning_rate": 9.394742869457546e-07, "loss": 1.0448, "step": 470 }, { "epoch": 0.3368421052631579, "grad_norm": 7.689866065979004, "learning_rate": 9.368190831898723e-07, "loss": 1.0705, "step": 480 }, { "epoch": 0.34385964912280703, "grad_norm": 6.757457256317139, "learning_rate": 9.341108023285237e-07, "loss": 1.0321, "step": 490 }, { "epoch": 0.3508771929824561, "grad_norm": 9.012947082519531, "learning_rate": 9.313497734400721e-07, "loss": 1.0783, "step": 500 }, { "epoch": 0.3508771929824561, "eval_loss": 1.060664415359497, "eval_runtime": 27.6699, "eval_samples_per_second": 173.474, "eval_steps_per_second": 2.711, "step": 500 }, { "epoch": 0.35789473684210527, "grad_norm": 6.598055362701416, "learning_rate": 9.28536332012199e-07, "loss": 1.0526, "step": 510 }, { "epoch": 0.3649122807017544, "grad_norm": 6.9514360427856445, "learning_rate": 9.2567081990114e-07, "loss": 1.055, "step": 520 }, { "epoch": 0.3719298245614035, "grad_norm": 7.644222259521484, "learning_rate": 9.227535852901462e-07, "loss": 1.0546, "step": 530 }, { "epoch": 0.37894736842105264, "grad_norm": 6.849003314971924, "learning_rate": 9.197849826471773e-07, "loss": 1.0819, "step": 540 }, { "epoch": 0.38596491228070173, "grad_norm": 7.057733535766602, "learning_rate": 9.167653726818304e-07, "loss": 1.0708, "step": 550 }, { "epoch": 0.3929824561403509, "grad_norm": 6.9738287925720215, "learning_rate": 9.136951223015112e-07, "loss": 1.0751, "step": 560 }, { "epoch": 0.4, "grad_norm": 7.2269511222839355, "learning_rate": 9.10574604566852e-07, "loss": 1.0437, "step": 570 }, { "epoch": 0.4070175438596491, "grad_norm": 7.4513654708862305, "learning_rate": 9.074041986463808e-07, "loss": 1.0553, "step": 580 }, { "epoch": 0.41403508771929826, "grad_norm": 7.455415725708008, "learning_rate": 9.041842897704501e-07, "loss": 1.0671, "step": 590 }, { "epoch": 0.42105263157894735, "grad_norm": 7.012011528015137, "learning_rate": 9.009152691844284e-07, "loss": 1.0663, "step": 600 }, { "epoch": 0.42105263157894735, "eval_loss": 1.051626205444336, "eval_runtime": 27.657, "eval_samples_per_second": 173.555, "eval_steps_per_second": 2.712, "step": 600 }, { "epoch": 0.4280701754385965, "grad_norm": 6.606391429901123, "learning_rate": 8.975975341011595e-07, "loss": 1.0385, "step": 610 }, { "epoch": 0.43508771929824563, "grad_norm": 7.090952396392822, "learning_rate": 8.942314876526991e-07, "loss": 1.0438, "step": 620 }, { "epoch": 0.4421052631578947, "grad_norm": 7.45530891418457, "learning_rate": 8.908175388413303e-07, "loss": 1.0519, "step": 630 }, { "epoch": 0.44912280701754387, "grad_norm": 7.6413960456848145, "learning_rate": 8.873561024898667e-07, "loss": 1.0705, "step": 640 }, { "epoch": 0.45614035087719296, "grad_norm": 7.025049209594727, "learning_rate": 8.838475991912481e-07, "loss": 1.0548, "step": 650 }, { "epoch": 0.4631578947368421, "grad_norm": 7.06046724319458, "learning_rate": 8.802924552574345e-07, "loss": 1.0465, "step": 660 }, { "epoch": 0.47017543859649125, "grad_norm": 7.351295471191406, "learning_rate": 8.766911026676063e-07, "loss": 1.0575, "step": 670 }, { "epoch": 0.47719298245614034, "grad_norm": 7.417140960693359, "learning_rate": 8.730439790156751e-07, "loss": 1.0686, "step": 680 }, { "epoch": 0.4842105263157895, "grad_norm": 7.903563499450684, "learning_rate": 8.693515274571121e-07, "loss": 1.0776, "step": 690 }, { "epoch": 0.49122807017543857, "grad_norm": 8.01221752166748, "learning_rate": 8.656141966551018e-07, "loss": 1.0621, "step": 700 }, { "epoch": 0.49122807017543857, "eval_loss": 1.043724775314331, "eval_runtime": 27.6712, "eval_samples_per_second": 173.466, "eval_steps_per_second": 2.71, "step": 700 }, { "epoch": 0.4982456140350877, "grad_norm": 7.052249431610107, "learning_rate": 8.618324407260249e-07, "loss": 1.0738, "step": 710 }, { "epoch": 0.5052631578947369, "grad_norm": 7.37591028213501, "learning_rate": 8.5800671918428e-07, "loss": 1.0607, "step": 720 }, { "epoch": 0.512280701754386, "grad_norm": 7.373082160949707, "learning_rate": 8.541374968864485e-07, "loss": 1.0602, "step": 730 }, { "epoch": 0.519298245614035, "grad_norm": 7.446669101715088, "learning_rate": 8.502252439748112e-07, "loss": 1.0462, "step": 740 }, { "epoch": 0.5263157894736842, "grad_norm": 6.634714603424072, "learning_rate": 8.462704358202216e-07, "loss": 1.0308, "step": 750 }, { "epoch": 0.5333333333333333, "grad_norm": 6.623584270477295, "learning_rate": 8.422735529643443e-07, "loss": 1.0462, "step": 760 }, { "epoch": 0.5403508771929825, "grad_norm": 7.110071659088135, "learning_rate": 8.382350810612663e-07, "loss": 1.0739, "step": 770 }, { "epoch": 0.5473684210526316, "grad_norm": 7.406259536743164, "learning_rate": 8.341555108184849e-07, "loss": 1.069, "step": 780 }, { "epoch": 0.5543859649122806, "grad_norm": 7.356163024902344, "learning_rate": 8.300353379372833e-07, "loss": 1.0542, "step": 790 }, { "epoch": 0.5614035087719298, "grad_norm": 7.522149562835693, "learning_rate": 8.258750630524983e-07, "loss": 1.0482, "step": 800 }, { "epoch": 0.5614035087719298, "eval_loss": 1.0357595682144165, "eval_runtime": 27.6785, "eval_samples_per_second": 173.42, "eval_steps_per_second": 2.71, "step": 800 }, { "epoch": 0.5684210526315789, "grad_norm": 6.716446399688721, "learning_rate": 8.216751916716899e-07, "loss": 1.0459, "step": 810 }, { "epoch": 0.5754385964912281, "grad_norm": 7.719761371612549, "learning_rate": 8.174362341137176e-07, "loss": 1.0271, "step": 820 }, { "epoch": 0.5824561403508772, "grad_norm": 7.073091983795166, "learning_rate": 8.13158705446732e-07, "loss": 1.0483, "step": 830 }, { "epoch": 0.5894736842105263, "grad_norm": 6.979051113128662, "learning_rate": 8.088431254255898e-07, "loss": 1.0293, "step": 840 }, { "epoch": 0.5964912280701754, "grad_norm": 7.095376014709473, "learning_rate": 8.044900184287006e-07, "loss": 1.0387, "step": 850 }, { "epoch": 0.6035087719298246, "grad_norm": 7.155153274536133, "learning_rate": 8.000999133943092e-07, "loss": 1.0448, "step": 860 }, { "epoch": 0.6105263157894737, "grad_norm": 7.818843841552734, "learning_rate": 7.956733437562258e-07, "loss": 1.047, "step": 870 }, { "epoch": 0.6175438596491228, "grad_norm": 7.174437046051025, "learning_rate": 7.912108473790091e-07, "loss": 1.0293, "step": 880 }, { "epoch": 0.624561403508772, "grad_norm": 7.124237060546875, "learning_rate": 7.867129664926123e-07, "loss": 1.0535, "step": 890 }, { "epoch": 0.631578947368421, "grad_norm": 7.362142562866211, "learning_rate": 7.821802476264965e-07, "loss": 1.0662, "step": 900 }, { "epoch": 0.631578947368421, "eval_loss": 1.0292896032333374, "eval_runtime": 27.6513, "eval_samples_per_second": 173.59, "eval_steps_per_second": 2.712, "step": 900 }, { "epoch": 0.6385964912280702, "grad_norm": 6.185942649841309, "learning_rate": 7.776132415432232e-07, "loss": 1.0311, "step": 910 }, { "epoch": 0.6456140350877193, "grad_norm": 7.229496955871582, "learning_rate": 7.73012503171533e-07, "loss": 1.0478, "step": 920 }, { "epoch": 0.6526315789473685, "grad_norm": 6.964082717895508, "learning_rate": 7.683785915389162e-07, "loss": 1.0355, "step": 930 }, { "epoch": 0.6596491228070176, "grad_norm": 7.6486077308654785, "learning_rate": 7.637120697036865e-07, "loss": 1.0078, "step": 940 }, { "epoch": 0.6666666666666666, "grad_norm": 7.581448554992676, "learning_rate": 7.590135046865651e-07, "loss": 1.0352, "step": 950 }, { "epoch": 0.6736842105263158, "grad_norm": 6.977712154388428, "learning_rate": 7.542834674017831e-07, "loss": 1.0352, "step": 960 }, { "epoch": 0.6807017543859649, "grad_norm": 7.210628986358643, "learning_rate": 7.495225325877103e-07, "loss": 1.0351, "step": 970 }, { "epoch": 0.6877192982456141, "grad_norm": 6.860006809234619, "learning_rate": 7.447312787370202e-07, "loss": 1.0244, "step": 980 }, { "epoch": 0.6947368421052632, "grad_norm": 7.080367088317871, "learning_rate": 7.399102880263983e-07, "loss": 1.0451, "step": 990 }, { "epoch": 0.7017543859649122, "grad_norm": 7.036980152130127, "learning_rate": 7.350601462458024e-07, "loss": 1.0727, "step": 1000 }, { "epoch": 0.7017543859649122, "eval_loss": 1.022666096687317, "eval_runtime": 27.6533, "eval_samples_per_second": 173.578, "eval_steps_per_second": 2.712, "step": 1000 }, { "epoch": 0.7087719298245614, "grad_norm": 6.67840576171875, "learning_rate": 7.301814427272848e-07, "loss": 1.0636, "step": 1010 }, { "epoch": 0.7157894736842105, "grad_norm": 7.1095452308654785, "learning_rate": 7.252747702733839e-07, "loss": 1.0088, "step": 1020 }, { "epoch": 0.7228070175438597, "grad_norm": 7.100186347961426, "learning_rate": 7.203407250850928e-07, "loss": 1.0245, "step": 1030 }, { "epoch": 0.7298245614035088, "grad_norm": 6.765640735626221, "learning_rate": 7.158771761692464e-07, "loss": 1.0095, "step": 1040 }, { "epoch": 0.7368421052631579, "grad_norm": 6.90313720703125, "learning_rate": 7.108927771727661e-07, "loss": 1.0188, "step": 1050 }, { "epoch": 0.743859649122807, "grad_norm": 6.8065948486328125, "learning_rate": 7.058827529721525e-07, "loss": 1.0339, "step": 1060 }, { "epoch": 0.7508771929824561, "grad_norm": 6.624533653259277, "learning_rate": 7.008477123264847e-07, "loss": 1.0346, "step": 1070 }, { "epoch": 0.7578947368421053, "grad_norm": 7.218606472015381, "learning_rate": 6.957882670345458e-07, "loss": 1.0379, "step": 1080 }, { "epoch": 0.7649122807017544, "grad_norm": 7.127339839935303, "learning_rate": 6.90705031860483e-07, "loss": 1.0205, "step": 1090 }, { "epoch": 0.7719298245614035, "grad_norm": 6.587140083312988, "learning_rate": 6.855986244591103e-07, "loss": 1.0263, "step": 1100 }, { "epoch": 0.7719298245614035, "eval_loss": 1.0174767971038818, "eval_runtime": 27.6964, "eval_samples_per_second": 173.308, "eval_steps_per_second": 2.708, "step": 1100 }, { "epoch": 0.7789473684210526, "grad_norm": 6.751448631286621, "learning_rate": 6.804696653008574e-07, "loss": 0.981, "step": 1110 }, { "epoch": 0.7859649122807018, "grad_norm": 7.036713600158691, "learning_rate": 6.753187775963772e-07, "loss": 1.0488, "step": 1120 }, { "epoch": 0.7929824561403509, "grad_norm": 6.959472179412842, "learning_rate": 6.701465872208216e-07, "loss": 1.0202, "step": 1130 }, { "epoch": 0.8, "grad_norm": 7.4908599853515625, "learning_rate": 6.649537226377914e-07, "loss": 1.0356, "step": 1140 }, { "epoch": 0.8070175438596491, "grad_norm": 8.565585136413574, "learning_rate": 6.597408148229741e-07, "loss": 1.0125, "step": 1150 }, { "epoch": 0.8140350877192982, "grad_norm": 7.0569167137146, "learning_rate": 6.545084971874736e-07, "loss": 1.0654, "step": 1160 }, { "epoch": 0.8210526315789474, "grad_norm": 6.795130252838135, "learning_rate": 6.492574055008473e-07, "loss": 1.046, "step": 1170 }, { "epoch": 0.8280701754385965, "grad_norm": 7.272831916809082, "learning_rate": 6.439881778138531e-07, "loss": 1.0238, "step": 1180 }, { "epoch": 0.8350877192982457, "grad_norm": 6.588538646697998, "learning_rate": 6.387014543809223e-07, "loss": 1.0155, "step": 1190 }, { "epoch": 0.8421052631578947, "grad_norm": 6.798887252807617, "learning_rate": 6.333978775823631e-07, "loss": 1.0187, "step": 1200 }, { "epoch": 0.8421052631578947, "eval_loss": 1.0141297578811646, "eval_runtime": 27.6602, "eval_samples_per_second": 173.534, "eval_steps_per_second": 2.711, "step": 1200 }, { "epoch": 0.8491228070175438, "grad_norm": 6.572112083435059, "learning_rate": 6.280780918463057e-07, "loss": 1.0355, "step": 1210 }, { "epoch": 0.856140350877193, "grad_norm": 7.28840970993042, "learning_rate": 6.227427435703995e-07, "loss": 1.0424, "step": 1220 }, { "epoch": 0.8631578947368421, "grad_norm": 8.068036079406738, "learning_rate": 6.173924810432704e-07, "loss": 1.0321, "step": 1230 }, { "epoch": 0.8701754385964913, "grad_norm": 6.726752281188965, "learning_rate": 6.12027954365748e-07, "loss": 1.0431, "step": 1240 }, { "epoch": 0.8771929824561403, "grad_norm": 6.742453098297119, "learning_rate": 6.066498153718734e-07, "loss": 1.0178, "step": 1250 }, { "epoch": 0.8842105263157894, "grad_norm": 6.598849296569824, "learning_rate": 6.01258717549696e-07, "loss": 1.0141, "step": 1260 }, { "epoch": 0.8912280701754386, "grad_norm": 6.771568775177002, "learning_rate": 5.958553159618692e-07, "loss": 0.9957, "step": 1270 }, { "epoch": 0.8982456140350877, "grad_norm": 7.0470380783081055, "learning_rate": 5.90440267166055e-07, "loss": 1.0387, "step": 1280 }, { "epoch": 0.9052631578947369, "grad_norm": 7.024428367614746, "learning_rate": 5.850142291351465e-07, "loss": 1.026, "step": 1290 }, { "epoch": 0.9122807017543859, "grad_norm": 7.074985027313232, "learning_rate": 5.795778611773197e-07, "loss": 1.0121, "step": 1300 }, { "epoch": 0.9122807017543859, "eval_loss": 1.0093048810958862, "eval_runtime": 27.6576, "eval_samples_per_second": 173.551, "eval_steps_per_second": 2.712, "step": 1300 }, { "epoch": 0.9192982456140351, "grad_norm": 7.012327194213867, "learning_rate": 5.741318238559209e-07, "loss": 1.0331, "step": 1310 }, { "epoch": 0.9263157894736842, "grad_norm": 6.710480690002441, "learning_rate": 5.686767789092041e-07, "loss": 1.012, "step": 1320 }, { "epoch": 0.9333333333333333, "grad_norm": 6.7387919425964355, "learning_rate": 5.632133891699231e-07, "loss": 0.9881, "step": 1330 }, { "epoch": 0.9403508771929825, "grad_norm": 6.965381145477295, "learning_rate": 5.577423184847931e-07, "loss": 1.0209, "step": 1340 }, { "epoch": 0.9473684210526315, "grad_norm": 7.125399589538574, "learning_rate": 5.522642316338268e-07, "loss": 1.0109, "step": 1350 }, { "epoch": 0.9543859649122807, "grad_norm": 7.273198127746582, "learning_rate": 5.467797942495589e-07, "loss": 1.0108, "step": 1360 }, { "epoch": 0.9614035087719298, "grad_norm": 6.802534580230713, "learning_rate": 5.412896727361662e-07, "loss": 1.025, "step": 1370 }, { "epoch": 0.968421052631579, "grad_norm": 7.282257080078125, "learning_rate": 5.357945341884935e-07, "loss": 1.0353, "step": 1380 }, { "epoch": 0.9754385964912281, "grad_norm": 6.752053260803223, "learning_rate": 5.302950463109969e-07, "loss": 1.0118, "step": 1390 }, { "epoch": 0.9824561403508771, "grad_norm": 6.847274303436279, "learning_rate": 5.247918773366111e-07, "loss": 1.0092, "step": 1400 }, { "epoch": 0.9824561403508771, "eval_loss": 1.003943681716919, "eval_runtime": 27.6644, "eval_samples_per_second": 173.508, "eval_steps_per_second": 2.711, "step": 1400 }, { "epoch": 0.9894736842105263, "grad_norm": 7.226211071014404, "learning_rate": 5.192856959455552e-07, "loss": 1.0278, "step": 1410 }, { "epoch": 0.9964912280701754, "grad_norm": 6.635247230529785, "learning_rate": 5.137771711840811e-07, "loss": 1.0163, "step": 1420 }, { "epoch": 1.0035087719298246, "grad_norm": 6.2100605964660645, "learning_rate": 5.082669723831793e-07, "loss": 0.928, "step": 1430 }, { "epoch": 1.0105263157894737, "grad_norm": 6.735259532928467, "learning_rate": 5.027557690772503e-07, "loss": 0.8903, "step": 1440 }, { "epoch": 1.0175438596491229, "grad_norm": 7.061236381530762, "learning_rate": 4.972442309227498e-07, "loss": 0.8721, "step": 1450 }, { "epoch": 1.024561403508772, "grad_norm": 6.729221820831299, "learning_rate": 4.917330276168208e-07, "loss": 0.8759, "step": 1460 }, { "epoch": 1.0315789473684212, "grad_norm": 6.925577640533447, "learning_rate": 4.86222828815919e-07, "loss": 0.866, "step": 1470 }, { "epoch": 1.03859649122807, "grad_norm": 6.847450256347656, "learning_rate": 4.807143040544446e-07, "loss": 0.8851, "step": 1480 }, { "epoch": 1.0456140350877192, "grad_norm": 7.24519157409668, "learning_rate": 4.752081226633888e-07, "loss": 0.8922, "step": 1490 }, { "epoch": 1.0526315789473684, "grad_norm": 6.8135085105896, "learning_rate": 4.697049536890033e-07, "loss": 0.8917, "step": 1500 }, { "epoch": 1.0526315789473684, "eval_loss": 1.0086382627487183, "eval_runtime": 27.6965, "eval_samples_per_second": 173.307, "eval_steps_per_second": 2.708, "step": 1500 }, { "epoch": 1.0596491228070175, "grad_norm": 6.774071216583252, "learning_rate": 4.6475522990138276e-07, "loss": 0.8773, "step": 1510 }, { "epoch": 1.0666666666666667, "grad_norm": 6.860315799713135, "learning_rate": 4.592596263646712e-07, "loss": 0.9042, "step": 1520 }, { "epoch": 1.0736842105263158, "grad_norm": 7.362914085388184, "learning_rate": 4.5376897311788825e-07, "loss": 0.8973, "step": 1530 }, { "epoch": 1.080701754385965, "grad_norm": 6.993128776550293, "learning_rate": 4.48283937320489e-07, "loss": 0.8533, "step": 1540 }, { "epoch": 1.087719298245614, "grad_norm": 7.575523853302002, "learning_rate": 4.4280518544936224e-07, "loss": 0.8896, "step": 1550 }, { "epoch": 1.0947368421052632, "grad_norm": 7.457510948181152, "learning_rate": 4.3733338321784777e-07, "loss": 0.873, "step": 1560 }, { "epoch": 1.1017543859649124, "grad_norm": 6.553786754608154, "learning_rate": 4.3186919549484777e-07, "loss": 0.8735, "step": 1570 }, { "epoch": 1.1087719298245613, "grad_norm": 7.161813259124756, "learning_rate": 4.264132862240387e-07, "loss": 0.8708, "step": 1580 }, { "epoch": 1.1157894736842104, "grad_norm": 7.342090129852295, "learning_rate": 4.2096631834319687e-07, "loss": 0.8627, "step": 1590 }, { "epoch": 1.1228070175438596, "grad_norm": 7.708263874053955, "learning_rate": 4.155289537036466e-07, "loss": 0.8916, "step": 1600 }, { "epoch": 1.1228070175438596, "eval_loss": 1.0080682039260864, "eval_runtime": 27.6601, "eval_samples_per_second": 173.535, "eval_steps_per_second": 2.711, "step": 1600 }, { "epoch": 1.1298245614035087, "grad_norm": 6.637975215911865, "learning_rate": 4.101018529898398e-07, "loss": 0.8598, "step": 1610 }, { "epoch": 1.1368421052631579, "grad_norm": 7.271252155303955, "learning_rate": 4.046856756390766e-07, "loss": 0.8632, "step": 1620 }, { "epoch": 1.143859649122807, "grad_norm": 6.89381742477417, "learning_rate": 3.99281079761379e-07, "loss": 0.8877, "step": 1630 }, { "epoch": 1.1508771929824562, "grad_norm": 7.032026290893555, "learning_rate": 3.938887220595252e-07, "loss": 0.879, "step": 1640 }, { "epoch": 1.1578947368421053, "grad_norm": 7.385174751281738, "learning_rate": 3.885092577492542e-07, "loss": 0.8893, "step": 1650 }, { "epoch": 1.1649122807017545, "grad_norm": 7.389017105102539, "learning_rate": 3.8314334047965207e-07, "loss": 0.8727, "step": 1660 }, { "epoch": 1.1719298245614036, "grad_norm": 6.653899192810059, "learning_rate": 3.7779162225372846e-07, "loss": 0.8941, "step": 1670 }, { "epoch": 1.1789473684210527, "grad_norm": 7.119126319885254, "learning_rate": 3.724547533491924e-07, "loss": 0.8676, "step": 1680 }, { "epoch": 1.1859649122807017, "grad_norm": 7.610691070556641, "learning_rate": 3.671333822394386e-07, "loss": 0.864, "step": 1690 }, { "epoch": 1.1929824561403508, "grad_norm": 6.851118564605713, "learning_rate": 3.6182815551475223e-07, "loss": 0.885, "step": 1700 }, { "epoch": 1.1929824561403508, "eval_loss": 1.0073468685150146, "eval_runtime": 27.66, "eval_samples_per_second": 173.536, "eval_steps_per_second": 2.712, "step": 1700 }, { "epoch": 1.2, "grad_norm": 7.08779764175415, "learning_rate": 3.565397178037429e-07, "loss": 0.875, "step": 1710 }, { "epoch": 1.207017543859649, "grad_norm": 6.938493728637695, "learning_rate": 3.5126871169501815e-07, "loss": 0.8823, "step": 1720 }, { "epoch": 1.2140350877192982, "grad_norm": 7.4112114906311035, "learning_rate": 3.4601577765910175e-07, "loss": 0.8428, "step": 1730 }, { "epoch": 1.2210526315789474, "grad_norm": 7.859072208404541, "learning_rate": 3.407815539706124e-07, "loss": 0.8659, "step": 1740 }, { "epoch": 1.2280701754385965, "grad_norm": 6.562801837921143, "learning_rate": 3.3556667663070835e-07, "loss": 0.8654, "step": 1750 }, { "epoch": 1.2350877192982457, "grad_norm": 7.658775806427002, "learning_rate": 3.303717792898073e-07, "loss": 0.8652, "step": 1760 }, { "epoch": 1.2421052631578948, "grad_norm": 7.275959491729736, "learning_rate": 3.2519749317059327e-07, "loss": 0.8957, "step": 1770 }, { "epoch": 1.2491228070175437, "grad_norm": 7.704782485961914, "learning_rate": 3.200444469913172e-07, "loss": 0.8737, "step": 1780 }, { "epoch": 1.256140350877193, "grad_norm": 7.395431995391846, "learning_rate": 3.1491326688940344e-07, "loss": 0.8542, "step": 1790 }, { "epoch": 1.263157894736842, "grad_norm": 6.88340425491333, "learning_rate": 3.0980457634536774e-07, "loss": 0.8843, "step": 1800 }, { "epoch": 1.263157894736842, "eval_loss": 1.0033657550811768, "eval_runtime": 27.6659, "eval_samples_per_second": 173.499, "eval_steps_per_second": 2.711, "step": 1800 }, { "epoch": 1.2701754385964912, "grad_norm": 6.7408766746521, "learning_rate": 3.0471899610706036e-07, "loss": 0.8331, "step": 1810 }, { "epoch": 1.2771929824561403, "grad_norm": 7.153403282165527, "learning_rate": 2.996571441142397e-07, "loss": 0.8465, "step": 1820 }, { "epoch": 1.2842105263157895, "grad_norm": 7.26017427444458, "learning_rate": 2.9461963542348733e-07, "loss": 0.8785, "step": 1830 }, { "epoch": 1.2912280701754386, "grad_norm": 7.271636486053467, "learning_rate": 2.896070821334736e-07, "loss": 0.8831, "step": 1840 }, { "epoch": 1.2982456140350878, "grad_norm": 6.8561201095581055, "learning_rate": 2.846200933105829e-07, "loss": 0.8578, "step": 1850 }, { "epoch": 1.305263157894737, "grad_norm": 7.387796878814697, "learning_rate": 2.7965927491490704e-07, "loss": 0.8439, "step": 1860 }, { "epoch": 1.312280701754386, "grad_norm": 7.401048183441162, "learning_rate": 2.747252297266162e-07, "loss": 0.8944, "step": 1870 }, { "epoch": 1.3192982456140352, "grad_norm": 7.2983527183532715, "learning_rate": 2.698185572727151e-07, "loss": 0.8689, "step": 1880 }, { "epoch": 1.3263157894736843, "grad_norm": 7.557769775390625, "learning_rate": 2.6493985375419775e-07, "loss": 0.885, "step": 1890 }, { "epoch": 1.3333333333333333, "grad_norm": 6.881629943847656, "learning_rate": 2.6008971197360175e-07, "loss": 0.8644, "step": 1900 }, { "epoch": 1.3333333333333333, "eval_loss": 1.0021144151687622, "eval_runtime": 27.6613, "eval_samples_per_second": 173.527, "eval_steps_per_second": 2.711, "step": 1900 }, { "epoch": 1.3403508771929824, "grad_norm": 7.333024978637695, "learning_rate": 2.5526872126297986e-07, "loss": 0.8912, "step": 1910 }, { "epoch": 1.3473684210526315, "grad_norm": 7.045767784118652, "learning_rate": 2.5047746741228977e-07, "loss": 0.8747, "step": 1920 }, { "epoch": 1.3543859649122807, "grad_norm": 7.227980613708496, "learning_rate": 2.457165325982169e-07, "loss": 0.8647, "step": 1930 }, { "epoch": 1.3614035087719298, "grad_norm": 7.303330898284912, "learning_rate": 2.4098649531343494e-07, "loss": 0.8657, "step": 1940 }, { "epoch": 1.368421052631579, "grad_norm": 7.276090621948242, "learning_rate": 2.362879302963135e-07, "loss": 0.8845, "step": 1950 }, { "epoch": 1.3754385964912281, "grad_norm": 7.321451663970947, "learning_rate": 2.3162140846108363e-07, "loss": 0.8487, "step": 1960 }, { "epoch": 1.3824561403508773, "grad_norm": 7.5262980461120605, "learning_rate": 2.2698749682846685e-07, "loss": 0.8762, "step": 1970 }, { "epoch": 1.3894736842105262, "grad_norm": 7.401157855987549, "learning_rate": 2.223867584567766e-07, "loss": 0.8748, "step": 1980 }, { "epoch": 1.3964912280701753, "grad_norm": 7.1058149337768555, "learning_rate": 2.1781975237350365e-07, "loss": 0.8641, "step": 1990 }, { "epoch": 1.4035087719298245, "grad_norm": 7.203502178192139, "learning_rate": 2.1328703350738765e-07, "loss": 0.8661, "step": 2000 }, { "epoch": 1.4035087719298245, "eval_loss": 1.000258445739746, "eval_runtime": 27.6622, "eval_samples_per_second": 173.522, "eval_steps_per_second": 2.711, "step": 2000 }, { "epoch": 1.4105263157894736, "grad_norm": 7.68574857711792, "learning_rate": 2.0878915262099096e-07, "loss": 0.8964, "step": 2010 }, { "epoch": 1.4175438596491228, "grad_norm": 7.339992523193359, "learning_rate": 2.0432665624377433e-07, "loss": 0.8779, "step": 2020 }, { "epoch": 1.424561403508772, "grad_norm": 7.711989879608154, "learning_rate": 1.999000866056908e-07, "loss": 0.8958, "step": 2030 }, { "epoch": 1.431578947368421, "grad_norm": 6.8218488693237305, "learning_rate": 1.9550998157129944e-07, "loss": 0.8848, "step": 2040 }, { "epoch": 1.4385964912280702, "grad_norm": 7.602545261383057, "learning_rate": 1.9115687457441022e-07, "loss": 0.8668, "step": 2050 }, { "epoch": 1.4456140350877194, "grad_norm": 7.199863433837891, "learning_rate": 1.8684129455326808e-07, "loss": 0.8705, "step": 2060 }, { "epoch": 1.4526315789473685, "grad_norm": 7.163413047790527, "learning_rate": 1.8256376588628235e-07, "loss": 0.8641, "step": 2070 }, { "epoch": 1.4596491228070176, "grad_norm": 7.178804397583008, "learning_rate": 1.7832480832830986e-07, "loss": 0.8526, "step": 2080 }, { "epoch": 1.4666666666666668, "grad_norm": 7.084789752960205, "learning_rate": 1.7412493694750173e-07, "loss": 0.8834, "step": 2090 }, { "epoch": 1.4736842105263157, "grad_norm": 7.647516250610352, "learning_rate": 1.6996466206271675e-07, "loss": 0.8712, "step": 2100 }, { "epoch": 1.4736842105263157, "eval_loss": 1.0002570152282715, "eval_runtime": 27.6725, "eval_samples_per_second": 173.457, "eval_steps_per_second": 2.71, "step": 2100 }, { "epoch": 1.4807017543859649, "grad_norm": 7.682786464691162, "learning_rate": 1.6584448918151518e-07, "loss": 0.8648, "step": 2110 }, { "epoch": 1.487719298245614, "grad_norm": 6.9408650398254395, "learning_rate": 1.6176491893873367e-07, "loss": 0.8775, "step": 2120 }, { "epoch": 1.4947368421052631, "grad_norm": 7.477031230926514, "learning_rate": 1.5772644703565564e-07, "loss": 0.8648, "step": 2130 }, { "epoch": 1.5017543859649123, "grad_norm": 7.054373741149902, "learning_rate": 1.537295641797785e-07, "loss": 0.8608, "step": 2140 }, { "epoch": 1.5087719298245614, "grad_norm": 6.98421049118042, "learning_rate": 1.4977475602518874e-07, "loss": 0.8653, "step": 2150 }, { "epoch": 1.5157894736842106, "grad_norm": 7.556164264678955, "learning_rate": 1.4586250311355132e-07, "loss": 0.8691, "step": 2160 }, { "epoch": 1.5228070175438595, "grad_norm": 7.721457004547119, "learning_rate": 1.4199328081572e-07, "loss": 0.8853, "step": 2170 }, { "epoch": 1.5298245614035086, "grad_norm": 7.5607428550720215, "learning_rate": 1.38167559273975e-07, "loss": 0.8647, "step": 2180 }, { "epoch": 1.5368421052631578, "grad_norm": 7.398414134979248, "learning_rate": 1.3438580334489818e-07, "loss": 0.8524, "step": 2190 }, { "epoch": 1.543859649122807, "grad_norm": 7.229887008666992, "learning_rate": 1.3064847254288796e-07, "loss": 0.8638, "step": 2200 }, { "epoch": 1.543859649122807, "eval_loss": 0.9979353547096252, "eval_runtime": 27.6809, "eval_samples_per_second": 173.405, "eval_steps_per_second": 2.709, "step": 2200 }, { "epoch": 1.550877192982456, "grad_norm": 7.479950428009033, "learning_rate": 1.26956020984325e-07, "loss": 0.8672, "step": 2210 }, { "epoch": 1.5578947368421052, "grad_norm": 7.526796340942383, "learning_rate": 1.2330889733239368e-07, "loss": 0.8882, "step": 2220 }, { "epoch": 1.5649122807017544, "grad_norm": 7.098681926727295, "learning_rate": 1.197075447425656e-07, "loss": 0.8564, "step": 2230 }, { "epoch": 1.5719298245614035, "grad_norm": 7.627535343170166, "learning_rate": 1.16152400808752e-07, "loss": 0.8778, "step": 2240 }, { "epoch": 1.5789473684210527, "grad_norm": 7.635378360748291, "learning_rate": 1.1264389751013325e-07, "loss": 0.8615, "step": 2250 }, { "epoch": 1.5859649122807018, "grad_norm": 7.256911754608154, "learning_rate": 1.0918246115866964e-07, "loss": 0.8828, "step": 2260 }, { "epoch": 1.592982456140351, "grad_norm": 7.054688453674316, "learning_rate": 1.0576851234730094e-07, "loss": 0.8602, "step": 2270 }, { "epoch": 1.6, "grad_norm": 7.2597479820251465, "learning_rate": 1.0240246589884045e-07, "loss": 0.8588, "step": 2280 }, { "epoch": 1.6070175438596492, "grad_norm": 7.462535381317139, "learning_rate": 9.90847308155715e-08, "loss": 0.8623, "step": 2290 }, { "epoch": 1.6140350877192984, "grad_norm": 7.354959487915039, "learning_rate": 9.581571022954987e-08, "loss": 0.8632, "step": 2300 }, { "epoch": 1.6140350877192984, "eval_loss": 0.9973437786102295, "eval_runtime": 27.6881, "eval_samples_per_second": 173.36, "eval_steps_per_second": 2.709, "step": 2300 }, { "epoch": 1.6210526315789475, "grad_norm": 7.283778667449951, "learning_rate": 9.259580135361927e-08, "loss": 0.8684, "step": 2310 }, { "epoch": 1.6280701754385964, "grad_norm": 7.570828914642334, "learning_rate": 8.942539543314798e-08, "loss": 0.8609, "step": 2320 }, { "epoch": 1.6350877192982456, "grad_norm": 7.366217613220215, "learning_rate": 8.630487769848876e-08, "loss": 0.8722, "step": 2330 }, { "epoch": 1.6421052631578947, "grad_norm": 7.667774200439453, "learning_rate": 8.32346273181696e-08, "loss": 0.8883, "step": 2340 }, { "epoch": 1.6491228070175439, "grad_norm": 8.111892700195312, "learning_rate": 8.021501735282266e-08, "loss": 0.8599, "step": 2350 }, { "epoch": 1.656140350877193, "grad_norm": 7.690216064453125, "learning_rate": 7.724641470985377e-08, "loss": 0.8951, "step": 2360 }, { "epoch": 1.663157894736842, "grad_norm": 7.080111980438232, "learning_rate": 7.432918009885996e-08, "loss": 0.865, "step": 2370 }, { "epoch": 1.670175438596491, "grad_norm": 7.580221176147461, "learning_rate": 7.146366798780096e-08, "loss": 0.8905, "step": 2380 }, { "epoch": 1.6771929824561402, "grad_norm": 6.910195827484131, "learning_rate": 6.865022655992798e-08, "loss": 0.8501, "step": 2390 }, { "epoch": 1.6842105263157894, "grad_norm": 7.176208972930908, "learning_rate": 6.588919767147638e-08, "loss": 0.8461, "step": 2400 }, { "epoch": 1.6842105263157894, "eval_loss": 0.9966626167297363, "eval_runtime": 27.668, "eval_samples_per_second": 173.486, "eval_steps_per_second": 2.711, "step": 2400 }, { "epoch": 1.6912280701754385, "grad_norm": 7.764338970184326, "learning_rate": 6.318091681012771e-08, "loss": 0.8711, "step": 2410 }, { "epoch": 1.6982456140350877, "grad_norm": 8.283316612243652, "learning_rate": 6.052571305424531e-08, "loss": 0.8738, "step": 2420 }, { "epoch": 1.7052631578947368, "grad_norm": 7.315950870513916, "learning_rate": 5.7923909032888295e-08, "loss": 0.8719, "step": 2430 }, { "epoch": 1.712280701754386, "grad_norm": 7.591914653778076, "learning_rate": 5.537582088660936e-08, "loss": 0.8708, "step": 2440 }, { "epoch": 1.719298245614035, "grad_norm": 7.378705978393555, "learning_rate": 5.2881758229041394e-08, "loss": 0.8722, "step": 2450 }, { "epoch": 1.7263157894736842, "grad_norm": 7.416294097900391, "learning_rate": 5.044202410927706e-08, "loss": 0.8586, "step": 2460 }, { "epoch": 1.7333333333333334, "grad_norm": 7.301969051361084, "learning_rate": 4.805691497504505e-08, "loss": 0.891, "step": 2470 }, { "epoch": 1.7403508771929825, "grad_norm": 6.946348190307617, "learning_rate": 4.5726720636690195e-08, "loss": 0.8871, "step": 2480 }, { "epoch": 1.7473684210526317, "grad_norm": 7.327394008636475, "learning_rate": 4.3451724231958645e-08, "loss": 0.8688, "step": 2490 }, { "epoch": 1.7543859649122808, "grad_norm": 7.17736291885376, "learning_rate": 4.123220219159418e-08, "loss": 0.8729, "step": 2500 }, { "epoch": 1.7543859649122808, "eval_loss": 0.9957481622695923, "eval_runtime": 27.665, "eval_samples_per_second": 173.504, "eval_steps_per_second": 2.711, "step": 2500 } ], "logging_steps": 10, "max_steps": 2850, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 6.354365204175782e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }