|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7543859649122808, |
|
"eval_steps": 100, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007017543859649123, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-06, |
|
"loss": 1.3034, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014035087719298246, |
|
"grad_norm": 11.65329647064209, |
|
"learning_rate": 9.999987849060752e-07, |
|
"loss": 1.3006, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.021052631578947368, |
|
"grad_norm": 8.014320373535156, |
|
"learning_rate": 9.999632438442366e-07, |
|
"loss": 1.233, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.028070175438596492, |
|
"grad_norm": 7.890571594238281, |
|
"learning_rate": 9.998660418225644e-07, |
|
"loss": 1.1962, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03508771929824561, |
|
"grad_norm": 7.12827205657959, |
|
"learning_rate": 9.997081019722536e-07, |
|
"loss": 1.2213, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.042105263157894736, |
|
"grad_norm": 7.200845718383789, |
|
"learning_rate": 9.99489443484293e-07, |
|
"loss": 1.1679, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04912280701754386, |
|
"grad_norm": 7.650635242462158, |
|
"learning_rate": 9.992100929274846e-07, |
|
"loss": 1.1699, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.056140350877192984, |
|
"grad_norm": 7.227153778076172, |
|
"learning_rate": 9.988700842452145e-07, |
|
"loss": 1.1207, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06315789473684211, |
|
"grad_norm": 7.5115532875061035, |
|
"learning_rate": 9.984694587513297e-07, |
|
"loss": 1.1387, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07017543859649122, |
|
"grad_norm": 7.4819512367248535, |
|
"learning_rate": 9.980082651251174e-07, |
|
"loss": 1.1544, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07017543859649122, |
|
"eval_loss": 1.1328155994415283, |
|
"eval_runtime": 27.6835, |
|
"eval_samples_per_second": 173.388, |
|
"eval_steps_per_second": 2.709, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07719298245614035, |
|
"grad_norm": 7.3147759437561035, |
|
"learning_rate": 9.9748655940539e-07, |
|
"loss": 1.1726, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 7.672832489013672, |
|
"learning_rate": 9.969044049836765e-07, |
|
"loss": 1.115, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0912280701754386, |
|
"grad_norm": 7.895420551300049, |
|
"learning_rate": 9.962618725965194e-07, |
|
"loss": 1.1274, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09824561403508772, |
|
"grad_norm": 7.362156867980957, |
|
"learning_rate": 9.955590403168798e-07, |
|
"loss": 1.1401, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 7.586355209350586, |
|
"learning_rate": 9.947959935446506e-07, |
|
"loss": 1.1543, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11228070175438597, |
|
"grad_norm": 7.309718132019043, |
|
"learning_rate": 9.939728249962806e-07, |
|
"loss": 1.115, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11929824561403508, |
|
"grad_norm": 7.269148826599121, |
|
"learning_rate": 9.930896346935075e-07, |
|
"loss": 1.0933, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12631578947368421, |
|
"grad_norm": 7.365452766418457, |
|
"learning_rate": 9.921465299512052e-07, |
|
"loss": 1.0965, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 7.434603214263916, |
|
"learning_rate": 9.911436253643443e-07, |
|
"loss": 1.0972, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"grad_norm": 7.557833194732666, |
|
"learning_rate": 9.900810427940668e-07, |
|
"loss": 1.1182, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"eval_loss": 1.1001578569412231, |
|
"eval_runtime": 27.6607, |
|
"eval_samples_per_second": 173.531, |
|
"eval_steps_per_second": 2.711, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14736842105263157, |
|
"grad_norm": 7.197221279144287, |
|
"learning_rate": 9.889589113528808e-07, |
|
"loss": 1.0991, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1543859649122807, |
|
"grad_norm": 7.870287895202637, |
|
"learning_rate": 9.8777736738897e-07, |
|
"loss": 1.1135, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16140350877192983, |
|
"grad_norm": 7.257969379425049, |
|
"learning_rate": 9.865365544696286e-07, |
|
"loss": 1.1207, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 7.788718223571777, |
|
"learning_rate": 9.852366233638143e-07, |
|
"loss": 1.1084, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17543859649122806, |
|
"grad_norm": 7.723772048950195, |
|
"learning_rate": 9.838777320238312e-07, |
|
"loss": 1.0881, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1824561403508772, |
|
"grad_norm": 6.814189434051514, |
|
"learning_rate": 9.824600455661351e-07, |
|
"loss": 1.1118, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18947368421052632, |
|
"grad_norm": 7.434762477874756, |
|
"learning_rate": 9.809837362512718e-07, |
|
"loss": 1.0948, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.19649122807017544, |
|
"grad_norm": 7.205653190612793, |
|
"learning_rate": 9.794489834629454e-07, |
|
"loss": 1.0837, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.20350877192982456, |
|
"grad_norm": 7.118565559387207, |
|
"learning_rate": 9.77855973686222e-07, |
|
"loss": 1.092, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 7.293910503387451, |
|
"learning_rate": 9.762049004848705e-07, |
|
"loss": 1.1015, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"eval_loss": 1.0845627784729004, |
|
"eval_runtime": 27.672, |
|
"eval_samples_per_second": 173.461, |
|
"eval_steps_per_second": 2.71, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21754385964912282, |
|
"grad_norm": 7.512034893035889, |
|
"learning_rate": 9.744959644778421e-07, |
|
"loss": 1.0836, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.22456140350877193, |
|
"grad_norm": 7.277877330780029, |
|
"learning_rate": 9.727293733148942e-07, |
|
"loss": 1.0717, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23157894736842105, |
|
"grad_norm": 7.781631946563721, |
|
"learning_rate": 9.709053416513591e-07, |
|
"loss": 1.0391, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.23859649122807017, |
|
"grad_norm": 7.217984199523926, |
|
"learning_rate": 9.690240911220617e-07, |
|
"loss": 1.1131, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.24561403508771928, |
|
"grad_norm": 7.256911277770996, |
|
"learning_rate": 9.67085850314389e-07, |
|
"loss": 1.0628, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"grad_norm": 7.053469657897949, |
|
"learning_rate": 9.650908547405143e-07, |
|
"loss": 1.0583, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2596491228070175, |
|
"grad_norm": 7.0806498527526855, |
|
"learning_rate": 9.630393468087817e-07, |
|
"loss": 1.0714, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 7.368037223815918, |
|
"learning_rate": 9.609315757942502e-07, |
|
"loss": 1.0629, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2736842105263158, |
|
"grad_norm": 7.083371639251709, |
|
"learning_rate": 9.58767797808406e-07, |
|
"loss": 1.0748, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"grad_norm": 7.305485248565674, |
|
"learning_rate": 9.565482757680414e-07, |
|
"loss": 1.0736, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"eval_loss": 1.072194218635559, |
|
"eval_runtime": 27.6671, |
|
"eval_samples_per_second": 173.491, |
|
"eval_steps_per_second": 2.711, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.28771929824561404, |
|
"grad_norm": 7.741823196411133, |
|
"learning_rate": 9.542732793633097e-07, |
|
"loss": 1.0913, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.29473684210526313, |
|
"grad_norm": 6.781225204467773, |
|
"learning_rate": 9.519430850249549e-07, |
|
"loss": 1.0826, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3017543859649123, |
|
"grad_norm": 6.993170738220215, |
|
"learning_rate": 9.495579758907229e-07, |
|
"loss": 1.0472, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3087719298245614, |
|
"grad_norm": 6.528597831726074, |
|
"learning_rate": 9.471182417709586e-07, |
|
"loss": 1.0795, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 7.972232341766357, |
|
"learning_rate": 9.446241791133907e-07, |
|
"loss": 1.0656, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.32280701754385965, |
|
"grad_norm": 6.81664514541626, |
|
"learning_rate": 9.420760909671118e-07, |
|
"loss": 1.0888, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3298245614035088, |
|
"grad_norm": 6.822625160217285, |
|
"learning_rate": 9.394742869457546e-07, |
|
"loss": 1.0448, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3368421052631579, |
|
"grad_norm": 7.689866065979004, |
|
"learning_rate": 9.368190831898723e-07, |
|
"loss": 1.0705, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.34385964912280703, |
|
"grad_norm": 6.757457256317139, |
|
"learning_rate": 9.341108023285237e-07, |
|
"loss": 1.0321, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 9.012947082519531, |
|
"learning_rate": 9.313497734400721e-07, |
|
"loss": 1.0783, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"eval_loss": 1.060664415359497, |
|
"eval_runtime": 27.6699, |
|
"eval_samples_per_second": 173.474, |
|
"eval_steps_per_second": 2.711, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.35789473684210527, |
|
"grad_norm": 6.598055362701416, |
|
"learning_rate": 9.28536332012199e-07, |
|
"loss": 1.0526, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3649122807017544, |
|
"grad_norm": 6.9514360427856445, |
|
"learning_rate": 9.2567081990114e-07, |
|
"loss": 1.055, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3719298245614035, |
|
"grad_norm": 7.644222259521484, |
|
"learning_rate": 9.227535852901462e-07, |
|
"loss": 1.0546, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.37894736842105264, |
|
"grad_norm": 6.849003314971924, |
|
"learning_rate": 9.197849826471773e-07, |
|
"loss": 1.0819, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.38596491228070173, |
|
"grad_norm": 7.057733535766602, |
|
"learning_rate": 9.167653726818304e-07, |
|
"loss": 1.0708, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3929824561403509, |
|
"grad_norm": 6.9738287925720215, |
|
"learning_rate": 9.136951223015112e-07, |
|
"loss": 1.0751, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.2269511222839355, |
|
"learning_rate": 9.10574604566852e-07, |
|
"loss": 1.0437, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4070175438596491, |
|
"grad_norm": 7.4513654708862305, |
|
"learning_rate": 9.074041986463808e-07, |
|
"loss": 1.0553, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.41403508771929826, |
|
"grad_norm": 7.455415725708008, |
|
"learning_rate": 9.041842897704501e-07, |
|
"loss": 1.0671, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 7.012011528015137, |
|
"learning_rate": 9.009152691844284e-07, |
|
"loss": 1.0663, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"eval_loss": 1.051626205444336, |
|
"eval_runtime": 27.657, |
|
"eval_samples_per_second": 173.555, |
|
"eval_steps_per_second": 2.712, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4280701754385965, |
|
"grad_norm": 6.606391429901123, |
|
"learning_rate": 8.975975341011595e-07, |
|
"loss": 1.0385, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.43508771929824563, |
|
"grad_norm": 7.090952396392822, |
|
"learning_rate": 8.942314876526991e-07, |
|
"loss": 1.0438, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4421052631578947, |
|
"grad_norm": 7.45530891418457, |
|
"learning_rate": 8.908175388413303e-07, |
|
"loss": 1.0519, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.44912280701754387, |
|
"grad_norm": 7.6413960456848145, |
|
"learning_rate": 8.873561024898667e-07, |
|
"loss": 1.0705, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.45614035087719296, |
|
"grad_norm": 7.025049209594727, |
|
"learning_rate": 8.838475991912481e-07, |
|
"loss": 1.0548, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4631578947368421, |
|
"grad_norm": 7.06046724319458, |
|
"learning_rate": 8.802924552574345e-07, |
|
"loss": 1.0465, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.47017543859649125, |
|
"grad_norm": 7.351295471191406, |
|
"learning_rate": 8.766911026676063e-07, |
|
"loss": 1.0575, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.47719298245614034, |
|
"grad_norm": 7.417140960693359, |
|
"learning_rate": 8.730439790156751e-07, |
|
"loss": 1.0686, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4842105263157895, |
|
"grad_norm": 7.903563499450684, |
|
"learning_rate": 8.693515274571121e-07, |
|
"loss": 1.0776, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.49122807017543857, |
|
"grad_norm": 8.01221752166748, |
|
"learning_rate": 8.656141966551018e-07, |
|
"loss": 1.0621, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.49122807017543857, |
|
"eval_loss": 1.043724775314331, |
|
"eval_runtime": 27.6712, |
|
"eval_samples_per_second": 173.466, |
|
"eval_steps_per_second": 2.71, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4982456140350877, |
|
"grad_norm": 7.052249431610107, |
|
"learning_rate": 8.618324407260249e-07, |
|
"loss": 1.0738, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"grad_norm": 7.37591028213501, |
|
"learning_rate": 8.5800671918428e-07, |
|
"loss": 1.0607, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.512280701754386, |
|
"grad_norm": 7.373082160949707, |
|
"learning_rate": 8.541374968864485e-07, |
|
"loss": 1.0602, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.519298245614035, |
|
"grad_norm": 7.446669101715088, |
|
"learning_rate": 8.502252439748112e-07, |
|
"loss": 1.0462, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 6.634714603424072, |
|
"learning_rate": 8.462704358202216e-07, |
|
"loss": 1.0308, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 6.623584270477295, |
|
"learning_rate": 8.422735529643443e-07, |
|
"loss": 1.0462, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5403508771929825, |
|
"grad_norm": 7.110071659088135, |
|
"learning_rate": 8.382350810612663e-07, |
|
"loss": 1.0739, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5473684210526316, |
|
"grad_norm": 7.406259536743164, |
|
"learning_rate": 8.341555108184849e-07, |
|
"loss": 1.069, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5543859649122806, |
|
"grad_norm": 7.356163024902344, |
|
"learning_rate": 8.300353379372833e-07, |
|
"loss": 1.0542, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 7.522149562835693, |
|
"learning_rate": 8.258750630524983e-07, |
|
"loss": 1.0482, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"eval_loss": 1.0357595682144165, |
|
"eval_runtime": 27.6785, |
|
"eval_samples_per_second": 173.42, |
|
"eval_steps_per_second": 2.71, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5684210526315789, |
|
"grad_norm": 6.716446399688721, |
|
"learning_rate": 8.216751916716899e-07, |
|
"loss": 1.0459, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5754385964912281, |
|
"grad_norm": 7.719761371612549, |
|
"learning_rate": 8.174362341137176e-07, |
|
"loss": 1.0271, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5824561403508772, |
|
"grad_norm": 7.073091983795166, |
|
"learning_rate": 8.13158705446732e-07, |
|
"loss": 1.0483, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5894736842105263, |
|
"grad_norm": 6.979051113128662, |
|
"learning_rate": 8.088431254255898e-07, |
|
"loss": 1.0293, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5964912280701754, |
|
"grad_norm": 7.095376014709473, |
|
"learning_rate": 8.044900184287006e-07, |
|
"loss": 1.0387, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6035087719298246, |
|
"grad_norm": 7.155153274536133, |
|
"learning_rate": 8.000999133943092e-07, |
|
"loss": 1.0448, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6105263157894737, |
|
"grad_norm": 7.818843841552734, |
|
"learning_rate": 7.956733437562258e-07, |
|
"loss": 1.047, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6175438596491228, |
|
"grad_norm": 7.174437046051025, |
|
"learning_rate": 7.912108473790091e-07, |
|
"loss": 1.0293, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.624561403508772, |
|
"grad_norm": 7.124237060546875, |
|
"learning_rate": 7.867129664926123e-07, |
|
"loss": 1.0535, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 7.362142562866211, |
|
"learning_rate": 7.821802476264965e-07, |
|
"loss": 1.0662, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"eval_loss": 1.0292896032333374, |
|
"eval_runtime": 27.6513, |
|
"eval_samples_per_second": 173.59, |
|
"eval_steps_per_second": 2.712, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6385964912280702, |
|
"grad_norm": 6.185942649841309, |
|
"learning_rate": 7.776132415432232e-07, |
|
"loss": 1.0311, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6456140350877193, |
|
"grad_norm": 7.229496955871582, |
|
"learning_rate": 7.73012503171533e-07, |
|
"loss": 1.0478, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6526315789473685, |
|
"grad_norm": 6.964082717895508, |
|
"learning_rate": 7.683785915389162e-07, |
|
"loss": 1.0355, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6596491228070176, |
|
"grad_norm": 7.6486077308654785, |
|
"learning_rate": 7.637120697036865e-07, |
|
"loss": 1.0078, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 7.581448554992676, |
|
"learning_rate": 7.590135046865651e-07, |
|
"loss": 1.0352, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6736842105263158, |
|
"grad_norm": 6.977712154388428, |
|
"learning_rate": 7.542834674017831e-07, |
|
"loss": 1.0352, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6807017543859649, |
|
"grad_norm": 7.210628986358643, |
|
"learning_rate": 7.495225325877103e-07, |
|
"loss": 1.0351, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6877192982456141, |
|
"grad_norm": 6.860006809234619, |
|
"learning_rate": 7.447312787370202e-07, |
|
"loss": 1.0244, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6947368421052632, |
|
"grad_norm": 7.080367088317871, |
|
"learning_rate": 7.399102880263983e-07, |
|
"loss": 1.0451, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 7.036980152130127, |
|
"learning_rate": 7.350601462458024e-07, |
|
"loss": 1.0727, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"eval_loss": 1.022666096687317, |
|
"eval_runtime": 27.6533, |
|
"eval_samples_per_second": 173.578, |
|
"eval_steps_per_second": 2.712, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7087719298245614, |
|
"grad_norm": 6.67840576171875, |
|
"learning_rate": 7.301814427272848e-07, |
|
"loss": 1.0636, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7157894736842105, |
|
"grad_norm": 7.1095452308654785, |
|
"learning_rate": 7.252747702733839e-07, |
|
"loss": 1.0088, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7228070175438597, |
|
"grad_norm": 7.100186347961426, |
|
"learning_rate": 7.203407250850928e-07, |
|
"loss": 1.0245, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7298245614035088, |
|
"grad_norm": 6.765640735626221, |
|
"learning_rate": 7.158771761692464e-07, |
|
"loss": 1.0095, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 6.90313720703125, |
|
"learning_rate": 7.108927771727661e-07, |
|
"loss": 1.0188, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.743859649122807, |
|
"grad_norm": 6.8065948486328125, |
|
"learning_rate": 7.058827529721525e-07, |
|
"loss": 1.0339, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7508771929824561, |
|
"grad_norm": 6.624533653259277, |
|
"learning_rate": 7.008477123264847e-07, |
|
"loss": 1.0346, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"grad_norm": 7.218606472015381, |
|
"learning_rate": 6.957882670345458e-07, |
|
"loss": 1.0379, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7649122807017544, |
|
"grad_norm": 7.127339839935303, |
|
"learning_rate": 6.90705031860483e-07, |
|
"loss": 1.0205, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.7719298245614035, |
|
"grad_norm": 6.587140083312988, |
|
"learning_rate": 6.855986244591103e-07, |
|
"loss": 1.0263, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7719298245614035, |
|
"eval_loss": 1.0174767971038818, |
|
"eval_runtime": 27.6964, |
|
"eval_samples_per_second": 173.308, |
|
"eval_steps_per_second": 2.708, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7789473684210526, |
|
"grad_norm": 6.751448631286621, |
|
"learning_rate": 6.804696653008574e-07, |
|
"loss": 0.981, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7859649122807018, |
|
"grad_norm": 7.036713600158691, |
|
"learning_rate": 6.753187775963772e-07, |
|
"loss": 1.0488, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7929824561403509, |
|
"grad_norm": 6.959472179412842, |
|
"learning_rate": 6.701465872208216e-07, |
|
"loss": 1.0202, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.4908599853515625, |
|
"learning_rate": 6.649537226377914e-07, |
|
"loss": 1.0356, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8070175438596491, |
|
"grad_norm": 8.565585136413574, |
|
"learning_rate": 6.597408148229741e-07, |
|
"loss": 1.0125, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8140350877192982, |
|
"grad_norm": 7.0569167137146, |
|
"learning_rate": 6.545084971874736e-07, |
|
"loss": 1.0654, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8210526315789474, |
|
"grad_norm": 6.795130252838135, |
|
"learning_rate": 6.492574055008473e-07, |
|
"loss": 1.046, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8280701754385965, |
|
"grad_norm": 7.272831916809082, |
|
"learning_rate": 6.439881778138531e-07, |
|
"loss": 1.0238, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8350877192982457, |
|
"grad_norm": 6.588538646697998, |
|
"learning_rate": 6.387014543809223e-07, |
|
"loss": 1.0155, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 6.798887252807617, |
|
"learning_rate": 6.333978775823631e-07, |
|
"loss": 1.0187, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"eval_loss": 1.0141297578811646, |
|
"eval_runtime": 27.6602, |
|
"eval_samples_per_second": 173.534, |
|
"eval_steps_per_second": 2.711, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8491228070175438, |
|
"grad_norm": 6.572112083435059, |
|
"learning_rate": 6.280780918463057e-07, |
|
"loss": 1.0355, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.856140350877193, |
|
"grad_norm": 7.28840970993042, |
|
"learning_rate": 6.227427435703995e-07, |
|
"loss": 1.0424, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8631578947368421, |
|
"grad_norm": 8.068036079406738, |
|
"learning_rate": 6.173924810432704e-07, |
|
"loss": 1.0321, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.8701754385964913, |
|
"grad_norm": 6.726752281188965, |
|
"learning_rate": 6.12027954365748e-07, |
|
"loss": 1.0431, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8771929824561403, |
|
"grad_norm": 6.742453098297119, |
|
"learning_rate": 6.066498153718734e-07, |
|
"loss": 1.0178, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8842105263157894, |
|
"grad_norm": 6.598849296569824, |
|
"learning_rate": 6.01258717549696e-07, |
|
"loss": 1.0141, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8912280701754386, |
|
"grad_norm": 6.771568775177002, |
|
"learning_rate": 5.958553159618692e-07, |
|
"loss": 0.9957, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8982456140350877, |
|
"grad_norm": 7.0470380783081055, |
|
"learning_rate": 5.90440267166055e-07, |
|
"loss": 1.0387, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9052631578947369, |
|
"grad_norm": 7.024428367614746, |
|
"learning_rate": 5.850142291351465e-07, |
|
"loss": 1.026, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9122807017543859, |
|
"grad_norm": 7.074985027313232, |
|
"learning_rate": 5.795778611773197e-07, |
|
"loss": 1.0121, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9122807017543859, |
|
"eval_loss": 1.0093048810958862, |
|
"eval_runtime": 27.6576, |
|
"eval_samples_per_second": 173.551, |
|
"eval_steps_per_second": 2.712, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9192982456140351, |
|
"grad_norm": 7.012327194213867, |
|
"learning_rate": 5.741318238559209e-07, |
|
"loss": 1.0331, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.9263157894736842, |
|
"grad_norm": 6.710480690002441, |
|
"learning_rate": 5.686767789092041e-07, |
|
"loss": 1.012, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 6.7387919425964355, |
|
"learning_rate": 5.632133891699231e-07, |
|
"loss": 0.9881, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9403508771929825, |
|
"grad_norm": 6.965381145477295, |
|
"learning_rate": 5.577423184847931e-07, |
|
"loss": 1.0209, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 7.125399589538574, |
|
"learning_rate": 5.522642316338268e-07, |
|
"loss": 1.0109, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9543859649122807, |
|
"grad_norm": 7.273198127746582, |
|
"learning_rate": 5.467797942495589e-07, |
|
"loss": 1.0108, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.9614035087719298, |
|
"grad_norm": 6.802534580230713, |
|
"learning_rate": 5.412896727361662e-07, |
|
"loss": 1.025, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.968421052631579, |
|
"grad_norm": 7.282257080078125, |
|
"learning_rate": 5.357945341884935e-07, |
|
"loss": 1.0353, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.9754385964912281, |
|
"grad_norm": 6.752053260803223, |
|
"learning_rate": 5.302950463109969e-07, |
|
"loss": 1.0118, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.9824561403508771, |
|
"grad_norm": 6.847274303436279, |
|
"learning_rate": 5.247918773366111e-07, |
|
"loss": 1.0092, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9824561403508771, |
|
"eval_loss": 1.003943681716919, |
|
"eval_runtime": 27.6644, |
|
"eval_samples_per_second": 173.508, |
|
"eval_steps_per_second": 2.711, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9894736842105263, |
|
"grad_norm": 7.226211071014404, |
|
"learning_rate": 5.192856959455552e-07, |
|
"loss": 1.0278, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9964912280701754, |
|
"grad_norm": 6.635247230529785, |
|
"learning_rate": 5.137771711840811e-07, |
|
"loss": 1.0163, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.0035087719298246, |
|
"grad_norm": 6.2100605964660645, |
|
"learning_rate": 5.082669723831793e-07, |
|
"loss": 0.928, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.0105263157894737, |
|
"grad_norm": 6.735259532928467, |
|
"learning_rate": 5.027557690772503e-07, |
|
"loss": 0.8903, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.0175438596491229, |
|
"grad_norm": 7.061236381530762, |
|
"learning_rate": 4.972442309227498e-07, |
|
"loss": 0.8721, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.024561403508772, |
|
"grad_norm": 6.729221820831299, |
|
"learning_rate": 4.917330276168208e-07, |
|
"loss": 0.8759, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.0315789473684212, |
|
"grad_norm": 6.925577640533447, |
|
"learning_rate": 4.86222828815919e-07, |
|
"loss": 0.866, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.03859649122807, |
|
"grad_norm": 6.847450256347656, |
|
"learning_rate": 4.807143040544446e-07, |
|
"loss": 0.8851, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.0456140350877192, |
|
"grad_norm": 7.24519157409668, |
|
"learning_rate": 4.752081226633888e-07, |
|
"loss": 0.8922, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 6.8135085105896, |
|
"learning_rate": 4.697049536890033e-07, |
|
"loss": 0.8917, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"eval_loss": 1.0086382627487183, |
|
"eval_runtime": 27.6965, |
|
"eval_samples_per_second": 173.307, |
|
"eval_steps_per_second": 2.708, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0596491228070175, |
|
"grad_norm": 6.774071216583252, |
|
"learning_rate": 4.6475522990138276e-07, |
|
"loss": 0.8773, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 6.860315799713135, |
|
"learning_rate": 4.592596263646712e-07, |
|
"loss": 0.9042, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.0736842105263158, |
|
"grad_norm": 7.362914085388184, |
|
"learning_rate": 4.5376897311788825e-07, |
|
"loss": 0.8973, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.080701754385965, |
|
"grad_norm": 6.993128776550293, |
|
"learning_rate": 4.48283937320489e-07, |
|
"loss": 0.8533, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.087719298245614, |
|
"grad_norm": 7.575523853302002, |
|
"learning_rate": 4.4280518544936224e-07, |
|
"loss": 0.8896, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.0947368421052632, |
|
"grad_norm": 7.457510948181152, |
|
"learning_rate": 4.3733338321784777e-07, |
|
"loss": 0.873, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.1017543859649124, |
|
"grad_norm": 6.553786754608154, |
|
"learning_rate": 4.3186919549484777e-07, |
|
"loss": 0.8735, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.1087719298245613, |
|
"grad_norm": 7.161813259124756, |
|
"learning_rate": 4.264132862240387e-07, |
|
"loss": 0.8708, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.1157894736842104, |
|
"grad_norm": 7.342090129852295, |
|
"learning_rate": 4.2096631834319687e-07, |
|
"loss": 0.8627, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.1228070175438596, |
|
"grad_norm": 7.708263874053955, |
|
"learning_rate": 4.155289537036466e-07, |
|
"loss": 0.8916, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1228070175438596, |
|
"eval_loss": 1.0080682039260864, |
|
"eval_runtime": 27.6601, |
|
"eval_samples_per_second": 173.535, |
|
"eval_steps_per_second": 2.711, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1298245614035087, |
|
"grad_norm": 6.637975215911865, |
|
"learning_rate": 4.101018529898398e-07, |
|
"loss": 0.8598, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.1368421052631579, |
|
"grad_norm": 7.271252155303955, |
|
"learning_rate": 4.046856756390766e-07, |
|
"loss": 0.8632, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.143859649122807, |
|
"grad_norm": 6.89381742477417, |
|
"learning_rate": 3.99281079761379e-07, |
|
"loss": 0.8877, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.1508771929824562, |
|
"grad_norm": 7.032026290893555, |
|
"learning_rate": 3.938887220595252e-07, |
|
"loss": 0.879, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 7.385174751281738, |
|
"learning_rate": 3.885092577492542e-07, |
|
"loss": 0.8893, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.1649122807017545, |
|
"grad_norm": 7.389017105102539, |
|
"learning_rate": 3.8314334047965207e-07, |
|
"loss": 0.8727, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.1719298245614036, |
|
"grad_norm": 6.653899192810059, |
|
"learning_rate": 3.7779162225372846e-07, |
|
"loss": 0.8941, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.1789473684210527, |
|
"grad_norm": 7.119126319885254, |
|
"learning_rate": 3.724547533491924e-07, |
|
"loss": 0.8676, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.1859649122807017, |
|
"grad_norm": 7.610691070556641, |
|
"learning_rate": 3.671333822394386e-07, |
|
"loss": 0.864, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.1929824561403508, |
|
"grad_norm": 6.851118564605713, |
|
"learning_rate": 3.6182815551475223e-07, |
|
"loss": 0.885, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.1929824561403508, |
|
"eval_loss": 1.0073468685150146, |
|
"eval_runtime": 27.66, |
|
"eval_samples_per_second": 173.536, |
|
"eval_steps_per_second": 2.712, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 7.08779764175415, |
|
"learning_rate": 3.565397178037429e-07, |
|
"loss": 0.875, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.207017543859649, |
|
"grad_norm": 6.938493728637695, |
|
"learning_rate": 3.5126871169501815e-07, |
|
"loss": 0.8823, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.2140350877192982, |
|
"grad_norm": 7.4112114906311035, |
|
"learning_rate": 3.4601577765910175e-07, |
|
"loss": 0.8428, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.2210526315789474, |
|
"grad_norm": 7.859072208404541, |
|
"learning_rate": 3.407815539706124e-07, |
|
"loss": 0.8659, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.2280701754385965, |
|
"grad_norm": 6.562801837921143, |
|
"learning_rate": 3.3556667663070835e-07, |
|
"loss": 0.8654, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.2350877192982457, |
|
"grad_norm": 7.658775806427002, |
|
"learning_rate": 3.303717792898073e-07, |
|
"loss": 0.8652, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.2421052631578948, |
|
"grad_norm": 7.275959491729736, |
|
"learning_rate": 3.2519749317059327e-07, |
|
"loss": 0.8957, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.2491228070175437, |
|
"grad_norm": 7.704782485961914, |
|
"learning_rate": 3.200444469913172e-07, |
|
"loss": 0.8737, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.256140350877193, |
|
"grad_norm": 7.395431995391846, |
|
"learning_rate": 3.1491326688940344e-07, |
|
"loss": 0.8542, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 6.88340425491333, |
|
"learning_rate": 3.0980457634536774e-07, |
|
"loss": 0.8843, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"eval_loss": 1.0033657550811768, |
|
"eval_runtime": 27.6659, |
|
"eval_samples_per_second": 173.499, |
|
"eval_steps_per_second": 2.711, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2701754385964912, |
|
"grad_norm": 6.7408766746521, |
|
"learning_rate": 3.0471899610706036e-07, |
|
"loss": 0.8331, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.2771929824561403, |
|
"grad_norm": 7.153403282165527, |
|
"learning_rate": 2.996571441142397e-07, |
|
"loss": 0.8465, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.2842105263157895, |
|
"grad_norm": 7.26017427444458, |
|
"learning_rate": 2.9461963542348733e-07, |
|
"loss": 0.8785, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.2912280701754386, |
|
"grad_norm": 7.271636486053467, |
|
"learning_rate": 2.896070821334736e-07, |
|
"loss": 0.8831, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.2982456140350878, |
|
"grad_norm": 6.8561201095581055, |
|
"learning_rate": 2.846200933105829e-07, |
|
"loss": 0.8578, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.305263157894737, |
|
"grad_norm": 7.387796878814697, |
|
"learning_rate": 2.7965927491490704e-07, |
|
"loss": 0.8439, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.312280701754386, |
|
"grad_norm": 7.401048183441162, |
|
"learning_rate": 2.747252297266162e-07, |
|
"loss": 0.8944, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.3192982456140352, |
|
"grad_norm": 7.2983527183532715, |
|
"learning_rate": 2.698185572727151e-07, |
|
"loss": 0.8689, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.3263157894736843, |
|
"grad_norm": 7.557769775390625, |
|
"learning_rate": 2.6493985375419775e-07, |
|
"loss": 0.885, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 6.881629943847656, |
|
"learning_rate": 2.6008971197360175e-07, |
|
"loss": 0.8644, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"eval_loss": 1.0021144151687622, |
|
"eval_runtime": 27.6613, |
|
"eval_samples_per_second": 173.527, |
|
"eval_steps_per_second": 2.711, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.3403508771929824, |
|
"grad_norm": 7.333024978637695, |
|
"learning_rate": 2.5526872126297986e-07, |
|
"loss": 0.8912, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.3473684210526315, |
|
"grad_norm": 7.045767784118652, |
|
"learning_rate": 2.5047746741228977e-07, |
|
"loss": 0.8747, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.3543859649122807, |
|
"grad_norm": 7.227980613708496, |
|
"learning_rate": 2.457165325982169e-07, |
|
"loss": 0.8647, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.3614035087719298, |
|
"grad_norm": 7.303330898284912, |
|
"learning_rate": 2.4098649531343494e-07, |
|
"loss": 0.8657, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 7.276090621948242, |
|
"learning_rate": 2.362879302963135e-07, |
|
"loss": 0.8845, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.3754385964912281, |
|
"grad_norm": 7.321451663970947, |
|
"learning_rate": 2.3162140846108363e-07, |
|
"loss": 0.8487, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.3824561403508773, |
|
"grad_norm": 7.5262980461120605, |
|
"learning_rate": 2.2698749682846685e-07, |
|
"loss": 0.8762, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.3894736842105262, |
|
"grad_norm": 7.401157855987549, |
|
"learning_rate": 2.223867584567766e-07, |
|
"loss": 0.8748, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.3964912280701753, |
|
"grad_norm": 7.1058149337768555, |
|
"learning_rate": 2.1781975237350365e-07, |
|
"loss": 0.8641, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.4035087719298245, |
|
"grad_norm": 7.203502178192139, |
|
"learning_rate": 2.1328703350738765e-07, |
|
"loss": 0.8661, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4035087719298245, |
|
"eval_loss": 1.000258445739746, |
|
"eval_runtime": 27.6622, |
|
"eval_samples_per_second": 173.522, |
|
"eval_steps_per_second": 2.711, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4105263157894736, |
|
"grad_norm": 7.68574857711792, |
|
"learning_rate": 2.0878915262099096e-07, |
|
"loss": 0.8964, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.4175438596491228, |
|
"grad_norm": 7.339992523193359, |
|
"learning_rate": 2.0432665624377433e-07, |
|
"loss": 0.8779, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.424561403508772, |
|
"grad_norm": 7.711989879608154, |
|
"learning_rate": 1.999000866056908e-07, |
|
"loss": 0.8958, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.431578947368421, |
|
"grad_norm": 6.8218488693237305, |
|
"learning_rate": 1.9550998157129944e-07, |
|
"loss": 0.8848, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.4385964912280702, |
|
"grad_norm": 7.602545261383057, |
|
"learning_rate": 1.9115687457441022e-07, |
|
"loss": 0.8668, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.4456140350877194, |
|
"grad_norm": 7.199863433837891, |
|
"learning_rate": 1.8684129455326808e-07, |
|
"loss": 0.8705, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.4526315789473685, |
|
"grad_norm": 7.163413047790527, |
|
"learning_rate": 1.8256376588628235e-07, |
|
"loss": 0.8641, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.4596491228070176, |
|
"grad_norm": 7.178804397583008, |
|
"learning_rate": 1.7832480832830986e-07, |
|
"loss": 0.8526, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 7.084789752960205, |
|
"learning_rate": 1.7412493694750173e-07, |
|
"loss": 0.8834, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 7.647516250610352, |
|
"learning_rate": 1.6996466206271675e-07, |
|
"loss": 0.8712, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"eval_loss": 1.0002570152282715, |
|
"eval_runtime": 27.6725, |
|
"eval_samples_per_second": 173.457, |
|
"eval_steps_per_second": 2.71, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4807017543859649, |
|
"grad_norm": 7.682786464691162, |
|
"learning_rate": 1.6584448918151518e-07, |
|
"loss": 0.8648, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.487719298245614, |
|
"grad_norm": 6.9408650398254395, |
|
"learning_rate": 1.6176491893873367e-07, |
|
"loss": 0.8775, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.4947368421052631, |
|
"grad_norm": 7.477031230926514, |
|
"learning_rate": 1.5772644703565564e-07, |
|
"loss": 0.8648, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.5017543859649123, |
|
"grad_norm": 7.054373741149902, |
|
"learning_rate": 1.537295641797785e-07, |
|
"loss": 0.8608, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.5087719298245614, |
|
"grad_norm": 6.98421049118042, |
|
"learning_rate": 1.4977475602518874e-07, |
|
"loss": 0.8653, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.5157894736842106, |
|
"grad_norm": 7.556164264678955, |
|
"learning_rate": 1.4586250311355132e-07, |
|
"loss": 0.8691, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.5228070175438595, |
|
"grad_norm": 7.721457004547119, |
|
"learning_rate": 1.4199328081572e-07, |
|
"loss": 0.8853, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.5298245614035086, |
|
"grad_norm": 7.5607428550720215, |
|
"learning_rate": 1.38167559273975e-07, |
|
"loss": 0.8647, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.5368421052631578, |
|
"grad_norm": 7.398414134979248, |
|
"learning_rate": 1.3438580334489818e-07, |
|
"loss": 0.8524, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.543859649122807, |
|
"grad_norm": 7.229887008666992, |
|
"learning_rate": 1.3064847254288796e-07, |
|
"loss": 0.8638, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.543859649122807, |
|
"eval_loss": 0.9979353547096252, |
|
"eval_runtime": 27.6809, |
|
"eval_samples_per_second": 173.405, |
|
"eval_steps_per_second": 2.709, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.550877192982456, |
|
"grad_norm": 7.479950428009033, |
|
"learning_rate": 1.26956020984325e-07, |
|
"loss": 0.8672, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.5578947368421052, |
|
"grad_norm": 7.526796340942383, |
|
"learning_rate": 1.2330889733239368e-07, |
|
"loss": 0.8882, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.5649122807017544, |
|
"grad_norm": 7.098681926727295, |
|
"learning_rate": 1.197075447425656e-07, |
|
"loss": 0.8564, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.5719298245614035, |
|
"grad_norm": 7.627535343170166, |
|
"learning_rate": 1.16152400808752e-07, |
|
"loss": 0.8778, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 7.635378360748291, |
|
"learning_rate": 1.1264389751013325e-07, |
|
"loss": 0.8615, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.5859649122807018, |
|
"grad_norm": 7.256911754608154, |
|
"learning_rate": 1.0918246115866964e-07, |
|
"loss": 0.8828, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.592982456140351, |
|
"grad_norm": 7.054688453674316, |
|
"learning_rate": 1.0576851234730094e-07, |
|
"loss": 0.8602, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 7.2597479820251465, |
|
"learning_rate": 1.0240246589884045e-07, |
|
"loss": 0.8588, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.6070175438596492, |
|
"grad_norm": 7.462535381317139, |
|
"learning_rate": 9.90847308155715e-08, |
|
"loss": 0.8623, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.6140350877192984, |
|
"grad_norm": 7.354959487915039, |
|
"learning_rate": 9.581571022954987e-08, |
|
"loss": 0.8632, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.6140350877192984, |
|
"eval_loss": 0.9973437786102295, |
|
"eval_runtime": 27.6881, |
|
"eval_samples_per_second": 173.36, |
|
"eval_steps_per_second": 2.709, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.6210526315789475, |
|
"grad_norm": 7.283778667449951, |
|
"learning_rate": 9.259580135361927e-08, |
|
"loss": 0.8684, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.6280701754385964, |
|
"grad_norm": 7.570828914642334, |
|
"learning_rate": 8.942539543314798e-08, |
|
"loss": 0.8609, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.6350877192982456, |
|
"grad_norm": 7.366217613220215, |
|
"learning_rate": 8.630487769848876e-08, |
|
"loss": 0.8722, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.6421052631578947, |
|
"grad_norm": 7.667774200439453, |
|
"learning_rate": 8.32346273181696e-08, |
|
"loss": 0.8883, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.6491228070175439, |
|
"grad_norm": 8.111892700195312, |
|
"learning_rate": 8.021501735282266e-08, |
|
"loss": 0.8599, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.656140350877193, |
|
"grad_norm": 7.690216064453125, |
|
"learning_rate": 7.724641470985377e-08, |
|
"loss": 0.8951, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.663157894736842, |
|
"grad_norm": 7.080111980438232, |
|
"learning_rate": 7.432918009885996e-08, |
|
"loss": 0.865, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.670175438596491, |
|
"grad_norm": 7.580221176147461, |
|
"learning_rate": 7.146366798780096e-08, |
|
"loss": 0.8905, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.6771929824561402, |
|
"grad_norm": 6.910195827484131, |
|
"learning_rate": 6.865022655992798e-08, |
|
"loss": 0.8501, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 7.176208972930908, |
|
"learning_rate": 6.588919767147638e-08, |
|
"loss": 0.8461, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"eval_loss": 0.9966626167297363, |
|
"eval_runtime": 27.668, |
|
"eval_samples_per_second": 173.486, |
|
"eval_steps_per_second": 2.711, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.6912280701754385, |
|
"grad_norm": 7.764338970184326, |
|
"learning_rate": 6.318091681012771e-08, |
|
"loss": 0.8711, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.6982456140350877, |
|
"grad_norm": 8.283316612243652, |
|
"learning_rate": 6.052571305424531e-08, |
|
"loss": 0.8738, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.7052631578947368, |
|
"grad_norm": 7.315950870513916, |
|
"learning_rate": 5.7923909032888295e-08, |
|
"loss": 0.8719, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.712280701754386, |
|
"grad_norm": 7.591914653778076, |
|
"learning_rate": 5.537582088660936e-08, |
|
"loss": 0.8708, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.719298245614035, |
|
"grad_norm": 7.378705978393555, |
|
"learning_rate": 5.2881758229041394e-08, |
|
"loss": 0.8722, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.7263157894736842, |
|
"grad_norm": 7.416294097900391, |
|
"learning_rate": 5.044202410927706e-08, |
|
"loss": 0.8586, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 7.301969051361084, |
|
"learning_rate": 4.805691497504505e-08, |
|
"loss": 0.891, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.7403508771929825, |
|
"grad_norm": 6.946348190307617, |
|
"learning_rate": 4.5726720636690195e-08, |
|
"loss": 0.8871, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.7473684210526317, |
|
"grad_norm": 7.327394008636475, |
|
"learning_rate": 4.3451724231958645e-08, |
|
"loss": 0.8688, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.7543859649122808, |
|
"grad_norm": 7.17736291885376, |
|
"learning_rate": 4.123220219159418e-08, |
|
"loss": 0.8729, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.7543859649122808, |
|
"eval_loss": 0.9957481622695923, |
|
"eval_runtime": 27.665, |
|
"eval_samples_per_second": 173.504, |
|
"eval_steps_per_second": 2.711, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 6.354365204175782e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|