|
{ |
|
"best_metric": 1.1103906631469727, |
|
"best_model_checkpoint": "/home/wani/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/256/checkpoint-12330", |
|
"epoch": 10.386703853378108, |
|
"eval_steps": 90, |
|
"global_step": 12330, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008423928510444533, |
|
"grad_norm": 5.073121070861816, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 7.2395, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016847857020889066, |
|
"grad_norm": 4.587955474853516, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 7.0836, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0252717855313336, |
|
"grad_norm": 3.8589327335357666, |
|
"learning_rate": 1.25e-05, |
|
"loss": 6.8156, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03369571404177813, |
|
"grad_norm": 3.4427683353424072, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 6.5549, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04211964255222266, |
|
"grad_norm": 3.109060525894165, |
|
"learning_rate": 2.0833333333333333e-05, |
|
"loss": 6.3522, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0505435710626672, |
|
"grad_norm": 2.86232590675354, |
|
"learning_rate": 2.5e-05, |
|
"loss": 6.1983, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05896749957311173, |
|
"grad_norm": 2.6880924701690674, |
|
"learning_rate": 2.9166666666666666e-05, |
|
"loss": 6.0796, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06739142808355626, |
|
"grad_norm": 2.490527629852295, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 5.9754, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0758153565940008, |
|
"grad_norm": 2.3156356811523438, |
|
"learning_rate": 3.75e-05, |
|
"loss": 5.8736, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0758153565940008, |
|
"eval_accuracy": 0.22415329938580753, |
|
"eval_loss": 5.8054423332214355, |
|
"eval_runtime": 910.9652, |
|
"eval_samples_per_second": 548.183, |
|
"eval_steps_per_second": 5.076, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08423928510444532, |
|
"grad_norm": 2.1557302474975586, |
|
"learning_rate": 4.1666666666666665e-05, |
|
"loss": 5.7691, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09266321361488987, |
|
"grad_norm": 1.9360383749008179, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 5.6653, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1010871421253344, |
|
"grad_norm": 1.731399655342102, |
|
"learning_rate": 5e-05, |
|
"loss": 5.5598, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10951107063577893, |
|
"grad_norm": 1.508693814277649, |
|
"learning_rate": 5.416666666666667e-05, |
|
"loss": 5.4574, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11793499914622346, |
|
"grad_norm": 1.2835007905960083, |
|
"learning_rate": 5.833333333333333e-05, |
|
"loss": 5.3585, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.126358927656668, |
|
"grad_norm": 1.0747231245040894, |
|
"learning_rate": 6.25e-05, |
|
"loss": 5.2667, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13478285616711252, |
|
"grad_norm": 0.852271318435669, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 5.1779, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14320678467755707, |
|
"grad_norm": 0.7001814842224121, |
|
"learning_rate": 7.083333333333334e-05, |
|
"loss": 5.0965, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1516307131880016, |
|
"grad_norm": 0.5657457709312439, |
|
"learning_rate": 7.5e-05, |
|
"loss": 5.0237, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1516307131880016, |
|
"eval_accuracy": 0.23888299376264316, |
|
"eval_loss": 4.981535911560059, |
|
"eval_runtime": 882.341, |
|
"eval_samples_per_second": 565.967, |
|
"eval_steps_per_second": 5.241, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16005464169844613, |
|
"grad_norm": 0.4981703758239746, |
|
"learning_rate": 7.916666666666666e-05, |
|
"loss": 4.9662, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16847857020889065, |
|
"grad_norm": 0.40254291892051697, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 4.9195, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1769024987193352, |
|
"grad_norm": 0.32726043462753296, |
|
"learning_rate": 8.75e-05, |
|
"loss": 4.8766, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18532642722977974, |
|
"grad_norm": 0.2471727877855301, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 4.8458, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19375035574022426, |
|
"grad_norm": 0.2568261921405792, |
|
"learning_rate": 9.583333333333334e-05, |
|
"loss": 4.8169, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2021742842506688, |
|
"grad_norm": 0.19310955703258514, |
|
"learning_rate": 0.0001, |
|
"loss": 4.7926, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21059821276111332, |
|
"grad_norm": 0.20584674179553986, |
|
"learning_rate": 0.00010416666666666667, |
|
"loss": 4.7714, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21902214127155786, |
|
"grad_norm": 0.26360729336738586, |
|
"learning_rate": 0.00010833333333333334, |
|
"loss": 4.7511, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22744606978200238, |
|
"grad_norm": 0.1681978851556778, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 4.7309, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.22744606978200238, |
|
"eval_accuracy": 0.28488370423336357, |
|
"eval_loss": 4.706047534942627, |
|
"eval_runtime": 889.3977, |
|
"eval_samples_per_second": 561.477, |
|
"eval_steps_per_second": 5.199, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23586999829244693, |
|
"grad_norm": 0.17959143221378326, |
|
"learning_rate": 0.00011666666666666667, |
|
"loss": 4.7148, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24429392680289147, |
|
"grad_norm": 0.27109047770500183, |
|
"learning_rate": 0.00012083333333333333, |
|
"loss": 4.6989, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.252717855313336, |
|
"grad_norm": 0.2674080431461334, |
|
"learning_rate": 0.000125, |
|
"loss": 4.6826, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2611417838237805, |
|
"grad_norm": 0.24386395514011383, |
|
"learning_rate": 0.00012916666666666667, |
|
"loss": 4.6707, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.26956571233422505, |
|
"grad_norm": 0.5274083614349365, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 4.6553, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2779896408446696, |
|
"grad_norm": 0.4005141258239746, |
|
"learning_rate": 0.0001375, |
|
"loss": 4.6446, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.28641356935511414, |
|
"grad_norm": 0.3732853829860687, |
|
"learning_rate": 0.00014166666666666668, |
|
"loss": 4.6315, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.29483749786555863, |
|
"grad_norm": 0.2742752730846405, |
|
"learning_rate": 0.00014583333333333335, |
|
"loss": 4.6221, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3032614263760032, |
|
"grad_norm": 0.20482462644577026, |
|
"learning_rate": 0.00015, |
|
"loss": 4.6138, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3032614263760032, |
|
"eval_accuracy": 0.28836420126551926, |
|
"eval_loss": 4.5933918952941895, |
|
"eval_runtime": 880.4452, |
|
"eval_samples_per_second": 567.186, |
|
"eval_steps_per_second": 5.252, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3116853548864477, |
|
"grad_norm": 0.26613757014274597, |
|
"learning_rate": 0.00015416666666666668, |
|
"loss": 4.5983, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32010928339689226, |
|
"grad_norm": 0.20205098390579224, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 4.5922, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3285332119073368, |
|
"grad_norm": 0.5084218978881836, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 4.5826, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3369571404177813, |
|
"grad_norm": 0.2835780084133148, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 4.5771, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.34538106892822584, |
|
"grad_norm": 0.23976200819015503, |
|
"learning_rate": 0.00017083333333333333, |
|
"loss": 4.5726, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3538049974386704, |
|
"grad_norm": 0.2275087982416153, |
|
"learning_rate": 0.000175, |
|
"loss": 4.5666, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.36222892594911493, |
|
"grad_norm": 0.27758899331092834, |
|
"learning_rate": 0.00017916666666666667, |
|
"loss": 4.5654, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3706528544595595, |
|
"grad_norm": 0.18581350147724152, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 4.5593, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.37907678297000397, |
|
"grad_norm": 0.1667676419019699, |
|
"learning_rate": 0.0001875, |
|
"loss": 4.5538, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.37907678297000397, |
|
"eval_accuracy": 0.28966679521500804, |
|
"eval_loss": 4.547606468200684, |
|
"eval_runtime": 890.3979, |
|
"eval_samples_per_second": 560.846, |
|
"eval_steps_per_second": 5.193, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3875007114804485, |
|
"grad_norm": 0.32489290833473206, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 4.5532, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.39592463999089306, |
|
"grad_norm": 0.7000045776367188, |
|
"learning_rate": 0.00019583333333333334, |
|
"loss": 4.5484, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4043485685013376, |
|
"grad_norm": 0.43668240308761597, |
|
"learning_rate": 0.0002, |
|
"loss": 4.5489, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4127724970117821, |
|
"grad_norm": 0.36716368794441223, |
|
"learning_rate": 0.00020416666666666668, |
|
"loss": 4.5459, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42119642552222664, |
|
"grad_norm": 0.30332931876182556, |
|
"learning_rate": 0.00020833333333333335, |
|
"loss": 4.5418, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4296203540326712, |
|
"grad_norm": 0.5920347571372986, |
|
"learning_rate": 0.0002125, |
|
"loss": 4.5406, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4380442825431157, |
|
"grad_norm": 0.45020386576652527, |
|
"learning_rate": 0.00021666666666666668, |
|
"loss": 4.5372, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.44646821105356027, |
|
"grad_norm": 0.33357909321784973, |
|
"learning_rate": 0.00022083333333333333, |
|
"loss": 4.5367, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.45489213956400476, |
|
"grad_norm": 0.45888572931289673, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 4.5344, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.45489213956400476, |
|
"eval_accuracy": 0.2902362393111046, |
|
"eval_loss": 4.531790256500244, |
|
"eval_runtime": 882.2427, |
|
"eval_samples_per_second": 566.03, |
|
"eval_steps_per_second": 5.241, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4633160680744493, |
|
"grad_norm": 0.4458440840244293, |
|
"learning_rate": 0.00022916666666666666, |
|
"loss": 4.5328, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.47173999658489385, |
|
"grad_norm": 0.1917838305234909, |
|
"learning_rate": 0.00023333333333333333, |
|
"loss": 4.5296, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4801639250953384, |
|
"grad_norm": 0.8310424089431763, |
|
"learning_rate": 0.0002375, |
|
"loss": 4.5275, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.48858785360578294, |
|
"grad_norm": 0.4216615855693817, |
|
"learning_rate": 0.00024166666666666667, |
|
"loss": 4.531, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.49701178211622743, |
|
"grad_norm": 0.2320231944322586, |
|
"learning_rate": 0.0002458333333333333, |
|
"loss": 4.5276, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.505435710626672, |
|
"grad_norm": 0.3115006983280182, |
|
"learning_rate": 0.00025, |
|
"loss": 4.5252, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5138596391371165, |
|
"grad_norm": 0.13032270967960358, |
|
"learning_rate": 0.00025416666666666665, |
|
"loss": 4.5227, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.522283567647561, |
|
"grad_norm": 0.5333927273750305, |
|
"learning_rate": 0.00025833333333333334, |
|
"loss": 4.5214, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5307074961580056, |
|
"grad_norm": 0.8976441025733948, |
|
"learning_rate": 0.00026250000000000004, |
|
"loss": 4.5218, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5307074961580056, |
|
"eval_accuracy": 0.290083406000685, |
|
"eval_loss": 4.522771835327148, |
|
"eval_runtime": 892.1941, |
|
"eval_samples_per_second": 559.717, |
|
"eval_steps_per_second": 5.183, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5391314246684501, |
|
"grad_norm": 0.1657322496175766, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 4.523, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5475553531788947, |
|
"grad_norm": 0.1890048235654831, |
|
"learning_rate": 0.0002708333333333333, |
|
"loss": 4.5185, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5559792816893392, |
|
"grad_norm": 0.8254080414772034, |
|
"learning_rate": 0.000275, |
|
"loss": 4.5196, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5644032101997837, |
|
"grad_norm": 0.1703944355249405, |
|
"learning_rate": 0.00027916666666666666, |
|
"loss": 4.52, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5728271387102283, |
|
"grad_norm": 0.33486783504486084, |
|
"learning_rate": 0.00028333333333333335, |
|
"loss": 4.5139, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5812510672206728, |
|
"grad_norm": 0.4759036600589752, |
|
"learning_rate": 0.0002875, |
|
"loss": 4.5158, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5896749957311173, |
|
"grad_norm": 0.26314422488212585, |
|
"learning_rate": 0.0002916666666666667, |
|
"loss": 4.5135, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5980989242415619, |
|
"grad_norm": 0.39898937940597534, |
|
"learning_rate": 0.00029583333333333333, |
|
"loss": 4.5114, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6065228527520063, |
|
"grad_norm": 0.5003794431686401, |
|
"learning_rate": 0.0003, |
|
"loss": 4.5148, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6065228527520063, |
|
"eval_accuracy": 0.2903979539286128, |
|
"eval_loss": 4.508981704711914, |
|
"eval_runtime": 878.8487, |
|
"eval_samples_per_second": 568.216, |
|
"eval_steps_per_second": 5.261, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.614946781262451, |
|
"grad_norm": 0.2276950627565384, |
|
"learning_rate": 0.00030416666666666667, |
|
"loss": 4.5111, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6233707097728954, |
|
"grad_norm": 0.21725377440452576, |
|
"learning_rate": 0.00030833333333333337, |
|
"loss": 4.5088, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6317946382833399, |
|
"grad_norm": 0.8084585666656494, |
|
"learning_rate": 0.0003125, |
|
"loss": 4.5074, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6402185667937845, |
|
"grad_norm": 0.46915069222450256, |
|
"learning_rate": 0.00031666666666666665, |
|
"loss": 4.5072, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.648642495304229, |
|
"grad_norm": 0.15649260580539703, |
|
"learning_rate": 0.00032083333333333334, |
|
"loss": 4.5039, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6570664238146736, |
|
"grad_norm": 0.42916274070739746, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 4.5056, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6654903523251181, |
|
"grad_norm": 0.287572979927063, |
|
"learning_rate": 0.0003291666666666667, |
|
"loss": 4.5045, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6739142808355626, |
|
"grad_norm": 0.6869699358940125, |
|
"learning_rate": 0.0003333333333333333, |
|
"loss": 4.5029, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6823382093460072, |
|
"grad_norm": 0.2973476052284241, |
|
"learning_rate": 0.0003375, |
|
"loss": 4.5009, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6823382093460072, |
|
"eval_accuracy": 0.29041409279207236, |
|
"eval_loss": 4.497637748718262, |
|
"eval_runtime": 872.3603, |
|
"eval_samples_per_second": 572.442, |
|
"eval_steps_per_second": 5.301, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6907621378564517, |
|
"grad_norm": 0.5773557424545288, |
|
"learning_rate": 0.00034166666666666666, |
|
"loss": 4.5024, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6991860663668963, |
|
"grad_norm": 0.31921157240867615, |
|
"learning_rate": 0.00034583333333333335, |
|
"loss": 4.5006, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7076099948773408, |
|
"grad_norm": 0.4232361912727356, |
|
"learning_rate": 0.00035, |
|
"loss": 4.5001, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7160339233877853, |
|
"grad_norm": 0.30865538120269775, |
|
"learning_rate": 0.0003541666666666667, |
|
"loss": 4.4998, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7244578518982299, |
|
"grad_norm": 0.6191368699073792, |
|
"learning_rate": 0.00035833333333333333, |
|
"loss": 4.4967, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7328817804086744, |
|
"grad_norm": 0.3202773630619049, |
|
"learning_rate": 0.0003625, |
|
"loss": 4.499, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.741305708919119, |
|
"grad_norm": 0.3090028464794159, |
|
"learning_rate": 0.00036666666666666667, |
|
"loss": 4.4967, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7497296374295634, |
|
"grad_norm": 0.9248805046081543, |
|
"learning_rate": 0.00037083333333333337, |
|
"loss": 4.4962, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7581535659400079, |
|
"grad_norm": 0.27745822072029114, |
|
"learning_rate": 0.000375, |
|
"loss": 4.4956, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7581535659400079, |
|
"eval_accuracy": 0.29047371761644103, |
|
"eval_loss": 4.492140293121338, |
|
"eval_runtime": 888.1144, |
|
"eval_samples_per_second": 562.288, |
|
"eval_steps_per_second": 5.207, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7665774944504525, |
|
"grad_norm": 0.2972380518913269, |
|
"learning_rate": 0.00037916666666666665, |
|
"loss": 4.4936, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.775001422960897, |
|
"grad_norm": 1.4440104961395264, |
|
"learning_rate": 0.00038333333333333334, |
|
"loss": 4.4956, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7834253514713415, |
|
"grad_norm": 0.2894129455089569, |
|
"learning_rate": 0.00038750000000000004, |
|
"loss": 4.4961, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7918492799817861, |
|
"grad_norm": 0.22757315635681152, |
|
"learning_rate": 0.0003916666666666667, |
|
"loss": 4.495, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8002732084922306, |
|
"grad_norm": 0.2084762305021286, |
|
"learning_rate": 0.0003958333333333333, |
|
"loss": 4.4921, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8086971370026752, |
|
"grad_norm": 0.4823535084724426, |
|
"learning_rate": 0.0004, |
|
"loss": 4.4928, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8171210655131197, |
|
"grad_norm": 0.22939594089984894, |
|
"learning_rate": 0.00040416666666666666, |
|
"loss": 4.4889, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8255449940235642, |
|
"grad_norm": 0.4983462989330292, |
|
"learning_rate": 0.00040833333333333336, |
|
"loss": 4.4888, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8339689225340088, |
|
"grad_norm": 0.7445792555809021, |
|
"learning_rate": 0.0004125, |
|
"loss": 4.4899, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8339689225340088, |
|
"eval_accuracy": 0.2903607895100575, |
|
"eval_loss": 4.490144729614258, |
|
"eval_runtime": 872.9885, |
|
"eval_samples_per_second": 572.03, |
|
"eval_steps_per_second": 5.297, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8423928510444533, |
|
"grad_norm": 0.3264559805393219, |
|
"learning_rate": 0.0004166666666666667, |
|
"loss": 4.4879, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8508167795548979, |
|
"grad_norm": 0.5130082964897156, |
|
"learning_rate": 0.00042083333333333333, |
|
"loss": 4.4881, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8592407080653424, |
|
"grad_norm": 0.2776341736316681, |
|
"learning_rate": 0.000425, |
|
"loss": 4.4872, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8676646365757869, |
|
"grad_norm": 0.9157618880271912, |
|
"learning_rate": 0.00042916666666666667, |
|
"loss": 4.4868, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8760885650862315, |
|
"grad_norm": 0.22099615633487701, |
|
"learning_rate": 0.00043333333333333337, |
|
"loss": 4.4877, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8845124935966759, |
|
"grad_norm": 0.2313142567873001, |
|
"learning_rate": 0.0004375, |
|
"loss": 4.4845, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8929364221071205, |
|
"grad_norm": 0.4353635907173157, |
|
"learning_rate": 0.00044166666666666665, |
|
"loss": 4.4888, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.901360350617565, |
|
"grad_norm": 0.2390984743833542, |
|
"learning_rate": 0.00044583333333333335, |
|
"loss": 4.4827, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9097842791280095, |
|
"grad_norm": 0.31369632482528687, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 4.4832, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9097842791280095, |
|
"eval_accuracy": 0.2904605834264481, |
|
"eval_loss": 4.480494499206543, |
|
"eval_runtime": 880.1337, |
|
"eval_samples_per_second": 567.386, |
|
"eval_steps_per_second": 5.254, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9182082076384541, |
|
"grad_norm": 0.6700971722602844, |
|
"learning_rate": 0.0004541666666666667, |
|
"loss": 4.483, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9266321361488986, |
|
"grad_norm": 0.25950998067855835, |
|
"learning_rate": 0.0004583333333333333, |
|
"loss": 4.4832, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9350560646593432, |
|
"grad_norm": 0.2840316593647003, |
|
"learning_rate": 0.0004625, |
|
"loss": 4.4819, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9434799931697877, |
|
"grad_norm": 0.6859279274940491, |
|
"learning_rate": 0.00046666666666666666, |
|
"loss": 4.4819, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9519039216802322, |
|
"grad_norm": 0.2865343391895294, |
|
"learning_rate": 0.00047083333333333336, |
|
"loss": 4.48, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9603278501906768, |
|
"grad_norm": 1.179539442062378, |
|
"learning_rate": 0.000475, |
|
"loss": 4.4762, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9687517787011213, |
|
"grad_norm": 0.4731704294681549, |
|
"learning_rate": 0.0004791666666666667, |
|
"loss": 4.4831, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9771757072115659, |
|
"grad_norm": 0.298757404088974, |
|
"learning_rate": 0.00048333333333333334, |
|
"loss": 4.4742, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9855996357220104, |
|
"grad_norm": 1.0954639911651611, |
|
"learning_rate": 0.0004875, |
|
"loss": 4.46, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9855996357220104, |
|
"eval_accuracy": 0.29021425691327735, |
|
"eval_loss": 4.458162784576416, |
|
"eval_runtime": 887.8161, |
|
"eval_samples_per_second": 562.477, |
|
"eval_steps_per_second": 5.208, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9940235642324549, |
|
"grad_norm": 0.441949725151062, |
|
"learning_rate": 0.0004916666666666666, |
|
"loss": 4.4549, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.0024474927428995, |
|
"grad_norm": 0.5917736887931824, |
|
"learning_rate": 0.0004958333333333334, |
|
"loss": 4.4425, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.010871421253344, |
|
"grad_norm": 0.3910304307937622, |
|
"learning_rate": 0.0005, |
|
"loss": 4.4376, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0192953497637884, |
|
"grad_norm": 0.446277916431427, |
|
"learning_rate": 0.0005041666666666667, |
|
"loss": 4.4284, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.027719278274233, |
|
"grad_norm": 0.7843539118766785, |
|
"learning_rate": 0.0005083333333333333, |
|
"loss": 4.4216, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.0361432067846776, |
|
"grad_norm": 0.5028587579727173, |
|
"learning_rate": 0.0005124999999999999, |
|
"loss": 4.418, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.044567135295122, |
|
"grad_norm": 0.5062530636787415, |
|
"learning_rate": 0.0005166666666666667, |
|
"loss": 4.4099, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.0529910638055666, |
|
"grad_norm": 0.4109475016593933, |
|
"learning_rate": 0.0005208333333333334, |
|
"loss": 4.4005, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0614149923160112, |
|
"grad_norm": 0.494357705116272, |
|
"learning_rate": 0.0005250000000000001, |
|
"loss": 4.3924, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0614149923160112, |
|
"eval_accuracy": 0.29121270831959656, |
|
"eval_loss": 4.368500232696533, |
|
"eval_runtime": 885.6194, |
|
"eval_samples_per_second": 563.872, |
|
"eval_steps_per_second": 5.221, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0698389208264556, |
|
"grad_norm": 0.4964124858379364, |
|
"learning_rate": 0.0005291666666666667, |
|
"loss": 4.3843, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.0782628493369002, |
|
"grad_norm": 0.6328290700912476, |
|
"learning_rate": 0.0005333333333333334, |
|
"loss": 4.3756, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.0866867778473448, |
|
"grad_norm": 0.8674759268760681, |
|
"learning_rate": 0.0005375, |
|
"loss": 4.3697, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.0951107063577892, |
|
"grad_norm": 0.4631132185459137, |
|
"learning_rate": 0.0005416666666666666, |
|
"loss": 4.3676, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1035346348682338, |
|
"grad_norm": 0.5043870210647583, |
|
"learning_rate": 0.0005458333333333333, |
|
"loss": 4.3582, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.1119585633786784, |
|
"grad_norm": 0.5791853666305542, |
|
"learning_rate": 0.00055, |
|
"loss": 4.3529, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.120382491889123, |
|
"grad_norm": 0.6443321108818054, |
|
"learning_rate": 0.0005541666666666667, |
|
"loss": 4.3471, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.1288064203995674, |
|
"grad_norm": 0.6193282008171082, |
|
"learning_rate": 0.0005583333333333333, |
|
"loss": 4.338, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.137230348910012, |
|
"grad_norm": 0.6169930696487427, |
|
"learning_rate": 0.0005625000000000001, |
|
"loss": 4.3365, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.137230348910012, |
|
"eval_accuracy": 0.2912005471998471, |
|
"eval_loss": 4.2970428466796875, |
|
"eval_runtime": 875.1704, |
|
"eval_samples_per_second": 570.604, |
|
"eval_steps_per_second": 5.284, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1456542774204566, |
|
"grad_norm": 0.8051270246505737, |
|
"learning_rate": 0.0005666666666666667, |
|
"loss": 4.3252, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.154078205930901, |
|
"grad_norm": 0.7985979914665222, |
|
"learning_rate": 0.0005708333333333333, |
|
"loss": 4.3185, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.1625021344413455, |
|
"grad_norm": 0.7459626793861389, |
|
"learning_rate": 0.000575, |
|
"loss": 4.3119, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1709260629517901, |
|
"grad_norm": 0.572289228439331, |
|
"learning_rate": 0.0005791666666666667, |
|
"loss": 4.3066, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.1793499914622347, |
|
"grad_norm": 0.5565480589866638, |
|
"learning_rate": 0.0005833333333333334, |
|
"loss": 4.2973, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1877739199726791, |
|
"grad_norm": 0.789574384689331, |
|
"learning_rate": 0.0005875, |
|
"loss": 4.2922, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1961978484831237, |
|
"grad_norm": 1.0027601718902588, |
|
"learning_rate": 0.0005916666666666667, |
|
"loss": 4.2824, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.204621776993568, |
|
"grad_norm": 0.8137519359588623, |
|
"learning_rate": 0.0005958333333333333, |
|
"loss": 4.2808, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.2130457055040127, |
|
"grad_norm": 0.8705686330795288, |
|
"learning_rate": 0.0006, |
|
"loss": 4.2685, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.2130457055040127, |
|
"eval_accuracy": 0.2922224943254529, |
|
"eval_loss": 4.225285053253174, |
|
"eval_runtime": 885.6768, |
|
"eval_samples_per_second": 563.835, |
|
"eval_steps_per_second": 5.221, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.2214696340144573, |
|
"grad_norm": 1.0055943727493286, |
|
"learning_rate": 0.0006041666666666666, |
|
"loss": 4.2639, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.229893562524902, |
|
"grad_norm": 0.9747255444526672, |
|
"learning_rate": 0.0006083333333333333, |
|
"loss": 4.2622, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.2383174910353463, |
|
"grad_norm": 0.6799793243408203, |
|
"learning_rate": 0.0006125000000000001, |
|
"loss": 4.251, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.2467414195457909, |
|
"grad_norm": 0.8863984942436218, |
|
"learning_rate": 0.0006166666666666667, |
|
"loss": 4.2476, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2551653480562355, |
|
"grad_norm": 0.891790509223938, |
|
"learning_rate": 0.0006208333333333334, |
|
"loss": 4.2434, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2635892765666799, |
|
"grad_norm": 0.731626033782959, |
|
"learning_rate": 0.000625, |
|
"loss": 4.233, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2720132050771245, |
|
"grad_norm": 0.7038396000862122, |
|
"learning_rate": 0.0006291666666666667, |
|
"loss": 4.2264, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.280437133587569, |
|
"grad_norm": 1.0247654914855957, |
|
"learning_rate": 0.0006333333333333333, |
|
"loss": 4.2198, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.2888610620980137, |
|
"grad_norm": 1.0854212045669556, |
|
"learning_rate": 0.0006374999999999999, |
|
"loss": 4.2126, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.2888610620980137, |
|
"eval_accuracy": 0.2953678601775117, |
|
"eval_loss": 4.152132034301758, |
|
"eval_runtime": 880.7951, |
|
"eval_samples_per_second": 566.96, |
|
"eval_steps_per_second": 5.25, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.297284990608458, |
|
"grad_norm": 0.8179611563682556, |
|
"learning_rate": 0.0006416666666666667, |
|
"loss": 4.2081, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.3057089191189026, |
|
"grad_norm": 1.4174506664276123, |
|
"learning_rate": 0.0006458333333333334, |
|
"loss": 4.2027, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.314132847629347, |
|
"grad_norm": 1.1611113548278809, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 4.1992, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.3225567761397916, |
|
"grad_norm": 1.1475598812103271, |
|
"learning_rate": 0.0006541666666666667, |
|
"loss": 4.1875, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.3309807046502362, |
|
"grad_norm": 1.158115267753601, |
|
"learning_rate": 0.0006583333333333334, |
|
"loss": 4.1883, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.3394046331606808, |
|
"grad_norm": 1.325655221939087, |
|
"learning_rate": 0.0006625, |
|
"loss": 4.181, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.3478285616711254, |
|
"grad_norm": 1.077793836593628, |
|
"learning_rate": 0.0006666666666666666, |
|
"loss": 4.1727, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3562524901815698, |
|
"grad_norm": 1.2139134407043457, |
|
"learning_rate": 0.0006708333333333333, |
|
"loss": 4.1691, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.3646764186920144, |
|
"grad_norm": 1.075778603553772, |
|
"learning_rate": 0.000675, |
|
"loss": 4.1563, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.3646764186920144, |
|
"eval_accuracy": 0.2982954422675167, |
|
"eval_loss": 4.0783562660217285, |
|
"eval_runtime": 880.4076, |
|
"eval_samples_per_second": 567.21, |
|
"eval_steps_per_second": 5.252, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.3731003472024588, |
|
"grad_norm": 1.8017152547836304, |
|
"learning_rate": 0.0006791666666666667, |
|
"loss": 4.1523, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.3815242757129034, |
|
"grad_norm": 1.2614473104476929, |
|
"learning_rate": 0.0006833333333333333, |
|
"loss": 4.1481, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.389948204223348, |
|
"grad_norm": 1.179167628288269, |
|
"learning_rate": 0.0006875, |
|
"loss": 4.1421, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3983721327337926, |
|
"grad_norm": 1.463998794555664, |
|
"learning_rate": 0.0006916666666666667, |
|
"loss": 4.1331, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.406796061244237, |
|
"grad_norm": 1.086358666419983, |
|
"learning_rate": 0.0006958333333333334, |
|
"loss": 4.1276, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.4152199897546816, |
|
"grad_norm": 1.3272647857666016, |
|
"learning_rate": 0.0007, |
|
"loss": 4.1357, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.4236439182651262, |
|
"grad_norm": 1.4760971069335938, |
|
"learning_rate": 0.0007041666666666667, |
|
"loss": 4.1299, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.4320678467755705, |
|
"grad_norm": 1.7591749429702759, |
|
"learning_rate": 0.0007083333333333334, |
|
"loss": 4.129, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.4404917752860151, |
|
"grad_norm": 1.7945603132247925, |
|
"learning_rate": 0.0007125, |
|
"loss": 4.1221, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.4404917752860151, |
|
"eval_accuracy": 0.3010639405026742, |
|
"eval_loss": 4.012106895446777, |
|
"eval_runtime": 881.7425, |
|
"eval_samples_per_second": 566.351, |
|
"eval_steps_per_second": 5.244, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.4489157037964597, |
|
"grad_norm": 1.7016360759735107, |
|
"learning_rate": 0.0007166666666666667, |
|
"loss": 4.1043, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.4573396323069043, |
|
"grad_norm": 1.8240207433700562, |
|
"learning_rate": 0.0007208333333333333, |
|
"loss": 4.1034, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.4657635608173487, |
|
"grad_norm": 2.4510786533355713, |
|
"learning_rate": 0.000725, |
|
"loss": 4.0924, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.4741874893277933, |
|
"grad_norm": 1.7411324977874756, |
|
"learning_rate": 0.0007291666666666666, |
|
"loss": 4.1041, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4826114178382377, |
|
"grad_norm": 1.1133612394332886, |
|
"learning_rate": 0.0007333333333333333, |
|
"loss": 4.1064, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.4910353463486823, |
|
"grad_norm": 1.3936740159988403, |
|
"learning_rate": 0.0007375000000000001, |
|
"loss": 4.0954, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.499459274859127, |
|
"grad_norm": 2.3855819702148438, |
|
"learning_rate": 0.0007416666666666667, |
|
"loss": 4.0836, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.5078832033695715, |
|
"grad_norm": 1.2734453678131104, |
|
"learning_rate": 0.0007458333333333334, |
|
"loss": 4.0834, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.516307131880016, |
|
"grad_norm": 1.432719349861145, |
|
"learning_rate": 0.00075, |
|
"loss": 4.0711, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.516307131880016, |
|
"eval_accuracy": 0.3055703004736556, |
|
"eval_loss": 3.976287841796875, |
|
"eval_runtime": 881.3595, |
|
"eval_samples_per_second": 566.597, |
|
"eval_steps_per_second": 5.246, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.5247310603904605, |
|
"grad_norm": 1.5839996337890625, |
|
"learning_rate": 0.0007541666666666667, |
|
"loss": 4.0712, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.5331549889009048, |
|
"grad_norm": 3.0461270809173584, |
|
"learning_rate": 0.0007583333333333333, |
|
"loss": 4.0617, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.5415789174113494, |
|
"grad_norm": 1.760568380355835, |
|
"learning_rate": 0.0007624999999999999, |
|
"loss": 4.0486, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.550002845921794, |
|
"grad_norm": 1.6682184934616089, |
|
"learning_rate": 0.0007666666666666667, |
|
"loss": 4.0034, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.5584267744322386, |
|
"grad_norm": 1.4350653886795044, |
|
"learning_rate": 0.0007708333333333334, |
|
"loss": 3.9644, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5668507029426832, |
|
"grad_norm": 1.4870712757110596, |
|
"learning_rate": 0.0007750000000000001, |
|
"loss": 3.9314, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.5752746314531276, |
|
"grad_norm": 1.7954463958740234, |
|
"learning_rate": 0.0007791666666666667, |
|
"loss": 3.8939, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.5836985599635722, |
|
"grad_norm": 2.1485602855682373, |
|
"learning_rate": 0.0007833333333333334, |
|
"loss": 3.8576, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.5921224884740166, |
|
"grad_norm": 1.647570252418518, |
|
"learning_rate": 0.0007875, |
|
"loss": 3.8159, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.5921224884740166, |
|
"eval_accuracy": 0.3353472770952767, |
|
"eval_loss": 3.6341910362243652, |
|
"eval_runtime": 881.1424, |
|
"eval_samples_per_second": 566.737, |
|
"eval_steps_per_second": 5.248, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.6005464169844612, |
|
"grad_norm": 1.7171742916107178, |
|
"learning_rate": 0.0007916666666666666, |
|
"loss": 3.7812, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.6089703454949058, |
|
"grad_norm": 2.12190580368042, |
|
"learning_rate": 0.0007958333333333333, |
|
"loss": 3.7402, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.6173942740053504, |
|
"grad_norm": 1.7334414720535278, |
|
"learning_rate": 0.0008, |
|
"loss": 3.7025, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.625818202515795, |
|
"grad_norm": 1.8880668878555298, |
|
"learning_rate": 0.0008041666666666667, |
|
"loss": 3.6808, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.6342421310262394, |
|
"grad_norm": 2.3294591903686523, |
|
"learning_rate": 0.0008083333333333333, |
|
"loss": 3.6419, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.642666059536684, |
|
"grad_norm": 2.4122796058654785, |
|
"learning_rate": 0.0008125000000000001, |
|
"loss": 3.6114, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.6510899880471284, |
|
"grad_norm": 2.090388774871826, |
|
"learning_rate": 0.0008166666666666667, |
|
"loss": 3.5867, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.659513916557573, |
|
"grad_norm": 2.267676830291748, |
|
"learning_rate": 0.0008208333333333334, |
|
"loss": 3.5501, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.6679378450680176, |
|
"grad_norm": 2.253739833831787, |
|
"learning_rate": 0.000825, |
|
"loss": 3.5114, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.6679378450680176, |
|
"eval_accuracy": 0.38861593633258434, |
|
"eval_loss": 3.2597665786743164, |
|
"eval_runtime": 889.3264, |
|
"eval_samples_per_second": 561.522, |
|
"eval_steps_per_second": 5.199, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.6763617735784622, |
|
"grad_norm": 2.269505739212036, |
|
"learning_rate": 0.0008291666666666667, |
|
"loss": 3.4854, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.6847857020889065, |
|
"grad_norm": 1.7237802743911743, |
|
"learning_rate": 0.0008333333333333334, |
|
"loss": 3.4651, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6932096305993511, |
|
"grad_norm": 2.1117663383483887, |
|
"learning_rate": 0.0008375, |
|
"loss": 3.4558, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.7016335591097955, |
|
"grad_norm": 2.1351046562194824, |
|
"learning_rate": 0.0008416666666666667, |
|
"loss": 3.4256, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.7100574876202401, |
|
"grad_norm": 2.326232671737671, |
|
"learning_rate": 0.0008458333333333333, |
|
"loss": 3.3998, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.7184814161306847, |
|
"grad_norm": 2.1802730560302734, |
|
"learning_rate": 0.00085, |
|
"loss": 3.3865, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.7269053446411293, |
|
"grad_norm": 2.042966604232788, |
|
"learning_rate": 0.0008541666666666666, |
|
"loss": 3.3539, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.735329273151574, |
|
"grad_norm": 2.052464008331299, |
|
"learning_rate": 0.0008583333333333333, |
|
"loss": 3.3308, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.7437532016620183, |
|
"grad_norm": 1.5790934562683105, |
|
"learning_rate": 0.0008625000000000001, |
|
"loss": 3.3122, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.7437532016620183, |
|
"eval_accuracy": 0.41178756961484836, |
|
"eval_loss": 3.0882680416107178, |
|
"eval_runtime": 878.4742, |
|
"eval_samples_per_second": 568.458, |
|
"eval_steps_per_second": 5.264, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.752177130172463, |
|
"grad_norm": 2.2859761714935303, |
|
"learning_rate": 0.0008666666666666667, |
|
"loss": 3.3034, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.7606010586829073, |
|
"grad_norm": 2.912191867828369, |
|
"learning_rate": 0.0008708333333333334, |
|
"loss": 3.289, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.7690249871933519, |
|
"grad_norm": 2.143118143081665, |
|
"learning_rate": 0.000875, |
|
"loss": 3.2547, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.7774489157037965, |
|
"grad_norm": 1.8577404022216797, |
|
"learning_rate": 0.0008791666666666667, |
|
"loss": 3.2383, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.785872844214241, |
|
"grad_norm": 1.9692562818527222, |
|
"learning_rate": 0.0008833333333333333, |
|
"loss": 3.2137, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.7942967727246857, |
|
"grad_norm": 1.938915729522705, |
|
"learning_rate": 0.0008874999999999999, |
|
"loss": 3.1909, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.80272070123513, |
|
"grad_norm": 1.395321011543274, |
|
"learning_rate": 0.0008916666666666667, |
|
"loss": 3.1346, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.8111446297455744, |
|
"grad_norm": 1.8771544694900513, |
|
"learning_rate": 0.0008958333333333334, |
|
"loss": 3.1035, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.819568558256019, |
|
"grad_norm": 1.5829336643218994, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 3.0328, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.819568558256019, |
|
"eval_accuracy": 0.45304088376136725, |
|
"eval_loss": 2.8062996864318848, |
|
"eval_runtime": 886.0675, |
|
"eval_samples_per_second": 563.587, |
|
"eval_steps_per_second": 5.219, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.8279924867664636, |
|
"grad_norm": 1.5085866451263428, |
|
"learning_rate": 0.0009041666666666667, |
|
"loss": 3.0089, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.8364164152769082, |
|
"grad_norm": 1.4988549947738647, |
|
"learning_rate": 0.0009083333333333334, |
|
"loss": 2.9786, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.8448403437873528, |
|
"grad_norm": 1.5726799964904785, |
|
"learning_rate": 0.0009125, |
|
"loss": 2.936, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.8532642722977972, |
|
"grad_norm": 1.2175358533859253, |
|
"learning_rate": 0.0009166666666666666, |
|
"loss": 2.8996, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.8616882008082418, |
|
"grad_norm": 1.4195218086242676, |
|
"learning_rate": 0.0009208333333333333, |
|
"loss": 2.8664, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.8701121293186862, |
|
"grad_norm": 1.1213312149047852, |
|
"learning_rate": 0.000925, |
|
"loss": 2.8382, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.8785360578291308, |
|
"grad_norm": 1.169554591178894, |
|
"learning_rate": 0.0009291666666666667, |
|
"loss": 2.8026, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.8869599863395754, |
|
"grad_norm": 1.4759305715560913, |
|
"learning_rate": 0.0009333333333333333, |
|
"loss": 2.7654, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.89538391485002, |
|
"grad_norm": 1.3071763515472412, |
|
"learning_rate": 0.0009375, |
|
"loss": 2.7311, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.89538391485002, |
|
"eval_accuracy": 0.4917409385648686, |
|
"eval_loss": 2.5433878898620605, |
|
"eval_runtime": 879.3794, |
|
"eval_samples_per_second": 567.873, |
|
"eval_steps_per_second": 5.258, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.9038078433604646, |
|
"grad_norm": 0.9968194961547852, |
|
"learning_rate": 0.0009416666666666667, |
|
"loss": 2.7044, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.912231771870909, |
|
"grad_norm": 1.1783692836761475, |
|
"learning_rate": 0.0009458333333333334, |
|
"loss": 2.6819, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.9206557003813534, |
|
"grad_norm": 0.9856918454170227, |
|
"learning_rate": 0.00095, |
|
"loss": 2.6528, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.929079628891798, |
|
"grad_norm": 1.0605028867721558, |
|
"learning_rate": 0.0009541666666666667, |
|
"loss": 2.6226, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.9375035574022426, |
|
"grad_norm": 0.8553977608680725, |
|
"learning_rate": 0.0009583333333333334, |
|
"loss": 2.608, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9459274859126872, |
|
"grad_norm": 0.9543612599372864, |
|
"learning_rate": 0.0009625, |
|
"loss": 2.5865, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.9543514144231318, |
|
"grad_norm": 1.1085282564163208, |
|
"learning_rate": 0.0009666666666666667, |
|
"loss": 2.5586, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.9627753429335761, |
|
"grad_norm": 0.8689624667167664, |
|
"learning_rate": 0.0009708333333333333, |
|
"loss": 2.541, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.9711992714440207, |
|
"grad_norm": 0.6790447235107422, |
|
"learning_rate": 0.000975, |
|
"loss": 2.5214, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.9711992714440207, |
|
"eval_accuracy": 0.5198810557311793, |
|
"eval_loss": 2.3582663536071777, |
|
"eval_runtime": 891.4654, |
|
"eval_samples_per_second": 560.174, |
|
"eval_steps_per_second": 5.187, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.9796231999544651, |
|
"grad_norm": 1.1572414636611938, |
|
"learning_rate": 0.0009791666666666666, |
|
"loss": 2.5126, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.9880471284649097, |
|
"grad_norm": 0.8218650221824646, |
|
"learning_rate": 0.0009833333333333332, |
|
"loss": 2.4903, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.9964710569753543, |
|
"grad_norm": 0.9195880889892578, |
|
"learning_rate": 0.0009875, |
|
"loss": 2.479, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.004894985485799, |
|
"grad_norm": 0.6436383724212646, |
|
"learning_rate": 0.0009916666666666667, |
|
"loss": 2.4509, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.0133189139962435, |
|
"grad_norm": 0.9757860898971558, |
|
"learning_rate": 0.0009958333333333334, |
|
"loss": 2.453, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.021742842506688, |
|
"grad_norm": 0.8884423971176147, |
|
"learning_rate": 0.001, |
|
"loss": 2.428, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.0301667710171323, |
|
"grad_norm": 1.097330093383789, |
|
"learning_rate": 0.000999009900990099, |
|
"loss": 2.4139, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.038590699527577, |
|
"grad_norm": 1.095337152481079, |
|
"learning_rate": 0.0009980198019801981, |
|
"loss": 2.4024, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.0470146280380215, |
|
"grad_norm": 1.0757551193237305, |
|
"learning_rate": 0.000997029702970297, |
|
"loss": 2.3853, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.0470146280380215, |
|
"eval_accuracy": 0.538133837771306, |
|
"eval_loss": 2.2352097034454346, |
|
"eval_runtime": 883.4374, |
|
"eval_samples_per_second": 565.265, |
|
"eval_steps_per_second": 5.234, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.055438556548466, |
|
"grad_norm": 0.9356153011322021, |
|
"learning_rate": 0.000996039603960396, |
|
"loss": 2.3669, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.0638624850589107, |
|
"grad_norm": 0.8463107347488403, |
|
"learning_rate": 0.000995049504950495, |
|
"loss": 2.3604, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.0722864135693553, |
|
"grad_norm": 0.8833483457565308, |
|
"learning_rate": 0.0009940594059405941, |
|
"loss": 2.3574, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.0807103420797994, |
|
"grad_norm": 0.7081923484802246, |
|
"learning_rate": 0.0009930693069306932, |
|
"loss": 2.3338, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.089134270590244, |
|
"grad_norm": 0.5993143916130066, |
|
"learning_rate": 0.000992079207920792, |
|
"loss": 2.3219, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.0975581991006886, |
|
"grad_norm": 0.8431512117385864, |
|
"learning_rate": 0.000991089108910891, |
|
"loss": 2.3108, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.1059821276111332, |
|
"grad_norm": 0.9983824491500854, |
|
"learning_rate": 0.0009900990099009901, |
|
"loss": 2.305, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.114406056121578, |
|
"grad_norm": 0.6354156732559204, |
|
"learning_rate": 0.0009891089108910892, |
|
"loss": 2.2965, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.1228299846320224, |
|
"grad_norm": 0.8491016626358032, |
|
"learning_rate": 0.0009881188118811882, |
|
"loss": 2.2763, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.1228299846320224, |
|
"eval_accuracy": 0.5540495533549666, |
|
"eval_loss": 2.135758399963379, |
|
"eval_runtime": 895.5557, |
|
"eval_samples_per_second": 557.616, |
|
"eval_steps_per_second": 5.163, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.131253913142467, |
|
"grad_norm": 0.6909253001213074, |
|
"learning_rate": 0.000987128712871287, |
|
"loss": 2.2696, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.139677841652911, |
|
"grad_norm": 0.5072851181030273, |
|
"learning_rate": 0.000986138613861386, |
|
"loss": 2.2555, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.148101770163356, |
|
"grad_norm": 0.7575969696044922, |
|
"learning_rate": 0.0009851485148514852, |
|
"loss": 2.2552, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.1565256986738004, |
|
"grad_norm": 0.7418563365936279, |
|
"learning_rate": 0.0009841584158415842, |
|
"loss": 2.2439, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.164949627184245, |
|
"grad_norm": 0.5893211960792542, |
|
"learning_rate": 0.0009831683168316833, |
|
"loss": 2.2282, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.1733735556946896, |
|
"grad_norm": 0.892035186290741, |
|
"learning_rate": 0.000982178217821782, |
|
"loss": 2.2201, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.181797484205134, |
|
"grad_norm": 0.688275933265686, |
|
"learning_rate": 0.0009811881188118811, |
|
"loss": 2.2174, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.1902214127155784, |
|
"grad_norm": 0.5092687010765076, |
|
"learning_rate": 0.0009801980198019802, |
|
"loss": 2.2032, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.198645341226023, |
|
"grad_norm": 0.6715185642242432, |
|
"learning_rate": 0.0009792079207920793, |
|
"loss": 2.189, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.198645341226023, |
|
"eval_accuracy": 0.5674450081410035, |
|
"eval_loss": 2.053079605102539, |
|
"eval_runtime": 876.7453, |
|
"eval_samples_per_second": 569.579, |
|
"eval_steps_per_second": 5.274, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.2070692697364676, |
|
"grad_norm": 0.5717750191688538, |
|
"learning_rate": 0.0009782178217821783, |
|
"loss": 2.1894, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.215493198246912, |
|
"grad_norm": 0.7002500295639038, |
|
"learning_rate": 0.0009772277227722771, |
|
"loss": 2.1851, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.2239171267573568, |
|
"grad_norm": 0.6041799783706665, |
|
"learning_rate": 0.0009762376237623762, |
|
"loss": 2.1899, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.2323410552678014, |
|
"grad_norm": 0.40263745188713074, |
|
"learning_rate": 0.0009752475247524752, |
|
"loss": 2.1633, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.240764983778246, |
|
"grad_norm": 0.47779303789138794, |
|
"learning_rate": 0.0009742574257425743, |
|
"loss": 2.1478, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.24918891228869, |
|
"grad_norm": 0.8906975984573364, |
|
"learning_rate": 0.0009732673267326732, |
|
"loss": 2.1508, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.2576128407991347, |
|
"grad_norm": 0.4588846266269684, |
|
"learning_rate": 0.0009722772277227723, |
|
"loss": 2.1422, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.2660367693095793, |
|
"grad_norm": 0.6038916707038879, |
|
"learning_rate": 0.0009712871287128712, |
|
"loss": 2.1229, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.274460697820024, |
|
"grad_norm": 0.792378842830658, |
|
"learning_rate": 0.0009702970297029703, |
|
"loss": 2.1262, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.274460697820024, |
|
"eval_accuracy": 0.5767164906847645, |
|
"eval_loss": 1.9968212842941284, |
|
"eval_runtime": 890.0794, |
|
"eval_samples_per_second": 561.047, |
|
"eval_steps_per_second": 5.195, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.2828846263304685, |
|
"grad_norm": 0.5215600728988647, |
|
"learning_rate": 0.0009693069306930693, |
|
"loss": 2.1315, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.291308554840913, |
|
"grad_norm": 0.42443060874938965, |
|
"learning_rate": 0.0009683168316831683, |
|
"loss": 2.1075, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.2997324833513577, |
|
"grad_norm": 0.7379765510559082, |
|
"learning_rate": 0.0009673267326732673, |
|
"loss": 2.0997, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.308156411861802, |
|
"grad_norm": 0.532883882522583, |
|
"learning_rate": 0.0009663366336633663, |
|
"loss": 2.1009, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.3165803403722465, |
|
"grad_norm": 0.4312550127506256, |
|
"learning_rate": 0.0009653465346534653, |
|
"loss": 2.0836, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.325004268882691, |
|
"grad_norm": 0.42506101727485657, |
|
"learning_rate": 0.0009643564356435644, |
|
"loss": 2.0751, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.3334281973931357, |
|
"grad_norm": 0.9728929400444031, |
|
"learning_rate": 0.0009633663366336633, |
|
"loss": 2.0755, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.3418521259035803, |
|
"grad_norm": 0.4502295255661011, |
|
"learning_rate": 0.0009623762376237624, |
|
"loss": 2.0757, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.350276054414025, |
|
"grad_norm": 0.6825786232948303, |
|
"learning_rate": 0.0009613861386138613, |
|
"loss": 2.0593, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.350276054414025, |
|
"eval_accuracy": 0.5877788692302428, |
|
"eval_loss": 1.932070255279541, |
|
"eval_runtime": 877.2049, |
|
"eval_samples_per_second": 569.281, |
|
"eval_steps_per_second": 5.271, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.3586999829244695, |
|
"grad_norm": 0.5142760276794434, |
|
"learning_rate": 0.0009603960396039604, |
|
"loss": 2.0529, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.3671239114349136, |
|
"grad_norm": 0.613132119178772, |
|
"learning_rate": 0.0009594059405940594, |
|
"loss": 2.0423, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.3755478399453582, |
|
"grad_norm": 0.7282253503799438, |
|
"learning_rate": 0.0009584158415841584, |
|
"loss": 2.0522, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.383971768455803, |
|
"grad_norm": 0.37959426641464233, |
|
"learning_rate": 0.0009574257425742574, |
|
"loss": 2.0367, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.3923956969662474, |
|
"grad_norm": 0.35326164960861206, |
|
"learning_rate": 0.0009564356435643564, |
|
"loss": 2.0233, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.400819625476692, |
|
"grad_norm": 0.8196151256561279, |
|
"learning_rate": 0.0009554455445544554, |
|
"loss": 2.0264, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.409243553987136, |
|
"grad_norm": 0.7122208476066589, |
|
"learning_rate": 0.0009544554455445545, |
|
"loss": 2.0308, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.417667482497581, |
|
"grad_norm": 0.35665011405944824, |
|
"learning_rate": 0.0009534653465346534, |
|
"loss": 2.0133, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.4260914110080254, |
|
"grad_norm": 0.3755228519439697, |
|
"learning_rate": 0.0009524752475247525, |
|
"loss": 1.9992, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.4260914110080254, |
|
"eval_accuracy": 0.596780331496744, |
|
"eval_loss": 1.8819479942321777, |
|
"eval_runtime": 890.4504, |
|
"eval_samples_per_second": 560.813, |
|
"eval_steps_per_second": 5.193, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.43451533951847, |
|
"grad_norm": 0.7018378376960754, |
|
"learning_rate": 0.0009514851485148514, |
|
"loss": 2.0013, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.4429392680289146, |
|
"grad_norm": 0.4874301850795746, |
|
"learning_rate": 0.0009504950495049505, |
|
"loss": 1.9971, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.451363196539359, |
|
"grad_norm": 0.45909377932548523, |
|
"learning_rate": 0.0009495049504950495, |
|
"loss": 1.9881, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.459787125049804, |
|
"grad_norm": 0.4965904951095581, |
|
"learning_rate": 0.0009485148514851485, |
|
"loss": 1.989, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.468211053560248, |
|
"grad_norm": 0.4780527949333191, |
|
"learning_rate": 0.0009475247524752475, |
|
"loss": 1.9795, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.4766349820706925, |
|
"grad_norm": 0.5145118236541748, |
|
"learning_rate": 0.0009465346534653465, |
|
"loss": 1.973, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.485058910581137, |
|
"grad_norm": 0.5469622015953064, |
|
"learning_rate": 0.0009455445544554455, |
|
"loss": 1.9692, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.4934828390915817, |
|
"grad_norm": 0.5788788199424744, |
|
"learning_rate": 0.0009445544554455446, |
|
"loss": 1.9627, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.5019067676020263, |
|
"grad_norm": 0.5380696654319763, |
|
"learning_rate": 0.0009435643564356435, |
|
"loss": 1.9624, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.5019067676020263, |
|
"eval_accuracy": 0.6028271764812113, |
|
"eval_loss": 1.8441975116729736, |
|
"eval_runtime": 877.1334, |
|
"eval_samples_per_second": 569.327, |
|
"eval_steps_per_second": 5.272, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.510330696112471, |
|
"grad_norm": 0.4939862787723541, |
|
"learning_rate": 0.0009425742574257426, |
|
"loss": 1.9576, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.5187546246229155, |
|
"grad_norm": 0.4804815649986267, |
|
"learning_rate": 0.0009415841584158415, |
|
"loss": 1.948, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.5271785531333597, |
|
"grad_norm": 0.529515266418457, |
|
"learning_rate": 0.0009405940594059406, |
|
"loss": 1.9414, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.5356024816438043, |
|
"grad_norm": 0.5104151964187622, |
|
"learning_rate": 0.0009396039603960396, |
|
"loss": 1.9472, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.544026410154249, |
|
"grad_norm": 0.36934202909469604, |
|
"learning_rate": 0.0009386138613861386, |
|
"loss": 1.9358, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.5524503386646935, |
|
"grad_norm": 0.5956403017044067, |
|
"learning_rate": 0.0009376237623762376, |
|
"loss": 1.9272, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.560874267175138, |
|
"grad_norm": 0.5035738348960876, |
|
"learning_rate": 0.0009366336633663367, |
|
"loss": 1.934, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.5692981956855827, |
|
"grad_norm": 0.44133296608924866, |
|
"learning_rate": 0.0009356435643564357, |
|
"loss": 1.9192, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.5777221241960273, |
|
"grad_norm": 0.617588996887207, |
|
"learning_rate": 0.0009346534653465348, |
|
"loss": 1.9189, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.5777221241960273, |
|
"eval_accuracy": 0.6097417836200192, |
|
"eval_loss": 1.806692123413086, |
|
"eval_runtime": 890.173, |
|
"eval_samples_per_second": 560.988, |
|
"eval_steps_per_second": 5.194, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.5861460527064715, |
|
"grad_norm": 0.4702962338924408, |
|
"learning_rate": 0.0009336633663366337, |
|
"loss": 1.9145, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.594569981216916, |
|
"grad_norm": 0.37163108587265015, |
|
"learning_rate": 0.0009326732673267328, |
|
"loss": 1.907, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.6029939097273607, |
|
"grad_norm": 0.8039525151252747, |
|
"learning_rate": 0.0009316831683168317, |
|
"loss": 1.9071, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.6114178382378053, |
|
"grad_norm": 0.3594844341278076, |
|
"learning_rate": 0.0009306930693069308, |
|
"loss": 1.9109, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.61984176674825, |
|
"grad_norm": 0.44677871465682983, |
|
"learning_rate": 0.0009297029702970298, |
|
"loss": 1.8948, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.628265695258694, |
|
"grad_norm": 0.4496874511241913, |
|
"learning_rate": 0.0009287128712871288, |
|
"loss": 1.893, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.636689623769139, |
|
"grad_norm": 0.44437769055366516, |
|
"learning_rate": 0.0009277227722772278, |
|
"loss": 1.8891, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.6451135522795832, |
|
"grad_norm": 0.47511276602745056, |
|
"learning_rate": 0.0009267326732673268, |
|
"loss": 1.8828, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.653537480790028, |
|
"grad_norm": 0.5357436537742615, |
|
"learning_rate": 0.0009257425742574258, |
|
"loss": 1.8802, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.653537480790028, |
|
"eval_accuracy": 0.6167399590165771, |
|
"eval_loss": 1.7698620557785034, |
|
"eval_runtime": 887.5592, |
|
"eval_samples_per_second": 562.64, |
|
"eval_steps_per_second": 5.21, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.6619614093004724, |
|
"grad_norm": 0.5014392137527466, |
|
"learning_rate": 0.0009247524752475249, |
|
"loss": 1.8819, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.670385337810917, |
|
"grad_norm": 0.41872531175613403, |
|
"learning_rate": 0.0009237623762376238, |
|
"loss": 1.8736, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.6788092663213616, |
|
"grad_norm": 0.4343492388725281, |
|
"learning_rate": 0.0009227722772277229, |
|
"loss": 1.8659, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.687233194831806, |
|
"grad_norm": 0.45470404624938965, |
|
"learning_rate": 0.0009217821782178218, |
|
"loss": 1.8689, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.695657123342251, |
|
"grad_norm": 0.4626518487930298, |
|
"learning_rate": 0.0009207920792079209, |
|
"loss": 1.8606, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.704081051852695, |
|
"grad_norm": 0.4213305711746216, |
|
"learning_rate": 0.0009198019801980199, |
|
"loss": 1.8587, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.7125049803631396, |
|
"grad_norm": 0.5036765336990356, |
|
"learning_rate": 0.0009188118811881188, |
|
"loss": 1.8514, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.720928908873584, |
|
"grad_norm": 0.4738876223564148, |
|
"learning_rate": 0.0009178217821782179, |
|
"loss": 1.8506, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.729352837384029, |
|
"grad_norm": 0.3712784945964813, |
|
"learning_rate": 0.0009168316831683168, |
|
"loss": 1.8461, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.729352837384029, |
|
"eval_accuracy": 0.6231111347423419, |
|
"eval_loss": 1.7313838005065918, |
|
"eval_runtime": 889.784, |
|
"eval_samples_per_second": 561.233, |
|
"eval_steps_per_second": 5.197, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.7377767658944734, |
|
"grad_norm": 0.45651596784591675, |
|
"learning_rate": 0.0009158415841584159, |
|
"loss": 1.8405, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.7462006944049175, |
|
"grad_norm": 0.5253742933273315, |
|
"learning_rate": 0.000914851485148515, |
|
"loss": 1.839, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.754624622915362, |
|
"grad_norm": 0.4810900390148163, |
|
"learning_rate": 0.0009138613861386139, |
|
"loss": 1.8352, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.7630485514258067, |
|
"grad_norm": 0.42353251576423645, |
|
"learning_rate": 0.0009128712871287129, |
|
"loss": 1.8308, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.7714724799362513, |
|
"grad_norm": 0.34494903683662415, |
|
"learning_rate": 0.0009118811881188119, |
|
"loss": 1.8271, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.779896408446696, |
|
"grad_norm": 0.44857293367385864, |
|
"learning_rate": 0.0009108910891089109, |
|
"loss": 1.8272, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.7883203369571405, |
|
"grad_norm": 0.32810303568840027, |
|
"learning_rate": 0.00090990099009901, |
|
"loss": 1.8201, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.796744265467585, |
|
"grad_norm": 0.5814313292503357, |
|
"learning_rate": 0.0009089108910891089, |
|
"loss": 1.8181, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.8051681939780293, |
|
"grad_norm": 0.6469531655311584, |
|
"learning_rate": 0.000907920792079208, |
|
"loss": 1.8228, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.8051681939780293, |
|
"eval_accuracy": 0.627194729904968, |
|
"eval_loss": 1.7094751596450806, |
|
"eval_runtime": 879.8799, |
|
"eval_samples_per_second": 567.55, |
|
"eval_steps_per_second": 5.255, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.813592122488474, |
|
"grad_norm": 0.37370234727859497, |
|
"learning_rate": 0.0009069306930693069, |
|
"loss": 1.8143, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.8220160509989185, |
|
"grad_norm": 0.2818905711174011, |
|
"learning_rate": 0.000905940594059406, |
|
"loss": 1.8058, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.830439979509363, |
|
"grad_norm": 0.40032240748405457, |
|
"learning_rate": 0.000904950495049505, |
|
"loss": 1.8037, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.8388639080198077, |
|
"grad_norm": 0.4075703024864197, |
|
"learning_rate": 0.000903960396039604, |
|
"loss": 1.8042, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.8472878365302523, |
|
"grad_norm": 0.4188884496688843, |
|
"learning_rate": 0.000902970297029703, |
|
"loss": 1.7954, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.855711765040697, |
|
"grad_norm": 0.40151095390319824, |
|
"learning_rate": 0.000901980198019802, |
|
"loss": 1.8, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.864135693551141, |
|
"grad_norm": 0.38640516996383667, |
|
"learning_rate": 0.000900990099009901, |
|
"loss": 1.7897, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.8725596220615857, |
|
"grad_norm": 0.46775710582733154, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 1.7889, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.8809835505720303, |
|
"grad_norm": 0.5004317760467529, |
|
"learning_rate": 0.000899009900990099, |
|
"loss": 1.7838, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.8809835505720303, |
|
"eval_accuracy": 0.6330453392339891, |
|
"eval_loss": 1.6756778955459595, |
|
"eval_runtime": 890.43, |
|
"eval_samples_per_second": 560.826, |
|
"eval_steps_per_second": 5.193, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.889407479082475, |
|
"grad_norm": 0.44054290652275085, |
|
"learning_rate": 0.0008980198019801981, |
|
"loss": 1.7839, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.8978314075929195, |
|
"grad_norm": 0.38003844022750854, |
|
"learning_rate": 0.000897029702970297, |
|
"loss": 1.7793, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.9062553361033636, |
|
"grad_norm": 0.3714471757411957, |
|
"learning_rate": 0.0008960396039603961, |
|
"loss": 1.7765, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.9146792646138087, |
|
"grad_norm": 0.4955293834209442, |
|
"learning_rate": 0.0008950495049504951, |
|
"loss": 1.7729, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.923103193124253, |
|
"grad_norm": 0.367481529712677, |
|
"learning_rate": 0.0008940594059405941, |
|
"loss": 1.7666, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.9315271216346974, |
|
"grad_norm": 0.48372742533683777, |
|
"learning_rate": 0.0008930693069306931, |
|
"loss": 1.7638, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.939951050145142, |
|
"grad_norm": 0.5356625318527222, |
|
"learning_rate": 0.0008920792079207921, |
|
"loss": 1.7625, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.9483749786555866, |
|
"grad_norm": 0.396090030670166, |
|
"learning_rate": 0.0008910891089108911, |
|
"loss": 1.7597, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.956798907166031, |
|
"grad_norm": 0.3071458041667938, |
|
"learning_rate": 0.0008900990099009902, |
|
"loss": 1.7513, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.956798907166031, |
|
"eval_accuracy": 0.640630813225039, |
|
"eval_loss": 1.6351577043533325, |
|
"eval_runtime": 887.1061, |
|
"eval_samples_per_second": 562.927, |
|
"eval_steps_per_second": 5.212, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.9652228356764754, |
|
"grad_norm": 0.7265316247940063, |
|
"learning_rate": 0.0008891089108910891, |
|
"loss": 1.7482, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.97364676418692, |
|
"grad_norm": 0.34152501821517944, |
|
"learning_rate": 0.0008881188118811882, |
|
"loss": 1.7454, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.9820706926973646, |
|
"grad_norm": 0.5570985078811646, |
|
"learning_rate": 0.0008871287128712871, |
|
"loss": 1.736, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.990494621207809, |
|
"grad_norm": 0.29268133640289307, |
|
"learning_rate": 0.0008861386138613862, |
|
"loss": 1.7323, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.998918549718254, |
|
"grad_norm": 0.4475082755088806, |
|
"learning_rate": 0.0008851485148514852, |
|
"loss": 1.7207, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.0073424782286984, |
|
"grad_norm": 0.39963921904563904, |
|
"learning_rate": 0.0008841584158415842, |
|
"loss": 1.7199, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.015766406739143, |
|
"grad_norm": 0.3290662169456482, |
|
"learning_rate": 0.0008831683168316832, |
|
"loss": 1.7103, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.024190335249587, |
|
"grad_norm": 0.4892579913139343, |
|
"learning_rate": 0.0008821782178217822, |
|
"loss": 1.7024, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.0326142637600317, |
|
"grad_norm": 0.45102205872535706, |
|
"learning_rate": 0.0008811881188118812, |
|
"loss": 1.7012, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.0326142637600317, |
|
"eval_accuracy": 0.65292687328356, |
|
"eval_loss": 1.578561544418335, |
|
"eval_runtime": 889.1801, |
|
"eval_samples_per_second": 561.614, |
|
"eval_steps_per_second": 5.2, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.0410381922704763, |
|
"grad_norm": 0.38877975940704346, |
|
"learning_rate": 0.0008801980198019803, |
|
"loss": 1.6999, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.049462120780921, |
|
"grad_norm": 0.32052722573280334, |
|
"learning_rate": 0.0008792079207920792, |
|
"loss": 1.6898, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.0578860492913655, |
|
"grad_norm": 0.4076586365699768, |
|
"learning_rate": 0.0008782178217821783, |
|
"loss": 1.682, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.06630997780181, |
|
"grad_norm": 0.3886164724826813, |
|
"learning_rate": 0.0008772277227722772, |
|
"loss": 1.6788, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.0747339063122547, |
|
"grad_norm": 0.43478402495384216, |
|
"learning_rate": 0.0008762376237623763, |
|
"loss": 1.6757, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.083157834822699, |
|
"grad_norm": 0.3681798279285431, |
|
"learning_rate": 0.0008752475247524753, |
|
"loss": 1.6725, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 3.0915817633331435, |
|
"grad_norm": 0.44459056854248047, |
|
"learning_rate": 0.0008742574257425743, |
|
"loss": 1.6653, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 3.100005691843588, |
|
"grad_norm": 0.3404163420200348, |
|
"learning_rate": 0.0008732673267326733, |
|
"loss": 1.6597, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 3.1084296203540327, |
|
"grad_norm": 0.39622583985328674, |
|
"learning_rate": 0.0008722772277227722, |
|
"loss": 1.664, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.1084296203540327, |
|
"eval_accuracy": 0.6616252383451875, |
|
"eval_loss": 1.5378377437591553, |
|
"eval_runtime": 880.004, |
|
"eval_samples_per_second": 567.47, |
|
"eval_steps_per_second": 5.255, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.1168535488644773, |
|
"grad_norm": 0.36066505312919617, |
|
"learning_rate": 0.0008712871287128713, |
|
"loss": 1.6552, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.125277477374922, |
|
"grad_norm": 0.45852380990982056, |
|
"learning_rate": 0.0008702970297029704, |
|
"loss": 1.6581, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 3.1337014058853665, |
|
"grad_norm": 0.3647266924381256, |
|
"learning_rate": 0.0008693069306930693, |
|
"loss": 1.6493, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 3.1421253343958107, |
|
"grad_norm": 0.4774695038795471, |
|
"learning_rate": 0.0008683168316831684, |
|
"loss": 1.6457, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 3.1505492629062553, |
|
"grad_norm": 0.4143640398979187, |
|
"learning_rate": 0.0008673267326732673, |
|
"loss": 1.6436, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.1589731914167, |
|
"grad_norm": 0.4920789897441864, |
|
"learning_rate": 0.0008663366336633663, |
|
"loss": 1.6431, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.1673971199271445, |
|
"grad_norm": 0.40231600403785706, |
|
"learning_rate": 0.0008653465346534654, |
|
"loss": 1.6373, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.175821048437589, |
|
"grad_norm": 0.35115131735801697, |
|
"learning_rate": 0.0008643564356435643, |
|
"loss": 1.6343, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.1842449769480337, |
|
"grad_norm": 0.3814195990562439, |
|
"learning_rate": 0.0008633663366336634, |
|
"loss": 1.6345, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.1842449769480337, |
|
"eval_accuracy": 0.6669776046149977, |
|
"eval_loss": 1.5131778717041016, |
|
"eval_runtime": 887.9268, |
|
"eval_samples_per_second": 562.407, |
|
"eval_steps_per_second": 5.208, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.192668905458478, |
|
"grad_norm": 0.3229101896286011, |
|
"learning_rate": 0.0008623762376237623, |
|
"loss": 1.6281, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.2010928339689224, |
|
"grad_norm": 0.4361475110054016, |
|
"learning_rate": 0.0008613861386138614, |
|
"loss": 1.6253, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.209516762479367, |
|
"grad_norm": 0.3246362507343292, |
|
"learning_rate": 0.0008603960396039604, |
|
"loss": 1.6269, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.2179406909898116, |
|
"grad_norm": 0.5126762390136719, |
|
"learning_rate": 0.0008594059405940594, |
|
"loss": 1.62, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.226364619500256, |
|
"grad_norm": 0.3813638389110565, |
|
"learning_rate": 0.0008584158415841584, |
|
"loss": 1.6228, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.234788548010701, |
|
"grad_norm": 0.5111351013183594, |
|
"learning_rate": 0.0008574257425742574, |
|
"loss": 1.6162, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.243212476521145, |
|
"grad_norm": 0.3448195457458496, |
|
"learning_rate": 0.0008564356435643564, |
|
"loss": 1.6156, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.2516364050315896, |
|
"grad_norm": 0.50129634141922, |
|
"learning_rate": 0.0008554455445544555, |
|
"loss": 1.6153, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.260060333542034, |
|
"grad_norm": 0.3352351188659668, |
|
"learning_rate": 0.0008544554455445544, |
|
"loss": 1.6117, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.260060333542034, |
|
"eval_accuracy": 0.6717362607348063, |
|
"eval_loss": 1.4890562295913696, |
|
"eval_runtime": 886.1465, |
|
"eval_samples_per_second": 563.537, |
|
"eval_steps_per_second": 5.218, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.2684842620524788, |
|
"grad_norm": 0.38713541626930237, |
|
"learning_rate": 0.0008534653465346535, |
|
"loss": 1.6058, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.2769081905629234, |
|
"grad_norm": 0.46299123764038086, |
|
"learning_rate": 0.0008524752475247524, |
|
"loss": 1.6053, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.285332119073368, |
|
"grad_norm": 0.4045964181423187, |
|
"learning_rate": 0.0008514851485148515, |
|
"loss": 1.6064, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.2937560475838126, |
|
"grad_norm": 0.37616729736328125, |
|
"learning_rate": 0.0008504950495049505, |
|
"loss": 1.6005, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.3021799760942567, |
|
"grad_norm": 0.47833314538002014, |
|
"learning_rate": 0.0008495049504950495, |
|
"loss": 1.599, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.3106039046047013, |
|
"grad_norm": 0.436625212430954, |
|
"learning_rate": 0.0008485148514851485, |
|
"loss": 1.5954, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.319027833115146, |
|
"grad_norm": 0.3456842005252838, |
|
"learning_rate": 0.0008475247524752475, |
|
"loss": 1.5924, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 3.3274517616255905, |
|
"grad_norm": 0.5403941869735718, |
|
"learning_rate": 0.0008465346534653465, |
|
"loss": 1.5915, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.335875690136035, |
|
"grad_norm": 0.3622403144836426, |
|
"learning_rate": 0.0008455445544554456, |
|
"loss": 1.6013, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.335875690136035, |
|
"eval_accuracy": 0.6740560565861919, |
|
"eval_loss": 1.475487232208252, |
|
"eval_runtime": 895.3114, |
|
"eval_samples_per_second": 557.768, |
|
"eval_steps_per_second": 5.165, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.3442996186464797, |
|
"grad_norm": 0.2850242555141449, |
|
"learning_rate": 0.0008445544554455445, |
|
"loss": 1.5903, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 3.3527235471569243, |
|
"grad_norm": 0.39831429719924927, |
|
"learning_rate": 0.0008435643564356436, |
|
"loss": 1.5846, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 3.3611474756673685, |
|
"grad_norm": 0.4886794686317444, |
|
"learning_rate": 0.0008425742574257425, |
|
"loss": 1.5876, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.369571404177813, |
|
"grad_norm": 0.35439977049827576, |
|
"learning_rate": 0.0008415841584158416, |
|
"loss": 1.5839, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.3779953326882577, |
|
"grad_norm": 0.32369595766067505, |
|
"learning_rate": 0.0008405940594059406, |
|
"loss": 1.5797, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 3.3864192611987023, |
|
"grad_norm": 0.48595139384269714, |
|
"learning_rate": 0.0008396039603960396, |
|
"loss": 1.58, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.394843189709147, |
|
"grad_norm": 0.39331361651420593, |
|
"learning_rate": 0.0008386138613861386, |
|
"loss": 1.5786, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 3.4032671182195915, |
|
"grad_norm": 0.31911513209342957, |
|
"learning_rate": 0.0008376237623762376, |
|
"loss": 1.5745, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 3.411691046730036, |
|
"grad_norm": 0.319876104593277, |
|
"learning_rate": 0.0008366336633663366, |
|
"loss": 1.5749, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.411691046730036, |
|
"eval_accuracy": 0.6780886041474171, |
|
"eval_loss": 1.4578139781951904, |
|
"eval_runtime": 880.4333, |
|
"eval_samples_per_second": 567.193, |
|
"eval_steps_per_second": 5.252, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.4201149752404802, |
|
"grad_norm": 0.45969948172569275, |
|
"learning_rate": 0.0008356435643564357, |
|
"loss": 1.5759, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 3.428538903750925, |
|
"grad_norm": 0.34449151158332825, |
|
"learning_rate": 0.0008346534653465346, |
|
"loss": 1.5707, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 3.4369628322613694, |
|
"grad_norm": 0.3478371202945709, |
|
"learning_rate": 0.0008336633663366337, |
|
"loss": 1.5699, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.445386760771814, |
|
"grad_norm": 0.5127679109573364, |
|
"learning_rate": 0.0008326732673267326, |
|
"loss": 1.5668, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 3.4538106892822587, |
|
"grad_norm": 0.302216500043869, |
|
"learning_rate": 0.0008316831683168317, |
|
"loss": 1.5647, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.4622346177927033, |
|
"grad_norm": 0.3295814096927643, |
|
"learning_rate": 0.0008306930693069307, |
|
"loss": 1.5628, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.4706585463031474, |
|
"grad_norm": 0.4209032654762268, |
|
"learning_rate": 0.0008297029702970297, |
|
"loss": 1.5628, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.479082474813592, |
|
"grad_norm": 0.34786614775657654, |
|
"learning_rate": 0.0008287128712871287, |
|
"loss": 1.5613, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.4875064033240366, |
|
"grad_norm": 0.4870763421058655, |
|
"learning_rate": 0.0008277227722772277, |
|
"loss": 1.5584, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.4875064033240366, |
|
"eval_accuracy": 0.6804383346028876, |
|
"eval_loss": 1.4444972276687622, |
|
"eval_runtime": 891.9286, |
|
"eval_samples_per_second": 559.883, |
|
"eval_steps_per_second": 5.184, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.495930331834481, |
|
"grad_norm": 0.31641605496406555, |
|
"learning_rate": 0.0008267326732673267, |
|
"loss": 1.5581, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.504354260344926, |
|
"grad_norm": 0.31303870677948, |
|
"learning_rate": 0.0008257425742574258, |
|
"loss": 1.5548, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.5127781888553704, |
|
"grad_norm": 0.35413628816604614, |
|
"learning_rate": 0.0008247524752475247, |
|
"loss": 1.5506, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.5212021173658146, |
|
"grad_norm": 0.39600226283073425, |
|
"learning_rate": 0.0008237623762376238, |
|
"loss": 1.5517, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 3.529626045876259, |
|
"grad_norm": 0.3600960075855255, |
|
"learning_rate": 0.0008227722772277227, |
|
"loss": 1.5563, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 3.5380499743867038, |
|
"grad_norm": 0.2877024710178375, |
|
"learning_rate": 0.0008217821782178218, |
|
"loss": 1.5467, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.5464739028971484, |
|
"grad_norm": 0.42324578762054443, |
|
"learning_rate": 0.0008207920792079208, |
|
"loss": 1.546, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 3.554897831407593, |
|
"grad_norm": 0.38907232880592346, |
|
"learning_rate": 0.0008198019801980197, |
|
"loss": 1.5458, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 3.5633217599180376, |
|
"grad_norm": 0.34750425815582275, |
|
"learning_rate": 0.0008188118811881188, |
|
"loss": 1.5437, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.5633217599180376, |
|
"eval_accuracy": 0.6840987986477044, |
|
"eval_loss": 1.4261698722839355, |
|
"eval_runtime": 886.2695, |
|
"eval_samples_per_second": 563.458, |
|
"eval_steps_per_second": 5.217, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.571745688428482, |
|
"grad_norm": 0.3718611001968384, |
|
"learning_rate": 0.0008178217821782177, |
|
"loss": 1.546, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 3.5801696169389263, |
|
"grad_norm": 0.39119917154312134, |
|
"learning_rate": 0.0008168316831683168, |
|
"loss": 1.5411, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.588593545449371, |
|
"grad_norm": 0.45689284801483154, |
|
"learning_rate": 0.0008158415841584159, |
|
"loss": 1.5416, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.5970174739598155, |
|
"grad_norm": 0.4029008150100708, |
|
"learning_rate": 0.0008148514851485148, |
|
"loss": 1.5364, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.60544140247026, |
|
"grad_norm": 0.3843879997730255, |
|
"learning_rate": 0.0008138613861386138, |
|
"loss": 1.5368, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.6138653309807047, |
|
"grad_norm": 0.33945897221565247, |
|
"learning_rate": 0.0008128712871287128, |
|
"loss": 1.5369, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.6222892594911493, |
|
"grad_norm": 0.29753997921943665, |
|
"learning_rate": 0.000811881188118812, |
|
"loss": 1.5326, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.630713188001594, |
|
"grad_norm": 0.4412858784198761, |
|
"learning_rate": 0.000810891089108911, |
|
"loss": 1.5316, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.639137116512038, |
|
"grad_norm": 0.30377647280693054, |
|
"learning_rate": 0.00080990099009901, |
|
"loss": 1.5308, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.639137116512038, |
|
"eval_accuracy": 0.6865785598346558, |
|
"eval_loss": 1.4111888408660889, |
|
"eval_runtime": 880.9823, |
|
"eval_samples_per_second": 566.84, |
|
"eval_steps_per_second": 5.249, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.6475610450224827, |
|
"grad_norm": 0.3666999638080597, |
|
"learning_rate": 0.000808910891089109, |
|
"loss": 1.5279, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.6559849735329273, |
|
"grad_norm": 0.3254301846027374, |
|
"learning_rate": 0.0008079207920792079, |
|
"loss": 1.5277, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.664408902043372, |
|
"grad_norm": 0.4963987469673157, |
|
"learning_rate": 0.000806930693069307, |
|
"loss": 1.5286, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.6728328305538165, |
|
"grad_norm": 0.34190070629119873, |
|
"learning_rate": 0.000805940594059406, |
|
"loss": 1.5294, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.6812567590642606, |
|
"grad_norm": 0.35153254866600037, |
|
"learning_rate": 0.000804950495049505, |
|
"loss": 1.5217, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.6896806875747057, |
|
"grad_norm": 0.345929354429245, |
|
"learning_rate": 0.000803960396039604, |
|
"loss": 1.52, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.69810461608515, |
|
"grad_norm": 0.37540799379348755, |
|
"learning_rate": 0.000802970297029703, |
|
"loss": 1.5208, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.7065285445955944, |
|
"grad_norm": 0.33499011397361755, |
|
"learning_rate": 0.000801980198019802, |
|
"loss": 1.5196, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.714952473106039, |
|
"grad_norm": 0.3461949825286865, |
|
"learning_rate": 0.0008009900990099011, |
|
"loss": 1.5188, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.714952473106039, |
|
"eval_accuracy": 0.6888913088166951, |
|
"eval_loss": 1.40292227268219, |
|
"eval_runtime": 882.772, |
|
"eval_samples_per_second": 565.691, |
|
"eval_steps_per_second": 5.238, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.7233764016164836, |
|
"grad_norm": 0.36491358280181885, |
|
"learning_rate": 0.0008, |
|
"loss": 1.5171, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.7318003301269282, |
|
"grad_norm": 0.2799367606639862, |
|
"learning_rate": 0.0007990099009900991, |
|
"loss": 1.5142, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.7402242586373724, |
|
"grad_norm": 0.361971914768219, |
|
"learning_rate": 0.000798019801980198, |
|
"loss": 1.5145, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.7486481871478174, |
|
"grad_norm": 0.2618056535720825, |
|
"learning_rate": 0.0007970297029702971, |
|
"loss": 1.5113, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.7570721156582616, |
|
"grad_norm": 0.5228148698806763, |
|
"learning_rate": 0.0007960396039603961, |
|
"loss": 1.5111, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.765496044168706, |
|
"grad_norm": 0.37740132212638855, |
|
"learning_rate": 0.0007950495049504951, |
|
"loss": 1.5121, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.773919972679151, |
|
"grad_norm": 0.3701629340648651, |
|
"learning_rate": 0.0007940594059405941, |
|
"loss": 1.5083, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.7823439011895954, |
|
"grad_norm": 0.3345108926296234, |
|
"learning_rate": 0.0007930693069306931, |
|
"loss": 1.5077, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.79076782970004, |
|
"grad_norm": 0.3989773988723755, |
|
"learning_rate": 0.0007920792079207921, |
|
"loss": 1.5079, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.79076782970004, |
|
"eval_accuracy": 0.6907081981543249, |
|
"eval_loss": 1.3909889459609985, |
|
"eval_runtime": 889.7203, |
|
"eval_samples_per_second": 561.273, |
|
"eval_steps_per_second": 5.197, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.799191758210484, |
|
"grad_norm": 0.284728080034256, |
|
"learning_rate": 0.0007910891089108912, |
|
"loss": 1.5046, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.8076156867209288, |
|
"grad_norm": 0.5029779672622681, |
|
"learning_rate": 0.0007900990099009901, |
|
"loss": 1.5049, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.8160396152313734, |
|
"grad_norm": 0.32617345452308655, |
|
"learning_rate": 0.0007891089108910892, |
|
"loss": 1.5068, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.824463543741818, |
|
"grad_norm": 0.36316540837287903, |
|
"learning_rate": 0.0007881188118811881, |
|
"loss": 1.4999, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.8328874722522626, |
|
"grad_norm": 0.30240392684936523, |
|
"learning_rate": 0.0007871287128712872, |
|
"loss": 1.498, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.841311400762707, |
|
"grad_norm": 0.3905390202999115, |
|
"learning_rate": 0.0007861386138613862, |
|
"loss": 1.4978, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.8497353292731518, |
|
"grad_norm": 0.30473875999450684, |
|
"learning_rate": 0.0007851485148514852, |
|
"loss": 1.4965, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.858159257783596, |
|
"grad_norm": 0.3675777316093445, |
|
"learning_rate": 0.0007841584158415842, |
|
"loss": 1.4957, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.8665831862940405, |
|
"grad_norm": 0.394168883562088, |
|
"learning_rate": 0.0007831683168316832, |
|
"loss": 1.4936, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.8665831862940405, |
|
"eval_accuracy": 0.6926193728848408, |
|
"eval_loss": 1.3844850063323975, |
|
"eval_runtime": 887.3028, |
|
"eval_samples_per_second": 562.802, |
|
"eval_steps_per_second": 5.211, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.875007114804485, |
|
"grad_norm": 0.3404500186443329, |
|
"learning_rate": 0.0007821782178217822, |
|
"loss": 1.4956, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.8834310433149297, |
|
"grad_norm": 0.3074527978897095, |
|
"learning_rate": 0.0007811881188118813, |
|
"loss": 1.4928, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.8918549718253743, |
|
"grad_norm": 0.44941094517707825, |
|
"learning_rate": 0.0007801980198019802, |
|
"loss": 1.4911, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.900278900335819, |
|
"grad_norm": 0.3098917603492737, |
|
"learning_rate": 0.0007792079207920793, |
|
"loss": 1.4918, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.9087028288462635, |
|
"grad_norm": 0.37436243891716003, |
|
"learning_rate": 0.0007782178217821782, |
|
"loss": 1.4866, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 3.9171267573567077, |
|
"grad_norm": 0.3058597445487976, |
|
"learning_rate": 0.0007772277227722773, |
|
"loss": 1.4896, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.9255506858671523, |
|
"grad_norm": 0.34245744347572327, |
|
"learning_rate": 0.0007762376237623763, |
|
"loss": 1.4874, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 3.933974614377597, |
|
"grad_norm": 0.3401254117488861, |
|
"learning_rate": 0.0007752475247524753, |
|
"loss": 1.4866, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 3.9423985428880415, |
|
"grad_norm": 0.35778889060020447, |
|
"learning_rate": 0.0007742574257425743, |
|
"loss": 1.4818, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.9423985428880415, |
|
"eval_accuracy": 0.6951155140000936, |
|
"eval_loss": 1.3689333200454712, |
|
"eval_runtime": 879.8095, |
|
"eval_samples_per_second": 567.596, |
|
"eval_steps_per_second": 5.256, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.950822471398486, |
|
"grad_norm": 0.2895776927471161, |
|
"learning_rate": 0.0007732673267326733, |
|
"loss": 1.4822, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 3.9592463999089302, |
|
"grad_norm": 0.3483330309391022, |
|
"learning_rate": 0.0007722772277227723, |
|
"loss": 1.4802, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.9676703284193753, |
|
"grad_norm": 0.30115026235580444, |
|
"learning_rate": 0.0007712871287128714, |
|
"loss": 1.4838, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 3.9760942569298194, |
|
"grad_norm": 0.32046666741371155, |
|
"learning_rate": 0.0007702970297029703, |
|
"loss": 1.4799, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 3.984518185440264, |
|
"grad_norm": 0.3833225965499878, |
|
"learning_rate": 0.0007693069306930694, |
|
"loss": 1.4785, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 3.9929421139507086, |
|
"grad_norm": 0.30888909101486206, |
|
"learning_rate": 0.0007683168316831683, |
|
"loss": 1.475, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 4.001366042461153, |
|
"grad_norm": 0.32462459802627563, |
|
"learning_rate": 0.0007673267326732674, |
|
"loss": 1.4746, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 4.009789970971598, |
|
"grad_norm": 0.3200187683105469, |
|
"learning_rate": 0.0007663366336633664, |
|
"loss": 1.4768, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 4.018213899482042, |
|
"grad_norm": 0.3794704079627991, |
|
"learning_rate": 0.0007653465346534654, |
|
"loss": 1.4761, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 4.018213899482042, |
|
"eval_accuracy": 0.6969660848927619, |
|
"eval_loss": 1.3595411777496338, |
|
"eval_runtime": 887.2228, |
|
"eval_samples_per_second": 562.853, |
|
"eval_steps_per_second": 5.212, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 4.026637827992487, |
|
"grad_norm": 0.27933019399642944, |
|
"learning_rate": 0.0007643564356435644, |
|
"loss": 1.47, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 4.035061756502931, |
|
"grad_norm": 0.32542508840560913, |
|
"learning_rate": 0.0007633663366336634, |
|
"loss": 1.4726, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 4.043485685013376, |
|
"grad_norm": 0.3638169765472412, |
|
"learning_rate": 0.0007623762376237624, |
|
"loss": 1.4697, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.05190961352382, |
|
"grad_norm": 0.3762564957141876, |
|
"learning_rate": 0.0007613861386138615, |
|
"loss": 1.4663, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 4.0603335420342646, |
|
"grad_norm": 0.36758995056152344, |
|
"learning_rate": 0.0007603960396039604, |
|
"loss": 1.4729, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 4.06875747054471, |
|
"grad_norm": 0.34590932726860046, |
|
"learning_rate": 0.0007594059405940595, |
|
"loss": 1.4665, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 4.077181399055154, |
|
"grad_norm": 0.3242778182029724, |
|
"learning_rate": 0.0007584158415841584, |
|
"loss": 1.4639, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 4.085605327565599, |
|
"grad_norm": 0.3849882185459137, |
|
"learning_rate": 0.0007574257425742574, |
|
"loss": 1.4613, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 4.094029256076043, |
|
"grad_norm": 0.3495323061943054, |
|
"learning_rate": 0.0007564356435643565, |
|
"loss": 1.4598, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 4.094029256076043, |
|
"eval_accuracy": 0.6996214986490302, |
|
"eval_loss": 1.3455697298049927, |
|
"eval_runtime": 887.3091, |
|
"eval_samples_per_second": 562.798, |
|
"eval_steps_per_second": 5.211, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 4.102453184586488, |
|
"grad_norm": 0.3290145993232727, |
|
"learning_rate": 0.0007554455445544554, |
|
"loss": 1.4601, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 4.110877113096932, |
|
"grad_norm": 0.34369096159935, |
|
"learning_rate": 0.0007544554455445545, |
|
"loss": 1.4603, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 4.119301041607376, |
|
"grad_norm": 0.3350279629230499, |
|
"learning_rate": 0.0007534653465346534, |
|
"loss": 1.4609, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 4.127724970117821, |
|
"grad_norm": 0.2575846016407013, |
|
"learning_rate": 0.0007524752475247525, |
|
"loss": 1.4565, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.1361488986282655, |
|
"grad_norm": 0.3337861895561218, |
|
"learning_rate": 0.0007514851485148515, |
|
"loss": 1.4574, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 4.144572827138711, |
|
"grad_norm": 0.3752147853374481, |
|
"learning_rate": 0.0007504950495049505, |
|
"loss": 1.4594, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 4.152996755649155, |
|
"grad_norm": 0.29587122797966003, |
|
"learning_rate": 0.0007495049504950495, |
|
"loss": 1.4518, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 4.161420684159599, |
|
"grad_norm": 0.2764742374420166, |
|
"learning_rate": 0.0007485148514851485, |
|
"loss": 1.4514, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 4.169844612670044, |
|
"grad_norm": 0.4625591039657593, |
|
"learning_rate": 0.0007475247524752475, |
|
"loss": 1.4527, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 4.169844612670044, |
|
"eval_accuracy": 0.701515475804278, |
|
"eval_loss": 1.3361947536468506, |
|
"eval_runtime": 883.9818, |
|
"eval_samples_per_second": 564.917, |
|
"eval_steps_per_second": 5.231, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 4.178268541180488, |
|
"grad_norm": 0.29412004351615906, |
|
"learning_rate": 0.0007465346534653466, |
|
"loss": 1.4514, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 4.186692469690933, |
|
"grad_norm": 0.3580242693424225, |
|
"learning_rate": 0.0007455445544554455, |
|
"loss": 1.4486, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 4.195116398201377, |
|
"grad_norm": 0.46256908774375916, |
|
"learning_rate": 0.0007445544554455446, |
|
"loss": 1.4494, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 4.203540326711822, |
|
"grad_norm": 0.3117842674255371, |
|
"learning_rate": 0.0007435643564356435, |
|
"loss": 1.4486, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 4.2119642552222665, |
|
"grad_norm": 0.3382858335971832, |
|
"learning_rate": 0.0007425742574257426, |
|
"loss": 1.4452, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.220388183732711, |
|
"grad_norm": 0.3153148889541626, |
|
"learning_rate": 0.0007415841584158416, |
|
"loss": 1.4465, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 4.228812112243156, |
|
"grad_norm": 0.3635173439979553, |
|
"learning_rate": 0.0007405940594059406, |
|
"loss": 1.4443, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 4.2372360407536, |
|
"grad_norm": 0.4260285794734955, |
|
"learning_rate": 0.0007396039603960396, |
|
"loss": 1.4454, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 4.245659969264045, |
|
"grad_norm": 0.29188039898872375, |
|
"learning_rate": 0.0007386138613861386, |
|
"loss": 1.4442, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 4.245659969264045, |
|
"eval_accuracy": 0.7031089800515327, |
|
"eval_loss": 1.3285191059112549, |
|
"eval_runtime": 890.9721, |
|
"eval_samples_per_second": 560.484, |
|
"eval_steps_per_second": 5.19, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 4.254083897774489, |
|
"grad_norm": 0.5350555777549744, |
|
"learning_rate": 0.0007376237623762376, |
|
"loss": 1.4416, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 4.262507826284934, |
|
"grad_norm": 0.35281315445899963, |
|
"learning_rate": 0.0007366336633663367, |
|
"loss": 1.4432, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 4.270931754795378, |
|
"grad_norm": 0.37922871112823486, |
|
"learning_rate": 0.0007356435643564356, |
|
"loss": 1.4399, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 4.279355683305822, |
|
"grad_norm": 0.3072182238101959, |
|
"learning_rate": 0.0007346534653465347, |
|
"loss": 1.4383, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 4.287779611816267, |
|
"grad_norm": 0.30223241448402405, |
|
"learning_rate": 0.0007336633663366336, |
|
"loss": 1.4406, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 4.296203540326712, |
|
"grad_norm": 0.5292770862579346, |
|
"learning_rate": 0.0007326732673267327, |
|
"loss": 1.4376, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.304627468837157, |
|
"grad_norm": 0.35330840945243835, |
|
"learning_rate": 0.0007316831683168317, |
|
"loss": 1.4389, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 4.313051397347601, |
|
"grad_norm": 0.30719104409217834, |
|
"learning_rate": 0.0007306930693069307, |
|
"loss": 1.4384, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 4.321475325858046, |
|
"grad_norm": 0.34203872084617615, |
|
"learning_rate": 0.0007297029702970297, |
|
"loss": 1.4374, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 4.321475325858046, |
|
"eval_accuracy": 0.7048288335521147, |
|
"eval_loss": 1.3187906742095947, |
|
"eval_runtime": 887.0787, |
|
"eval_samples_per_second": 562.944, |
|
"eval_steps_per_second": 5.213, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 4.32989925436849, |
|
"grad_norm": 0.38140207529067993, |
|
"learning_rate": 0.0007287128712871287, |
|
"loss": 1.4353, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 4.338323182878934, |
|
"grad_norm": 0.303752064704895, |
|
"learning_rate": 0.0007277227722772277, |
|
"loss": 1.4336, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 4.346747111389379, |
|
"grad_norm": 0.290764719247818, |
|
"learning_rate": 0.0007267326732673268, |
|
"loss": 1.4304, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 4.355171039899823, |
|
"grad_norm": 0.4335167407989502, |
|
"learning_rate": 0.0007257425742574257, |
|
"loss": 1.4327, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 4.363594968410268, |
|
"grad_norm": 0.3198365271091461, |
|
"learning_rate": 0.0007247524752475248, |
|
"loss": 1.4319, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 4.3720188969207125, |
|
"grad_norm": 0.41567763686180115, |
|
"learning_rate": 0.0007237623762376237, |
|
"loss": 1.4318, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 4.380442825431157, |
|
"grad_norm": 0.3342703580856323, |
|
"learning_rate": 0.0007227722772277228, |
|
"loss": 1.4298, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.388866753941602, |
|
"grad_norm": 0.25702279806137085, |
|
"learning_rate": 0.0007217821782178218, |
|
"loss": 1.4265, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 4.397290682452046, |
|
"grad_norm": 0.26949411630630493, |
|
"learning_rate": 0.0007207920792079208, |
|
"loss": 1.4278, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 4.397290682452046, |
|
"eval_accuracy": 0.7063243134470976, |
|
"eval_loss": 1.3113943338394165, |
|
"eval_runtime": 889.8031, |
|
"eval_samples_per_second": 561.221, |
|
"eval_steps_per_second": 5.197, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 4.405714610962491, |
|
"grad_norm": 0.3861467242240906, |
|
"learning_rate": 0.0007198019801980198, |
|
"loss": 1.4318, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 4.414138539472935, |
|
"grad_norm": 0.34858283400535583, |
|
"learning_rate": 0.0007188118811881188, |
|
"loss": 1.4291, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 4.42256246798338, |
|
"grad_norm": 0.3346785604953766, |
|
"learning_rate": 0.0007178217821782178, |
|
"loss": 1.425, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 4.430986396493824, |
|
"grad_norm": 0.3916323184967041, |
|
"learning_rate": 0.0007168316831683169, |
|
"loss": 1.4241, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 4.439410325004269, |
|
"grad_norm": 0.2802947759628296, |
|
"learning_rate": 0.0007158415841584158, |
|
"loss": 1.4221, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 4.4478342535147135, |
|
"grad_norm": 0.4092938303947449, |
|
"learning_rate": 0.0007148514851485149, |
|
"loss": 1.4236, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 4.456258182025158, |
|
"grad_norm": 0.25096723437309265, |
|
"learning_rate": 0.0007138613861386138, |
|
"loss": 1.4235, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 4.464682110535603, |
|
"grad_norm": 0.3570871949195862, |
|
"learning_rate": 0.0007128712871287129, |
|
"loss": 1.4216, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.473106039046047, |
|
"grad_norm": 0.3168172240257263, |
|
"learning_rate": 0.0007118811881188119, |
|
"loss": 1.4236, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 4.473106039046047, |
|
"eval_accuracy": 0.7076842136916008, |
|
"eval_loss": 1.307774543762207, |
|
"eval_runtime": 889.4836, |
|
"eval_samples_per_second": 561.422, |
|
"eval_steps_per_second": 5.199, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 4.481529967556492, |
|
"grad_norm": 0.30059170722961426, |
|
"learning_rate": 0.0007108910891089109, |
|
"loss": 1.4193, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 4.489953896066936, |
|
"grad_norm": 0.331824392080307, |
|
"learning_rate": 0.0007099009900990099, |
|
"loss": 1.4185, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 4.49837782457738, |
|
"grad_norm": 0.3295821249485016, |
|
"learning_rate": 0.0007089108910891088, |
|
"loss": 1.4198, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 4.506801753087825, |
|
"grad_norm": 0.3506734371185303, |
|
"learning_rate": 0.0007079207920792079, |
|
"loss": 1.4167, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 4.515225681598269, |
|
"grad_norm": 0.3836129903793335, |
|
"learning_rate": 0.000706930693069307, |
|
"loss": 1.417, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 4.5236496101087145, |
|
"grad_norm": 0.3046220541000366, |
|
"learning_rate": 0.0007059405940594059, |
|
"loss": 1.4177, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 4.532073538619159, |
|
"grad_norm": 0.37655332684516907, |
|
"learning_rate": 0.000704950495049505, |
|
"loss": 1.4149, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 4.540497467129603, |
|
"grad_norm": 0.32939672470092773, |
|
"learning_rate": 0.0007039603960396039, |
|
"loss": 1.4165, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 4.548921395640048, |
|
"grad_norm": 0.2900882363319397, |
|
"learning_rate": 0.0007029702970297029, |
|
"loss": 1.4128, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.548921395640048, |
|
"eval_accuracy": 0.7087959913049944, |
|
"eval_loss": 1.3013147115707397, |
|
"eval_runtime": 892.9333, |
|
"eval_samples_per_second": 559.253, |
|
"eval_steps_per_second": 5.178, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.557345324150492, |
|
"grad_norm": 0.27651771903038025, |
|
"learning_rate": 0.000701980198019802, |
|
"loss": 1.4122, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 4.565769252660937, |
|
"grad_norm": 0.4160715639591217, |
|
"learning_rate": 0.0007009900990099009, |
|
"loss": 1.4122, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 4.574193181171381, |
|
"grad_norm": 0.2724072337150574, |
|
"learning_rate": 0.0007, |
|
"loss": 1.41, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 4.582617109681826, |
|
"grad_norm": 0.35586145520210266, |
|
"learning_rate": 0.0006990099009900989, |
|
"loss": 1.4118, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 4.59104103819227, |
|
"grad_norm": 0.3268265128135681, |
|
"learning_rate": 0.000698019801980198, |
|
"loss": 1.4117, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 4.599464966702715, |
|
"grad_norm": 0.3230002522468567, |
|
"learning_rate": 0.000697029702970297, |
|
"loss": 1.4102, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 4.60788889521316, |
|
"grad_norm": 0.25019174814224243, |
|
"learning_rate": 0.000696039603960396, |
|
"loss": 1.4102, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 4.616312823723604, |
|
"grad_norm": 0.38475289940834045, |
|
"learning_rate": 0.000695049504950495, |
|
"loss": 1.4075, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 4.624736752234049, |
|
"grad_norm": 0.39824309945106506, |
|
"learning_rate": 0.000694059405940594, |
|
"loss": 1.4077, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.624736752234049, |
|
"eval_accuracy": 0.7098417264518991, |
|
"eval_loss": 1.2926928997039795, |
|
"eval_runtime": 881.9048, |
|
"eval_samples_per_second": 566.247, |
|
"eval_steps_per_second": 5.243, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.633160680744493, |
|
"grad_norm": 0.3250022828578949, |
|
"learning_rate": 0.000693069306930693, |
|
"loss": 1.4068, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.641584609254938, |
|
"grad_norm": 0.32388612627983093, |
|
"learning_rate": 0.0006920792079207921, |
|
"loss": 1.4062, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 4.650008537765382, |
|
"grad_norm": 0.2806077003479004, |
|
"learning_rate": 0.000691089108910891, |
|
"loss": 1.4049, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 4.658432466275826, |
|
"grad_norm": 0.33755025267601013, |
|
"learning_rate": 0.0006900990099009901, |
|
"loss": 1.4045, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 4.666856394786271, |
|
"grad_norm": 0.4184636175632477, |
|
"learning_rate": 0.000689108910891089, |
|
"loss": 1.4042, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 4.6752803232967155, |
|
"grad_norm": 0.34234240651130676, |
|
"learning_rate": 0.0006881188118811881, |
|
"loss": 1.4055, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 4.6837042518071605, |
|
"grad_norm": 0.32120293378829956, |
|
"learning_rate": 0.0006871287128712872, |
|
"loss": 1.4014, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 4.692128180317605, |
|
"grad_norm": 0.3810026943683624, |
|
"learning_rate": 0.0006861386138613862, |
|
"loss": 1.4039, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 4.70055210882805, |
|
"grad_norm": 0.3171080946922302, |
|
"learning_rate": 0.0006851485148514852, |
|
"loss": 1.4025, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.70055210882805, |
|
"eval_accuracy": 0.7115425686273988, |
|
"eval_loss": 1.285227656364441, |
|
"eval_runtime": 891.3368, |
|
"eval_samples_per_second": 560.255, |
|
"eval_steps_per_second": 5.188, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.708976037338494, |
|
"grad_norm": 0.24618960916996002, |
|
"learning_rate": 0.0006841584158415842, |
|
"loss": 1.3983, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 4.717399965848939, |
|
"grad_norm": 0.494895339012146, |
|
"learning_rate": 0.0006831683168316832, |
|
"loss": 1.4, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.725823894359383, |
|
"grad_norm": 0.31908226013183594, |
|
"learning_rate": 0.0006821782178217823, |
|
"loss": 1.3983, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 4.734247822869827, |
|
"grad_norm": 0.26488983631134033, |
|
"learning_rate": 0.0006811881188118812, |
|
"loss": 1.3956, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 4.742671751380272, |
|
"grad_norm": 0.3156343102455139, |
|
"learning_rate": 0.0006801980198019803, |
|
"loss": 1.397, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 4.7510956798907165, |
|
"grad_norm": 0.38938194513320923, |
|
"learning_rate": 0.0006792079207920792, |
|
"loss": 1.3987, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 4.7595196084011615, |
|
"grad_norm": 0.27233967185020447, |
|
"learning_rate": 0.0006782178217821783, |
|
"loss": 1.3983, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 4.767943536911606, |
|
"grad_norm": 0.347419410943985, |
|
"learning_rate": 0.0006772277227722773, |
|
"loss": 1.3953, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 4.77636746542205, |
|
"grad_norm": 0.44131675362586975, |
|
"learning_rate": 0.0006762376237623763, |
|
"loss": 1.3956, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.77636746542205, |
|
"eval_accuracy": 0.7112416746447588, |
|
"eval_loss": 1.290834665298462, |
|
"eval_runtime": 886.5668, |
|
"eval_samples_per_second": 563.269, |
|
"eval_steps_per_second": 5.216, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.784791393932495, |
|
"grad_norm": 0.3185184895992279, |
|
"learning_rate": 0.0006752475247524753, |
|
"loss": 1.3976, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 4.793215322442939, |
|
"grad_norm": 0.2549585998058319, |
|
"learning_rate": 0.0006742574257425743, |
|
"loss": 1.3931, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 4.801639250953384, |
|
"grad_norm": 0.315294086933136, |
|
"learning_rate": 0.0006732673267326733, |
|
"loss": 1.393, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.810063179463828, |
|
"grad_norm": 0.3866962492465973, |
|
"learning_rate": 0.0006722772277227724, |
|
"loss": 1.3923, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 4.818487107974272, |
|
"grad_norm": 0.28364527225494385, |
|
"learning_rate": 0.0006712871287128713, |
|
"loss": 1.3924, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 4.826911036484717, |
|
"grad_norm": 0.3253314793109894, |
|
"learning_rate": 0.0006702970297029704, |
|
"loss": 1.3914, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 4.835334964995162, |
|
"grad_norm": 0.31215131282806396, |
|
"learning_rate": 0.0006693069306930693, |
|
"loss": 1.3903, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 4.843758893505607, |
|
"grad_norm": 0.34929993748664856, |
|
"learning_rate": 0.0006683168316831684, |
|
"loss": 1.3894, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 4.852182822016051, |
|
"grad_norm": 0.38991761207580566, |
|
"learning_rate": 0.0006673267326732674, |
|
"loss": 1.3924, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.852182822016051, |
|
"eval_accuracy": 0.7133021748514282, |
|
"eval_loss": 1.2766938209533691, |
|
"eval_runtime": 881.7452, |
|
"eval_samples_per_second": 566.35, |
|
"eval_steps_per_second": 5.244, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.860606750526496, |
|
"grad_norm": 0.2888573408126831, |
|
"learning_rate": 0.0006663366336633664, |
|
"loss": 1.3918, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 4.86903067903694, |
|
"grad_norm": 0.3224232494831085, |
|
"learning_rate": 0.0006653465346534654, |
|
"loss": 1.3895, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 4.877454607547385, |
|
"grad_norm": 0.3562750518321991, |
|
"learning_rate": 0.0006643564356435644, |
|
"loss": 1.387, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 4.885878536057829, |
|
"grad_norm": 0.3339401185512543, |
|
"learning_rate": 0.0006633663366336634, |
|
"loss": 1.3886, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.894302464568273, |
|
"grad_norm": 0.3022938072681427, |
|
"learning_rate": 0.0006623762376237625, |
|
"loss": 1.3858, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 4.902726393078718, |
|
"grad_norm": 0.276065856218338, |
|
"learning_rate": 0.0006613861386138614, |
|
"loss": 1.386, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 4.9111503215891625, |
|
"grad_norm": 0.3148975372314453, |
|
"learning_rate": 0.0006603960396039605, |
|
"loss": 1.385, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 4.919574250099608, |
|
"grad_norm": 0.3374193608760834, |
|
"learning_rate": 0.0006594059405940594, |
|
"loss": 1.3842, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 4.927998178610052, |
|
"grad_norm": 0.3293200135231018, |
|
"learning_rate": 0.0006584158415841585, |
|
"loss": 1.3835, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.927998178610052, |
|
"eval_accuracy": 0.7147221912687882, |
|
"eval_loss": 1.2681052684783936, |
|
"eval_runtime": 890.793, |
|
"eval_samples_per_second": 560.597, |
|
"eval_steps_per_second": 5.191, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.936422107120496, |
|
"grad_norm": 0.3032568693161011, |
|
"learning_rate": 0.0006574257425742575, |
|
"loss": 1.3828, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 4.944846035630941, |
|
"grad_norm": 0.24251434206962585, |
|
"learning_rate": 0.0006564356435643565, |
|
"loss": 1.3818, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 4.953269964141385, |
|
"grad_norm": 0.3096301257610321, |
|
"learning_rate": 0.0006554455445544555, |
|
"loss": 1.3814, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 4.96169389265183, |
|
"grad_norm": 0.34841156005859375, |
|
"learning_rate": 0.0006544554455445545, |
|
"loss": 1.3823, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 4.970117821162274, |
|
"grad_norm": 0.312688946723938, |
|
"learning_rate": 0.0006534653465346535, |
|
"loss": 1.3818, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 4.978541749672719, |
|
"grad_norm": 0.30799320340156555, |
|
"learning_rate": 0.0006524752475247526, |
|
"loss": 1.379, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 4.9869656781831635, |
|
"grad_norm": 0.3510371148586273, |
|
"learning_rate": 0.0006514851485148515, |
|
"loss": 1.3814, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 4.9953896066936085, |
|
"grad_norm": 0.2894381582736969, |
|
"learning_rate": 0.0006504950495049506, |
|
"loss": 1.3812, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 5.003813535204053, |
|
"grad_norm": 0.2685450315475464, |
|
"learning_rate": 0.0006495049504950495, |
|
"loss": 1.3788, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 5.003813535204053, |
|
"eval_accuracy": 0.7160080315056353, |
|
"eval_loss": 1.2630343437194824, |
|
"eval_runtime": 883.8805, |
|
"eval_samples_per_second": 564.981, |
|
"eval_steps_per_second": 5.231, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 5.012237463714497, |
|
"grad_norm": 0.38857927918434143, |
|
"learning_rate": 0.0006485148514851485, |
|
"loss": 1.3809, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 5.020661392224942, |
|
"grad_norm": 0.2822309136390686, |
|
"learning_rate": 0.0006475247524752476, |
|
"loss": 1.3769, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 5.029085320735386, |
|
"grad_norm": 0.2725491523742676, |
|
"learning_rate": 0.0006465346534653465, |
|
"loss": 1.3762, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 5.037509249245831, |
|
"grad_norm": 0.32517486810684204, |
|
"learning_rate": 0.0006455445544554456, |
|
"loss": 1.377, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 5.045933177756275, |
|
"grad_norm": 0.34373360872268677, |
|
"learning_rate": 0.0006445544554455445, |
|
"loss": 1.3774, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 5.054357106266719, |
|
"grad_norm": 0.3029853403568268, |
|
"learning_rate": 0.0006435643564356436, |
|
"loss": 1.3746, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.0627810347771645, |
|
"grad_norm": 0.5577653646469116, |
|
"learning_rate": 0.0006425742574257426, |
|
"loss": 1.378, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 5.071204963287609, |
|
"grad_norm": 0.27967342734336853, |
|
"learning_rate": 0.0006415841584158416, |
|
"loss": 1.3779, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 5.079628891798054, |
|
"grad_norm": 0.2680428624153137, |
|
"learning_rate": 0.0006405940594059406, |
|
"loss": 1.3733, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 5.079628891798054, |
|
"eval_accuracy": 0.7168763989390342, |
|
"eval_loss": 1.258245825767517, |
|
"eval_runtime": 902.3568, |
|
"eval_samples_per_second": 553.413, |
|
"eval_steps_per_second": 5.124, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 5.088052820308498, |
|
"grad_norm": 0.24522745609283447, |
|
"learning_rate": 0.0006396039603960396, |
|
"loss": 1.3692, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 5.096476748818943, |
|
"grad_norm": 0.3076081871986389, |
|
"learning_rate": 0.0006386138613861386, |
|
"loss": 1.3724, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 5.104900677329387, |
|
"grad_norm": 0.32096347212791443, |
|
"learning_rate": 0.0006376237623762377, |
|
"loss": 1.3737, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 5.113324605839831, |
|
"grad_norm": 0.35196197032928467, |
|
"learning_rate": 0.0006366336633663366, |
|
"loss": 1.3719, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 5.121748534350276, |
|
"grad_norm": 0.39065635204315186, |
|
"learning_rate": 0.0006356435643564357, |
|
"loss": 1.3719, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 5.13017246286072, |
|
"grad_norm": 0.3439326882362366, |
|
"learning_rate": 0.0006346534653465346, |
|
"loss": 1.3749, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 5.138596391371165, |
|
"grad_norm": 0.3175961673259735, |
|
"learning_rate": 0.0006336633663366337, |
|
"loss": 1.3679, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.14702031988161, |
|
"grad_norm": 0.37071719765663147, |
|
"learning_rate": 0.0006326732673267327, |
|
"loss": 1.3706, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 5.155444248392055, |
|
"grad_norm": 0.2499271035194397, |
|
"learning_rate": 0.0006316831683168317, |
|
"loss": 1.3685, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 5.155444248392055, |
|
"eval_accuracy": 0.717981203712741, |
|
"eval_loss": 1.2521748542785645, |
|
"eval_runtime": 885.5528, |
|
"eval_samples_per_second": 563.914, |
|
"eval_steps_per_second": 5.222, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 5.163868176902499, |
|
"grad_norm": 0.3951607346534729, |
|
"learning_rate": 0.0006306930693069307, |
|
"loss": 1.3671, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 5.172292105412943, |
|
"grad_norm": 0.4264112114906311, |
|
"learning_rate": 0.0006297029702970297, |
|
"loss": 1.3652, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 5.180716033923388, |
|
"grad_norm": 0.3097785711288452, |
|
"learning_rate": 0.0006287128712871287, |
|
"loss": 1.3695, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 5.189139962433832, |
|
"grad_norm": 0.28887125849723816, |
|
"learning_rate": 0.0006277227722772278, |
|
"loss": 1.3658, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 5.197563890944277, |
|
"grad_norm": 0.27163591980934143, |
|
"learning_rate": 0.0006267326732673267, |
|
"loss": 1.3655, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 5.205987819454721, |
|
"grad_norm": 0.30266183614730835, |
|
"learning_rate": 0.0006257425742574258, |
|
"loss": 1.3631, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 5.2144117479651655, |
|
"grad_norm": 0.3191784620285034, |
|
"learning_rate": 0.0006247524752475247, |
|
"loss": 1.3667, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 5.2228356764756105, |
|
"grad_norm": 0.30907300114631653, |
|
"learning_rate": 0.0006237623762376238, |
|
"loss": 1.3667, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.231259604986055, |
|
"grad_norm": 0.3120558559894562, |
|
"learning_rate": 0.0006227722772277228, |
|
"loss": 1.3638, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 5.231259604986055, |
|
"eval_accuracy": 0.7190249020483522, |
|
"eval_loss": 1.2470471858978271, |
|
"eval_runtime": 893.7706, |
|
"eval_samples_per_second": 558.73, |
|
"eval_steps_per_second": 5.174, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 5.2396835334965, |
|
"grad_norm": 0.35595396161079407, |
|
"learning_rate": 0.0006217821782178218, |
|
"loss": 1.3634, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 5.248107462006944, |
|
"grad_norm": 0.33759573101997375, |
|
"learning_rate": 0.0006207920792079208, |
|
"loss": 1.3661, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 5.256531390517389, |
|
"grad_norm": 0.26417672634124756, |
|
"learning_rate": 0.0006198019801980198, |
|
"loss": 1.3627, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 5.264955319027833, |
|
"grad_norm": 0.28236111998558044, |
|
"learning_rate": 0.0006188118811881188, |
|
"loss": 1.362, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 5.273379247538277, |
|
"grad_norm": 0.5903481245040894, |
|
"learning_rate": 0.0006178217821782179, |
|
"loss": 1.3619, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 5.281803176048722, |
|
"grad_norm": 0.298475056886673, |
|
"learning_rate": 0.0006168316831683168, |
|
"loss": 1.3671, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 5.2902271045591664, |
|
"grad_norm": 0.27397215366363525, |
|
"learning_rate": 0.0006158415841584159, |
|
"loss": 1.3611, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 5.2986510330696115, |
|
"grad_norm": 0.28740593791007996, |
|
"learning_rate": 0.0006148514851485148, |
|
"loss": 1.3579, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 5.307074961580056, |
|
"grad_norm": 0.274557888507843, |
|
"learning_rate": 0.0006138613861386139, |
|
"loss": 1.3587, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.307074961580056, |
|
"eval_accuracy": 0.719703789624826, |
|
"eval_loss": 1.2432972192764282, |
|
"eval_runtime": 881.2394, |
|
"eval_samples_per_second": 566.675, |
|
"eval_steps_per_second": 5.247, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.315498890090501, |
|
"grad_norm": 0.31431418657302856, |
|
"learning_rate": 0.0006128712871287129, |
|
"loss": 1.3565, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 5.323922818600945, |
|
"grad_norm": 0.358239084482193, |
|
"learning_rate": 0.0006118811881188119, |
|
"loss": 1.3614, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 5.332346747111389, |
|
"grad_norm": 0.3043140769004822, |
|
"learning_rate": 0.0006108910891089109, |
|
"loss": 1.3576, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 5.340770675621834, |
|
"grad_norm": 0.2583385109901428, |
|
"learning_rate": 0.0006099009900990099, |
|
"loss": 1.3578, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 5.349194604132278, |
|
"grad_norm": 0.3068407475948334, |
|
"learning_rate": 0.0006089108910891089, |
|
"loss": 1.3577, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 5.357618532642723, |
|
"grad_norm": 0.2893878221511841, |
|
"learning_rate": 0.000607920792079208, |
|
"loss": 1.3569, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 5.366042461153167, |
|
"grad_norm": 0.2883850634098053, |
|
"learning_rate": 0.0006069306930693069, |
|
"loss": 1.3555, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 5.3744663896636125, |
|
"grad_norm": 0.3248838484287262, |
|
"learning_rate": 0.000605940594059406, |
|
"loss": 1.3561, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 5.382890318174057, |
|
"grad_norm": 0.29167214035987854, |
|
"learning_rate": 0.0006049504950495049, |
|
"loss": 1.3582, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 5.382890318174057, |
|
"eval_accuracy": 0.7203339064191229, |
|
"eval_loss": 1.241172432899475, |
|
"eval_runtime": 891.2006, |
|
"eval_samples_per_second": 560.341, |
|
"eval_steps_per_second": 5.189, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 5.391314246684501, |
|
"grad_norm": 0.3090030550956726, |
|
"learning_rate": 0.000603960396039604, |
|
"loss": 1.3534, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.399738175194946, |
|
"grad_norm": 0.25337210297584534, |
|
"learning_rate": 0.000602970297029703, |
|
"loss": 1.3564, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 5.40816210370539, |
|
"grad_norm": 0.25656768679618835, |
|
"learning_rate": 0.000601980198019802, |
|
"loss": 1.3549, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 5.416586032215835, |
|
"grad_norm": 0.2951459288597107, |
|
"learning_rate": 0.000600990099009901, |
|
"loss": 1.3518, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 5.425009960726279, |
|
"grad_norm": 0.2697450816631317, |
|
"learning_rate": 0.0006, |
|
"loss": 1.3531, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 5.433433889236724, |
|
"grad_norm": 0.28866857290267944, |
|
"learning_rate": 0.000599009900990099, |
|
"loss": 1.3524, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 5.441857817747168, |
|
"grad_norm": 0.26775673031806946, |
|
"learning_rate": 0.000598019801980198, |
|
"loss": 1.3505, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 5.4502817462576125, |
|
"grad_norm": 0.3911271393299103, |
|
"learning_rate": 0.000597029702970297, |
|
"loss": 1.3516, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 5.458705674768058, |
|
"grad_norm": 0.3151527941226959, |
|
"learning_rate": 0.000596039603960396, |
|
"loss": 1.353, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 5.458705674768058, |
|
"eval_accuracy": 0.7213715986510872, |
|
"eval_loss": 1.2357591390609741, |
|
"eval_runtime": 888.8097, |
|
"eval_samples_per_second": 561.848, |
|
"eval_steps_per_second": 5.202, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 5.467129603278502, |
|
"grad_norm": 0.32286888360977173, |
|
"learning_rate": 0.000595049504950495, |
|
"loss": 1.3527, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 5.475553531788947, |
|
"grad_norm": 0.3933228850364685, |
|
"learning_rate": 0.000594059405940594, |
|
"loss": 1.3511, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.483977460299391, |
|
"grad_norm": 0.3246067762374878, |
|
"learning_rate": 0.0005930693069306931, |
|
"loss": 1.3524, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 5.492401388809835, |
|
"grad_norm": 0.2912397086620331, |
|
"learning_rate": 0.000592079207920792, |
|
"loss": 1.3495, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 5.50082531732028, |
|
"grad_norm": 0.3058258891105652, |
|
"learning_rate": 0.0005910891089108911, |
|
"loss": 1.3486, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 5.509249245830724, |
|
"grad_norm": 0.310024231672287, |
|
"learning_rate": 0.00059009900990099, |
|
"loss": 1.3507, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 5.517673174341169, |
|
"grad_norm": 0.289165198802948, |
|
"learning_rate": 0.0005891089108910891, |
|
"loss": 1.3475, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 5.5260971028516135, |
|
"grad_norm": 0.324613094329834, |
|
"learning_rate": 0.0005881188118811881, |
|
"loss": 1.3489, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 5.5345210313620585, |
|
"grad_norm": 0.3530217111110687, |
|
"learning_rate": 0.0005871287128712871, |
|
"loss": 1.3477, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 5.5345210313620585, |
|
"eval_accuracy": 0.722217175302605, |
|
"eval_loss": 1.2293946743011475, |
|
"eval_runtime": 881.4092, |
|
"eval_samples_per_second": 566.565, |
|
"eval_steps_per_second": 5.246, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 5.542944959872503, |
|
"grad_norm": 0.3527272045612335, |
|
"learning_rate": 0.0005861386138613861, |
|
"loss": 1.3447, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 5.551368888382948, |
|
"grad_norm": 0.26519855856895447, |
|
"learning_rate": 0.0005851485148514851, |
|
"loss": 1.346, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 5.559792816893392, |
|
"grad_norm": 0.29473376274108887, |
|
"learning_rate": 0.0005841584158415841, |
|
"loss": 1.3461, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 5.568216745403836, |
|
"grad_norm": 0.31212469935417175, |
|
"learning_rate": 0.0005831683168316832, |
|
"loss": 1.3454, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 5.576640673914281, |
|
"grad_norm": 0.2541083097457886, |
|
"learning_rate": 0.0005821782178217821, |
|
"loss": 1.3451, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 5.585064602424725, |
|
"grad_norm": 0.28075823187828064, |
|
"learning_rate": 0.0005811881188118812, |
|
"loss": 1.3417, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 5.59348853093517, |
|
"grad_norm": 0.286945641040802, |
|
"learning_rate": 0.0005801980198019801, |
|
"loss": 1.3439, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 5.6019124594456144, |
|
"grad_norm": 0.2825601100921631, |
|
"learning_rate": 0.0005792079207920792, |
|
"loss": 1.3447, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 5.610336387956059, |
|
"grad_norm": 0.3023243844509125, |
|
"learning_rate": 0.0005782178217821782, |
|
"loss": 1.3428, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 5.610336387956059, |
|
"eval_accuracy": 0.7226627197479346, |
|
"eval_loss": 1.2287484407424927, |
|
"eval_runtime": 893.8585, |
|
"eval_samples_per_second": 558.675, |
|
"eval_steps_per_second": 5.173, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 5.618760316466504, |
|
"grad_norm": 0.2548897862434387, |
|
"learning_rate": 0.0005772277227722772, |
|
"loss": 1.3441, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 5.627184244976948, |
|
"grad_norm": 0.28277119994163513, |
|
"learning_rate": 0.0005762376237623762, |
|
"loss": 1.3421, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 5.635608173487393, |
|
"grad_norm": 0.35963568091392517, |
|
"learning_rate": 0.0005752475247524752, |
|
"loss": 1.3421, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 5.644032101997837, |
|
"grad_norm": 0.2753046452999115, |
|
"learning_rate": 0.0005742574257425742, |
|
"loss": 1.3449, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 5.652456030508281, |
|
"grad_norm": 0.31272053718566895, |
|
"learning_rate": 0.0005732673267326733, |
|
"loss": 1.3418, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 5.660879959018726, |
|
"grad_norm": 0.24427007138729095, |
|
"learning_rate": 0.0005722772277227722, |
|
"loss": 1.3409, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 5.66930388752917, |
|
"grad_norm": 0.4038189649581909, |
|
"learning_rate": 0.0005712871287128713, |
|
"loss": 1.3387, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 5.677727816039615, |
|
"grad_norm": 0.30009007453918457, |
|
"learning_rate": 0.0005702970297029702, |
|
"loss": 1.3425, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 5.68615174455006, |
|
"grad_norm": 0.2813461720943451, |
|
"learning_rate": 0.0005693069306930693, |
|
"loss": 1.3396, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 5.68615174455006, |
|
"eval_accuracy": 0.7239226758241876, |
|
"eval_loss": 1.2240657806396484, |
|
"eval_runtime": 898.7215, |
|
"eval_samples_per_second": 555.652, |
|
"eval_steps_per_second": 5.145, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 5.694575673060505, |
|
"grad_norm": 0.4396764039993286, |
|
"learning_rate": 0.0005683168316831683, |
|
"loss": 1.3408, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 5.702999601570949, |
|
"grad_norm": 0.2992042899131775, |
|
"learning_rate": 0.0005673267326732673, |
|
"loss": 1.3408, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 5.711423530081394, |
|
"grad_norm": 0.2579440474510193, |
|
"learning_rate": 0.0005663366336633663, |
|
"loss": 1.3369, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 5.719847458591838, |
|
"grad_norm": 0.32076653838157654, |
|
"learning_rate": 0.0005653465346534653, |
|
"loss": 1.3365, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 5.728271387102282, |
|
"grad_norm": 0.3180268108844757, |
|
"learning_rate": 0.0005643564356435643, |
|
"loss": 1.339, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 5.736695315612727, |
|
"grad_norm": 0.27663713693618774, |
|
"learning_rate": 0.0005633663366336634, |
|
"loss": 1.3373, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 5.745119244123171, |
|
"grad_norm": 0.27103811502456665, |
|
"learning_rate": 0.0005623762376237624, |
|
"loss": 1.3332, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 5.753543172633616, |
|
"grad_norm": 0.34022676944732666, |
|
"learning_rate": 0.0005613861386138615, |
|
"loss": 1.3373, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 5.7619671011440605, |
|
"grad_norm": 0.36838725209236145, |
|
"learning_rate": 0.0005603960396039604, |
|
"loss": 1.3384, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 5.7619671011440605, |
|
"eval_accuracy": 0.7243312842270887, |
|
"eval_loss": 1.221815586090088, |
|
"eval_runtime": 891.7897, |
|
"eval_samples_per_second": 559.971, |
|
"eval_steps_per_second": 5.185, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 5.770391029654505, |
|
"grad_norm": 0.2968374490737915, |
|
"learning_rate": 0.0005594059405940595, |
|
"loss": 1.3353, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 5.77881495816495, |
|
"grad_norm": 0.36536258459091187, |
|
"learning_rate": 0.0005584158415841585, |
|
"loss": 1.3331, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 5.787238886675394, |
|
"grad_norm": 0.2985541522502899, |
|
"learning_rate": 0.0005574257425742575, |
|
"loss": 1.3313, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 5.795662815185839, |
|
"grad_norm": 0.33506348729133606, |
|
"learning_rate": 0.0005564356435643565, |
|
"loss": 1.3349, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 5.804086743696283, |
|
"grad_norm": 0.31232866644859314, |
|
"learning_rate": 0.0005554455445544555, |
|
"loss": 1.3335, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 5.812510672206728, |
|
"grad_norm": 0.27576977014541626, |
|
"learning_rate": 0.0005544554455445545, |
|
"loss": 1.3309, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.820934600717172, |
|
"grad_norm": 0.2526339590549469, |
|
"learning_rate": 0.0005534653465346536, |
|
"loss": 1.3318, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 5.829358529227616, |
|
"grad_norm": 0.25774866342544556, |
|
"learning_rate": 0.0005524752475247525, |
|
"loss": 1.3329, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 5.8377824577380615, |
|
"grad_norm": 0.34311917424201965, |
|
"learning_rate": 0.0005514851485148516, |
|
"loss": 1.3334, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.8377824577380615, |
|
"eval_accuracy": 0.7251374384748042, |
|
"eval_loss": 1.216299057006836, |
|
"eval_runtime": 889.6984, |
|
"eval_samples_per_second": 561.287, |
|
"eval_steps_per_second": 5.197, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.846206386248506, |
|
"grad_norm": 0.32087624073028564, |
|
"learning_rate": 0.0005504950495049505, |
|
"loss": 1.3338, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 5.854630314758951, |
|
"grad_norm": 0.25447556376457214, |
|
"learning_rate": 0.0005495049504950496, |
|
"loss": 1.3315, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 5.863054243269395, |
|
"grad_norm": 0.285826712846756, |
|
"learning_rate": 0.0005485148514851486, |
|
"loss": 1.3303, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 5.87147817177984, |
|
"grad_norm": 0.2816094756126404, |
|
"learning_rate": 0.0005475247524752476, |
|
"loss": 1.3308, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 5.879902100290284, |
|
"grad_norm": 0.30444055795669556, |
|
"learning_rate": 0.0005465346534653466, |
|
"loss": 1.3303, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 5.888326028800728, |
|
"grad_norm": 0.3512563705444336, |
|
"learning_rate": 0.0005455445544554456, |
|
"loss": 1.3305, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 5.896749957311173, |
|
"grad_norm": 0.2924775779247284, |
|
"learning_rate": 0.0005445544554455446, |
|
"loss": 1.3307, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 5.905173885821617, |
|
"grad_norm": 0.3497087359428406, |
|
"learning_rate": 0.0005435643564356437, |
|
"loss": 1.3295, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 5.913597814332062, |
|
"grad_norm": 0.2714064419269562, |
|
"learning_rate": 0.0005425742574257426, |
|
"loss": 1.329, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 5.913597814332062, |
|
"eval_accuracy": 0.7261800107692413, |
|
"eval_loss": 1.2115275859832764, |
|
"eval_runtime": 893.0627, |
|
"eval_samples_per_second": 559.172, |
|
"eval_steps_per_second": 5.178, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 5.922021742842507, |
|
"grad_norm": 0.277203232049942, |
|
"learning_rate": 0.0005415841584158417, |
|
"loss": 1.3269, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 5.930445671352951, |
|
"grad_norm": 0.3769485354423523, |
|
"learning_rate": 0.0005405940594059406, |
|
"loss": 1.3268, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 5.938869599863396, |
|
"grad_norm": 0.2526576817035675, |
|
"learning_rate": 0.0005396039603960396, |
|
"loss": 1.3262, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 5.94729352837384, |
|
"grad_norm": 0.2670144736766815, |
|
"learning_rate": 0.0005386138613861387, |
|
"loss": 1.327, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 5.955717456884285, |
|
"grad_norm": 0.26662877202033997, |
|
"learning_rate": 0.0005376237623762376, |
|
"loss": 1.3277, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 5.964141385394729, |
|
"grad_norm": 0.3263689875602722, |
|
"learning_rate": 0.0005366336633663367, |
|
"loss": 1.3271, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 5.972565313905174, |
|
"grad_norm": 0.26732614636421204, |
|
"learning_rate": 0.0005356435643564356, |
|
"loss": 1.3264, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 5.980989242415618, |
|
"grad_norm": 0.3332139551639557, |
|
"learning_rate": 0.0005346534653465347, |
|
"loss": 1.3266, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 5.989413170926063, |
|
"grad_norm": 0.3081839680671692, |
|
"learning_rate": 0.0005336633663366337, |
|
"loss": 1.325, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 5.989413170926063, |
|
"eval_accuracy": 0.7263082386708871, |
|
"eval_loss": 1.2105002403259277, |
|
"eval_runtime": 893.0055, |
|
"eval_samples_per_second": 559.208, |
|
"eval_steps_per_second": 5.178, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 5.997837099436508, |
|
"grad_norm": 0.2502419650554657, |
|
"learning_rate": 0.0005326732673267327, |
|
"loss": 1.3263, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 6.006261027946952, |
|
"grad_norm": 0.2437312752008438, |
|
"learning_rate": 0.0005316831683168317, |
|
"loss": 1.3225, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 6.014684956457397, |
|
"grad_norm": 0.3372795581817627, |
|
"learning_rate": 0.0005306930693069307, |
|
"loss": 1.3234, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 6.023108884967841, |
|
"grad_norm": 0.2895912826061249, |
|
"learning_rate": 0.0005297029702970297, |
|
"loss": 1.3252, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 6.031532813478286, |
|
"grad_norm": 0.28451213240623474, |
|
"learning_rate": 0.0005287128712871288, |
|
"loss": 1.3238, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 6.03995674198873, |
|
"grad_norm": 0.2496078759431839, |
|
"learning_rate": 0.0005277227722772277, |
|
"loss": 1.323, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 6.048380670499174, |
|
"grad_norm": 0.26850923895835876, |
|
"learning_rate": 0.0005267326732673268, |
|
"loss": 1.322, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 6.056804599009619, |
|
"grad_norm": 0.30225685238838196, |
|
"learning_rate": 0.0005257425742574257, |
|
"loss": 1.3212, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 6.0652285275200635, |
|
"grad_norm": 0.32349905371665955, |
|
"learning_rate": 0.0005247524752475248, |
|
"loss": 1.3219, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.0652285275200635, |
|
"eval_accuracy": 0.727180971273756, |
|
"eval_loss": 1.205489993095398, |
|
"eval_runtime": 890.8938, |
|
"eval_samples_per_second": 560.534, |
|
"eval_steps_per_second": 5.19, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.0736524560305085, |
|
"grad_norm": 0.29943209886550903, |
|
"learning_rate": 0.0005237623762376238, |
|
"loss": 1.3182, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 6.082076384540953, |
|
"grad_norm": 0.30952343344688416, |
|
"learning_rate": 0.0005227722772277228, |
|
"loss": 1.3194, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 6.090500313051398, |
|
"grad_norm": 0.3158267140388489, |
|
"learning_rate": 0.0005217821782178218, |
|
"loss": 1.319, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 6.098924241561842, |
|
"grad_norm": 0.27009105682373047, |
|
"learning_rate": 0.0005207920792079208, |
|
"loss": 1.3212, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 6.107348170072286, |
|
"grad_norm": 0.2660143971443176, |
|
"learning_rate": 0.0005198019801980198, |
|
"loss": 1.3181, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 6.115772098582731, |
|
"grad_norm": 0.32289671897888184, |
|
"learning_rate": 0.0005188118811881189, |
|
"loss": 1.3166, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 6.124196027093175, |
|
"grad_norm": 0.301577627658844, |
|
"learning_rate": 0.0005178217821782178, |
|
"loss": 1.3215, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 6.13261995560362, |
|
"grad_norm": 0.26539114117622375, |
|
"learning_rate": 0.0005168316831683169, |
|
"loss": 1.3173, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 6.141043884114064, |
|
"grad_norm": 0.30636703968048096, |
|
"learning_rate": 0.0005158415841584158, |
|
"loss": 1.319, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 6.141043884114064, |
|
"eval_accuracy": 0.7278776618882268, |
|
"eval_loss": 1.2021031379699707, |
|
"eval_runtime": 893.3533, |
|
"eval_samples_per_second": 558.99, |
|
"eval_steps_per_second": 5.176, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 6.1494678126245095, |
|
"grad_norm": 0.2906350791454315, |
|
"learning_rate": 0.0005148514851485149, |
|
"loss": 1.3177, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.157891741134954, |
|
"grad_norm": 0.33962422609329224, |
|
"learning_rate": 0.0005138613861386139, |
|
"loss": 1.3173, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 6.166315669645398, |
|
"grad_norm": 0.29772093892097473, |
|
"learning_rate": 0.0005128712871287129, |
|
"loss": 1.3194, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 6.174739598155843, |
|
"grad_norm": 0.27262043952941895, |
|
"learning_rate": 0.0005118811881188119, |
|
"loss": 1.3159, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 6.183163526666287, |
|
"grad_norm": 0.2678314745426178, |
|
"learning_rate": 0.0005108910891089109, |
|
"loss": 1.3167, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 6.191587455176732, |
|
"grad_norm": 0.3115740716457367, |
|
"learning_rate": 0.0005099009900990099, |
|
"loss": 1.3142, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 6.200011383687176, |
|
"grad_norm": 0.2983403205871582, |
|
"learning_rate": 0.000508910891089109, |
|
"loss": 1.3158, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 6.208435312197621, |
|
"grad_norm": 0.2797269821166992, |
|
"learning_rate": 0.0005079207920792079, |
|
"loss": 1.3163, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 6.216859240708065, |
|
"grad_norm": 0.29581907391548157, |
|
"learning_rate": 0.000506930693069307, |
|
"loss": 1.3156, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 6.216859240708065, |
|
"eval_accuracy": 0.7285335214596267, |
|
"eval_loss": 1.1984630823135376, |
|
"eval_runtime": 881.1088, |
|
"eval_samples_per_second": 566.759, |
|
"eval_steps_per_second": 5.248, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 6.2252831692185095, |
|
"grad_norm": 0.2843240797519684, |
|
"learning_rate": 0.0005059405940594059, |
|
"loss": 1.3162, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 6.233707097728955, |
|
"grad_norm": 0.2662515938282013, |
|
"learning_rate": 0.000504950495049505, |
|
"loss": 1.314, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.242131026239399, |
|
"grad_norm": 0.3370913565158844, |
|
"learning_rate": 0.000503960396039604, |
|
"loss": 1.3136, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 6.250554954749844, |
|
"grad_norm": 0.29014459252357483, |
|
"learning_rate": 0.000502970297029703, |
|
"loss": 1.3127, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 6.258978883260288, |
|
"grad_norm": 0.2779816687107086, |
|
"learning_rate": 0.000501980198019802, |
|
"loss": 1.3137, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 6.267402811770733, |
|
"grad_norm": 0.2942447066307068, |
|
"learning_rate": 0.000500990099009901, |
|
"loss": 1.3138, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 6.275826740281177, |
|
"grad_norm": 0.3536125719547272, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3135, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 6.284250668791621, |
|
"grad_norm": 0.29686686396598816, |
|
"learning_rate": 0.0004990099009900991, |
|
"loss": 1.3129, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 6.292674597302066, |
|
"grad_norm": 0.30590084195137024, |
|
"learning_rate": 0.000498019801980198, |
|
"loss": 1.3114, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 6.292674597302066, |
|
"eval_accuracy": 0.7293452386458654, |
|
"eval_loss": 1.1951327323913574, |
|
"eval_runtime": 893.3348, |
|
"eval_samples_per_second": 559.002, |
|
"eval_steps_per_second": 5.176, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 6.3010985258125105, |
|
"grad_norm": 0.2687655985355377, |
|
"learning_rate": 0.0004970297029702971, |
|
"loss": 1.3125, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 6.3095224543229556, |
|
"grad_norm": 0.31057268381118774, |
|
"learning_rate": 0.000496039603960396, |
|
"loss": 1.3106, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 6.3179463828334, |
|
"grad_norm": 0.3097970187664032, |
|
"learning_rate": 0.0004950495049504951, |
|
"loss": 1.31, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.326370311343844, |
|
"grad_norm": 0.28469330072402954, |
|
"learning_rate": 0.0004940594059405941, |
|
"loss": 1.3098, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 6.334794239854289, |
|
"grad_norm": 0.2911768853664398, |
|
"learning_rate": 0.000493069306930693, |
|
"loss": 1.3103, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 6.343218168364733, |
|
"grad_norm": 0.2990330755710602, |
|
"learning_rate": 0.0004920792079207921, |
|
"loss": 1.3108, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 6.351642096875178, |
|
"grad_norm": 0.2908383905887604, |
|
"learning_rate": 0.000491089108910891, |
|
"loss": 1.3092, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 6.360066025385622, |
|
"grad_norm": 0.306233674287796, |
|
"learning_rate": 0.0004900990099009901, |
|
"loss": 1.3107, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 6.368489953896067, |
|
"grad_norm": 0.2749456465244293, |
|
"learning_rate": 0.0004891089108910892, |
|
"loss": 1.3073, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 6.368489953896067, |
|
"eval_accuracy": 0.7300212582744398, |
|
"eval_loss": 1.1918327808380127, |
|
"eval_runtime": 886.4778, |
|
"eval_samples_per_second": 563.326, |
|
"eval_steps_per_second": 5.216, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 6.3769138824065115, |
|
"grad_norm": 0.2799837291240692, |
|
"learning_rate": 0.0004881188118811881, |
|
"loss": 1.3084, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 6.385337810916956, |
|
"grad_norm": 0.3050614893436432, |
|
"learning_rate": 0.00048712871287128715, |
|
"loss": 1.3082, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 6.393761739427401, |
|
"grad_norm": 0.2900220453739166, |
|
"learning_rate": 0.00048613861386138615, |
|
"loss": 1.3087, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 6.402185667937845, |
|
"grad_norm": 0.2592508792877197, |
|
"learning_rate": 0.00048514851485148515, |
|
"loss": 1.3082, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 6.41060959644829, |
|
"grad_norm": 0.2503323256969452, |
|
"learning_rate": 0.00048415841584158414, |
|
"loss": 1.3066, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 6.419033524958734, |
|
"grad_norm": 0.30254074931144714, |
|
"learning_rate": 0.00048316831683168314, |
|
"loss": 1.3079, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 6.427457453469179, |
|
"grad_norm": 0.28869137167930603, |
|
"learning_rate": 0.0004821782178217822, |
|
"loss": 1.3061, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 6.435881381979623, |
|
"grad_norm": 0.3226109445095062, |
|
"learning_rate": 0.0004811881188118812, |
|
"loss": 1.3051, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 6.444305310490067, |
|
"grad_norm": 0.2900817096233368, |
|
"learning_rate": 0.0004801980198019802, |
|
"loss": 1.3062, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 6.444305310490067, |
|
"eval_accuracy": 0.7304169114350704, |
|
"eval_loss": 1.1914669275283813, |
|
"eval_runtime": 888.5325, |
|
"eval_samples_per_second": 562.023, |
|
"eval_steps_per_second": 5.204, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 6.452729239000512, |
|
"grad_norm": 0.3235354721546173, |
|
"learning_rate": 0.0004792079207920792, |
|
"loss": 1.3074, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 6.461153167510957, |
|
"grad_norm": 0.26384827494621277, |
|
"learning_rate": 0.0004782178217821782, |
|
"loss": 1.3052, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 6.469577096021402, |
|
"grad_norm": 0.27176037430763245, |
|
"learning_rate": 0.00047722772277227724, |
|
"loss": 1.3032, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 6.478001024531846, |
|
"grad_norm": 0.27846911549568176, |
|
"learning_rate": 0.00047623762376237624, |
|
"loss": 1.3038, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 6.48642495304229, |
|
"grad_norm": 0.32258498668670654, |
|
"learning_rate": 0.00047524752475247524, |
|
"loss": 1.3052, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 6.494848881552735, |
|
"grad_norm": 0.3000924587249756, |
|
"learning_rate": 0.00047425742574257423, |
|
"loss": 1.3046, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 6.503272810063179, |
|
"grad_norm": 0.22748370468616486, |
|
"learning_rate": 0.00047326732673267323, |
|
"loss": 1.3054, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 6.511696738573624, |
|
"grad_norm": 0.3552054464817047, |
|
"learning_rate": 0.0004722772277227723, |
|
"loss": 1.3026, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 6.520120667084068, |
|
"grad_norm": 0.2629605531692505, |
|
"learning_rate": 0.0004712871287128713, |
|
"loss": 1.3021, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 6.520120667084068, |
|
"eval_accuracy": 0.7311149976881265, |
|
"eval_loss": 1.1877076625823975, |
|
"eval_runtime": 883.1573, |
|
"eval_samples_per_second": 565.444, |
|
"eval_steps_per_second": 5.236, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 6.528544595594513, |
|
"grad_norm": 0.31692177057266235, |
|
"learning_rate": 0.0004702970297029703, |
|
"loss": 1.3048, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 6.5369685241049575, |
|
"grad_norm": 0.3689730167388916, |
|
"learning_rate": 0.0004693069306930693, |
|
"loss": 1.3016, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 6.545392452615403, |
|
"grad_norm": 0.2619648277759552, |
|
"learning_rate": 0.00046831683168316833, |
|
"loss": 1.3018, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 6.553816381125847, |
|
"grad_norm": 0.29713907837867737, |
|
"learning_rate": 0.0004673267326732674, |
|
"loss": 1.3007, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 6.562240309636291, |
|
"grad_norm": 0.3426944315433502, |
|
"learning_rate": 0.0004663366336633664, |
|
"loss": 1.302, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 6.570664238146736, |
|
"grad_norm": 0.30286312103271484, |
|
"learning_rate": 0.0004653465346534654, |
|
"loss": 1.3024, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 6.57908816665718, |
|
"grad_norm": 0.2533584237098694, |
|
"learning_rate": 0.0004643564356435644, |
|
"loss": 1.2991, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 6.587512095167625, |
|
"grad_norm": 0.23465867340564728, |
|
"learning_rate": 0.0004633663366336634, |
|
"loss": 1.3007, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 6.595936023678069, |
|
"grad_norm": 0.31729191541671753, |
|
"learning_rate": 0.00046237623762376243, |
|
"loss": 1.3, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 6.595936023678069, |
|
"eval_accuracy": 0.7318502985148011, |
|
"eval_loss": 1.1818432807922363, |
|
"eval_runtime": 891.13, |
|
"eval_samples_per_second": 560.385, |
|
"eval_steps_per_second": 5.189, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 6.6043599521885135, |
|
"grad_norm": 0.26264631748199463, |
|
"learning_rate": 0.00046138613861386143, |
|
"loss": 1.3003, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 6.6127838806989585, |
|
"grad_norm": 0.26062801480293274, |
|
"learning_rate": 0.0004603960396039604, |
|
"loss": 1.2977, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 6.621207809209403, |
|
"grad_norm": 0.2755686640739441, |
|
"learning_rate": 0.0004594059405940594, |
|
"loss": 1.2979, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 6.629631737719848, |
|
"grad_norm": 0.32309025526046753, |
|
"learning_rate": 0.0004584158415841584, |
|
"loss": 1.297, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 6.638055666230292, |
|
"grad_norm": 0.2709057927131653, |
|
"learning_rate": 0.0004574257425742575, |
|
"loss": 1.2999, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 6.646479594740737, |
|
"grad_norm": 0.2785532772541046, |
|
"learning_rate": 0.00045643564356435647, |
|
"loss": 1.2959, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 6.654903523251181, |
|
"grad_norm": 0.2822953164577484, |
|
"learning_rate": 0.00045544554455445547, |
|
"loss": 1.2984, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 6.663327451761625, |
|
"grad_norm": 0.2704668641090393, |
|
"learning_rate": 0.00045445544554455447, |
|
"loss": 1.2956, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 6.67175138027207, |
|
"grad_norm": 0.3228791058063507, |
|
"learning_rate": 0.00045346534653465347, |
|
"loss": 1.2984, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 6.67175138027207, |
|
"eval_accuracy": 0.7318941432804211, |
|
"eval_loss": 1.184158205986023, |
|
"eval_runtime": 883.7641, |
|
"eval_samples_per_second": 565.056, |
|
"eval_steps_per_second": 5.232, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 6.680175308782514, |
|
"grad_norm": 0.2641367018222809, |
|
"learning_rate": 0.0004524752475247525, |
|
"loss": 1.299, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 6.6885992372929595, |
|
"grad_norm": 0.28555190563201904, |
|
"learning_rate": 0.0004514851485148515, |
|
"loss": 1.2985, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 6.697023165803404, |
|
"grad_norm": 0.2615039050579071, |
|
"learning_rate": 0.0004504950495049505, |
|
"loss": 1.294, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 6.705447094313849, |
|
"grad_norm": 0.25349870324134827, |
|
"learning_rate": 0.0004495049504950495, |
|
"loss": 1.295, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 6.713871022824293, |
|
"grad_norm": 0.3342011272907257, |
|
"learning_rate": 0.0004485148514851485, |
|
"loss": 1.2963, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 6.722294951334737, |
|
"grad_norm": 0.2608206570148468, |
|
"learning_rate": 0.00044752475247524756, |
|
"loss": 1.2957, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 6.730718879845182, |
|
"grad_norm": 0.27476873993873596, |
|
"learning_rate": 0.00044653465346534656, |
|
"loss": 1.2939, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 6.739142808355626, |
|
"grad_norm": 0.3241907060146332, |
|
"learning_rate": 0.00044554455445544556, |
|
"loss": 1.2965, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.747566736866071, |
|
"grad_norm": 0.3494180142879486, |
|
"learning_rate": 0.00044455445544554456, |
|
"loss": 1.2962, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 6.747566736866071, |
|
"eval_accuracy": 0.7322386411238602, |
|
"eval_loss": 1.182516098022461, |
|
"eval_runtime": 889.7545, |
|
"eval_samples_per_second": 561.251, |
|
"eval_steps_per_second": 5.197, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 6.755990665376515, |
|
"grad_norm": 0.2616145610809326, |
|
"learning_rate": 0.00044356435643564356, |
|
"loss": 1.2958, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 6.7644145938869595, |
|
"grad_norm": 0.29238995909690857, |
|
"learning_rate": 0.0004425742574257426, |
|
"loss": 1.293, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 6.772838522397405, |
|
"grad_norm": 0.24060964584350586, |
|
"learning_rate": 0.0004415841584158416, |
|
"loss": 1.2948, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 6.781262450907849, |
|
"grad_norm": 0.29363489151000977, |
|
"learning_rate": 0.0004405940594059406, |
|
"loss": 1.2928, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 6.789686379418294, |
|
"grad_norm": 0.3320622444152832, |
|
"learning_rate": 0.0004396039603960396, |
|
"loss": 1.2925, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 6.798110307928738, |
|
"grad_norm": 0.23857133090496063, |
|
"learning_rate": 0.0004386138613861386, |
|
"loss": 1.2943, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 6.806534236439183, |
|
"grad_norm": 0.24713198840618134, |
|
"learning_rate": 0.00043762376237623765, |
|
"loss": 1.2938, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 6.814958164949627, |
|
"grad_norm": 0.26270854473114014, |
|
"learning_rate": 0.00043663366336633665, |
|
"loss": 1.2916, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 6.823382093460072, |
|
"grad_norm": 0.2450101524591446, |
|
"learning_rate": 0.00043564356435643565, |
|
"loss": 1.2931, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 6.823382093460072, |
|
"eval_accuracy": 0.7332625526391774, |
|
"eval_loss": 1.1757333278656006, |
|
"eval_runtime": 889.0249, |
|
"eval_samples_per_second": 561.712, |
|
"eval_steps_per_second": 5.201, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 6.831806021970516, |
|
"grad_norm": 0.27462685108184814, |
|
"learning_rate": 0.00043465346534653465, |
|
"loss": 1.2923, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 6.8402299504809605, |
|
"grad_norm": 0.2707907259464264, |
|
"learning_rate": 0.00043366336633663365, |
|
"loss": 1.2925, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 6.8486538789914055, |
|
"grad_norm": 0.24748317897319794, |
|
"learning_rate": 0.0004326732673267327, |
|
"loss": 1.2929, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 6.85707780750185, |
|
"grad_norm": 0.226767897605896, |
|
"learning_rate": 0.0004316831683168317, |
|
"loss": 1.2883, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 6.865501736012295, |
|
"grad_norm": 0.24889105558395386, |
|
"learning_rate": 0.0004306930693069307, |
|
"loss": 1.2893, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 6.873925664522739, |
|
"grad_norm": 0.26075902581214905, |
|
"learning_rate": 0.0004297029702970297, |
|
"loss": 1.2893, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 6.882349593033183, |
|
"grad_norm": 0.26210734248161316, |
|
"learning_rate": 0.0004287128712871287, |
|
"loss": 1.2868, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 6.890773521543628, |
|
"grad_norm": 0.2559298872947693, |
|
"learning_rate": 0.00042772277227722774, |
|
"loss": 1.2886, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 6.899197450054072, |
|
"grad_norm": 0.2503817081451416, |
|
"learning_rate": 0.00042673267326732674, |
|
"loss": 1.2883, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 6.899197450054072, |
|
"eval_accuracy": 0.7335132915044345, |
|
"eval_loss": 1.1744158267974854, |
|
"eval_runtime": 885.5636, |
|
"eval_samples_per_second": 563.908, |
|
"eval_steps_per_second": 5.222, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 6.907621378564517, |
|
"grad_norm": 0.24540117383003235, |
|
"learning_rate": 0.00042574257425742574, |
|
"loss": 1.2893, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 6.9160453070749615, |
|
"grad_norm": 0.3089258670806885, |
|
"learning_rate": 0.00042475247524752474, |
|
"loss": 1.2896, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 6.9244692355854065, |
|
"grad_norm": 0.26888999342918396, |
|
"learning_rate": 0.00042376237623762374, |
|
"loss": 1.2895, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 6.932893164095851, |
|
"grad_norm": 0.24743571877479553, |
|
"learning_rate": 0.0004227722772277228, |
|
"loss": 1.2884, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 6.941317092606295, |
|
"grad_norm": 0.24364733695983887, |
|
"learning_rate": 0.0004217821782178218, |
|
"loss": 1.2879, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 6.94974102111674, |
|
"grad_norm": 0.2963743507862091, |
|
"learning_rate": 0.0004207920792079208, |
|
"loss": 1.2878, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 6.958164949627184, |
|
"grad_norm": 0.2444639950990677, |
|
"learning_rate": 0.0004198019801980198, |
|
"loss": 1.2871, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 6.966588878137629, |
|
"grad_norm": 0.27140820026397705, |
|
"learning_rate": 0.0004188118811881188, |
|
"loss": 1.2878, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 6.975012806648073, |
|
"grad_norm": 0.2628765404224396, |
|
"learning_rate": 0.00041782178217821784, |
|
"loss": 1.2873, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 6.975012806648073, |
|
"eval_accuracy": 0.734204579286565, |
|
"eval_loss": 1.171156644821167, |
|
"eval_runtime": 888.1172, |
|
"eval_samples_per_second": 562.286, |
|
"eval_steps_per_second": 5.207, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 6.983436735158518, |
|
"grad_norm": 0.2539413869380951, |
|
"learning_rate": 0.00041683168316831683, |
|
"loss": 1.2874, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 6.991860663668962, |
|
"grad_norm": 0.29522642493247986, |
|
"learning_rate": 0.00041584158415841583, |
|
"loss": 1.2859, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.000284592179407, |
|
"grad_norm": 0.29553958773612976, |
|
"learning_rate": 0.00041485148514851483, |
|
"loss": 1.2878, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 7.008708520689852, |
|
"grad_norm": 0.3111182153224945, |
|
"learning_rate": 0.00041386138613861383, |
|
"loss": 1.2874, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 7.017132449200296, |
|
"grad_norm": 0.33146336674690247, |
|
"learning_rate": 0.0004128712871287129, |
|
"loss": 1.287, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 7.025556377710741, |
|
"grad_norm": 0.27456361055374146, |
|
"learning_rate": 0.0004118811881188119, |
|
"loss": 1.2858, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 7.033980306221185, |
|
"grad_norm": 0.29216212034225464, |
|
"learning_rate": 0.0004108910891089109, |
|
"loss": 1.2838, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 7.042404234731629, |
|
"grad_norm": 0.24966631829738617, |
|
"learning_rate": 0.0004099009900990099, |
|
"loss": 1.2857, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 7.050828163242074, |
|
"grad_norm": 0.2910294234752655, |
|
"learning_rate": 0.0004089108910891089, |
|
"loss": 1.2858, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 7.050828163242074, |
|
"eval_accuracy": 0.7346228547150983, |
|
"eval_loss": 1.169946551322937, |
|
"eval_runtime": 890.9908, |
|
"eval_samples_per_second": 560.473, |
|
"eval_steps_per_second": 5.19, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 7.059252091752518, |
|
"grad_norm": 0.26337358355522156, |
|
"learning_rate": 0.0004079207920792079, |
|
"loss": 1.2842, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 7.067676020262963, |
|
"grad_norm": 0.2426845133304596, |
|
"learning_rate": 0.0004069306930693069, |
|
"loss": 1.2836, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 7.0760999487734075, |
|
"grad_norm": 0.2740408778190613, |
|
"learning_rate": 0.000405940594059406, |
|
"loss": 1.2842, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.084523877283853, |
|
"grad_norm": 0.27966201305389404, |
|
"learning_rate": 0.000404950495049505, |
|
"loss": 1.2841, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 7.092947805794297, |
|
"grad_norm": 0.3083817660808563, |
|
"learning_rate": 0.00040396039603960397, |
|
"loss": 1.2823, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 7.101371734304741, |
|
"grad_norm": 0.30730104446411133, |
|
"learning_rate": 0.000402970297029703, |
|
"loss": 1.2845, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 7.109795662815186, |
|
"grad_norm": 0.2973144054412842, |
|
"learning_rate": 0.000401980198019802, |
|
"loss": 1.2814, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 7.11821959132563, |
|
"grad_norm": 0.2775426208972931, |
|
"learning_rate": 0.000400990099009901, |
|
"loss": 1.2823, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 7.126643519836075, |
|
"grad_norm": 0.2734345495700836, |
|
"learning_rate": 0.0004, |
|
"loss": 1.2819, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 7.126643519836075, |
|
"eval_accuracy": 0.735104089750221, |
|
"eval_loss": 1.1682698726654053, |
|
"eval_runtime": 886.7497, |
|
"eval_samples_per_second": 563.153, |
|
"eval_steps_per_second": 5.215, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 7.135067448346519, |
|
"grad_norm": 0.27912047505378723, |
|
"learning_rate": 0.000399009900990099, |
|
"loss": 1.2826, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 7.143491376856964, |
|
"grad_norm": 0.3084285855293274, |
|
"learning_rate": 0.00039801980198019807, |
|
"loss": 1.2811, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 7.1519153053674085, |
|
"grad_norm": 0.30194783210754395, |
|
"learning_rate": 0.00039702970297029707, |
|
"loss": 1.2828, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 7.160339233877853, |
|
"grad_norm": 0.25307685136795044, |
|
"learning_rate": 0.00039603960396039607, |
|
"loss": 1.2791, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.168763162388298, |
|
"grad_norm": 0.25018778443336487, |
|
"learning_rate": 0.00039504950495049506, |
|
"loss": 1.2796, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 7.177187090898742, |
|
"grad_norm": 0.2541010081768036, |
|
"learning_rate": 0.00039405940594059406, |
|
"loss": 1.2812, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 7.185611019409187, |
|
"grad_norm": 0.29745373129844666, |
|
"learning_rate": 0.0003930693069306931, |
|
"loss": 1.2828, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 7.194034947919631, |
|
"grad_norm": 0.2740705907344818, |
|
"learning_rate": 0.0003920792079207921, |
|
"loss": 1.2812, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 7.202458876430076, |
|
"grad_norm": 0.23998434841632843, |
|
"learning_rate": 0.0003910891089108911, |
|
"loss": 1.2781, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 7.202458876430076, |
|
"eval_accuracy": 0.7354429371546514, |
|
"eval_loss": 1.1649537086486816, |
|
"eval_runtime": 891.9041, |
|
"eval_samples_per_second": 559.899, |
|
"eval_steps_per_second": 5.184, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 7.21088280494052, |
|
"grad_norm": 0.2691722512245178, |
|
"learning_rate": 0.0003900990099009901, |
|
"loss": 1.2785, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 7.219306733450964, |
|
"grad_norm": 0.28188225626945496, |
|
"learning_rate": 0.0003891089108910891, |
|
"loss": 1.2807, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 7.2277306619614095, |
|
"grad_norm": 0.3311617970466614, |
|
"learning_rate": 0.00038811881188118816, |
|
"loss": 1.2809, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 7.236154590471854, |
|
"grad_norm": 0.2717738747596741, |
|
"learning_rate": 0.00038712871287128716, |
|
"loss": 1.278, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 7.244578518982299, |
|
"grad_norm": 0.27171820402145386, |
|
"learning_rate": 0.00038613861386138616, |
|
"loss": 1.2803, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.253002447492743, |
|
"grad_norm": 0.249137282371521, |
|
"learning_rate": 0.00038514851485148515, |
|
"loss": 1.277, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 7.261426376003188, |
|
"grad_norm": 0.26939263939857483, |
|
"learning_rate": 0.00038415841584158415, |
|
"loss": 1.2773, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 7.269850304513632, |
|
"grad_norm": 0.3177802860736847, |
|
"learning_rate": 0.0003831683168316832, |
|
"loss": 1.2763, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 7.278274233024076, |
|
"grad_norm": 0.2421504557132721, |
|
"learning_rate": 0.0003821782178217822, |
|
"loss": 1.2771, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 7.278274233024076, |
|
"eval_accuracy": 0.7357238880776348, |
|
"eval_loss": 1.1646403074264526, |
|
"eval_runtime": 878.5966, |
|
"eval_samples_per_second": 568.379, |
|
"eval_steps_per_second": 5.263, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 7.286698161534521, |
|
"grad_norm": 0.28808215260505676, |
|
"learning_rate": 0.0003811881188118812, |
|
"loss": 1.2744, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 7.295122090044965, |
|
"grad_norm": 0.26363667845726013, |
|
"learning_rate": 0.0003801980198019802, |
|
"loss": 1.2788, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 7.30354601855541, |
|
"grad_norm": 0.35491064190864563, |
|
"learning_rate": 0.0003792079207920792, |
|
"loss": 1.2792, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 7.311969947065855, |
|
"grad_norm": 0.3273920714855194, |
|
"learning_rate": 0.00037821782178217825, |
|
"loss": 1.278, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 7.320393875576299, |
|
"grad_norm": 0.28319239616394043, |
|
"learning_rate": 0.00037722772277227725, |
|
"loss": 1.2762, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 7.328817804086744, |
|
"grad_norm": 0.28414586186408997, |
|
"learning_rate": 0.00037623762376237625, |
|
"loss": 1.2769, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 7.337241732597188, |
|
"grad_norm": 0.25393033027648926, |
|
"learning_rate": 0.00037524752475247524, |
|
"loss": 1.2742, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 7.345665661107633, |
|
"grad_norm": 0.25634288787841797, |
|
"learning_rate": 0.00037425742574257424, |
|
"loss": 1.2753, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 7.354089589618077, |
|
"grad_norm": 0.2355813831090927, |
|
"learning_rate": 0.0003732673267326733, |
|
"loss": 1.2749, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 7.354089589618077, |
|
"eval_accuracy": 0.7361996522899728, |
|
"eval_loss": 1.160847544670105, |
|
"eval_runtime": 889.4544, |
|
"eval_samples_per_second": 561.441, |
|
"eval_steps_per_second": 5.199, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 7.362513518128522, |
|
"grad_norm": 0.24002189934253693, |
|
"learning_rate": 0.0003722772277227723, |
|
"loss": 1.2751, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 7.370937446638966, |
|
"grad_norm": 0.2806450128555298, |
|
"learning_rate": 0.0003712871287128713, |
|
"loss": 1.275, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 7.3793613751494105, |
|
"grad_norm": 0.24552834033966064, |
|
"learning_rate": 0.0003702970297029703, |
|
"loss": 1.2753, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 7.3877853036598555, |
|
"grad_norm": 0.24814461171627045, |
|
"learning_rate": 0.0003693069306930693, |
|
"loss": 1.276, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 7.3962092321703, |
|
"grad_norm": 0.26086533069610596, |
|
"learning_rate": 0.00036831683168316834, |
|
"loss": 1.2744, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 7.404633160680745, |
|
"grad_norm": 0.2854679822921753, |
|
"learning_rate": 0.00036732673267326734, |
|
"loss": 1.2739, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 7.413057089191189, |
|
"grad_norm": 0.24847003817558289, |
|
"learning_rate": 0.00036633663366336634, |
|
"loss": 1.2731, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 7.421481017701634, |
|
"grad_norm": 0.3230905532836914, |
|
"learning_rate": 0.00036534653465346533, |
|
"loss": 1.2732, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 7.429904946212078, |
|
"grad_norm": 0.30264076590538025, |
|
"learning_rate": 0.00036435643564356433, |
|
"loss": 1.273, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 7.429904946212078, |
|
"eval_accuracy": 0.7366944357714759, |
|
"eval_loss": 1.1585748195648193, |
|
"eval_runtime": 884.7129, |
|
"eval_samples_per_second": 564.45, |
|
"eval_steps_per_second": 5.227, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 7.438328874722522, |
|
"grad_norm": 0.25705888867378235, |
|
"learning_rate": 0.0003633663366336634, |
|
"loss": 1.2738, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 7.446752803232967, |
|
"grad_norm": 0.2455236166715622, |
|
"learning_rate": 0.0003623762376237624, |
|
"loss": 1.2727, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 7.4551767317434114, |
|
"grad_norm": 0.2877678871154785, |
|
"learning_rate": 0.0003613861386138614, |
|
"loss": 1.2733, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 7.4636006602538565, |
|
"grad_norm": 0.2644253969192505, |
|
"learning_rate": 0.0003603960396039604, |
|
"loss": 1.2711, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 7.472024588764301, |
|
"grad_norm": 0.25103089213371277, |
|
"learning_rate": 0.0003594059405940594, |
|
"loss": 1.2727, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 7.480448517274746, |
|
"grad_norm": 0.28732746839523315, |
|
"learning_rate": 0.00035841584158415843, |
|
"loss": 1.2729, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 7.48887244578519, |
|
"grad_norm": 0.3096875846385956, |
|
"learning_rate": 0.00035742574257425743, |
|
"loss": 1.2733, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 7.497296374295634, |
|
"grad_norm": 0.27695363759994507, |
|
"learning_rate": 0.0003564356435643564, |
|
"loss": 1.2719, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 7.505720302806079, |
|
"grad_norm": 0.26089048385620117, |
|
"learning_rate": 0.0003554455445544554, |
|
"loss": 1.2718, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 7.505720302806079, |
|
"eval_accuracy": 0.7372118632602084, |
|
"eval_loss": 1.1557950973510742, |
|
"eval_runtime": 890.5411, |
|
"eval_samples_per_second": 560.756, |
|
"eval_steps_per_second": 5.192, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 7.514144231316523, |
|
"grad_norm": 0.24578547477722168, |
|
"learning_rate": 0.0003544554455445544, |
|
"loss": 1.2723, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 7.522568159826968, |
|
"grad_norm": 0.2624136209487915, |
|
"learning_rate": 0.0003534653465346535, |
|
"loss": 1.2708, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 7.530992088337412, |
|
"grad_norm": 0.25748109817504883, |
|
"learning_rate": 0.0003524752475247525, |
|
"loss": 1.2708, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 7.5394160168478574, |
|
"grad_norm": 0.28079208731651306, |
|
"learning_rate": 0.00035148514851485147, |
|
"loss": 1.2727, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 7.547839945358302, |
|
"grad_norm": 0.2706407904624939, |
|
"learning_rate": 0.00035049504950495047, |
|
"loss": 1.2712, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 7.556263873868746, |
|
"grad_norm": 0.27032172679901123, |
|
"learning_rate": 0.00034950495049504947, |
|
"loss": 1.2673, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 7.564687802379191, |
|
"grad_norm": 0.24915465712547302, |
|
"learning_rate": 0.0003485148514851485, |
|
"loss": 1.2682, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 7.573111730889635, |
|
"grad_norm": 0.24191108345985413, |
|
"learning_rate": 0.0003475247524752475, |
|
"loss": 1.2719, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 7.58153565940008, |
|
"grad_norm": 0.2806965112686157, |
|
"learning_rate": 0.0003465346534653465, |
|
"loss": 1.2681, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 7.58153565940008, |
|
"eval_accuracy": 0.7375367942915361, |
|
"eval_loss": 1.1551363468170166, |
|
"eval_runtime": 876.3936, |
|
"eval_samples_per_second": 569.808, |
|
"eval_steps_per_second": 5.276, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 7.589959587910524, |
|
"grad_norm": 0.2909415364265442, |
|
"learning_rate": 0.0003455445544554455, |
|
"loss": 1.2687, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 7.598383516420968, |
|
"grad_norm": 0.30222398042678833, |
|
"learning_rate": 0.0003445544554455445, |
|
"loss": 1.2684, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 7.606807444931413, |
|
"grad_norm": 0.25246381759643555, |
|
"learning_rate": 0.0003435643564356436, |
|
"loss": 1.2689, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 7.6152313734418575, |
|
"grad_norm": 0.25202953815460205, |
|
"learning_rate": 0.0003425742574257426, |
|
"loss": 1.2689, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 7.623655301952303, |
|
"grad_norm": 0.2351432740688324, |
|
"learning_rate": 0.0003415841584158416, |
|
"loss": 1.2655, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 7.632079230462747, |
|
"grad_norm": 0.26545044779777527, |
|
"learning_rate": 0.0003405940594059406, |
|
"loss": 1.2659, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 7.640503158973192, |
|
"grad_norm": 0.248436838388443, |
|
"learning_rate": 0.0003396039603960396, |
|
"loss": 1.2677, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 7.648927087483636, |
|
"grad_norm": 0.3021203279495239, |
|
"learning_rate": 0.00033861386138613867, |
|
"loss": 1.2692, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 7.657351015994081, |
|
"grad_norm": 0.27577024698257446, |
|
"learning_rate": 0.00033762376237623766, |
|
"loss": 1.2672, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 7.657351015994081, |
|
"eval_accuracy": 0.7378275299930978, |
|
"eval_loss": 1.1522574424743652, |
|
"eval_runtime": 891.8663, |
|
"eval_samples_per_second": 559.923, |
|
"eval_steps_per_second": 5.185, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 7.665774944504525, |
|
"grad_norm": 0.2087612897157669, |
|
"learning_rate": 0.00033663366336633666, |
|
"loss": 1.2655, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 7.674198873014969, |
|
"grad_norm": 0.24880866706371307, |
|
"learning_rate": 0.00033564356435643566, |
|
"loss": 1.2677, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 7.682622801525414, |
|
"grad_norm": 0.26335397362709045, |
|
"learning_rate": 0.00033465346534653466, |
|
"loss": 1.2647, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 7.6910467300358585, |
|
"grad_norm": 0.25413015484809875, |
|
"learning_rate": 0.0003336633663366337, |
|
"loss": 1.265, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 7.6994706585463035, |
|
"grad_norm": 0.3119896650314331, |
|
"learning_rate": 0.0003326732673267327, |
|
"loss": 1.2674, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 7.707894587056748, |
|
"grad_norm": 0.2269907146692276, |
|
"learning_rate": 0.0003316831683168317, |
|
"loss": 1.2647, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 7.716318515567192, |
|
"grad_norm": 0.31745684146881104, |
|
"learning_rate": 0.0003306930693069307, |
|
"loss": 1.2668, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 7.724742444077637, |
|
"grad_norm": 0.28096485137939453, |
|
"learning_rate": 0.0003297029702970297, |
|
"loss": 1.2658, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 7.733166372588081, |
|
"grad_norm": 0.26646697521209717, |
|
"learning_rate": 0.00032871287128712876, |
|
"loss": 1.2664, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 7.733166372588081, |
|
"eval_accuracy": 0.7381772885380696, |
|
"eval_loss": 1.151962161064148, |
|
"eval_runtime": 889.9446, |
|
"eval_samples_per_second": 561.132, |
|
"eval_steps_per_second": 5.196, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 7.741590301098526, |
|
"grad_norm": 0.24463273584842682, |
|
"learning_rate": 0.00032772277227722775, |
|
"loss": 1.2663, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 7.75001422960897, |
|
"grad_norm": 0.23978425562381744, |
|
"learning_rate": 0.00032673267326732675, |
|
"loss": 1.2634, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 7.758438158119414, |
|
"grad_norm": 0.25662901997566223, |
|
"learning_rate": 0.00032574257425742575, |
|
"loss": 1.2651, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 7.766862086629859, |
|
"grad_norm": 0.2697198688983917, |
|
"learning_rate": 0.00032475247524752475, |
|
"loss": 1.2628, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 7.775286015140304, |
|
"grad_norm": 0.2753835618495941, |
|
"learning_rate": 0.0003237623762376238, |
|
"loss": 1.2632, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 7.783709943650749, |
|
"grad_norm": 0.23303931951522827, |
|
"learning_rate": 0.0003227722772277228, |
|
"loss": 1.2625, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 7.792133872161193, |
|
"grad_norm": 0.26077255606651306, |
|
"learning_rate": 0.0003217821782178218, |
|
"loss": 1.2648, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 7.800557800671638, |
|
"grad_norm": 0.25494781136512756, |
|
"learning_rate": 0.0003207920792079208, |
|
"loss": 1.2648, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 7.808981729182082, |
|
"grad_norm": 0.2447885125875473, |
|
"learning_rate": 0.0003198019801980198, |
|
"loss": 1.2645, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 7.808981729182082, |
|
"eval_accuracy": 0.7385748699480129, |
|
"eval_loss": 1.1492513418197632, |
|
"eval_runtime": 885.3604, |
|
"eval_samples_per_second": 564.037, |
|
"eval_steps_per_second": 5.223, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 7.817405657692527, |
|
"grad_norm": 0.23961922526359558, |
|
"learning_rate": 0.00031881188118811885, |
|
"loss": 1.2631, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 7.825829586202971, |
|
"grad_norm": 0.2850695252418518, |
|
"learning_rate": 0.00031782178217821784, |
|
"loss": 1.2636, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 7.834253514713415, |
|
"grad_norm": 0.257962167263031, |
|
"learning_rate": 0.00031683168316831684, |
|
"loss": 1.2647, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 7.84267744322386, |
|
"grad_norm": 0.28995752334594727, |
|
"learning_rate": 0.00031584158415841584, |
|
"loss": 1.2613, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 7.851101371734305, |
|
"grad_norm": 0.23544956743717194, |
|
"learning_rate": 0.00031485148514851484, |
|
"loss": 1.261, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 7.85952530024475, |
|
"grad_norm": 0.27855780720710754, |
|
"learning_rate": 0.0003138613861386139, |
|
"loss": 1.2615, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 7.867949228755194, |
|
"grad_norm": 0.2668914198875427, |
|
"learning_rate": 0.0003128712871287129, |
|
"loss": 1.2629, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 7.876373157265638, |
|
"grad_norm": 0.2561187446117401, |
|
"learning_rate": 0.0003118811881188119, |
|
"loss": 1.2614, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 7.884797085776083, |
|
"grad_norm": 0.23943807184696198, |
|
"learning_rate": 0.0003108910891089109, |
|
"loss": 1.2591, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 7.884797085776083, |
|
"eval_accuracy": 0.7389714933005799, |
|
"eval_loss": 1.1477636098861694, |
|
"eval_runtime": 884.2901, |
|
"eval_samples_per_second": 564.72, |
|
"eval_steps_per_second": 5.229, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 7.893221014286527, |
|
"grad_norm": 0.3144013583660126, |
|
"learning_rate": 0.0003099009900990099, |
|
"loss": 1.2606, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 7.901644942796972, |
|
"grad_norm": 0.30694615840911865, |
|
"learning_rate": 0.00030891089108910894, |
|
"loss": 1.2607, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 7.910068871307416, |
|
"grad_norm": 0.28703033924102783, |
|
"learning_rate": 0.00030792079207920793, |
|
"loss": 1.2625, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 7.918492799817861, |
|
"grad_norm": 0.24160224199295044, |
|
"learning_rate": 0.00030693069306930693, |
|
"loss": 1.2594, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 7.9269167283283055, |
|
"grad_norm": 0.26693734526634216, |
|
"learning_rate": 0.00030594059405940593, |
|
"loss": 1.2605, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 7.935340656838751, |
|
"grad_norm": 0.23551449179649353, |
|
"learning_rate": 0.00030495049504950493, |
|
"loss": 1.2589, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 7.943764585349195, |
|
"grad_norm": 0.23266945779323578, |
|
"learning_rate": 0.000303960396039604, |
|
"loss": 1.2575, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 7.952188513859639, |
|
"grad_norm": 0.19307726621627808, |
|
"learning_rate": 0.000302970297029703, |
|
"loss": 1.2594, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 7.960612442370084, |
|
"grad_norm": 0.2490869015455246, |
|
"learning_rate": 0.000301980198019802, |
|
"loss": 1.2594, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 7.960612442370084, |
|
"eval_accuracy": 0.7392987654643606, |
|
"eval_loss": 1.1463170051574707, |
|
"eval_runtime": 887.3291, |
|
"eval_samples_per_second": 562.786, |
|
"eval_steps_per_second": 5.211, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 7.969036370880528, |
|
"grad_norm": 0.24613766372203827, |
|
"learning_rate": 0.000300990099009901, |
|
"loss": 1.2586, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 7.977460299390973, |
|
"grad_norm": 0.28653955459594727, |
|
"learning_rate": 0.0003, |
|
"loss": 1.2596, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 7.985884227901417, |
|
"grad_norm": 0.2534151077270508, |
|
"learning_rate": 0.000299009900990099, |
|
"loss": 1.258, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 7.994308156411861, |
|
"grad_norm": 0.2278260588645935, |
|
"learning_rate": 0.000298019801980198, |
|
"loss": 1.2596, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 8.002732084922306, |
|
"grad_norm": 0.24955512583255768, |
|
"learning_rate": 0.000297029702970297, |
|
"loss": 1.2589, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.011156013432752, |
|
"grad_norm": 0.24727576971054077, |
|
"learning_rate": 0.000296039603960396, |
|
"loss": 1.259, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 8.019579941943196, |
|
"grad_norm": 0.23246212303638458, |
|
"learning_rate": 0.000295049504950495, |
|
"loss": 1.2569, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 8.02800387045364, |
|
"grad_norm": 0.31031736731529236, |
|
"learning_rate": 0.00029405940594059407, |
|
"loss": 1.2576, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 8.036427798964084, |
|
"grad_norm": 0.25005343556404114, |
|
"learning_rate": 0.00029306930693069307, |
|
"loss": 1.2586, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 8.036427798964084, |
|
"eval_accuracy": 0.7396166114825387, |
|
"eval_loss": 1.1443780660629272, |
|
"eval_runtime": 886.7087, |
|
"eval_samples_per_second": 563.179, |
|
"eval_steps_per_second": 5.215, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 8.044851727474528, |
|
"grad_norm": 0.26693809032440186, |
|
"learning_rate": 0.00029207920792079207, |
|
"loss": 1.2565, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 8.053275655984974, |
|
"grad_norm": 0.2694302797317505, |
|
"learning_rate": 0.00029108910891089107, |
|
"loss": 1.2578, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 8.061699584495418, |
|
"grad_norm": 0.28717589378356934, |
|
"learning_rate": 0.00029009900990099006, |
|
"loss": 1.257, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 8.070123513005862, |
|
"grad_norm": 0.2473517805337906, |
|
"learning_rate": 0.0002891089108910891, |
|
"loss": 1.2584, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 8.078547441516307, |
|
"grad_norm": 0.238663449883461, |
|
"learning_rate": 0.0002881188118811881, |
|
"loss": 1.2565, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 8.086971370026752, |
|
"grad_norm": 0.25168007612228394, |
|
"learning_rate": 0.0002871287128712871, |
|
"loss": 1.2601, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.095395298537197, |
|
"grad_norm": 0.2553163766860962, |
|
"learning_rate": 0.0002861386138613861, |
|
"loss": 1.2582, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 8.10381922704764, |
|
"grad_norm": 0.22442133724689484, |
|
"learning_rate": 0.0002851485148514851, |
|
"loss": 1.2564, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 8.112243155558085, |
|
"grad_norm": 0.2428729087114334, |
|
"learning_rate": 0.00028415841584158416, |
|
"loss": 1.2555, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 8.112243155558085, |
|
"eval_accuracy": 0.7398516451845706, |
|
"eval_loss": 1.1434710025787354, |
|
"eval_runtime": 884.9135, |
|
"eval_samples_per_second": 564.322, |
|
"eval_steps_per_second": 5.225, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 8.120667084068529, |
|
"grad_norm": 0.24635536968708038, |
|
"learning_rate": 0.00028316831683168316, |
|
"loss": 1.256, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 8.129091012578975, |
|
"grad_norm": 0.25894826650619507, |
|
"learning_rate": 0.00028217821782178216, |
|
"loss": 1.2559, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 8.13751494108942, |
|
"grad_norm": 0.28364095091819763, |
|
"learning_rate": 0.0002811881188118812, |
|
"loss": 1.2558, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 8.145938869599863, |
|
"grad_norm": 0.27813902497291565, |
|
"learning_rate": 0.0002801980198019802, |
|
"loss": 1.2551, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 8.154362798110308, |
|
"grad_norm": 0.25842994451522827, |
|
"learning_rate": 0.00027920792079207926, |
|
"loss": 1.2566, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 8.162786726620752, |
|
"grad_norm": 0.28136196732521057, |
|
"learning_rate": 0.00027821782178217826, |
|
"loss": 1.2558, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 8.171210655131198, |
|
"grad_norm": 0.24087685346603394, |
|
"learning_rate": 0.00027722772277227726, |
|
"loss": 1.2548, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.179634583641642, |
|
"grad_norm": 0.24687226116657257, |
|
"learning_rate": 0.00027623762376237626, |
|
"loss": 1.2585, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 8.188058512152086, |
|
"grad_norm": 0.22570998966693878, |
|
"learning_rate": 0.00027524752475247525, |
|
"loss": 1.2534, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 8.188058512152086, |
|
"eval_accuracy": 0.7402963892075639, |
|
"eval_loss": 1.1417516469955444, |
|
"eval_runtime": 887.2248, |
|
"eval_samples_per_second": 562.852, |
|
"eval_steps_per_second": 5.212, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 8.19648244066253, |
|
"grad_norm": 0.2180325835943222, |
|
"learning_rate": 0.0002742574257425743, |
|
"loss": 1.254, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 8.204906369172976, |
|
"grad_norm": 0.24650686979293823, |
|
"learning_rate": 0.0002732673267326733, |
|
"loss": 1.2549, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 8.21333029768342, |
|
"grad_norm": 0.23055210709571838, |
|
"learning_rate": 0.0002722772277227723, |
|
"loss": 1.2533, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 8.221754226193864, |
|
"grad_norm": 0.2486119419336319, |
|
"learning_rate": 0.0002712871287128713, |
|
"loss": 1.2535, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 8.230178154704308, |
|
"grad_norm": 0.2295829951763153, |
|
"learning_rate": 0.0002702970297029703, |
|
"loss": 1.2532, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 8.238602083214753, |
|
"grad_norm": 0.24997445940971375, |
|
"learning_rate": 0.00026930693069306935, |
|
"loss": 1.2531, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 8.247026011725199, |
|
"grad_norm": 0.26696640253067017, |
|
"learning_rate": 0.00026831683168316835, |
|
"loss": 1.2537, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 8.255449940235643, |
|
"grad_norm": 0.26139459013938904, |
|
"learning_rate": 0.00026732673267326735, |
|
"loss": 1.255, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 8.263873868746087, |
|
"grad_norm": 0.24359402060508728, |
|
"learning_rate": 0.00026633663366336635, |
|
"loss": 1.2531, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 8.263873868746087, |
|
"eval_accuracy": 0.7405673501883495, |
|
"eval_loss": 1.139613389968872, |
|
"eval_runtime": 879.601, |
|
"eval_samples_per_second": 567.73, |
|
"eval_steps_per_second": 5.257, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 8.272297797256531, |
|
"grad_norm": 0.2327917069196701, |
|
"learning_rate": 0.00026534653465346534, |
|
"loss": 1.2534, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 8.280721725766975, |
|
"grad_norm": 0.25629815459251404, |
|
"learning_rate": 0.0002643564356435644, |
|
"loss": 1.2531, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 8.289145654277421, |
|
"grad_norm": 0.22450138628482819, |
|
"learning_rate": 0.0002633663366336634, |
|
"loss": 1.2529, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 8.297569582787865, |
|
"grad_norm": 0.2623524069786072, |
|
"learning_rate": 0.0002623762376237624, |
|
"loss": 1.2504, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 8.30599351129831, |
|
"grad_norm": 0.2159668356180191, |
|
"learning_rate": 0.0002613861386138614, |
|
"loss": 1.2528, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 8.314417439808754, |
|
"grad_norm": 0.24267102777957916, |
|
"learning_rate": 0.0002603960396039604, |
|
"loss": 1.2514, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 8.322841368319198, |
|
"grad_norm": 0.2541745603084564, |
|
"learning_rate": 0.00025940594059405944, |
|
"loss": 1.2505, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 8.331265296829644, |
|
"grad_norm": 0.28231385350227356, |
|
"learning_rate": 0.00025841584158415844, |
|
"loss": 1.2511, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 8.339689225340088, |
|
"grad_norm": 0.2412833273410797, |
|
"learning_rate": 0.00025742574257425744, |
|
"loss": 1.2506, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 8.339689225340088, |
|
"eval_accuracy": 0.740612444763646, |
|
"eval_loss": 1.140478491783142, |
|
"eval_runtime": 884.9323, |
|
"eval_samples_per_second": 564.31, |
|
"eval_steps_per_second": 5.225, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 8.348113153850532, |
|
"grad_norm": 0.2641441524028778, |
|
"learning_rate": 0.00025643564356435644, |
|
"loss": 1.2519, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 8.356537082360976, |
|
"grad_norm": 0.2675786316394806, |
|
"learning_rate": 0.00025544554455445543, |
|
"loss": 1.2516, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 8.364961010871422, |
|
"grad_norm": 0.2118910253047943, |
|
"learning_rate": 0.0002544554455445545, |
|
"loss": 1.2511, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 8.373384939381866, |
|
"grad_norm": 0.27223941683769226, |
|
"learning_rate": 0.0002534653465346535, |
|
"loss": 1.2519, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 8.38180886789231, |
|
"grad_norm": 0.2487749308347702, |
|
"learning_rate": 0.0002524752475247525, |
|
"loss": 1.2506, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 8.390232796402755, |
|
"grad_norm": 0.2320510894060135, |
|
"learning_rate": 0.0002514851485148515, |
|
"loss": 1.2534, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 8.398656724913199, |
|
"grad_norm": 0.2474934607744217, |
|
"learning_rate": 0.0002504950495049505, |
|
"loss": 1.249, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 8.407080653423645, |
|
"grad_norm": 0.23778343200683594, |
|
"learning_rate": 0.00024950495049504953, |
|
"loss": 1.2503, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 8.415504581934089, |
|
"grad_norm": 0.2715946137905121, |
|
"learning_rate": 0.00024851485148514853, |
|
"loss": 1.2515, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 8.415504581934089, |
|
"eval_accuracy": 0.7412818791412316, |
|
"eval_loss": 1.137270450592041, |
|
"eval_runtime": 885.4223, |
|
"eval_samples_per_second": 563.998, |
|
"eval_steps_per_second": 5.222, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 8.423928510444533, |
|
"grad_norm": 0.26555290818214417, |
|
"learning_rate": 0.00024752475247524753, |
|
"loss": 1.2485, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 8.432352438954977, |
|
"grad_norm": 0.23698092997074127, |
|
"learning_rate": 0.0002465346534653465, |
|
"loss": 1.2498, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 8.440776367465421, |
|
"grad_norm": 0.23015616834163666, |
|
"learning_rate": 0.0002455445544554455, |
|
"loss": 1.2482, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 8.449200295975867, |
|
"grad_norm": 0.22911451756954193, |
|
"learning_rate": 0.0002445544554455446, |
|
"loss": 1.2503, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 8.457624224486311, |
|
"grad_norm": 0.24171452224254608, |
|
"learning_rate": 0.00024356435643564357, |
|
"loss": 1.2485, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 8.466048152996756, |
|
"grad_norm": 0.24717497825622559, |
|
"learning_rate": 0.00024257425742574257, |
|
"loss": 1.2503, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 8.4744720815072, |
|
"grad_norm": 0.23118732869625092, |
|
"learning_rate": 0.00024158415841584157, |
|
"loss": 1.2488, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 8.482896010017644, |
|
"grad_norm": 0.22151467204093933, |
|
"learning_rate": 0.0002405940594059406, |
|
"loss": 1.2484, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 8.49131993852809, |
|
"grad_norm": 0.2284466177225113, |
|
"learning_rate": 0.0002396039603960396, |
|
"loss": 1.2487, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 8.49131993852809, |
|
"eval_accuracy": 0.7414350855696202, |
|
"eval_loss": 1.134464144706726, |
|
"eval_runtime": 887.5421, |
|
"eval_samples_per_second": 562.65, |
|
"eval_steps_per_second": 5.21, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 8.499743867038534, |
|
"grad_norm": 0.2377534806728363, |
|
"learning_rate": 0.00023861386138613862, |
|
"loss": 1.2491, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 8.508167795548978, |
|
"grad_norm": 0.2649644613265991, |
|
"learning_rate": 0.00023762376237623762, |
|
"loss": 1.2467, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 8.516591724059422, |
|
"grad_norm": 0.22302138805389404, |
|
"learning_rate": 0.00023663366336633662, |
|
"loss": 1.2496, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 8.525015652569868, |
|
"grad_norm": 0.24170257151126862, |
|
"learning_rate": 0.00023564356435643564, |
|
"loss": 1.2471, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 8.533439581080312, |
|
"grad_norm": 0.2645774781703949, |
|
"learning_rate": 0.00023465346534653464, |
|
"loss": 1.2477, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 8.541863509590756, |
|
"grad_norm": 0.24155734479427338, |
|
"learning_rate": 0.0002336633663366337, |
|
"loss": 1.2466, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 8.5502874381012, |
|
"grad_norm": 0.23023132979869843, |
|
"learning_rate": 0.0002326732673267327, |
|
"loss": 1.2457, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 8.558711366611645, |
|
"grad_norm": 0.2243080586194992, |
|
"learning_rate": 0.0002316831683168317, |
|
"loss": 1.2476, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 8.56713529512209, |
|
"grad_norm": 0.278157114982605, |
|
"learning_rate": 0.00023069306930693071, |
|
"loss": 1.2462, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 8.56713529512209, |
|
"eval_accuracy": 0.7417397824056636, |
|
"eval_loss": 1.1336922645568848, |
|
"eval_runtime": 892.4907, |
|
"eval_samples_per_second": 559.531, |
|
"eval_steps_per_second": 5.181, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 8.575559223632535, |
|
"grad_norm": 0.24606026709079742, |
|
"learning_rate": 0.0002297029702970297, |
|
"loss": 1.2478, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 8.583983152142979, |
|
"grad_norm": 0.23494498431682587, |
|
"learning_rate": 0.00022871287128712874, |
|
"loss": 1.2463, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 8.592407080653423, |
|
"grad_norm": 0.21522320806980133, |
|
"learning_rate": 0.00022772277227722774, |
|
"loss": 1.2479, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 8.60083100916387, |
|
"grad_norm": 0.2655723989009857, |
|
"learning_rate": 0.00022673267326732673, |
|
"loss": 1.2468, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 8.609254937674313, |
|
"grad_norm": 0.2444898933172226, |
|
"learning_rate": 0.00022574257425742576, |
|
"loss": 1.246, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 8.617678866184757, |
|
"grad_norm": 0.2277156114578247, |
|
"learning_rate": 0.00022475247524752476, |
|
"loss": 1.2466, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 8.626102794695202, |
|
"grad_norm": 0.22111962735652924, |
|
"learning_rate": 0.00022376237623762378, |
|
"loss": 1.2451, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 8.634526723205646, |
|
"grad_norm": 0.23199447989463806, |
|
"learning_rate": 0.00022277227722772278, |
|
"loss": 1.2463, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 8.642950651716092, |
|
"grad_norm": 0.22960427403450012, |
|
"learning_rate": 0.00022178217821782178, |
|
"loss": 1.2465, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 8.642950651716092, |
|
"eval_accuracy": 0.7420823467349104, |
|
"eval_loss": 1.1322184801101685, |
|
"eval_runtime": 883.7567, |
|
"eval_samples_per_second": 565.061, |
|
"eval_steps_per_second": 5.232, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 8.651374580226536, |
|
"grad_norm": 0.290622353553772, |
|
"learning_rate": 0.0002207920792079208, |
|
"loss": 1.2444, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 8.65979850873698, |
|
"grad_norm": 0.2639337480068207, |
|
"learning_rate": 0.0002198019801980198, |
|
"loss": 1.247, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 8.668222437247424, |
|
"grad_norm": 0.22477252781391144, |
|
"learning_rate": 0.00021881188118811883, |
|
"loss": 1.2443, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 8.676646365757868, |
|
"grad_norm": 0.2989983558654785, |
|
"learning_rate": 0.00021782178217821783, |
|
"loss": 1.2461, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 8.685070294268314, |
|
"grad_norm": 0.22259776294231415, |
|
"learning_rate": 0.00021683168316831682, |
|
"loss": 1.2438, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 8.693494222778758, |
|
"grad_norm": 0.21380363404750824, |
|
"learning_rate": 0.00021584158415841585, |
|
"loss": 1.2414, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 8.701918151289203, |
|
"grad_norm": 0.23593538999557495, |
|
"learning_rate": 0.00021485148514851485, |
|
"loss": 1.2454, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 8.710342079799647, |
|
"grad_norm": 0.25987499952316284, |
|
"learning_rate": 0.00021386138613861387, |
|
"loss": 1.2444, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 8.71876600831009, |
|
"grad_norm": 0.21150009334087372, |
|
"learning_rate": 0.00021287128712871287, |
|
"loss": 1.2414, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 8.71876600831009, |
|
"eval_accuracy": 0.7421671573662553, |
|
"eval_loss": 1.1316900253295898, |
|
"eval_runtime": 893.0033, |
|
"eval_samples_per_second": 559.21, |
|
"eval_steps_per_second": 5.178, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 8.727189936820537, |
|
"grad_norm": 0.23628725111484528, |
|
"learning_rate": 0.00021188118811881187, |
|
"loss": 1.2432, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 8.735613865330981, |
|
"grad_norm": 0.24477533996105194, |
|
"learning_rate": 0.0002108910891089109, |
|
"loss": 1.2447, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 8.744037793841425, |
|
"grad_norm": 0.2156253159046173, |
|
"learning_rate": 0.0002099009900990099, |
|
"loss": 1.2452, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 8.75246172235187, |
|
"grad_norm": 0.27982792258262634, |
|
"learning_rate": 0.00020891089108910892, |
|
"loss": 1.2434, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 8.760885650862313, |
|
"grad_norm": 0.24025356769561768, |
|
"learning_rate": 0.00020792079207920792, |
|
"loss": 1.244, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 8.76930957937276, |
|
"grad_norm": 0.22768454253673553, |
|
"learning_rate": 0.00020693069306930691, |
|
"loss": 1.2427, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 8.777733507883204, |
|
"grad_norm": 0.2676762640476227, |
|
"learning_rate": 0.00020594059405940594, |
|
"loss": 1.244, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 8.786157436393648, |
|
"grad_norm": 0.23502378165721893, |
|
"learning_rate": 0.00020495049504950494, |
|
"loss": 1.244, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 8.794581364904092, |
|
"grad_norm": 0.23354895412921906, |
|
"learning_rate": 0.00020396039603960396, |
|
"loss": 1.2435, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 8.794581364904092, |
|
"eval_accuracy": 0.7425177306861277, |
|
"eval_loss": 1.1301963329315186, |
|
"eval_runtime": 885.137, |
|
"eval_samples_per_second": 564.179, |
|
"eval_steps_per_second": 5.224, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 8.803005293414538, |
|
"grad_norm": 0.22738757729530334, |
|
"learning_rate": 0.000202970297029703, |
|
"loss": 1.2426, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 8.811429221924982, |
|
"grad_norm": 0.20702116191387177, |
|
"learning_rate": 0.00020198019801980199, |
|
"loss": 1.243, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 8.819853150435426, |
|
"grad_norm": 0.20945468544960022, |
|
"learning_rate": 0.000200990099009901, |
|
"loss": 1.2411, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 8.82827707894587, |
|
"grad_norm": 0.21654458343982697, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2428, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 8.836701007456314, |
|
"grad_norm": 0.2217228263616562, |
|
"learning_rate": 0.00019900990099009903, |
|
"loss": 1.2405, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 8.84512493596676, |
|
"grad_norm": 0.27619633078575134, |
|
"learning_rate": 0.00019801980198019803, |
|
"loss": 1.2424, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 8.853548864477204, |
|
"grad_norm": 0.2569934129714966, |
|
"learning_rate": 0.00019702970297029703, |
|
"loss": 1.2418, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 8.861972792987649, |
|
"grad_norm": 0.2570299804210663, |
|
"learning_rate": 0.00019603960396039606, |
|
"loss": 1.2423, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 8.870396721498093, |
|
"grad_norm": 0.22972337901592255, |
|
"learning_rate": 0.00019504950495049505, |
|
"loss": 1.2399, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 8.870396721498093, |
|
"eval_accuracy": 0.7427001211705735, |
|
"eval_loss": 1.1304486989974976, |
|
"eval_runtime": 881.4454, |
|
"eval_samples_per_second": 566.542, |
|
"eval_steps_per_second": 5.246, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 8.878820650008539, |
|
"grad_norm": 0.2365693300962448, |
|
"learning_rate": 0.00019405940594059408, |
|
"loss": 1.2426, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 8.887244578518983, |
|
"grad_norm": 0.2252751588821411, |
|
"learning_rate": 0.00019306930693069308, |
|
"loss": 1.2406, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 8.895668507029427, |
|
"grad_norm": 0.2205033302307129, |
|
"learning_rate": 0.00019207920792079208, |
|
"loss": 1.2419, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 8.904092435539871, |
|
"grad_norm": 0.21468041837215424, |
|
"learning_rate": 0.0001910891089108911, |
|
"loss": 1.2406, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 8.912516364050315, |
|
"grad_norm": 0.23669223487377167, |
|
"learning_rate": 0.0001900990099009901, |
|
"loss": 1.2401, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 8.920940292560761, |
|
"grad_norm": 0.2412618100643158, |
|
"learning_rate": 0.00018910891089108913, |
|
"loss": 1.2402, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 8.929364221071205, |
|
"grad_norm": 0.21675223112106323, |
|
"learning_rate": 0.00018811881188118812, |
|
"loss": 1.2417, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 8.93778814958165, |
|
"grad_norm": 0.24683676660060883, |
|
"learning_rate": 0.00018712871287128712, |
|
"loss": 1.2417, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 8.946212078092094, |
|
"grad_norm": 0.21681492030620575, |
|
"learning_rate": 0.00018613861386138615, |
|
"loss": 1.2408, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 8.946212078092094, |
|
"eval_accuracy": 0.7428579001690714, |
|
"eval_loss": 1.1290760040283203, |
|
"eval_runtime": 889.1418, |
|
"eval_samples_per_second": 561.638, |
|
"eval_steps_per_second": 5.201, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 8.954636006602538, |
|
"grad_norm": 0.22117485105991364, |
|
"learning_rate": 0.00018514851485148514, |
|
"loss": 1.2399, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 8.963059935112984, |
|
"grad_norm": 0.2180255800485611, |
|
"learning_rate": 0.00018415841584158417, |
|
"loss": 1.2378, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 8.971483863623428, |
|
"grad_norm": 0.23244567215442657, |
|
"learning_rate": 0.00018316831683168317, |
|
"loss": 1.2402, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 8.979907792133872, |
|
"grad_norm": 0.23777294158935547, |
|
"learning_rate": 0.00018217821782178217, |
|
"loss": 1.2417, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 8.988331720644316, |
|
"grad_norm": 0.26418906450271606, |
|
"learning_rate": 0.0001811881188118812, |
|
"loss": 1.238, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 8.99675564915476, |
|
"grad_norm": 0.21142803132534027, |
|
"learning_rate": 0.0001801980198019802, |
|
"loss": 1.2384, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 9.005179577665206, |
|
"grad_norm": 0.21976542472839355, |
|
"learning_rate": 0.00017920792079207922, |
|
"loss": 1.2399, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 9.01360350617565, |
|
"grad_norm": 0.2216147631406784, |
|
"learning_rate": 0.0001782178217821782, |
|
"loss": 1.2391, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.022027434686095, |
|
"grad_norm": 0.1873018890619278, |
|
"learning_rate": 0.0001772277227722772, |
|
"loss": 1.2368, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 9.022027434686095, |
|
"eval_accuracy": 0.7431224622062498, |
|
"eval_loss": 1.1265127658843994, |
|
"eval_runtime": 891.5668, |
|
"eval_samples_per_second": 560.111, |
|
"eval_steps_per_second": 5.186, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 9.030451363196539, |
|
"grad_norm": 0.23913191258907318, |
|
"learning_rate": 0.00017623762376237624, |
|
"loss": 1.2404, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 9.038875291706983, |
|
"grad_norm": 0.21578449010849, |
|
"learning_rate": 0.00017524752475247524, |
|
"loss": 1.2388, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 9.047299220217429, |
|
"grad_norm": 0.2038455754518509, |
|
"learning_rate": 0.00017425742574257426, |
|
"loss": 1.2402, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 9.055723148727873, |
|
"grad_norm": 0.21903488039970398, |
|
"learning_rate": 0.00017326732673267326, |
|
"loss": 1.2383, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 9.064147077238317, |
|
"grad_norm": 0.21970726549625397, |
|
"learning_rate": 0.00017227722772277226, |
|
"loss": 1.2386, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 9.072571005748761, |
|
"grad_norm": 0.22701360285282135, |
|
"learning_rate": 0.0001712871287128713, |
|
"loss": 1.2391, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 9.080994934259207, |
|
"grad_norm": 0.21777622401714325, |
|
"learning_rate": 0.0001702970297029703, |
|
"loss": 1.2388, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 9.089418862769651, |
|
"grad_norm": 0.2336941659450531, |
|
"learning_rate": 0.00016930693069306933, |
|
"loss": 1.2383, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 9.097842791280096, |
|
"grad_norm": 0.20545706152915955, |
|
"learning_rate": 0.00016831683168316833, |
|
"loss": 1.2376, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.097842791280096, |
|
"eval_accuracy": 0.7435866345331611, |
|
"eval_loss": 1.1250243186950684, |
|
"eval_runtime": 885.3582, |
|
"eval_samples_per_second": 564.038, |
|
"eval_steps_per_second": 5.223, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.10626671979054, |
|
"grad_norm": 0.23678459227085114, |
|
"learning_rate": 0.00016732673267326733, |
|
"loss": 1.2394, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 9.114690648300984, |
|
"grad_norm": 0.24195948243141174, |
|
"learning_rate": 0.00016633663366336635, |
|
"loss": 1.238, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 9.12311457681143, |
|
"grad_norm": 0.20026259124279022, |
|
"learning_rate": 0.00016534653465346535, |
|
"loss": 1.2364, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 9.131538505321874, |
|
"grad_norm": 0.21753010153770447, |
|
"learning_rate": 0.00016435643564356438, |
|
"loss": 1.238, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 9.139962433832318, |
|
"grad_norm": 0.20273657143115997, |
|
"learning_rate": 0.00016336633663366338, |
|
"loss": 1.2374, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 9.148386362342762, |
|
"grad_norm": 0.21302086114883423, |
|
"learning_rate": 0.00016237623762376237, |
|
"loss": 1.2372, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 9.156810290853207, |
|
"grad_norm": 0.23342467844486237, |
|
"learning_rate": 0.0001613861386138614, |
|
"loss": 1.2378, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 9.165234219363652, |
|
"grad_norm": 0.24393875896930695, |
|
"learning_rate": 0.0001603960396039604, |
|
"loss": 1.2362, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 9.173658147874097, |
|
"grad_norm": 0.19604717195034027, |
|
"learning_rate": 0.00015940594059405942, |
|
"loss": 1.237, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 9.173658147874097, |
|
"eval_accuracy": 0.743667723412049, |
|
"eval_loss": 1.124830722808838, |
|
"eval_runtime": 887.4222, |
|
"eval_samples_per_second": 562.727, |
|
"eval_steps_per_second": 5.211, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 9.18208207638454, |
|
"grad_norm": 0.19619697332382202, |
|
"learning_rate": 0.00015841584158415842, |
|
"loss": 1.2356, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 9.190506004894985, |
|
"grad_norm": 0.20415499806404114, |
|
"learning_rate": 0.00015742574257425742, |
|
"loss": 1.2373, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 9.19892993340543, |
|
"grad_norm": 0.21602529287338257, |
|
"learning_rate": 0.00015643564356435644, |
|
"loss": 1.2369, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 9.207353861915875, |
|
"grad_norm": 0.2266259491443634, |
|
"learning_rate": 0.00015544554455445544, |
|
"loss": 1.236, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 9.21577779042632, |
|
"grad_norm": 0.2172340452671051, |
|
"learning_rate": 0.00015445544554455447, |
|
"loss": 1.236, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 9.224201718936763, |
|
"grad_norm": 0.21929994225502014, |
|
"learning_rate": 0.00015346534653465347, |
|
"loss": 1.2381, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 9.232625647447207, |
|
"grad_norm": 0.20617130398750305, |
|
"learning_rate": 0.00015247524752475246, |
|
"loss": 1.2346, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 9.241049575957653, |
|
"grad_norm": 0.2271021008491516, |
|
"learning_rate": 0.0001514851485148515, |
|
"loss": 1.2364, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 9.249473504468098, |
|
"grad_norm": 0.22377552092075348, |
|
"learning_rate": 0.0001504950495049505, |
|
"loss": 1.2342, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 9.249473504468098, |
|
"eval_accuracy": 0.7438243969178056, |
|
"eval_loss": 1.124144434928894, |
|
"eval_runtime": 880.0851, |
|
"eval_samples_per_second": 567.418, |
|
"eval_steps_per_second": 5.254, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 9.257897432978542, |
|
"grad_norm": 0.23195216059684753, |
|
"learning_rate": 0.0001495049504950495, |
|
"loss": 1.2347, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 9.266321361488986, |
|
"grad_norm": 0.19934554398059845, |
|
"learning_rate": 0.0001485148514851485, |
|
"loss": 1.2359, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 9.27474528999943, |
|
"grad_norm": 0.19541287422180176, |
|
"learning_rate": 0.0001475247524752475, |
|
"loss": 1.2342, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 9.283169218509876, |
|
"grad_norm": 0.2204955518245697, |
|
"learning_rate": 0.00014653465346534653, |
|
"loss": 1.2356, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 9.29159314702032, |
|
"grad_norm": 0.22855669260025024, |
|
"learning_rate": 0.00014554455445544553, |
|
"loss": 1.2367, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 9.300017075530764, |
|
"grad_norm": 0.20308193564414978, |
|
"learning_rate": 0.00014455445544554456, |
|
"loss": 1.235, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 9.308441004041208, |
|
"grad_norm": 0.18201188743114471, |
|
"learning_rate": 0.00014356435643564356, |
|
"loss": 1.235, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 9.316864932551653, |
|
"grad_norm": 0.199186772108078, |
|
"learning_rate": 0.00014257425742574255, |
|
"loss": 1.2348, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 9.325288861062099, |
|
"grad_norm": 0.23214493691921234, |
|
"learning_rate": 0.00014158415841584158, |
|
"loss": 1.2335, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 9.325288861062099, |
|
"eval_accuracy": 0.7438911749364814, |
|
"eval_loss": 1.123384714126587, |
|
"eval_runtime": 888.3176, |
|
"eval_samples_per_second": 562.159, |
|
"eval_steps_per_second": 5.205, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 9.333712789572543, |
|
"grad_norm": 0.2128278762102127, |
|
"learning_rate": 0.0001405940594059406, |
|
"loss": 1.2337, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 9.342136718082987, |
|
"grad_norm": 0.20257510244846344, |
|
"learning_rate": 0.00013960396039603963, |
|
"loss": 1.2357, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 9.350560646593431, |
|
"grad_norm": 0.22038786113262177, |
|
"learning_rate": 0.00013861386138613863, |
|
"loss": 1.2333, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 9.358984575103877, |
|
"grad_norm": 0.2351042628288269, |
|
"learning_rate": 0.00013762376237623763, |
|
"loss": 1.235, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 9.367408503614321, |
|
"grad_norm": 0.2042153775691986, |
|
"learning_rate": 0.00013663366336633665, |
|
"loss": 1.2339, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 9.375832432124765, |
|
"grad_norm": 0.20065917074680328, |
|
"learning_rate": 0.00013564356435643565, |
|
"loss": 1.234, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 9.38425636063521, |
|
"grad_norm": 0.22544540464878082, |
|
"learning_rate": 0.00013465346534653468, |
|
"loss": 1.2319, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 9.392680289145654, |
|
"grad_norm": 0.2352074533700943, |
|
"learning_rate": 0.00013366336633663367, |
|
"loss": 1.2347, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 9.4011042176561, |
|
"grad_norm": 0.2452593892812729, |
|
"learning_rate": 0.00013267326732673267, |
|
"loss": 1.2343, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 9.4011042176561, |
|
"eval_accuracy": 0.7445740208736444, |
|
"eval_loss": 1.1202077865600586, |
|
"eval_runtime": 879.3984, |
|
"eval_samples_per_second": 567.861, |
|
"eval_steps_per_second": 5.258, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 9.409528146166544, |
|
"grad_norm": 0.20848217606544495, |
|
"learning_rate": 0.0001316831683168317, |
|
"loss": 1.2315, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 9.417952074676988, |
|
"grad_norm": 0.20628029108047485, |
|
"learning_rate": 0.0001306930693069307, |
|
"loss": 1.2326, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 9.426376003187432, |
|
"grad_norm": 0.199026957154274, |
|
"learning_rate": 0.00012970297029702972, |
|
"loss": 1.2329, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 9.434799931697876, |
|
"grad_norm": 0.21373671293258667, |
|
"learning_rate": 0.00012871287128712872, |
|
"loss": 1.2326, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 9.443223860208322, |
|
"grad_norm": 0.2015460729598999, |
|
"learning_rate": 0.00012772277227722772, |
|
"loss": 1.2327, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 9.451647788718766, |
|
"grad_norm": 0.2228008210659027, |
|
"learning_rate": 0.00012673267326732674, |
|
"loss": 1.2334, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 9.46007171722921, |
|
"grad_norm": 0.21561528742313385, |
|
"learning_rate": 0.00012574257425742574, |
|
"loss": 1.233, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 9.468495645739655, |
|
"grad_norm": 0.2073032706975937, |
|
"learning_rate": 0.00012475247524752477, |
|
"loss": 1.2314, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 9.4769195742501, |
|
"grad_norm": 0.19552037119865417, |
|
"learning_rate": 0.00012376237623762376, |
|
"loss": 1.2333, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 9.4769195742501, |
|
"eval_accuracy": 0.744401638855597, |
|
"eval_loss": 1.1210565567016602, |
|
"eval_runtime": 888.2535, |
|
"eval_samples_per_second": 562.2, |
|
"eval_steps_per_second": 5.206, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 9.485343502760545, |
|
"grad_norm": 0.20909276604652405, |
|
"learning_rate": 0.00012277227722772276, |
|
"loss": 1.2332, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 9.493767431270989, |
|
"grad_norm": 0.210150346159935, |
|
"learning_rate": 0.00012178217821782179, |
|
"loss": 1.2308, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 9.502191359781433, |
|
"grad_norm": 0.1982164978981018, |
|
"learning_rate": 0.00012079207920792079, |
|
"loss": 1.2305, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 9.510615288291877, |
|
"grad_norm": 0.2049965262413025, |
|
"learning_rate": 0.0001198019801980198, |
|
"loss": 1.2334, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 9.519039216802323, |
|
"grad_norm": 0.18243108689785004, |
|
"learning_rate": 0.00011881188118811881, |
|
"loss": 1.2335, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 9.527463145312767, |
|
"grad_norm": 0.2009328156709671, |
|
"learning_rate": 0.00011782178217821782, |
|
"loss": 1.2313, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 9.535887073823211, |
|
"grad_norm": 0.19226033985614777, |
|
"learning_rate": 0.00011683168316831685, |
|
"loss": 1.2332, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 9.544311002333655, |
|
"grad_norm": 0.20206843316555023, |
|
"learning_rate": 0.00011584158415841584, |
|
"loss": 1.2333, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 9.5527349308441, |
|
"grad_norm": 0.20852382481098175, |
|
"learning_rate": 0.00011485148514851486, |
|
"loss": 1.2322, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 9.5527349308441, |
|
"eval_accuracy": 0.7448142064493213, |
|
"eval_loss": 1.1182734966278076, |
|
"eval_runtime": 889.106, |
|
"eval_samples_per_second": 561.661, |
|
"eval_steps_per_second": 5.201, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 9.561158859354546, |
|
"grad_norm": 0.19330884516239166, |
|
"learning_rate": 0.00011386138613861387, |
|
"loss": 1.2294, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 9.56958278786499, |
|
"grad_norm": 0.17878125607967377, |
|
"learning_rate": 0.00011287128712871288, |
|
"loss": 1.2301, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 9.578006716375434, |
|
"grad_norm": 0.20679515600204468, |
|
"learning_rate": 0.00011188118811881189, |
|
"loss": 1.2302, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 9.586430644885878, |
|
"grad_norm": 0.20949432253837585, |
|
"learning_rate": 0.00011089108910891089, |
|
"loss": 1.2308, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 9.594854573396322, |
|
"grad_norm": 0.21771377325057983, |
|
"learning_rate": 0.0001099009900990099, |
|
"loss": 1.2313, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 9.603278501906768, |
|
"grad_norm": 0.1953546106815338, |
|
"learning_rate": 0.00010891089108910891, |
|
"loss": 1.2305, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 9.611702430417212, |
|
"grad_norm": 0.20105966925621033, |
|
"learning_rate": 0.00010792079207920792, |
|
"loss": 1.2294, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 9.620126358927656, |
|
"grad_norm": 0.20625823736190796, |
|
"learning_rate": 0.00010693069306930694, |
|
"loss": 1.2287, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 9.6285502874381, |
|
"grad_norm": 0.2024402767419815, |
|
"learning_rate": 0.00010594059405940593, |
|
"loss": 1.2309, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 9.6285502874381, |
|
"eval_accuracy": 0.7450274546722492, |
|
"eval_loss": 1.1177880764007568, |
|
"eval_runtime": 889.3816, |
|
"eval_samples_per_second": 561.487, |
|
"eval_steps_per_second": 5.199, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 9.636974215948547, |
|
"grad_norm": 0.20498992502689362, |
|
"learning_rate": 0.00010495049504950495, |
|
"loss": 1.228, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 9.64539814445899, |
|
"grad_norm": 0.18760576844215393, |
|
"learning_rate": 0.00010396039603960396, |
|
"loss": 1.2287, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 9.653822072969435, |
|
"grad_norm": 0.2059292048215866, |
|
"learning_rate": 0.00010297029702970297, |
|
"loss": 1.2284, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 9.662246001479879, |
|
"grad_norm": 0.20898665487766266, |
|
"learning_rate": 0.00010198019801980198, |
|
"loss": 1.231, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 9.670669929990323, |
|
"grad_norm": 0.20303255319595337, |
|
"learning_rate": 0.00010099009900990099, |
|
"loss": 1.2302, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 9.679093858500769, |
|
"grad_norm": 0.20947200059890747, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2314, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 9.687517787011213, |
|
"grad_norm": 0.20898771286010742, |
|
"learning_rate": 9.900990099009902e-05, |
|
"loss": 1.2294, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 9.695941715521657, |
|
"grad_norm": 0.18466849625110626, |
|
"learning_rate": 9.801980198019803e-05, |
|
"loss": 1.2309, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 9.704365644032102, |
|
"grad_norm": 0.1769760698080063, |
|
"learning_rate": 9.702970297029704e-05, |
|
"loss": 1.2282, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 9.704365644032102, |
|
"eval_accuracy": 0.7449189101862153, |
|
"eval_loss": 1.118354082107544, |
|
"eval_runtime": 879.3937, |
|
"eval_samples_per_second": 567.864, |
|
"eval_steps_per_second": 5.258, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 9.712789572542546, |
|
"grad_norm": 0.18270480632781982, |
|
"learning_rate": 9.603960396039604e-05, |
|
"loss": 1.2286, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 9.721213501052992, |
|
"grad_norm": 0.1812662035226822, |
|
"learning_rate": 9.504950495049505e-05, |
|
"loss": 1.2279, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 9.729637429563436, |
|
"grad_norm": 0.20632152259349823, |
|
"learning_rate": 9.405940594059406e-05, |
|
"loss": 1.2295, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 9.73806135807388, |
|
"grad_norm": 0.19512777030467987, |
|
"learning_rate": 9.306930693069307e-05, |
|
"loss": 1.2292, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 9.746485286584324, |
|
"grad_norm": 0.19665522873401642, |
|
"learning_rate": 9.207920792079209e-05, |
|
"loss": 1.2294, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 9.75490921509477, |
|
"grad_norm": 0.18540680408477783, |
|
"learning_rate": 9.108910891089108e-05, |
|
"loss": 1.2297, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 9.763333143605214, |
|
"grad_norm": 0.21472424268722534, |
|
"learning_rate": 9.00990099009901e-05, |
|
"loss": 1.2277, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 9.771757072115658, |
|
"grad_norm": 0.2189822793006897, |
|
"learning_rate": 8.91089108910891e-05, |
|
"loss": 1.2293, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 9.780181000626103, |
|
"grad_norm": 0.19983939826488495, |
|
"learning_rate": 8.811881188118812e-05, |
|
"loss": 1.2287, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 9.780181000626103, |
|
"eval_accuracy": 0.7452771934107217, |
|
"eval_loss": 1.1166530847549438, |
|
"eval_runtime": 886.9822, |
|
"eval_samples_per_second": 563.006, |
|
"eval_steps_per_second": 5.213, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 9.788604929136547, |
|
"grad_norm": 0.1868014931678772, |
|
"learning_rate": 8.712871287128713e-05, |
|
"loss": 1.2296, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 9.797028857646993, |
|
"grad_norm": 0.2048911601305008, |
|
"learning_rate": 8.613861386138613e-05, |
|
"loss": 1.2291, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 9.805452786157437, |
|
"grad_norm": 0.2088802009820938, |
|
"learning_rate": 8.514851485148515e-05, |
|
"loss": 1.2271, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 9.813876714667881, |
|
"grad_norm": 0.20058122277259827, |
|
"learning_rate": 8.415841584158417e-05, |
|
"loss": 1.2296, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 9.822300643178325, |
|
"grad_norm": 0.1964656561613083, |
|
"learning_rate": 8.316831683168318e-05, |
|
"loss": 1.2272, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 9.83072457168877, |
|
"grad_norm": 0.20214231312274933, |
|
"learning_rate": 8.217821782178219e-05, |
|
"loss": 1.2271, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 9.839148500199215, |
|
"grad_norm": 0.19427910447120667, |
|
"learning_rate": 8.118811881188119e-05, |
|
"loss": 1.2264, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 9.84757242870966, |
|
"grad_norm": 0.18842646479606628, |
|
"learning_rate": 8.01980198019802e-05, |
|
"loss": 1.2265, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 9.855996357220103, |
|
"grad_norm": 0.18588952720165253, |
|
"learning_rate": 7.920792079207921e-05, |
|
"loss": 1.2279, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 9.855996357220103, |
|
"eval_accuracy": 0.7454476541387279, |
|
"eval_loss": 1.1153885126113892, |
|
"eval_runtime": 879.2745, |
|
"eval_samples_per_second": 567.941, |
|
"eval_steps_per_second": 5.259, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 9.864420285730548, |
|
"grad_norm": 0.18300525844097137, |
|
"learning_rate": 7.821782178217822e-05, |
|
"loss": 1.2268, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 9.872844214240992, |
|
"grad_norm": 0.18436813354492188, |
|
"learning_rate": 7.722772277227723e-05, |
|
"loss": 1.2256, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 9.881268142751438, |
|
"grad_norm": 0.19767363369464874, |
|
"learning_rate": 7.623762376237623e-05, |
|
"loss": 1.2246, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 9.889692071261882, |
|
"grad_norm": 0.1749766319990158, |
|
"learning_rate": 7.524752475247524e-05, |
|
"loss": 1.2277, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 9.898115999772326, |
|
"grad_norm": 0.17161355912685394, |
|
"learning_rate": 7.425742574257426e-05, |
|
"loss": 1.2262, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 9.90653992828277, |
|
"grad_norm": 0.190937340259552, |
|
"learning_rate": 7.326732673267327e-05, |
|
"loss": 1.2276, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 9.914963856793216, |
|
"grad_norm": 0.18256962299346924, |
|
"learning_rate": 7.227722772277228e-05, |
|
"loss": 1.2274, |
|
"step": 11770 |
|
}, |
|
{ |
|
"epoch": 9.92338778530366, |
|
"grad_norm": 0.1912631094455719, |
|
"learning_rate": 7.128712871287128e-05, |
|
"loss": 1.2243, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 9.931811713814104, |
|
"grad_norm": 0.19331537187099457, |
|
"learning_rate": 7.02970297029703e-05, |
|
"loss": 1.2261, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 9.931811713814104, |
|
"eval_accuracy": 0.7455543705350357, |
|
"eval_loss": 1.115136981010437, |
|
"eval_runtime": 887.3277, |
|
"eval_samples_per_second": 562.786, |
|
"eval_steps_per_second": 5.211, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 9.940235642324549, |
|
"grad_norm": 0.17607170343399048, |
|
"learning_rate": 6.930693069306931e-05, |
|
"loss": 1.228, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 9.948659570834993, |
|
"grad_norm": 0.17280788719654083, |
|
"learning_rate": 6.831683168316833e-05, |
|
"loss": 1.2269, |
|
"step": 11810 |
|
}, |
|
{ |
|
"epoch": 9.957083499345439, |
|
"grad_norm": 0.19290916621685028, |
|
"learning_rate": 6.732673267326734e-05, |
|
"loss": 1.2279, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 9.965507427855883, |
|
"grad_norm": 0.19125664234161377, |
|
"learning_rate": 6.633663366336634e-05, |
|
"loss": 1.227, |
|
"step": 11830 |
|
}, |
|
{ |
|
"epoch": 9.973931356366327, |
|
"grad_norm": 0.18251217901706696, |
|
"learning_rate": 6.534653465346535e-05, |
|
"loss": 1.2254, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 9.982355284876771, |
|
"grad_norm": 0.19647039473056793, |
|
"learning_rate": 6.435643564356436e-05, |
|
"loss": 1.2261, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 9.990779213387215, |
|
"grad_norm": 0.17714038491249084, |
|
"learning_rate": 6.336633663366337e-05, |
|
"loss": 1.2276, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 9.999203141897661, |
|
"grad_norm": 0.18365037441253662, |
|
"learning_rate": 6.237623762376238e-05, |
|
"loss": 1.2261, |
|
"step": 11870 |
|
}, |
|
{ |
|
"epoch": 10.007627070408105, |
|
"grad_norm": 0.1910678595304489, |
|
"learning_rate": 6.138613861386138e-05, |
|
"loss": 1.2244, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 10.007627070408105, |
|
"eval_accuracy": 0.7456593741030724, |
|
"eval_loss": 1.1154232025146484, |
|
"eval_runtime": 887.0764, |
|
"eval_samples_per_second": 562.946, |
|
"eval_steps_per_second": 5.213, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 10.01605099891855, |
|
"grad_norm": 0.18324702978134155, |
|
"learning_rate": 6.039603960396039e-05, |
|
"loss": 1.2267, |
|
"step": 11890 |
|
}, |
|
{ |
|
"epoch": 10.024474927428994, |
|
"grad_norm": 0.1686498522758484, |
|
"learning_rate": 5.9405940594059404e-05, |
|
"loss": 1.2242, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 10.03289885593944, |
|
"grad_norm": 0.17256265878677368, |
|
"learning_rate": 5.841584158415842e-05, |
|
"loss": 1.2239, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 10.041322784449884, |
|
"grad_norm": 0.19624483585357666, |
|
"learning_rate": 5.742574257425743e-05, |
|
"loss": 1.2258, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 10.049746712960328, |
|
"grad_norm": 0.17262500524520874, |
|
"learning_rate": 5.643564356435644e-05, |
|
"loss": 1.2258, |
|
"step": 11930 |
|
}, |
|
{ |
|
"epoch": 10.058170641470772, |
|
"grad_norm": 0.1741054356098175, |
|
"learning_rate": 5.5445544554455445e-05, |
|
"loss": 1.2245, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 10.066594569981216, |
|
"grad_norm": 0.17313139140605927, |
|
"learning_rate": 5.4455445544554456e-05, |
|
"loss": 1.2256, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 10.075018498491662, |
|
"grad_norm": 0.18322905898094177, |
|
"learning_rate": 5.346534653465347e-05, |
|
"loss": 1.2243, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 10.083442427002106, |
|
"grad_norm": 0.18261946737766266, |
|
"learning_rate": 5.247524752475247e-05, |
|
"loss": 1.2252, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 10.083442427002106, |
|
"eval_accuracy": 0.7457714664313748, |
|
"eval_loss": 1.1143237352371216, |
|
"eval_runtime": 887.1041, |
|
"eval_samples_per_second": 562.928, |
|
"eval_steps_per_second": 5.212, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 10.09186635551255, |
|
"grad_norm": 0.1877572238445282, |
|
"learning_rate": 5.1485148514851485e-05, |
|
"loss": 1.2249, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 10.100290284022995, |
|
"grad_norm": 0.18356889486312866, |
|
"learning_rate": 5.0495049504950497e-05, |
|
"loss": 1.2255, |
|
"step": 11990 |
|
}, |
|
{ |
|
"epoch": 10.108714212533439, |
|
"grad_norm": 0.1898818463087082, |
|
"learning_rate": 4.950495049504951e-05, |
|
"loss": 1.2241, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.117138141043885, |
|
"grad_norm": 0.17149324715137482, |
|
"learning_rate": 4.851485148514852e-05, |
|
"loss": 1.2257, |
|
"step": 12010 |
|
}, |
|
{ |
|
"epoch": 10.125562069554329, |
|
"grad_norm": 0.16672831773757935, |
|
"learning_rate": 4.7524752475247525e-05, |
|
"loss": 1.2255, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 10.133985998064773, |
|
"grad_norm": 0.16820046305656433, |
|
"learning_rate": 4.653465346534654e-05, |
|
"loss": 1.225, |
|
"step": 12030 |
|
}, |
|
{ |
|
"epoch": 10.142409926575217, |
|
"grad_norm": 0.17770229279994965, |
|
"learning_rate": 4.554455445544554e-05, |
|
"loss": 1.227, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 10.150833855085661, |
|
"grad_norm": 0.16082800924777985, |
|
"learning_rate": 4.455445544554455e-05, |
|
"loss": 1.2253, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 10.159257783596107, |
|
"grad_norm": 0.1669086515903473, |
|
"learning_rate": 4.3564356435643565e-05, |
|
"loss": 1.2241, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 10.159257783596107, |
|
"eval_accuracy": 0.7460534494522424, |
|
"eval_loss": 1.1121779680252075, |
|
"eval_runtime": 882.614, |
|
"eval_samples_per_second": 565.792, |
|
"eval_steps_per_second": 5.239, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 10.167681712106551, |
|
"grad_norm": 0.17394189536571503, |
|
"learning_rate": 4.257425742574258e-05, |
|
"loss": 1.2238, |
|
"step": 12070 |
|
}, |
|
{ |
|
"epoch": 10.176105640616996, |
|
"grad_norm": 0.1611398160457611, |
|
"learning_rate": 4.158415841584159e-05, |
|
"loss": 1.2243, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 10.18452956912744, |
|
"grad_norm": 0.16469168663024902, |
|
"learning_rate": 4.0594059405940594e-05, |
|
"loss": 1.2232, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 10.192953497637886, |
|
"grad_norm": 0.1700202375650406, |
|
"learning_rate": 3.9603960396039605e-05, |
|
"loss": 1.2243, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 10.20137742614833, |
|
"grad_norm": 0.16961273550987244, |
|
"learning_rate": 3.861386138613862e-05, |
|
"loss": 1.2244, |
|
"step": 12110 |
|
}, |
|
{ |
|
"epoch": 10.209801354658774, |
|
"grad_norm": 0.18176864087581635, |
|
"learning_rate": 3.762376237623762e-05, |
|
"loss": 1.2234, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 10.218225283169218, |
|
"grad_norm": 0.17132678627967834, |
|
"learning_rate": 3.6633663366336634e-05, |
|
"loss": 1.2231, |
|
"step": 12130 |
|
}, |
|
{ |
|
"epoch": 10.226649211679662, |
|
"grad_norm": 0.1708788424730301, |
|
"learning_rate": 3.564356435643564e-05, |
|
"loss": 1.2228, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 10.235073140190108, |
|
"grad_norm": 0.16924616694450378, |
|
"learning_rate": 3.465346534653466e-05, |
|
"loss": 1.2241, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 10.235073140190108, |
|
"eval_accuracy": 0.7462807420235112, |
|
"eval_loss": 1.1115893125534058, |
|
"eval_runtime": 893.1249, |
|
"eval_samples_per_second": 559.133, |
|
"eval_steps_per_second": 5.177, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 10.243497068700552, |
|
"grad_norm": 0.1617705076932907, |
|
"learning_rate": 3.366336633663367e-05, |
|
"loss": 1.2239, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 10.251920997210997, |
|
"grad_norm": 0.17731362581253052, |
|
"learning_rate": 3.2673267326732674e-05, |
|
"loss": 1.2232, |
|
"step": 12170 |
|
}, |
|
{ |
|
"epoch": 10.26034492572144, |
|
"grad_norm": 0.17324230074882507, |
|
"learning_rate": 3.1683168316831686e-05, |
|
"loss": 1.224, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 10.268768854231885, |
|
"grad_norm": 0.15266722440719604, |
|
"learning_rate": 3.069306930693069e-05, |
|
"loss": 1.224, |
|
"step": 12190 |
|
}, |
|
{ |
|
"epoch": 10.27719278274233, |
|
"grad_norm": 0.1547342985868454, |
|
"learning_rate": 2.9702970297029702e-05, |
|
"loss": 1.2232, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 10.285616711252775, |
|
"grad_norm": 0.15873835980892181, |
|
"learning_rate": 2.8712871287128714e-05, |
|
"loss": 1.2221, |
|
"step": 12210 |
|
}, |
|
{ |
|
"epoch": 10.29404063976322, |
|
"grad_norm": 0.15968631207942963, |
|
"learning_rate": 2.7722772277227722e-05, |
|
"loss": 1.223, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 10.302464568273663, |
|
"grad_norm": 0.15929782390594482, |
|
"learning_rate": 2.6732673267326734e-05, |
|
"loss": 1.2242, |
|
"step": 12230 |
|
}, |
|
{ |
|
"epoch": 10.31088849678411, |
|
"grad_norm": 0.1512889713048935, |
|
"learning_rate": 2.5742574257425742e-05, |
|
"loss": 1.2223, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 10.31088849678411, |
|
"eval_accuracy": 0.7462616988558893, |
|
"eval_loss": 1.1114362478256226, |
|
"eval_runtime": 886.8923, |
|
"eval_samples_per_second": 563.063, |
|
"eval_steps_per_second": 5.214, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 10.319312425294553, |
|
"grad_norm": 0.15943297743797302, |
|
"learning_rate": 2.4752475247524754e-05, |
|
"loss": 1.2224, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 10.327736353804998, |
|
"grad_norm": 0.16134706139564514, |
|
"learning_rate": 2.3762376237623762e-05, |
|
"loss": 1.2218, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 10.336160282315442, |
|
"grad_norm": 0.15525278449058533, |
|
"learning_rate": 2.277227722772277e-05, |
|
"loss": 1.2237, |
|
"step": 12270 |
|
}, |
|
{ |
|
"epoch": 10.344584210825886, |
|
"grad_norm": 0.1626599282026291, |
|
"learning_rate": 2.1782178217821783e-05, |
|
"loss": 1.2228, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 10.353008139336332, |
|
"grad_norm": 0.1533862203359604, |
|
"learning_rate": 2.0792079207920794e-05, |
|
"loss": 1.221, |
|
"step": 12290 |
|
}, |
|
{ |
|
"epoch": 10.361432067846776, |
|
"grad_norm": 0.14988014101982117, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 1.2238, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 10.36985599635722, |
|
"grad_norm": 0.15282054245471954, |
|
"learning_rate": 1.881188118811881e-05, |
|
"loss": 1.2202, |
|
"step": 12310 |
|
}, |
|
{ |
|
"epoch": 10.378279924867664, |
|
"grad_norm": 0.1532844454050064, |
|
"learning_rate": 1.782178217821782e-05, |
|
"loss": 1.2222, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 10.386703853378108, |
|
"grad_norm": 0.15041793882846832, |
|
"learning_rate": 1.6831683168316834e-05, |
|
"loss": 1.2233, |
|
"step": 12330 |
|
}, |
|
{ |
|
"epoch": 10.386703853378108, |
|
"eval_accuracy": 0.7464784909349403, |
|
"eval_loss": 1.1103906631469727, |
|
"eval_runtime": 893.2259, |
|
"eval_samples_per_second": 559.07, |
|
"eval_steps_per_second": 5.177, |
|
"step": 12330 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 12500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 11, |
|
"save_steps": 90, |
|
"total_flos": 3.205415169974477e+18, |
|
"train_batch_size": 108, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|