diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9885 @@ +{ + "best_metric": 1.1103906631469727, + "best_model_checkpoint": "/home/wani/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/256/checkpoint-12330", + "epoch": 10.386703853378108, + "eval_steps": 90, + "global_step": 12330, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008423928510444533, + "grad_norm": 5.073121070861816, + "learning_rate": 4.166666666666667e-06, + "loss": 7.2395, + "step": 10 + }, + { + "epoch": 0.016847857020889066, + "grad_norm": 4.587955474853516, + "learning_rate": 8.333333333333334e-06, + "loss": 7.0836, + "step": 20 + }, + { + "epoch": 0.0252717855313336, + "grad_norm": 3.8589327335357666, + "learning_rate": 1.25e-05, + "loss": 6.8156, + "step": 30 + }, + { + "epoch": 0.03369571404177813, + "grad_norm": 3.4427683353424072, + "learning_rate": 1.6666666666666667e-05, + "loss": 6.5549, + "step": 40 + }, + { + "epoch": 0.04211964255222266, + "grad_norm": 3.109060525894165, + "learning_rate": 2.0833333333333333e-05, + "loss": 6.3522, + "step": 50 + }, + { + "epoch": 0.0505435710626672, + "grad_norm": 2.86232590675354, + "learning_rate": 2.5e-05, + "loss": 6.1983, + "step": 60 + }, + { + "epoch": 0.05896749957311173, + "grad_norm": 2.6880924701690674, + "learning_rate": 2.9166666666666666e-05, + "loss": 6.0796, + "step": 70 + }, + { + "epoch": 0.06739142808355626, + "grad_norm": 2.490527629852295, + "learning_rate": 3.3333333333333335e-05, + "loss": 5.9754, + "step": 80 + }, + { + "epoch": 0.0758153565940008, + "grad_norm": 2.3156356811523438, + "learning_rate": 3.75e-05, + "loss": 5.8736, + "step": 90 + }, + { + "epoch": 0.0758153565940008, + "eval_accuracy": 0.22415329938580753, + "eval_loss": 5.8054423332214355, + "eval_runtime": 910.9652, + "eval_samples_per_second": 548.183, + "eval_steps_per_second": 5.076, + "step": 90 + }, + { + "epoch": 0.08423928510444532, + "grad_norm": 2.1557302474975586, + "learning_rate": 4.1666666666666665e-05, + "loss": 5.7691, + "step": 100 + }, + { + "epoch": 0.09266321361488987, + "grad_norm": 1.9360383749008179, + "learning_rate": 4.5833333333333334e-05, + "loss": 5.6653, + "step": 110 + }, + { + "epoch": 0.1010871421253344, + "grad_norm": 1.731399655342102, + "learning_rate": 5e-05, + "loss": 5.5598, + "step": 120 + }, + { + "epoch": 0.10951107063577893, + "grad_norm": 1.508693814277649, + "learning_rate": 5.416666666666667e-05, + "loss": 5.4574, + "step": 130 + }, + { + "epoch": 0.11793499914622346, + "grad_norm": 1.2835007905960083, + "learning_rate": 5.833333333333333e-05, + "loss": 5.3585, + "step": 140 + }, + { + "epoch": 0.126358927656668, + "grad_norm": 1.0747231245040894, + "learning_rate": 6.25e-05, + "loss": 5.2667, + "step": 150 + }, + { + "epoch": 0.13478285616711252, + "grad_norm": 0.852271318435669, + "learning_rate": 6.666666666666667e-05, + "loss": 5.1779, + "step": 160 + }, + { + "epoch": 0.14320678467755707, + "grad_norm": 0.7001814842224121, + "learning_rate": 7.083333333333334e-05, + "loss": 5.0965, + "step": 170 + }, + { + "epoch": 0.1516307131880016, + "grad_norm": 0.5657457709312439, + "learning_rate": 7.5e-05, + "loss": 5.0237, + "step": 180 + }, + { + "epoch": 0.1516307131880016, + "eval_accuracy": 0.23888299376264316, + "eval_loss": 4.981535911560059, + "eval_runtime": 882.341, + "eval_samples_per_second": 565.967, + "eval_steps_per_second": 5.241, + "step": 180 + }, + { + "epoch": 0.16005464169844613, + "grad_norm": 0.4981703758239746, + "learning_rate": 7.916666666666666e-05, + "loss": 4.9662, + "step": 190 + }, + { + "epoch": 0.16847857020889065, + "grad_norm": 0.40254291892051697, + "learning_rate": 8.333333333333333e-05, + "loss": 4.9195, + "step": 200 + }, + { + "epoch": 0.1769024987193352, + "grad_norm": 0.32726043462753296, + "learning_rate": 8.75e-05, + "loss": 4.8766, + "step": 210 + }, + { + "epoch": 0.18532642722977974, + "grad_norm": 0.2471727877855301, + "learning_rate": 9.166666666666667e-05, + "loss": 4.8458, + "step": 220 + }, + { + "epoch": 0.19375035574022426, + "grad_norm": 0.2568261921405792, + "learning_rate": 9.583333333333334e-05, + "loss": 4.8169, + "step": 230 + }, + { + "epoch": 0.2021742842506688, + "grad_norm": 0.19310955703258514, + "learning_rate": 0.0001, + "loss": 4.7926, + "step": 240 + }, + { + "epoch": 0.21059821276111332, + "grad_norm": 0.20584674179553986, + "learning_rate": 0.00010416666666666667, + "loss": 4.7714, + "step": 250 + }, + { + "epoch": 0.21902214127155786, + "grad_norm": 0.26360729336738586, + "learning_rate": 0.00010833333333333334, + "loss": 4.7511, + "step": 260 + }, + { + "epoch": 0.22744606978200238, + "grad_norm": 0.1681978851556778, + "learning_rate": 0.00011250000000000001, + "loss": 4.7309, + "step": 270 + }, + { + "epoch": 0.22744606978200238, + "eval_accuracy": 0.28488370423336357, + "eval_loss": 4.706047534942627, + "eval_runtime": 889.3977, + "eval_samples_per_second": 561.477, + "eval_steps_per_second": 5.199, + "step": 270 + }, + { + "epoch": 0.23586999829244693, + "grad_norm": 0.17959143221378326, + "learning_rate": 0.00011666666666666667, + "loss": 4.7148, + "step": 280 + }, + { + "epoch": 0.24429392680289147, + "grad_norm": 0.27109047770500183, + "learning_rate": 0.00012083333333333333, + "loss": 4.6989, + "step": 290 + }, + { + "epoch": 0.252717855313336, + "grad_norm": 0.2674080431461334, + "learning_rate": 0.000125, + "loss": 4.6826, + "step": 300 + }, + { + "epoch": 0.2611417838237805, + "grad_norm": 0.24386395514011383, + "learning_rate": 0.00012916666666666667, + "loss": 4.6707, + "step": 310 + }, + { + "epoch": 0.26956571233422505, + "grad_norm": 0.5274083614349365, + "learning_rate": 0.00013333333333333334, + "loss": 4.6553, + "step": 320 + }, + { + "epoch": 0.2779896408446696, + "grad_norm": 0.4005141258239746, + "learning_rate": 0.0001375, + "loss": 4.6446, + "step": 330 + }, + { + "epoch": 0.28641356935511414, + "grad_norm": 0.3732853829860687, + "learning_rate": 0.00014166666666666668, + "loss": 4.6315, + "step": 340 + }, + { + "epoch": 0.29483749786555863, + "grad_norm": 0.2742752730846405, + "learning_rate": 0.00014583333333333335, + "loss": 4.6221, + "step": 350 + }, + { + "epoch": 0.3032614263760032, + "grad_norm": 0.20482462644577026, + "learning_rate": 0.00015, + "loss": 4.6138, + "step": 360 + }, + { + "epoch": 0.3032614263760032, + "eval_accuracy": 0.28836420126551926, + "eval_loss": 4.5933918952941895, + "eval_runtime": 880.4452, + "eval_samples_per_second": 567.186, + "eval_steps_per_second": 5.252, + "step": 360 + }, + { + "epoch": 0.3116853548864477, + "grad_norm": 0.26613757014274597, + "learning_rate": 0.00015416666666666668, + "loss": 4.5983, + "step": 370 + }, + { + "epoch": 0.32010928339689226, + "grad_norm": 0.20205098390579224, + "learning_rate": 0.00015833333333333332, + "loss": 4.5922, + "step": 380 + }, + { + "epoch": 0.3285332119073368, + "grad_norm": 0.5084218978881836, + "learning_rate": 0.00016250000000000002, + "loss": 4.5826, + "step": 390 + }, + { + "epoch": 0.3369571404177813, + "grad_norm": 0.2835780084133148, + "learning_rate": 0.00016666666666666666, + "loss": 4.5771, + "step": 400 + }, + { + "epoch": 0.34538106892822584, + "grad_norm": 0.23976200819015503, + "learning_rate": 0.00017083333333333333, + "loss": 4.5726, + "step": 410 + }, + { + "epoch": 0.3538049974386704, + "grad_norm": 0.2275087982416153, + "learning_rate": 0.000175, + "loss": 4.5666, + "step": 420 + }, + { + "epoch": 0.36222892594911493, + "grad_norm": 0.27758899331092834, + "learning_rate": 0.00017916666666666667, + "loss": 4.5654, + "step": 430 + }, + { + "epoch": 0.3706528544595595, + "grad_norm": 0.18581350147724152, + "learning_rate": 0.00018333333333333334, + "loss": 4.5593, + "step": 440 + }, + { + "epoch": 0.37907678297000397, + "grad_norm": 0.1667676419019699, + "learning_rate": 0.0001875, + "loss": 4.5538, + "step": 450 + }, + { + "epoch": 0.37907678297000397, + "eval_accuracy": 0.28966679521500804, + "eval_loss": 4.547606468200684, + "eval_runtime": 890.3979, + "eval_samples_per_second": 560.846, + "eval_steps_per_second": 5.193, + "step": 450 + }, + { + "epoch": 0.3875007114804485, + "grad_norm": 0.32489290833473206, + "learning_rate": 0.00019166666666666667, + "loss": 4.5532, + "step": 460 + }, + { + "epoch": 0.39592463999089306, + "grad_norm": 0.7000045776367188, + "learning_rate": 0.00019583333333333334, + "loss": 4.5484, + "step": 470 + }, + { + "epoch": 0.4043485685013376, + "grad_norm": 0.43668240308761597, + "learning_rate": 0.0002, + "loss": 4.5489, + "step": 480 + }, + { + "epoch": 0.4127724970117821, + "grad_norm": 0.36716368794441223, + "learning_rate": 0.00020416666666666668, + "loss": 4.5459, + "step": 490 + }, + { + "epoch": 0.42119642552222664, + "grad_norm": 0.30332931876182556, + "learning_rate": 0.00020833333333333335, + "loss": 4.5418, + "step": 500 + }, + { + "epoch": 0.4296203540326712, + "grad_norm": 0.5920347571372986, + "learning_rate": 0.0002125, + "loss": 4.5406, + "step": 510 + }, + { + "epoch": 0.4380442825431157, + "grad_norm": 0.45020386576652527, + "learning_rate": 0.00021666666666666668, + "loss": 4.5372, + "step": 520 + }, + { + "epoch": 0.44646821105356027, + "grad_norm": 0.33357909321784973, + "learning_rate": 0.00022083333333333333, + "loss": 4.5367, + "step": 530 + }, + { + "epoch": 0.45489213956400476, + "grad_norm": 0.45888572931289673, + "learning_rate": 0.00022500000000000002, + "loss": 4.5344, + "step": 540 + }, + { + "epoch": 0.45489213956400476, + "eval_accuracy": 0.2902362393111046, + "eval_loss": 4.531790256500244, + "eval_runtime": 882.2427, + "eval_samples_per_second": 566.03, + "eval_steps_per_second": 5.241, + "step": 540 + }, + { + "epoch": 0.4633160680744493, + "grad_norm": 0.4458440840244293, + "learning_rate": 0.00022916666666666666, + "loss": 4.5328, + "step": 550 + }, + { + "epoch": 0.47173999658489385, + "grad_norm": 0.1917838305234909, + "learning_rate": 0.00023333333333333333, + "loss": 4.5296, + "step": 560 + }, + { + "epoch": 0.4801639250953384, + "grad_norm": 0.8310424089431763, + "learning_rate": 0.0002375, + "loss": 4.5275, + "step": 570 + }, + { + "epoch": 0.48858785360578294, + "grad_norm": 0.4216615855693817, + "learning_rate": 0.00024166666666666667, + "loss": 4.531, + "step": 580 + }, + { + "epoch": 0.49701178211622743, + "grad_norm": 0.2320231944322586, + "learning_rate": 0.0002458333333333333, + "loss": 4.5276, + "step": 590 + }, + { + "epoch": 0.505435710626672, + "grad_norm": 0.3115006983280182, + "learning_rate": 0.00025, + "loss": 4.5252, + "step": 600 + }, + { + "epoch": 0.5138596391371165, + "grad_norm": 0.13032270967960358, + "learning_rate": 0.00025416666666666665, + "loss": 4.5227, + "step": 610 + }, + { + "epoch": 0.522283567647561, + "grad_norm": 0.5333927273750305, + "learning_rate": 0.00025833333333333334, + "loss": 4.5214, + "step": 620 + }, + { + "epoch": 0.5307074961580056, + "grad_norm": 0.8976441025733948, + "learning_rate": 0.00026250000000000004, + "loss": 4.5218, + "step": 630 + }, + { + "epoch": 0.5307074961580056, + "eval_accuracy": 0.290083406000685, + "eval_loss": 4.522771835327148, + "eval_runtime": 892.1941, + "eval_samples_per_second": 559.717, + "eval_steps_per_second": 5.183, + "step": 630 + }, + { + "epoch": 0.5391314246684501, + "grad_norm": 0.1657322496175766, + "learning_rate": 0.0002666666666666667, + "loss": 4.523, + "step": 640 + }, + { + "epoch": 0.5475553531788947, + "grad_norm": 0.1890048235654831, + "learning_rate": 0.0002708333333333333, + "loss": 4.5185, + "step": 650 + }, + { + "epoch": 0.5559792816893392, + "grad_norm": 0.8254080414772034, + "learning_rate": 0.000275, + "loss": 4.5196, + "step": 660 + }, + { + "epoch": 0.5644032101997837, + "grad_norm": 0.1703944355249405, + "learning_rate": 0.00027916666666666666, + "loss": 4.52, + "step": 670 + }, + { + "epoch": 0.5728271387102283, + "grad_norm": 0.33486783504486084, + "learning_rate": 0.00028333333333333335, + "loss": 4.5139, + "step": 680 + }, + { + "epoch": 0.5812510672206728, + "grad_norm": 0.4759036600589752, + "learning_rate": 0.0002875, + "loss": 4.5158, + "step": 690 + }, + { + "epoch": 0.5896749957311173, + "grad_norm": 0.26314422488212585, + "learning_rate": 0.0002916666666666667, + "loss": 4.5135, + "step": 700 + }, + { + "epoch": 0.5980989242415619, + "grad_norm": 0.39898937940597534, + "learning_rate": 0.00029583333333333333, + "loss": 4.5114, + "step": 710 + }, + { + "epoch": 0.6065228527520063, + "grad_norm": 0.5003794431686401, + "learning_rate": 0.0003, + "loss": 4.5148, + "step": 720 + }, + { + "epoch": 0.6065228527520063, + "eval_accuracy": 0.2903979539286128, + "eval_loss": 4.508981704711914, + "eval_runtime": 878.8487, + "eval_samples_per_second": 568.216, + "eval_steps_per_second": 5.261, + "step": 720 + }, + { + "epoch": 0.614946781262451, + "grad_norm": 0.2276950627565384, + "learning_rate": 0.00030416666666666667, + "loss": 4.5111, + "step": 730 + }, + { + "epoch": 0.6233707097728954, + "grad_norm": 0.21725377440452576, + "learning_rate": 0.00030833333333333337, + "loss": 4.5088, + "step": 740 + }, + { + "epoch": 0.6317946382833399, + "grad_norm": 0.8084585666656494, + "learning_rate": 0.0003125, + "loss": 4.5074, + "step": 750 + }, + { + "epoch": 0.6402185667937845, + "grad_norm": 0.46915069222450256, + "learning_rate": 0.00031666666666666665, + "loss": 4.5072, + "step": 760 + }, + { + "epoch": 0.648642495304229, + "grad_norm": 0.15649260580539703, + "learning_rate": 0.00032083333333333334, + "loss": 4.5039, + "step": 770 + }, + { + "epoch": 0.6570664238146736, + "grad_norm": 0.42916274070739746, + "learning_rate": 0.00032500000000000004, + "loss": 4.5056, + "step": 780 + }, + { + "epoch": 0.6654903523251181, + "grad_norm": 0.287572979927063, + "learning_rate": 0.0003291666666666667, + "loss": 4.5045, + "step": 790 + }, + { + "epoch": 0.6739142808355626, + "grad_norm": 0.6869699358940125, + "learning_rate": 0.0003333333333333333, + "loss": 4.5029, + "step": 800 + }, + { + "epoch": 0.6823382093460072, + "grad_norm": 0.2973476052284241, + "learning_rate": 0.0003375, + "loss": 4.5009, + "step": 810 + }, + { + "epoch": 0.6823382093460072, + "eval_accuracy": 0.29041409279207236, + "eval_loss": 4.497637748718262, + "eval_runtime": 872.3603, + "eval_samples_per_second": 572.442, + "eval_steps_per_second": 5.301, + "step": 810 + }, + { + "epoch": 0.6907621378564517, + "grad_norm": 0.5773557424545288, + "learning_rate": 0.00034166666666666666, + "loss": 4.5024, + "step": 820 + }, + { + "epoch": 0.6991860663668963, + "grad_norm": 0.31921157240867615, + "learning_rate": 0.00034583333333333335, + "loss": 4.5006, + "step": 830 + }, + { + "epoch": 0.7076099948773408, + "grad_norm": 0.4232361912727356, + "learning_rate": 0.00035, + "loss": 4.5001, + "step": 840 + }, + { + "epoch": 0.7160339233877853, + "grad_norm": 0.30865538120269775, + "learning_rate": 0.0003541666666666667, + "loss": 4.4998, + "step": 850 + }, + { + "epoch": 0.7244578518982299, + "grad_norm": 0.6191368699073792, + "learning_rate": 0.00035833333333333333, + "loss": 4.4967, + "step": 860 + }, + { + "epoch": 0.7328817804086744, + "grad_norm": 0.3202773630619049, + "learning_rate": 0.0003625, + "loss": 4.499, + "step": 870 + }, + { + "epoch": 0.741305708919119, + "grad_norm": 0.3090028464794159, + "learning_rate": 0.00036666666666666667, + "loss": 4.4967, + "step": 880 + }, + { + "epoch": 0.7497296374295634, + "grad_norm": 0.9248805046081543, + "learning_rate": 0.00037083333333333337, + "loss": 4.4962, + "step": 890 + }, + { + "epoch": 0.7581535659400079, + "grad_norm": 0.27745822072029114, + "learning_rate": 0.000375, + "loss": 4.4956, + "step": 900 + }, + { + "epoch": 0.7581535659400079, + "eval_accuracy": 0.29047371761644103, + "eval_loss": 4.492140293121338, + "eval_runtime": 888.1144, + "eval_samples_per_second": 562.288, + "eval_steps_per_second": 5.207, + "step": 900 + }, + { + "epoch": 0.7665774944504525, + "grad_norm": 0.2972380518913269, + "learning_rate": 0.00037916666666666665, + "loss": 4.4936, + "step": 910 + }, + { + "epoch": 0.775001422960897, + "grad_norm": 1.4440104961395264, + "learning_rate": 0.00038333333333333334, + "loss": 4.4956, + "step": 920 + }, + { + "epoch": 0.7834253514713415, + "grad_norm": 0.2894129455089569, + "learning_rate": 0.00038750000000000004, + "loss": 4.4961, + "step": 930 + }, + { + "epoch": 0.7918492799817861, + "grad_norm": 0.22757315635681152, + "learning_rate": 0.0003916666666666667, + "loss": 4.495, + "step": 940 + }, + { + "epoch": 0.8002732084922306, + "grad_norm": 0.2084762305021286, + "learning_rate": 0.0003958333333333333, + "loss": 4.4921, + "step": 950 + }, + { + "epoch": 0.8086971370026752, + "grad_norm": 0.4823535084724426, + "learning_rate": 0.0004, + "loss": 4.4928, + "step": 960 + }, + { + "epoch": 0.8171210655131197, + "grad_norm": 0.22939594089984894, + "learning_rate": 0.00040416666666666666, + "loss": 4.4889, + "step": 970 + }, + { + "epoch": 0.8255449940235642, + "grad_norm": 0.4983462989330292, + "learning_rate": 0.00040833333333333336, + "loss": 4.4888, + "step": 980 + }, + { + "epoch": 0.8339689225340088, + "grad_norm": 0.7445792555809021, + "learning_rate": 0.0004125, + "loss": 4.4899, + "step": 990 + }, + { + "epoch": 0.8339689225340088, + "eval_accuracy": 0.2903607895100575, + "eval_loss": 4.490144729614258, + "eval_runtime": 872.9885, + "eval_samples_per_second": 572.03, + "eval_steps_per_second": 5.297, + "step": 990 + }, + { + "epoch": 0.8423928510444533, + "grad_norm": 0.3264559805393219, + "learning_rate": 0.0004166666666666667, + "loss": 4.4879, + "step": 1000 + }, + { + "epoch": 0.8508167795548979, + "grad_norm": 0.5130082964897156, + "learning_rate": 0.00042083333333333333, + "loss": 4.4881, + "step": 1010 + }, + { + "epoch": 0.8592407080653424, + "grad_norm": 0.2776341736316681, + "learning_rate": 0.000425, + "loss": 4.4872, + "step": 1020 + }, + { + "epoch": 0.8676646365757869, + "grad_norm": 0.9157618880271912, + "learning_rate": 0.00042916666666666667, + "loss": 4.4868, + "step": 1030 + }, + { + "epoch": 0.8760885650862315, + "grad_norm": 0.22099615633487701, + "learning_rate": 0.00043333333333333337, + "loss": 4.4877, + "step": 1040 + }, + { + "epoch": 0.8845124935966759, + "grad_norm": 0.2313142567873001, + "learning_rate": 0.0004375, + "loss": 4.4845, + "step": 1050 + }, + { + "epoch": 0.8929364221071205, + "grad_norm": 0.4353635907173157, + "learning_rate": 0.00044166666666666665, + "loss": 4.4888, + "step": 1060 + }, + { + "epoch": 0.901360350617565, + "grad_norm": 0.2390984743833542, + "learning_rate": 0.00044583333333333335, + "loss": 4.4827, + "step": 1070 + }, + { + "epoch": 0.9097842791280095, + "grad_norm": 0.31369632482528687, + "learning_rate": 0.00045000000000000004, + "loss": 4.4832, + "step": 1080 + }, + { + "epoch": 0.9097842791280095, + "eval_accuracy": 0.2904605834264481, + "eval_loss": 4.480494499206543, + "eval_runtime": 880.1337, + "eval_samples_per_second": 567.386, + "eval_steps_per_second": 5.254, + "step": 1080 + }, + { + "epoch": 0.9182082076384541, + "grad_norm": 0.6700971722602844, + "learning_rate": 0.0004541666666666667, + "loss": 4.483, + "step": 1090 + }, + { + "epoch": 0.9266321361488986, + "grad_norm": 0.25950998067855835, + "learning_rate": 0.0004583333333333333, + "loss": 4.4832, + "step": 1100 + }, + { + "epoch": 0.9350560646593432, + "grad_norm": 0.2840316593647003, + "learning_rate": 0.0004625, + "loss": 4.4819, + "step": 1110 + }, + { + "epoch": 0.9434799931697877, + "grad_norm": 0.6859279274940491, + "learning_rate": 0.00046666666666666666, + "loss": 4.4819, + "step": 1120 + }, + { + "epoch": 0.9519039216802322, + "grad_norm": 0.2865343391895294, + "learning_rate": 0.00047083333333333336, + "loss": 4.48, + "step": 1130 + }, + { + "epoch": 0.9603278501906768, + "grad_norm": 1.179539442062378, + "learning_rate": 0.000475, + "loss": 4.4762, + "step": 1140 + }, + { + "epoch": 0.9687517787011213, + "grad_norm": 0.4731704294681549, + "learning_rate": 0.0004791666666666667, + "loss": 4.4831, + "step": 1150 + }, + { + "epoch": 0.9771757072115659, + "grad_norm": 0.298757404088974, + "learning_rate": 0.00048333333333333334, + "loss": 4.4742, + "step": 1160 + }, + { + "epoch": 0.9855996357220104, + "grad_norm": 1.0954639911651611, + "learning_rate": 0.0004875, + "loss": 4.46, + "step": 1170 + }, + { + "epoch": 0.9855996357220104, + "eval_accuracy": 0.29021425691327735, + "eval_loss": 4.458162784576416, + "eval_runtime": 887.8161, + "eval_samples_per_second": 562.477, + "eval_steps_per_second": 5.208, + "step": 1170 + }, + { + "epoch": 0.9940235642324549, + "grad_norm": 0.441949725151062, + "learning_rate": 0.0004916666666666666, + "loss": 4.4549, + "step": 1180 + }, + { + "epoch": 1.0024474927428995, + "grad_norm": 0.5917736887931824, + "learning_rate": 0.0004958333333333334, + "loss": 4.4425, + "step": 1190 + }, + { + "epoch": 1.010871421253344, + "grad_norm": 0.3910304307937622, + "learning_rate": 0.0005, + "loss": 4.4376, + "step": 1200 + }, + { + "epoch": 1.0192953497637884, + "grad_norm": 0.446277916431427, + "learning_rate": 0.0005041666666666667, + "loss": 4.4284, + "step": 1210 + }, + { + "epoch": 1.027719278274233, + "grad_norm": 0.7843539118766785, + "learning_rate": 0.0005083333333333333, + "loss": 4.4216, + "step": 1220 + }, + { + "epoch": 1.0361432067846776, + "grad_norm": 0.5028587579727173, + "learning_rate": 0.0005124999999999999, + "loss": 4.418, + "step": 1230 + }, + { + "epoch": 1.044567135295122, + "grad_norm": 0.5062530636787415, + "learning_rate": 0.0005166666666666667, + "loss": 4.4099, + "step": 1240 + }, + { + "epoch": 1.0529910638055666, + "grad_norm": 0.4109475016593933, + "learning_rate": 0.0005208333333333334, + "loss": 4.4005, + "step": 1250 + }, + { + "epoch": 1.0614149923160112, + "grad_norm": 0.494357705116272, + "learning_rate": 0.0005250000000000001, + "loss": 4.3924, + "step": 1260 + }, + { + "epoch": 1.0614149923160112, + "eval_accuracy": 0.29121270831959656, + "eval_loss": 4.368500232696533, + "eval_runtime": 885.6194, + "eval_samples_per_second": 563.872, + "eval_steps_per_second": 5.221, + "step": 1260 + }, + { + "epoch": 1.0698389208264556, + "grad_norm": 0.4964124858379364, + "learning_rate": 0.0005291666666666667, + "loss": 4.3843, + "step": 1270 + }, + { + "epoch": 1.0782628493369002, + "grad_norm": 0.6328290700912476, + "learning_rate": 0.0005333333333333334, + "loss": 4.3756, + "step": 1280 + }, + { + "epoch": 1.0866867778473448, + "grad_norm": 0.8674759268760681, + "learning_rate": 0.0005375, + "loss": 4.3697, + "step": 1290 + }, + { + "epoch": 1.0951107063577892, + "grad_norm": 0.4631132185459137, + "learning_rate": 0.0005416666666666666, + "loss": 4.3676, + "step": 1300 + }, + { + "epoch": 1.1035346348682338, + "grad_norm": 0.5043870210647583, + "learning_rate": 0.0005458333333333333, + "loss": 4.3582, + "step": 1310 + }, + { + "epoch": 1.1119585633786784, + "grad_norm": 0.5791853666305542, + "learning_rate": 0.00055, + "loss": 4.3529, + "step": 1320 + }, + { + "epoch": 1.120382491889123, + "grad_norm": 0.6443321108818054, + "learning_rate": 0.0005541666666666667, + "loss": 4.3471, + "step": 1330 + }, + { + "epoch": 1.1288064203995674, + "grad_norm": 0.6193282008171082, + "learning_rate": 0.0005583333333333333, + "loss": 4.338, + "step": 1340 + }, + { + "epoch": 1.137230348910012, + "grad_norm": 0.6169930696487427, + "learning_rate": 0.0005625000000000001, + "loss": 4.3365, + "step": 1350 + }, + { + "epoch": 1.137230348910012, + "eval_accuracy": 0.2912005471998471, + "eval_loss": 4.2970428466796875, + "eval_runtime": 875.1704, + "eval_samples_per_second": 570.604, + "eval_steps_per_second": 5.284, + "step": 1350 + }, + { + "epoch": 1.1456542774204566, + "grad_norm": 0.8051270246505737, + "learning_rate": 0.0005666666666666667, + "loss": 4.3252, + "step": 1360 + }, + { + "epoch": 1.154078205930901, + "grad_norm": 0.7985979914665222, + "learning_rate": 0.0005708333333333333, + "loss": 4.3185, + "step": 1370 + }, + { + "epoch": 1.1625021344413455, + "grad_norm": 0.7459626793861389, + "learning_rate": 0.000575, + "loss": 4.3119, + "step": 1380 + }, + { + "epoch": 1.1709260629517901, + "grad_norm": 0.572289228439331, + "learning_rate": 0.0005791666666666667, + "loss": 4.3066, + "step": 1390 + }, + { + "epoch": 1.1793499914622347, + "grad_norm": 0.5565480589866638, + "learning_rate": 0.0005833333333333334, + "loss": 4.2973, + "step": 1400 + }, + { + "epoch": 1.1877739199726791, + "grad_norm": 0.789574384689331, + "learning_rate": 0.0005875, + "loss": 4.2922, + "step": 1410 + }, + { + "epoch": 1.1961978484831237, + "grad_norm": 1.0027601718902588, + "learning_rate": 0.0005916666666666667, + "loss": 4.2824, + "step": 1420 + }, + { + "epoch": 1.204621776993568, + "grad_norm": 0.8137519359588623, + "learning_rate": 0.0005958333333333333, + "loss": 4.2808, + "step": 1430 + }, + { + "epoch": 1.2130457055040127, + "grad_norm": 0.8705686330795288, + "learning_rate": 0.0006, + "loss": 4.2685, + "step": 1440 + }, + { + "epoch": 1.2130457055040127, + "eval_accuracy": 0.2922224943254529, + "eval_loss": 4.225285053253174, + "eval_runtime": 885.6768, + "eval_samples_per_second": 563.835, + "eval_steps_per_second": 5.221, + "step": 1440 + }, + { + "epoch": 1.2214696340144573, + "grad_norm": 1.0055943727493286, + "learning_rate": 0.0006041666666666666, + "loss": 4.2639, + "step": 1450 + }, + { + "epoch": 1.229893562524902, + "grad_norm": 0.9747255444526672, + "learning_rate": 0.0006083333333333333, + "loss": 4.2622, + "step": 1460 + }, + { + "epoch": 1.2383174910353463, + "grad_norm": 0.6799793243408203, + "learning_rate": 0.0006125000000000001, + "loss": 4.251, + "step": 1470 + }, + { + "epoch": 1.2467414195457909, + "grad_norm": 0.8863984942436218, + "learning_rate": 0.0006166666666666667, + "loss": 4.2476, + "step": 1480 + }, + { + "epoch": 1.2551653480562355, + "grad_norm": 0.891790509223938, + "learning_rate": 0.0006208333333333334, + "loss": 4.2434, + "step": 1490 + }, + { + "epoch": 1.2635892765666799, + "grad_norm": 0.731626033782959, + "learning_rate": 0.000625, + "loss": 4.233, + "step": 1500 + }, + { + "epoch": 1.2720132050771245, + "grad_norm": 0.7038396000862122, + "learning_rate": 0.0006291666666666667, + "loss": 4.2264, + "step": 1510 + }, + { + "epoch": 1.280437133587569, + "grad_norm": 1.0247654914855957, + "learning_rate": 0.0006333333333333333, + "loss": 4.2198, + "step": 1520 + }, + { + "epoch": 1.2888610620980137, + "grad_norm": 1.0854212045669556, + "learning_rate": 0.0006374999999999999, + "loss": 4.2126, + "step": 1530 + }, + { + "epoch": 1.2888610620980137, + "eval_accuracy": 0.2953678601775117, + "eval_loss": 4.152132034301758, + "eval_runtime": 880.7951, + "eval_samples_per_second": 566.96, + "eval_steps_per_second": 5.25, + "step": 1530 + }, + { + "epoch": 1.297284990608458, + "grad_norm": 0.8179611563682556, + "learning_rate": 0.0006416666666666667, + "loss": 4.2081, + "step": 1540 + }, + { + "epoch": 1.3057089191189026, + "grad_norm": 1.4174506664276123, + "learning_rate": 0.0006458333333333334, + "loss": 4.2027, + "step": 1550 + }, + { + "epoch": 1.314132847629347, + "grad_norm": 1.1611113548278809, + "learning_rate": 0.0006500000000000001, + "loss": 4.1992, + "step": 1560 + }, + { + "epoch": 1.3225567761397916, + "grad_norm": 1.1475598812103271, + "learning_rate": 0.0006541666666666667, + "loss": 4.1875, + "step": 1570 + }, + { + "epoch": 1.3309807046502362, + "grad_norm": 1.158115267753601, + "learning_rate": 0.0006583333333333334, + "loss": 4.1883, + "step": 1580 + }, + { + "epoch": 1.3394046331606808, + "grad_norm": 1.325655221939087, + "learning_rate": 0.0006625, + "loss": 4.181, + "step": 1590 + }, + { + "epoch": 1.3478285616711254, + "grad_norm": 1.077793836593628, + "learning_rate": 0.0006666666666666666, + "loss": 4.1727, + "step": 1600 + }, + { + "epoch": 1.3562524901815698, + "grad_norm": 1.2139134407043457, + "learning_rate": 0.0006708333333333333, + "loss": 4.1691, + "step": 1610 + }, + { + "epoch": 1.3646764186920144, + "grad_norm": 1.075778603553772, + "learning_rate": 0.000675, + "loss": 4.1563, + "step": 1620 + }, + { + "epoch": 1.3646764186920144, + "eval_accuracy": 0.2982954422675167, + "eval_loss": 4.0783562660217285, + "eval_runtime": 880.4076, + "eval_samples_per_second": 567.21, + "eval_steps_per_second": 5.252, + "step": 1620 + }, + { + "epoch": 1.3731003472024588, + "grad_norm": 1.8017152547836304, + "learning_rate": 0.0006791666666666667, + "loss": 4.1523, + "step": 1630 + }, + { + "epoch": 1.3815242757129034, + "grad_norm": 1.2614473104476929, + "learning_rate": 0.0006833333333333333, + "loss": 4.1481, + "step": 1640 + }, + { + "epoch": 1.389948204223348, + "grad_norm": 1.179167628288269, + "learning_rate": 0.0006875, + "loss": 4.1421, + "step": 1650 + }, + { + "epoch": 1.3983721327337926, + "grad_norm": 1.463998794555664, + "learning_rate": 0.0006916666666666667, + "loss": 4.1331, + "step": 1660 + }, + { + "epoch": 1.406796061244237, + "grad_norm": 1.086358666419983, + "learning_rate": 0.0006958333333333334, + "loss": 4.1276, + "step": 1670 + }, + { + "epoch": 1.4152199897546816, + "grad_norm": 1.3272647857666016, + "learning_rate": 0.0007, + "loss": 4.1357, + "step": 1680 + }, + { + "epoch": 1.4236439182651262, + "grad_norm": 1.4760971069335938, + "learning_rate": 0.0007041666666666667, + "loss": 4.1299, + "step": 1690 + }, + { + "epoch": 1.4320678467755705, + "grad_norm": 1.7591749429702759, + "learning_rate": 0.0007083333333333334, + "loss": 4.129, + "step": 1700 + }, + { + "epoch": 1.4404917752860151, + "grad_norm": 1.7945603132247925, + "learning_rate": 0.0007125, + "loss": 4.1221, + "step": 1710 + }, + { + "epoch": 1.4404917752860151, + "eval_accuracy": 0.3010639405026742, + "eval_loss": 4.012106895446777, + "eval_runtime": 881.7425, + "eval_samples_per_second": 566.351, + "eval_steps_per_second": 5.244, + "step": 1710 + }, + { + "epoch": 1.4489157037964597, + "grad_norm": 1.7016360759735107, + "learning_rate": 0.0007166666666666667, + "loss": 4.1043, + "step": 1720 + }, + { + "epoch": 1.4573396323069043, + "grad_norm": 1.8240207433700562, + "learning_rate": 0.0007208333333333333, + "loss": 4.1034, + "step": 1730 + }, + { + "epoch": 1.4657635608173487, + "grad_norm": 2.4510786533355713, + "learning_rate": 0.000725, + "loss": 4.0924, + "step": 1740 + }, + { + "epoch": 1.4741874893277933, + "grad_norm": 1.7411324977874756, + "learning_rate": 0.0007291666666666666, + "loss": 4.1041, + "step": 1750 + }, + { + "epoch": 1.4826114178382377, + "grad_norm": 1.1133612394332886, + "learning_rate": 0.0007333333333333333, + "loss": 4.1064, + "step": 1760 + }, + { + "epoch": 1.4910353463486823, + "grad_norm": 1.3936740159988403, + "learning_rate": 0.0007375000000000001, + "loss": 4.0954, + "step": 1770 + }, + { + "epoch": 1.499459274859127, + "grad_norm": 2.3855819702148438, + "learning_rate": 0.0007416666666666667, + "loss": 4.0836, + "step": 1780 + }, + { + "epoch": 1.5078832033695715, + "grad_norm": 1.2734453678131104, + "learning_rate": 0.0007458333333333334, + "loss": 4.0834, + "step": 1790 + }, + { + "epoch": 1.516307131880016, + "grad_norm": 1.432719349861145, + "learning_rate": 0.00075, + "loss": 4.0711, + "step": 1800 + }, + { + "epoch": 1.516307131880016, + "eval_accuracy": 0.3055703004736556, + "eval_loss": 3.976287841796875, + "eval_runtime": 881.3595, + "eval_samples_per_second": 566.597, + "eval_steps_per_second": 5.246, + "step": 1800 + }, + { + "epoch": 1.5247310603904605, + "grad_norm": 1.5839996337890625, + "learning_rate": 0.0007541666666666667, + "loss": 4.0712, + "step": 1810 + }, + { + "epoch": 1.5331549889009048, + "grad_norm": 3.0461270809173584, + "learning_rate": 0.0007583333333333333, + "loss": 4.0617, + "step": 1820 + }, + { + "epoch": 1.5415789174113494, + "grad_norm": 1.760568380355835, + "learning_rate": 0.0007624999999999999, + "loss": 4.0486, + "step": 1830 + }, + { + "epoch": 1.550002845921794, + "grad_norm": 1.6682184934616089, + "learning_rate": 0.0007666666666666667, + "loss": 4.0034, + "step": 1840 + }, + { + "epoch": 1.5584267744322386, + "grad_norm": 1.4350653886795044, + "learning_rate": 0.0007708333333333334, + "loss": 3.9644, + "step": 1850 + }, + { + "epoch": 1.5668507029426832, + "grad_norm": 1.4870712757110596, + "learning_rate": 0.0007750000000000001, + "loss": 3.9314, + "step": 1860 + }, + { + "epoch": 1.5752746314531276, + "grad_norm": 1.7954463958740234, + "learning_rate": 0.0007791666666666667, + "loss": 3.8939, + "step": 1870 + }, + { + "epoch": 1.5836985599635722, + "grad_norm": 2.1485602855682373, + "learning_rate": 0.0007833333333333334, + "loss": 3.8576, + "step": 1880 + }, + { + "epoch": 1.5921224884740166, + "grad_norm": 1.647570252418518, + "learning_rate": 0.0007875, + "loss": 3.8159, + "step": 1890 + }, + { + "epoch": 1.5921224884740166, + "eval_accuracy": 0.3353472770952767, + "eval_loss": 3.6341910362243652, + "eval_runtime": 881.1424, + "eval_samples_per_second": 566.737, + "eval_steps_per_second": 5.248, + "step": 1890 + }, + { + "epoch": 1.6005464169844612, + "grad_norm": 1.7171742916107178, + "learning_rate": 0.0007916666666666666, + "loss": 3.7812, + "step": 1900 + }, + { + "epoch": 1.6089703454949058, + "grad_norm": 2.12190580368042, + "learning_rate": 0.0007958333333333333, + "loss": 3.7402, + "step": 1910 + }, + { + "epoch": 1.6173942740053504, + "grad_norm": 1.7334414720535278, + "learning_rate": 0.0008, + "loss": 3.7025, + "step": 1920 + }, + { + "epoch": 1.625818202515795, + "grad_norm": 1.8880668878555298, + "learning_rate": 0.0008041666666666667, + "loss": 3.6808, + "step": 1930 + }, + { + "epoch": 1.6342421310262394, + "grad_norm": 2.3294591903686523, + "learning_rate": 0.0008083333333333333, + "loss": 3.6419, + "step": 1940 + }, + { + "epoch": 1.642666059536684, + "grad_norm": 2.4122796058654785, + "learning_rate": 0.0008125000000000001, + "loss": 3.6114, + "step": 1950 + }, + { + "epoch": 1.6510899880471284, + "grad_norm": 2.090388774871826, + "learning_rate": 0.0008166666666666667, + "loss": 3.5867, + "step": 1960 + }, + { + "epoch": 1.659513916557573, + "grad_norm": 2.267676830291748, + "learning_rate": 0.0008208333333333334, + "loss": 3.5501, + "step": 1970 + }, + { + "epoch": 1.6679378450680176, + "grad_norm": 2.253739833831787, + "learning_rate": 0.000825, + "loss": 3.5114, + "step": 1980 + }, + { + "epoch": 1.6679378450680176, + "eval_accuracy": 0.38861593633258434, + "eval_loss": 3.2597665786743164, + "eval_runtime": 889.3264, + "eval_samples_per_second": 561.522, + "eval_steps_per_second": 5.199, + "step": 1980 + }, + { + "epoch": 1.6763617735784622, + "grad_norm": 2.269505739212036, + "learning_rate": 0.0008291666666666667, + "loss": 3.4854, + "step": 1990 + }, + { + "epoch": 1.6847857020889065, + "grad_norm": 1.7237802743911743, + "learning_rate": 0.0008333333333333334, + "loss": 3.4651, + "step": 2000 + }, + { + "epoch": 1.6932096305993511, + "grad_norm": 2.1117663383483887, + "learning_rate": 0.0008375, + "loss": 3.4558, + "step": 2010 + }, + { + "epoch": 1.7016335591097955, + "grad_norm": 2.1351046562194824, + "learning_rate": 0.0008416666666666667, + "loss": 3.4256, + "step": 2020 + }, + { + "epoch": 1.7100574876202401, + "grad_norm": 2.326232671737671, + "learning_rate": 0.0008458333333333333, + "loss": 3.3998, + "step": 2030 + }, + { + "epoch": 1.7184814161306847, + "grad_norm": 2.1802730560302734, + "learning_rate": 0.00085, + "loss": 3.3865, + "step": 2040 + }, + { + "epoch": 1.7269053446411293, + "grad_norm": 2.042966604232788, + "learning_rate": 0.0008541666666666666, + "loss": 3.3539, + "step": 2050 + }, + { + "epoch": 1.735329273151574, + "grad_norm": 2.052464008331299, + "learning_rate": 0.0008583333333333333, + "loss": 3.3308, + "step": 2060 + }, + { + "epoch": 1.7437532016620183, + "grad_norm": 1.5790934562683105, + "learning_rate": 0.0008625000000000001, + "loss": 3.3122, + "step": 2070 + }, + { + "epoch": 1.7437532016620183, + "eval_accuracy": 0.41178756961484836, + "eval_loss": 3.0882680416107178, + "eval_runtime": 878.4742, + "eval_samples_per_second": 568.458, + "eval_steps_per_second": 5.264, + "step": 2070 + }, + { + "epoch": 1.752177130172463, + "grad_norm": 2.2859761714935303, + "learning_rate": 0.0008666666666666667, + "loss": 3.3034, + "step": 2080 + }, + { + "epoch": 1.7606010586829073, + "grad_norm": 2.912191867828369, + "learning_rate": 0.0008708333333333334, + "loss": 3.289, + "step": 2090 + }, + { + "epoch": 1.7690249871933519, + "grad_norm": 2.143118143081665, + "learning_rate": 0.000875, + "loss": 3.2547, + "step": 2100 + }, + { + "epoch": 1.7774489157037965, + "grad_norm": 1.8577404022216797, + "learning_rate": 0.0008791666666666667, + "loss": 3.2383, + "step": 2110 + }, + { + "epoch": 1.785872844214241, + "grad_norm": 1.9692562818527222, + "learning_rate": 0.0008833333333333333, + "loss": 3.2137, + "step": 2120 + }, + { + "epoch": 1.7942967727246857, + "grad_norm": 1.938915729522705, + "learning_rate": 0.0008874999999999999, + "loss": 3.1909, + "step": 2130 + }, + { + "epoch": 1.80272070123513, + "grad_norm": 1.395321011543274, + "learning_rate": 0.0008916666666666667, + "loss": 3.1346, + "step": 2140 + }, + { + "epoch": 1.8111446297455744, + "grad_norm": 1.8771544694900513, + "learning_rate": 0.0008958333333333334, + "loss": 3.1035, + "step": 2150 + }, + { + "epoch": 1.819568558256019, + "grad_norm": 1.5829336643218994, + "learning_rate": 0.0009000000000000001, + "loss": 3.0328, + "step": 2160 + }, + { + "epoch": 1.819568558256019, + "eval_accuracy": 0.45304088376136725, + "eval_loss": 2.8062996864318848, + "eval_runtime": 886.0675, + "eval_samples_per_second": 563.587, + "eval_steps_per_second": 5.219, + "step": 2160 + }, + { + "epoch": 1.8279924867664636, + "grad_norm": 1.5085866451263428, + "learning_rate": 0.0009041666666666667, + "loss": 3.0089, + "step": 2170 + }, + { + "epoch": 1.8364164152769082, + "grad_norm": 1.4988549947738647, + "learning_rate": 0.0009083333333333334, + "loss": 2.9786, + "step": 2180 + }, + { + "epoch": 1.8448403437873528, + "grad_norm": 1.5726799964904785, + "learning_rate": 0.0009125, + "loss": 2.936, + "step": 2190 + }, + { + "epoch": 1.8532642722977972, + "grad_norm": 1.2175358533859253, + "learning_rate": 0.0009166666666666666, + "loss": 2.8996, + "step": 2200 + }, + { + "epoch": 1.8616882008082418, + "grad_norm": 1.4195218086242676, + "learning_rate": 0.0009208333333333333, + "loss": 2.8664, + "step": 2210 + }, + { + "epoch": 1.8701121293186862, + "grad_norm": 1.1213312149047852, + "learning_rate": 0.000925, + "loss": 2.8382, + "step": 2220 + }, + { + "epoch": 1.8785360578291308, + "grad_norm": 1.169554591178894, + "learning_rate": 0.0009291666666666667, + "loss": 2.8026, + "step": 2230 + }, + { + "epoch": 1.8869599863395754, + "grad_norm": 1.4759305715560913, + "learning_rate": 0.0009333333333333333, + "loss": 2.7654, + "step": 2240 + }, + { + "epoch": 1.89538391485002, + "grad_norm": 1.3071763515472412, + "learning_rate": 0.0009375, + "loss": 2.7311, + "step": 2250 + }, + { + "epoch": 1.89538391485002, + "eval_accuracy": 0.4917409385648686, + "eval_loss": 2.5433878898620605, + "eval_runtime": 879.3794, + "eval_samples_per_second": 567.873, + "eval_steps_per_second": 5.258, + "step": 2250 + }, + { + "epoch": 1.9038078433604646, + "grad_norm": 0.9968194961547852, + "learning_rate": 0.0009416666666666667, + "loss": 2.7044, + "step": 2260 + }, + { + "epoch": 1.912231771870909, + "grad_norm": 1.1783692836761475, + "learning_rate": 0.0009458333333333334, + "loss": 2.6819, + "step": 2270 + }, + { + "epoch": 1.9206557003813534, + "grad_norm": 0.9856918454170227, + "learning_rate": 0.00095, + "loss": 2.6528, + "step": 2280 + }, + { + "epoch": 1.929079628891798, + "grad_norm": 1.0605028867721558, + "learning_rate": 0.0009541666666666667, + "loss": 2.6226, + "step": 2290 + }, + { + "epoch": 1.9375035574022426, + "grad_norm": 0.8553977608680725, + "learning_rate": 0.0009583333333333334, + "loss": 2.608, + "step": 2300 + }, + { + "epoch": 1.9459274859126872, + "grad_norm": 0.9543612599372864, + "learning_rate": 0.0009625, + "loss": 2.5865, + "step": 2310 + }, + { + "epoch": 1.9543514144231318, + "grad_norm": 1.1085282564163208, + "learning_rate": 0.0009666666666666667, + "loss": 2.5586, + "step": 2320 + }, + { + "epoch": 1.9627753429335761, + "grad_norm": 0.8689624667167664, + "learning_rate": 0.0009708333333333333, + "loss": 2.541, + "step": 2330 + }, + { + "epoch": 1.9711992714440207, + "grad_norm": 0.6790447235107422, + "learning_rate": 0.000975, + "loss": 2.5214, + "step": 2340 + }, + { + "epoch": 1.9711992714440207, + "eval_accuracy": 0.5198810557311793, + "eval_loss": 2.3582663536071777, + "eval_runtime": 891.4654, + "eval_samples_per_second": 560.174, + "eval_steps_per_second": 5.187, + "step": 2340 + }, + { + "epoch": 1.9796231999544651, + "grad_norm": 1.1572414636611938, + "learning_rate": 0.0009791666666666666, + "loss": 2.5126, + "step": 2350 + }, + { + "epoch": 1.9880471284649097, + "grad_norm": 0.8218650221824646, + "learning_rate": 0.0009833333333333332, + "loss": 2.4903, + "step": 2360 + }, + { + "epoch": 1.9964710569753543, + "grad_norm": 0.9195880889892578, + "learning_rate": 0.0009875, + "loss": 2.479, + "step": 2370 + }, + { + "epoch": 2.004894985485799, + "grad_norm": 0.6436383724212646, + "learning_rate": 0.0009916666666666667, + "loss": 2.4509, + "step": 2380 + }, + { + "epoch": 2.0133189139962435, + "grad_norm": 0.9757860898971558, + "learning_rate": 0.0009958333333333334, + "loss": 2.453, + "step": 2390 + }, + { + "epoch": 2.021742842506688, + "grad_norm": 0.8884423971176147, + "learning_rate": 0.001, + "loss": 2.428, + "step": 2400 + }, + { + "epoch": 2.0301667710171323, + "grad_norm": 1.097330093383789, + "learning_rate": 0.000999009900990099, + "loss": 2.4139, + "step": 2410 + }, + { + "epoch": 2.038590699527577, + "grad_norm": 1.095337152481079, + "learning_rate": 0.0009980198019801981, + "loss": 2.4024, + "step": 2420 + }, + { + "epoch": 2.0470146280380215, + "grad_norm": 1.0757551193237305, + "learning_rate": 0.000997029702970297, + "loss": 2.3853, + "step": 2430 + }, + { + "epoch": 2.0470146280380215, + "eval_accuracy": 0.538133837771306, + "eval_loss": 2.2352097034454346, + "eval_runtime": 883.4374, + "eval_samples_per_second": 565.265, + "eval_steps_per_second": 5.234, + "step": 2430 + }, + { + "epoch": 2.055438556548466, + "grad_norm": 0.9356153011322021, + "learning_rate": 0.000996039603960396, + "loss": 2.3669, + "step": 2440 + }, + { + "epoch": 2.0638624850589107, + "grad_norm": 0.8463107347488403, + "learning_rate": 0.000995049504950495, + "loss": 2.3604, + "step": 2450 + }, + { + "epoch": 2.0722864135693553, + "grad_norm": 0.8833483457565308, + "learning_rate": 0.0009940594059405941, + "loss": 2.3574, + "step": 2460 + }, + { + "epoch": 2.0807103420797994, + "grad_norm": 0.7081923484802246, + "learning_rate": 0.0009930693069306932, + "loss": 2.3338, + "step": 2470 + }, + { + "epoch": 2.089134270590244, + "grad_norm": 0.5993143916130066, + "learning_rate": 0.000992079207920792, + "loss": 2.3219, + "step": 2480 + }, + { + "epoch": 2.0975581991006886, + "grad_norm": 0.8431512117385864, + "learning_rate": 0.000991089108910891, + "loss": 2.3108, + "step": 2490 + }, + { + "epoch": 2.1059821276111332, + "grad_norm": 0.9983824491500854, + "learning_rate": 0.0009900990099009901, + "loss": 2.305, + "step": 2500 + }, + { + "epoch": 2.114406056121578, + "grad_norm": 0.6354156732559204, + "learning_rate": 0.0009891089108910892, + "loss": 2.2965, + "step": 2510 + }, + { + "epoch": 2.1228299846320224, + "grad_norm": 0.8491016626358032, + "learning_rate": 0.0009881188118811882, + "loss": 2.2763, + "step": 2520 + }, + { + "epoch": 2.1228299846320224, + "eval_accuracy": 0.5540495533549666, + "eval_loss": 2.135758399963379, + "eval_runtime": 895.5557, + "eval_samples_per_second": 557.616, + "eval_steps_per_second": 5.163, + "step": 2520 + }, + { + "epoch": 2.131253913142467, + "grad_norm": 0.6909253001213074, + "learning_rate": 0.000987128712871287, + "loss": 2.2696, + "step": 2530 + }, + { + "epoch": 2.139677841652911, + "grad_norm": 0.5072851181030273, + "learning_rate": 0.000986138613861386, + "loss": 2.2555, + "step": 2540 + }, + { + "epoch": 2.148101770163356, + "grad_norm": 0.7575969696044922, + "learning_rate": 0.0009851485148514852, + "loss": 2.2552, + "step": 2550 + }, + { + "epoch": 2.1565256986738004, + "grad_norm": 0.7418563365936279, + "learning_rate": 0.0009841584158415842, + "loss": 2.2439, + "step": 2560 + }, + { + "epoch": 2.164949627184245, + "grad_norm": 0.5893211960792542, + "learning_rate": 0.0009831683168316833, + "loss": 2.2282, + "step": 2570 + }, + { + "epoch": 2.1733735556946896, + "grad_norm": 0.892035186290741, + "learning_rate": 0.000982178217821782, + "loss": 2.2201, + "step": 2580 + }, + { + "epoch": 2.181797484205134, + "grad_norm": 0.688275933265686, + "learning_rate": 0.0009811881188118811, + "loss": 2.2174, + "step": 2590 + }, + { + "epoch": 2.1902214127155784, + "grad_norm": 0.5092687010765076, + "learning_rate": 0.0009801980198019802, + "loss": 2.2032, + "step": 2600 + }, + { + "epoch": 2.198645341226023, + "grad_norm": 0.6715185642242432, + "learning_rate": 0.0009792079207920793, + "loss": 2.189, + "step": 2610 + }, + { + "epoch": 2.198645341226023, + "eval_accuracy": 0.5674450081410035, + "eval_loss": 2.053079605102539, + "eval_runtime": 876.7453, + "eval_samples_per_second": 569.579, + "eval_steps_per_second": 5.274, + "step": 2610 + }, + { + "epoch": 2.2070692697364676, + "grad_norm": 0.5717750191688538, + "learning_rate": 0.0009782178217821783, + "loss": 2.1894, + "step": 2620 + }, + { + "epoch": 2.215493198246912, + "grad_norm": 0.7002500295639038, + "learning_rate": 0.0009772277227722771, + "loss": 2.1851, + "step": 2630 + }, + { + "epoch": 2.2239171267573568, + "grad_norm": 0.6041799783706665, + "learning_rate": 0.0009762376237623762, + "loss": 2.1899, + "step": 2640 + }, + { + "epoch": 2.2323410552678014, + "grad_norm": 0.40263745188713074, + "learning_rate": 0.0009752475247524752, + "loss": 2.1633, + "step": 2650 + }, + { + "epoch": 2.240764983778246, + "grad_norm": 0.47779303789138794, + "learning_rate": 0.0009742574257425743, + "loss": 2.1478, + "step": 2660 + }, + { + "epoch": 2.24918891228869, + "grad_norm": 0.8906975984573364, + "learning_rate": 0.0009732673267326732, + "loss": 2.1508, + "step": 2670 + }, + { + "epoch": 2.2576128407991347, + "grad_norm": 0.4588846266269684, + "learning_rate": 0.0009722772277227723, + "loss": 2.1422, + "step": 2680 + }, + { + "epoch": 2.2660367693095793, + "grad_norm": 0.6038916707038879, + "learning_rate": 0.0009712871287128712, + "loss": 2.1229, + "step": 2690 + }, + { + "epoch": 2.274460697820024, + "grad_norm": 0.792378842830658, + "learning_rate": 0.0009702970297029703, + "loss": 2.1262, + "step": 2700 + }, + { + "epoch": 2.274460697820024, + "eval_accuracy": 0.5767164906847645, + "eval_loss": 1.9968212842941284, + "eval_runtime": 890.0794, + "eval_samples_per_second": 561.047, + "eval_steps_per_second": 5.195, + "step": 2700 + }, + { + "epoch": 2.2828846263304685, + "grad_norm": 0.5215600728988647, + "learning_rate": 0.0009693069306930693, + "loss": 2.1315, + "step": 2710 + }, + { + "epoch": 2.291308554840913, + "grad_norm": 0.42443060874938965, + "learning_rate": 0.0009683168316831683, + "loss": 2.1075, + "step": 2720 + }, + { + "epoch": 2.2997324833513577, + "grad_norm": 0.7379765510559082, + "learning_rate": 0.0009673267326732673, + "loss": 2.0997, + "step": 2730 + }, + { + "epoch": 2.308156411861802, + "grad_norm": 0.532883882522583, + "learning_rate": 0.0009663366336633663, + "loss": 2.1009, + "step": 2740 + }, + { + "epoch": 2.3165803403722465, + "grad_norm": 0.4312550127506256, + "learning_rate": 0.0009653465346534653, + "loss": 2.0836, + "step": 2750 + }, + { + "epoch": 2.325004268882691, + "grad_norm": 0.42506101727485657, + "learning_rate": 0.0009643564356435644, + "loss": 2.0751, + "step": 2760 + }, + { + "epoch": 2.3334281973931357, + "grad_norm": 0.9728929400444031, + "learning_rate": 0.0009633663366336633, + "loss": 2.0755, + "step": 2770 + }, + { + "epoch": 2.3418521259035803, + "grad_norm": 0.4502295255661011, + "learning_rate": 0.0009623762376237624, + "loss": 2.0757, + "step": 2780 + }, + { + "epoch": 2.350276054414025, + "grad_norm": 0.6825786232948303, + "learning_rate": 0.0009613861386138613, + "loss": 2.0593, + "step": 2790 + }, + { + "epoch": 2.350276054414025, + "eval_accuracy": 0.5877788692302428, + "eval_loss": 1.932070255279541, + "eval_runtime": 877.2049, + "eval_samples_per_second": 569.281, + "eval_steps_per_second": 5.271, + "step": 2790 + }, + { + "epoch": 2.3586999829244695, + "grad_norm": 0.5142760276794434, + "learning_rate": 0.0009603960396039604, + "loss": 2.0529, + "step": 2800 + }, + { + "epoch": 2.3671239114349136, + "grad_norm": 0.613132119178772, + "learning_rate": 0.0009594059405940594, + "loss": 2.0423, + "step": 2810 + }, + { + "epoch": 2.3755478399453582, + "grad_norm": 0.7282253503799438, + "learning_rate": 0.0009584158415841584, + "loss": 2.0522, + "step": 2820 + }, + { + "epoch": 2.383971768455803, + "grad_norm": 0.37959426641464233, + "learning_rate": 0.0009574257425742574, + "loss": 2.0367, + "step": 2830 + }, + { + "epoch": 2.3923956969662474, + "grad_norm": 0.35326164960861206, + "learning_rate": 0.0009564356435643564, + "loss": 2.0233, + "step": 2840 + }, + { + "epoch": 2.400819625476692, + "grad_norm": 0.8196151256561279, + "learning_rate": 0.0009554455445544554, + "loss": 2.0264, + "step": 2850 + }, + { + "epoch": 2.409243553987136, + "grad_norm": 0.7122208476066589, + "learning_rate": 0.0009544554455445545, + "loss": 2.0308, + "step": 2860 + }, + { + "epoch": 2.417667482497581, + "grad_norm": 0.35665011405944824, + "learning_rate": 0.0009534653465346534, + "loss": 2.0133, + "step": 2870 + }, + { + "epoch": 2.4260914110080254, + "grad_norm": 0.3755228519439697, + "learning_rate": 0.0009524752475247525, + "loss": 1.9992, + "step": 2880 + }, + { + "epoch": 2.4260914110080254, + "eval_accuracy": 0.596780331496744, + "eval_loss": 1.8819479942321777, + "eval_runtime": 890.4504, + "eval_samples_per_second": 560.813, + "eval_steps_per_second": 5.193, + "step": 2880 + }, + { + "epoch": 2.43451533951847, + "grad_norm": 0.7018378376960754, + "learning_rate": 0.0009514851485148514, + "loss": 2.0013, + "step": 2890 + }, + { + "epoch": 2.4429392680289146, + "grad_norm": 0.4874301850795746, + "learning_rate": 0.0009504950495049505, + "loss": 1.9971, + "step": 2900 + }, + { + "epoch": 2.451363196539359, + "grad_norm": 0.45909377932548523, + "learning_rate": 0.0009495049504950495, + "loss": 1.9881, + "step": 2910 + }, + { + "epoch": 2.459787125049804, + "grad_norm": 0.4965904951095581, + "learning_rate": 0.0009485148514851485, + "loss": 1.989, + "step": 2920 + }, + { + "epoch": 2.468211053560248, + "grad_norm": 0.4780527949333191, + "learning_rate": 0.0009475247524752475, + "loss": 1.9795, + "step": 2930 + }, + { + "epoch": 2.4766349820706925, + "grad_norm": 0.5145118236541748, + "learning_rate": 0.0009465346534653465, + "loss": 1.973, + "step": 2940 + }, + { + "epoch": 2.485058910581137, + "grad_norm": 0.5469622015953064, + "learning_rate": 0.0009455445544554455, + "loss": 1.9692, + "step": 2950 + }, + { + "epoch": 2.4934828390915817, + "grad_norm": 0.5788788199424744, + "learning_rate": 0.0009445544554455446, + "loss": 1.9627, + "step": 2960 + }, + { + "epoch": 2.5019067676020263, + "grad_norm": 0.5380696654319763, + "learning_rate": 0.0009435643564356435, + "loss": 1.9624, + "step": 2970 + }, + { + "epoch": 2.5019067676020263, + "eval_accuracy": 0.6028271764812113, + "eval_loss": 1.8441975116729736, + "eval_runtime": 877.1334, + "eval_samples_per_second": 569.327, + "eval_steps_per_second": 5.272, + "step": 2970 + }, + { + "epoch": 2.510330696112471, + "grad_norm": 0.4939862787723541, + "learning_rate": 0.0009425742574257426, + "loss": 1.9576, + "step": 2980 + }, + { + "epoch": 2.5187546246229155, + "grad_norm": 0.4804815649986267, + "learning_rate": 0.0009415841584158415, + "loss": 1.948, + "step": 2990 + }, + { + "epoch": 2.5271785531333597, + "grad_norm": 0.529515266418457, + "learning_rate": 0.0009405940594059406, + "loss": 1.9414, + "step": 3000 + }, + { + "epoch": 2.5356024816438043, + "grad_norm": 0.5104151964187622, + "learning_rate": 0.0009396039603960396, + "loss": 1.9472, + "step": 3010 + }, + { + "epoch": 2.544026410154249, + "grad_norm": 0.36934202909469604, + "learning_rate": 0.0009386138613861386, + "loss": 1.9358, + "step": 3020 + }, + { + "epoch": 2.5524503386646935, + "grad_norm": 0.5956403017044067, + "learning_rate": 0.0009376237623762376, + "loss": 1.9272, + "step": 3030 + }, + { + "epoch": 2.560874267175138, + "grad_norm": 0.5035738348960876, + "learning_rate": 0.0009366336633663367, + "loss": 1.934, + "step": 3040 + }, + { + "epoch": 2.5692981956855827, + "grad_norm": 0.44133296608924866, + "learning_rate": 0.0009356435643564357, + "loss": 1.9192, + "step": 3050 + }, + { + "epoch": 2.5777221241960273, + "grad_norm": 0.617588996887207, + "learning_rate": 0.0009346534653465348, + "loss": 1.9189, + "step": 3060 + }, + { + "epoch": 2.5777221241960273, + "eval_accuracy": 0.6097417836200192, + "eval_loss": 1.806692123413086, + "eval_runtime": 890.173, + "eval_samples_per_second": 560.988, + "eval_steps_per_second": 5.194, + "step": 3060 + }, + { + "epoch": 2.5861460527064715, + "grad_norm": 0.4702962338924408, + "learning_rate": 0.0009336633663366337, + "loss": 1.9145, + "step": 3070 + }, + { + "epoch": 2.594569981216916, + "grad_norm": 0.37163108587265015, + "learning_rate": 0.0009326732673267328, + "loss": 1.907, + "step": 3080 + }, + { + "epoch": 2.6029939097273607, + "grad_norm": 0.8039525151252747, + "learning_rate": 0.0009316831683168317, + "loss": 1.9071, + "step": 3090 + }, + { + "epoch": 2.6114178382378053, + "grad_norm": 0.3594844341278076, + "learning_rate": 0.0009306930693069308, + "loss": 1.9109, + "step": 3100 + }, + { + "epoch": 2.61984176674825, + "grad_norm": 0.44677871465682983, + "learning_rate": 0.0009297029702970298, + "loss": 1.8948, + "step": 3110 + }, + { + "epoch": 2.628265695258694, + "grad_norm": 0.4496874511241913, + "learning_rate": 0.0009287128712871288, + "loss": 1.893, + "step": 3120 + }, + { + "epoch": 2.636689623769139, + "grad_norm": 0.44437769055366516, + "learning_rate": 0.0009277227722772278, + "loss": 1.8891, + "step": 3130 + }, + { + "epoch": 2.6451135522795832, + "grad_norm": 0.47511276602745056, + "learning_rate": 0.0009267326732673268, + "loss": 1.8828, + "step": 3140 + }, + { + "epoch": 2.653537480790028, + "grad_norm": 0.5357436537742615, + "learning_rate": 0.0009257425742574258, + "loss": 1.8802, + "step": 3150 + }, + { + "epoch": 2.653537480790028, + "eval_accuracy": 0.6167399590165771, + "eval_loss": 1.7698620557785034, + "eval_runtime": 887.5592, + "eval_samples_per_second": 562.64, + "eval_steps_per_second": 5.21, + "step": 3150 + }, + { + "epoch": 2.6619614093004724, + "grad_norm": 0.5014392137527466, + "learning_rate": 0.0009247524752475249, + "loss": 1.8819, + "step": 3160 + }, + { + "epoch": 2.670385337810917, + "grad_norm": 0.41872531175613403, + "learning_rate": 0.0009237623762376238, + "loss": 1.8736, + "step": 3170 + }, + { + "epoch": 2.6788092663213616, + "grad_norm": 0.4343492388725281, + "learning_rate": 0.0009227722772277229, + "loss": 1.8659, + "step": 3180 + }, + { + "epoch": 2.687233194831806, + "grad_norm": 0.45470404624938965, + "learning_rate": 0.0009217821782178218, + "loss": 1.8689, + "step": 3190 + }, + { + "epoch": 2.695657123342251, + "grad_norm": 0.4626518487930298, + "learning_rate": 0.0009207920792079209, + "loss": 1.8606, + "step": 3200 + }, + { + "epoch": 2.704081051852695, + "grad_norm": 0.4213305711746216, + "learning_rate": 0.0009198019801980199, + "loss": 1.8587, + "step": 3210 + }, + { + "epoch": 2.7125049803631396, + "grad_norm": 0.5036765336990356, + "learning_rate": 0.0009188118811881188, + "loss": 1.8514, + "step": 3220 + }, + { + "epoch": 2.720928908873584, + "grad_norm": 0.4738876223564148, + "learning_rate": 0.0009178217821782179, + "loss": 1.8506, + "step": 3230 + }, + { + "epoch": 2.729352837384029, + "grad_norm": 0.3712784945964813, + "learning_rate": 0.0009168316831683168, + "loss": 1.8461, + "step": 3240 + }, + { + "epoch": 2.729352837384029, + "eval_accuracy": 0.6231111347423419, + "eval_loss": 1.7313838005065918, + "eval_runtime": 889.784, + "eval_samples_per_second": 561.233, + "eval_steps_per_second": 5.197, + "step": 3240 + }, + { + "epoch": 2.7377767658944734, + "grad_norm": 0.45651596784591675, + "learning_rate": 0.0009158415841584159, + "loss": 1.8405, + "step": 3250 + }, + { + "epoch": 2.7462006944049175, + "grad_norm": 0.5253742933273315, + "learning_rate": 0.000914851485148515, + "loss": 1.839, + "step": 3260 + }, + { + "epoch": 2.754624622915362, + "grad_norm": 0.4810900390148163, + "learning_rate": 0.0009138613861386139, + "loss": 1.8352, + "step": 3270 + }, + { + "epoch": 2.7630485514258067, + "grad_norm": 0.42353251576423645, + "learning_rate": 0.0009128712871287129, + "loss": 1.8308, + "step": 3280 + }, + { + "epoch": 2.7714724799362513, + "grad_norm": 0.34494903683662415, + "learning_rate": 0.0009118811881188119, + "loss": 1.8271, + "step": 3290 + }, + { + "epoch": 2.779896408446696, + "grad_norm": 0.44857293367385864, + "learning_rate": 0.0009108910891089109, + "loss": 1.8272, + "step": 3300 + }, + { + "epoch": 2.7883203369571405, + "grad_norm": 0.32810303568840027, + "learning_rate": 0.00090990099009901, + "loss": 1.8201, + "step": 3310 + }, + { + "epoch": 2.796744265467585, + "grad_norm": 0.5814313292503357, + "learning_rate": 0.0009089108910891089, + "loss": 1.8181, + "step": 3320 + }, + { + "epoch": 2.8051681939780293, + "grad_norm": 0.6469531655311584, + "learning_rate": 0.000907920792079208, + "loss": 1.8228, + "step": 3330 + }, + { + "epoch": 2.8051681939780293, + "eval_accuracy": 0.627194729904968, + "eval_loss": 1.7094751596450806, + "eval_runtime": 879.8799, + "eval_samples_per_second": 567.55, + "eval_steps_per_second": 5.255, + "step": 3330 + }, + { + "epoch": 2.813592122488474, + "grad_norm": 0.37370234727859497, + "learning_rate": 0.0009069306930693069, + "loss": 1.8143, + "step": 3340 + }, + { + "epoch": 2.8220160509989185, + "grad_norm": 0.2818905711174011, + "learning_rate": 0.000905940594059406, + "loss": 1.8058, + "step": 3350 + }, + { + "epoch": 2.830439979509363, + "grad_norm": 0.40032240748405457, + "learning_rate": 0.000904950495049505, + "loss": 1.8037, + "step": 3360 + }, + { + "epoch": 2.8388639080198077, + "grad_norm": 0.4075703024864197, + "learning_rate": 0.000903960396039604, + "loss": 1.8042, + "step": 3370 + }, + { + "epoch": 2.8472878365302523, + "grad_norm": 0.4188884496688843, + "learning_rate": 0.000902970297029703, + "loss": 1.7954, + "step": 3380 + }, + { + "epoch": 2.855711765040697, + "grad_norm": 0.40151095390319824, + "learning_rate": 0.000901980198019802, + "loss": 1.8, + "step": 3390 + }, + { + "epoch": 2.864135693551141, + "grad_norm": 0.38640516996383667, + "learning_rate": 0.000900990099009901, + "loss": 1.7897, + "step": 3400 + }, + { + "epoch": 2.8725596220615857, + "grad_norm": 0.46775710582733154, + "learning_rate": 0.0009000000000000001, + "loss": 1.7889, + "step": 3410 + }, + { + "epoch": 2.8809835505720303, + "grad_norm": 0.5004317760467529, + "learning_rate": 0.000899009900990099, + "loss": 1.7838, + "step": 3420 + }, + { + "epoch": 2.8809835505720303, + "eval_accuracy": 0.6330453392339891, + "eval_loss": 1.6756778955459595, + "eval_runtime": 890.43, + "eval_samples_per_second": 560.826, + "eval_steps_per_second": 5.193, + "step": 3420 + }, + { + "epoch": 2.889407479082475, + "grad_norm": 0.44054290652275085, + "learning_rate": 0.0008980198019801981, + "loss": 1.7839, + "step": 3430 + }, + { + "epoch": 2.8978314075929195, + "grad_norm": 0.38003844022750854, + "learning_rate": 0.000897029702970297, + "loss": 1.7793, + "step": 3440 + }, + { + "epoch": 2.9062553361033636, + "grad_norm": 0.3714471757411957, + "learning_rate": 0.0008960396039603961, + "loss": 1.7765, + "step": 3450 + }, + { + "epoch": 2.9146792646138087, + "grad_norm": 0.4955293834209442, + "learning_rate": 0.0008950495049504951, + "loss": 1.7729, + "step": 3460 + }, + { + "epoch": 2.923103193124253, + "grad_norm": 0.367481529712677, + "learning_rate": 0.0008940594059405941, + "loss": 1.7666, + "step": 3470 + }, + { + "epoch": 2.9315271216346974, + "grad_norm": 0.48372742533683777, + "learning_rate": 0.0008930693069306931, + "loss": 1.7638, + "step": 3480 + }, + { + "epoch": 2.939951050145142, + "grad_norm": 0.5356625318527222, + "learning_rate": 0.0008920792079207921, + "loss": 1.7625, + "step": 3490 + }, + { + "epoch": 2.9483749786555866, + "grad_norm": 0.396090030670166, + "learning_rate": 0.0008910891089108911, + "loss": 1.7597, + "step": 3500 + }, + { + "epoch": 2.956798907166031, + "grad_norm": 0.3071458041667938, + "learning_rate": 0.0008900990099009902, + "loss": 1.7513, + "step": 3510 + }, + { + "epoch": 2.956798907166031, + "eval_accuracy": 0.640630813225039, + "eval_loss": 1.6351577043533325, + "eval_runtime": 887.1061, + "eval_samples_per_second": 562.927, + "eval_steps_per_second": 5.212, + "step": 3510 + }, + { + "epoch": 2.9652228356764754, + "grad_norm": 0.7265316247940063, + "learning_rate": 0.0008891089108910891, + "loss": 1.7482, + "step": 3520 + }, + { + "epoch": 2.97364676418692, + "grad_norm": 0.34152501821517944, + "learning_rate": 0.0008881188118811882, + "loss": 1.7454, + "step": 3530 + }, + { + "epoch": 2.9820706926973646, + "grad_norm": 0.5570985078811646, + "learning_rate": 0.0008871287128712871, + "loss": 1.736, + "step": 3540 + }, + { + "epoch": 2.990494621207809, + "grad_norm": 0.29268133640289307, + "learning_rate": 0.0008861386138613862, + "loss": 1.7323, + "step": 3550 + }, + { + "epoch": 2.998918549718254, + "grad_norm": 0.4475082755088806, + "learning_rate": 0.0008851485148514852, + "loss": 1.7207, + "step": 3560 + }, + { + "epoch": 3.0073424782286984, + "grad_norm": 0.39963921904563904, + "learning_rate": 0.0008841584158415842, + "loss": 1.7199, + "step": 3570 + }, + { + "epoch": 3.015766406739143, + "grad_norm": 0.3290662169456482, + "learning_rate": 0.0008831683168316832, + "loss": 1.7103, + "step": 3580 + }, + { + "epoch": 3.024190335249587, + "grad_norm": 0.4892579913139343, + "learning_rate": 0.0008821782178217822, + "loss": 1.7024, + "step": 3590 + }, + { + "epoch": 3.0326142637600317, + "grad_norm": 0.45102205872535706, + "learning_rate": 0.0008811881188118812, + "loss": 1.7012, + "step": 3600 + }, + { + "epoch": 3.0326142637600317, + "eval_accuracy": 0.65292687328356, + "eval_loss": 1.578561544418335, + "eval_runtime": 889.1801, + "eval_samples_per_second": 561.614, + "eval_steps_per_second": 5.2, + "step": 3600 + }, + { + "epoch": 3.0410381922704763, + "grad_norm": 0.38877975940704346, + "learning_rate": 0.0008801980198019803, + "loss": 1.6999, + "step": 3610 + }, + { + "epoch": 3.049462120780921, + "grad_norm": 0.32052722573280334, + "learning_rate": 0.0008792079207920792, + "loss": 1.6898, + "step": 3620 + }, + { + "epoch": 3.0578860492913655, + "grad_norm": 0.4076586365699768, + "learning_rate": 0.0008782178217821783, + "loss": 1.682, + "step": 3630 + }, + { + "epoch": 3.06630997780181, + "grad_norm": 0.3886164724826813, + "learning_rate": 0.0008772277227722772, + "loss": 1.6788, + "step": 3640 + }, + { + "epoch": 3.0747339063122547, + "grad_norm": 0.43478402495384216, + "learning_rate": 0.0008762376237623763, + "loss": 1.6757, + "step": 3650 + }, + { + "epoch": 3.083157834822699, + "grad_norm": 0.3681798279285431, + "learning_rate": 0.0008752475247524753, + "loss": 1.6725, + "step": 3660 + }, + { + "epoch": 3.0915817633331435, + "grad_norm": 0.44459056854248047, + "learning_rate": 0.0008742574257425743, + "loss": 1.6653, + "step": 3670 + }, + { + "epoch": 3.100005691843588, + "grad_norm": 0.3404163420200348, + "learning_rate": 0.0008732673267326733, + "loss": 1.6597, + "step": 3680 + }, + { + "epoch": 3.1084296203540327, + "grad_norm": 0.39622583985328674, + "learning_rate": 0.0008722772277227722, + "loss": 1.664, + "step": 3690 + }, + { + "epoch": 3.1084296203540327, + "eval_accuracy": 0.6616252383451875, + "eval_loss": 1.5378377437591553, + "eval_runtime": 880.004, + "eval_samples_per_second": 567.47, + "eval_steps_per_second": 5.255, + "step": 3690 + }, + { + "epoch": 3.1168535488644773, + "grad_norm": 0.36066505312919617, + "learning_rate": 0.0008712871287128713, + "loss": 1.6552, + "step": 3700 + }, + { + "epoch": 3.125277477374922, + "grad_norm": 0.45852380990982056, + "learning_rate": 0.0008702970297029704, + "loss": 1.6581, + "step": 3710 + }, + { + "epoch": 3.1337014058853665, + "grad_norm": 0.3647266924381256, + "learning_rate": 0.0008693069306930693, + "loss": 1.6493, + "step": 3720 + }, + { + "epoch": 3.1421253343958107, + "grad_norm": 0.4774695038795471, + "learning_rate": 0.0008683168316831684, + "loss": 1.6457, + "step": 3730 + }, + { + "epoch": 3.1505492629062553, + "grad_norm": 0.4143640398979187, + "learning_rate": 0.0008673267326732673, + "loss": 1.6436, + "step": 3740 + }, + { + "epoch": 3.1589731914167, + "grad_norm": 0.4920789897441864, + "learning_rate": 0.0008663366336633663, + "loss": 1.6431, + "step": 3750 + }, + { + "epoch": 3.1673971199271445, + "grad_norm": 0.40231600403785706, + "learning_rate": 0.0008653465346534654, + "loss": 1.6373, + "step": 3760 + }, + { + "epoch": 3.175821048437589, + "grad_norm": 0.35115131735801697, + "learning_rate": 0.0008643564356435643, + "loss": 1.6343, + "step": 3770 + }, + { + "epoch": 3.1842449769480337, + "grad_norm": 0.3814195990562439, + "learning_rate": 0.0008633663366336634, + "loss": 1.6345, + "step": 3780 + }, + { + "epoch": 3.1842449769480337, + "eval_accuracy": 0.6669776046149977, + "eval_loss": 1.5131778717041016, + "eval_runtime": 887.9268, + "eval_samples_per_second": 562.407, + "eval_steps_per_second": 5.208, + "step": 3780 + }, + { + "epoch": 3.192668905458478, + "grad_norm": 0.3229101896286011, + "learning_rate": 0.0008623762376237623, + "loss": 1.6281, + "step": 3790 + }, + { + "epoch": 3.2010928339689224, + "grad_norm": 0.4361475110054016, + "learning_rate": 0.0008613861386138614, + "loss": 1.6253, + "step": 3800 + }, + { + "epoch": 3.209516762479367, + "grad_norm": 0.3246362507343292, + "learning_rate": 0.0008603960396039604, + "loss": 1.6269, + "step": 3810 + }, + { + "epoch": 3.2179406909898116, + "grad_norm": 0.5126762390136719, + "learning_rate": 0.0008594059405940594, + "loss": 1.62, + "step": 3820 + }, + { + "epoch": 3.226364619500256, + "grad_norm": 0.3813638389110565, + "learning_rate": 0.0008584158415841584, + "loss": 1.6228, + "step": 3830 + }, + { + "epoch": 3.234788548010701, + "grad_norm": 0.5111351013183594, + "learning_rate": 0.0008574257425742574, + "loss": 1.6162, + "step": 3840 + }, + { + "epoch": 3.243212476521145, + "grad_norm": 0.3448195457458496, + "learning_rate": 0.0008564356435643564, + "loss": 1.6156, + "step": 3850 + }, + { + "epoch": 3.2516364050315896, + "grad_norm": 0.50129634141922, + "learning_rate": 0.0008554455445544555, + "loss": 1.6153, + "step": 3860 + }, + { + "epoch": 3.260060333542034, + "grad_norm": 0.3352351188659668, + "learning_rate": 0.0008544554455445544, + "loss": 1.6117, + "step": 3870 + }, + { + "epoch": 3.260060333542034, + "eval_accuracy": 0.6717362607348063, + "eval_loss": 1.4890562295913696, + "eval_runtime": 886.1465, + "eval_samples_per_second": 563.537, + "eval_steps_per_second": 5.218, + "step": 3870 + }, + { + "epoch": 3.2684842620524788, + "grad_norm": 0.38713541626930237, + "learning_rate": 0.0008534653465346535, + "loss": 1.6058, + "step": 3880 + }, + { + "epoch": 3.2769081905629234, + "grad_norm": 0.46299123764038086, + "learning_rate": 0.0008524752475247524, + "loss": 1.6053, + "step": 3890 + }, + { + "epoch": 3.285332119073368, + "grad_norm": 0.4045964181423187, + "learning_rate": 0.0008514851485148515, + "loss": 1.6064, + "step": 3900 + }, + { + "epoch": 3.2937560475838126, + "grad_norm": 0.37616729736328125, + "learning_rate": 0.0008504950495049505, + "loss": 1.6005, + "step": 3910 + }, + { + "epoch": 3.3021799760942567, + "grad_norm": 0.47833314538002014, + "learning_rate": 0.0008495049504950495, + "loss": 1.599, + "step": 3920 + }, + { + "epoch": 3.3106039046047013, + "grad_norm": 0.436625212430954, + "learning_rate": 0.0008485148514851485, + "loss": 1.5954, + "step": 3930 + }, + { + "epoch": 3.319027833115146, + "grad_norm": 0.3456842005252838, + "learning_rate": 0.0008475247524752475, + "loss": 1.5924, + "step": 3940 + }, + { + "epoch": 3.3274517616255905, + "grad_norm": 0.5403941869735718, + "learning_rate": 0.0008465346534653465, + "loss": 1.5915, + "step": 3950 + }, + { + "epoch": 3.335875690136035, + "grad_norm": 0.3622403144836426, + "learning_rate": 0.0008455445544554456, + "loss": 1.6013, + "step": 3960 + }, + { + "epoch": 3.335875690136035, + "eval_accuracy": 0.6740560565861919, + "eval_loss": 1.475487232208252, + "eval_runtime": 895.3114, + "eval_samples_per_second": 557.768, + "eval_steps_per_second": 5.165, + "step": 3960 + }, + { + "epoch": 3.3442996186464797, + "grad_norm": 0.2850242555141449, + "learning_rate": 0.0008445544554455445, + "loss": 1.5903, + "step": 3970 + }, + { + "epoch": 3.3527235471569243, + "grad_norm": 0.39831429719924927, + "learning_rate": 0.0008435643564356436, + "loss": 1.5846, + "step": 3980 + }, + { + "epoch": 3.3611474756673685, + "grad_norm": 0.4886794686317444, + "learning_rate": 0.0008425742574257425, + "loss": 1.5876, + "step": 3990 + }, + { + "epoch": 3.369571404177813, + "grad_norm": 0.35439977049827576, + "learning_rate": 0.0008415841584158416, + "loss": 1.5839, + "step": 4000 + }, + { + "epoch": 3.3779953326882577, + "grad_norm": 0.32369595766067505, + "learning_rate": 0.0008405940594059406, + "loss": 1.5797, + "step": 4010 + }, + { + "epoch": 3.3864192611987023, + "grad_norm": 0.48595139384269714, + "learning_rate": 0.0008396039603960396, + "loss": 1.58, + "step": 4020 + }, + { + "epoch": 3.394843189709147, + "grad_norm": 0.39331361651420593, + "learning_rate": 0.0008386138613861386, + "loss": 1.5786, + "step": 4030 + }, + { + "epoch": 3.4032671182195915, + "grad_norm": 0.31911513209342957, + "learning_rate": 0.0008376237623762376, + "loss": 1.5745, + "step": 4040 + }, + { + "epoch": 3.411691046730036, + "grad_norm": 0.319876104593277, + "learning_rate": 0.0008366336633663366, + "loss": 1.5749, + "step": 4050 + }, + { + "epoch": 3.411691046730036, + "eval_accuracy": 0.6780886041474171, + "eval_loss": 1.4578139781951904, + "eval_runtime": 880.4333, + "eval_samples_per_second": 567.193, + "eval_steps_per_second": 5.252, + "step": 4050 + }, + { + "epoch": 3.4201149752404802, + "grad_norm": 0.45969948172569275, + "learning_rate": 0.0008356435643564357, + "loss": 1.5759, + "step": 4060 + }, + { + "epoch": 3.428538903750925, + "grad_norm": 0.34449151158332825, + "learning_rate": 0.0008346534653465346, + "loss": 1.5707, + "step": 4070 + }, + { + "epoch": 3.4369628322613694, + "grad_norm": 0.3478371202945709, + "learning_rate": 0.0008336633663366337, + "loss": 1.5699, + "step": 4080 + }, + { + "epoch": 3.445386760771814, + "grad_norm": 0.5127679109573364, + "learning_rate": 0.0008326732673267326, + "loss": 1.5668, + "step": 4090 + }, + { + "epoch": 3.4538106892822587, + "grad_norm": 0.302216500043869, + "learning_rate": 0.0008316831683168317, + "loss": 1.5647, + "step": 4100 + }, + { + "epoch": 3.4622346177927033, + "grad_norm": 0.3295814096927643, + "learning_rate": 0.0008306930693069307, + "loss": 1.5628, + "step": 4110 + }, + { + "epoch": 3.4706585463031474, + "grad_norm": 0.4209032654762268, + "learning_rate": 0.0008297029702970297, + "loss": 1.5628, + "step": 4120 + }, + { + "epoch": 3.479082474813592, + "grad_norm": 0.34786614775657654, + "learning_rate": 0.0008287128712871287, + "loss": 1.5613, + "step": 4130 + }, + { + "epoch": 3.4875064033240366, + "grad_norm": 0.4870763421058655, + "learning_rate": 0.0008277227722772277, + "loss": 1.5584, + "step": 4140 + }, + { + "epoch": 3.4875064033240366, + "eval_accuracy": 0.6804383346028876, + "eval_loss": 1.4444972276687622, + "eval_runtime": 891.9286, + "eval_samples_per_second": 559.883, + "eval_steps_per_second": 5.184, + "step": 4140 + }, + { + "epoch": 3.495930331834481, + "grad_norm": 0.31641605496406555, + "learning_rate": 0.0008267326732673267, + "loss": 1.5581, + "step": 4150 + }, + { + "epoch": 3.504354260344926, + "grad_norm": 0.31303870677948, + "learning_rate": 0.0008257425742574258, + "loss": 1.5548, + "step": 4160 + }, + { + "epoch": 3.5127781888553704, + "grad_norm": 0.35413628816604614, + "learning_rate": 0.0008247524752475247, + "loss": 1.5506, + "step": 4170 + }, + { + "epoch": 3.5212021173658146, + "grad_norm": 0.39600226283073425, + "learning_rate": 0.0008237623762376238, + "loss": 1.5517, + "step": 4180 + }, + { + "epoch": 3.529626045876259, + "grad_norm": 0.3600960075855255, + "learning_rate": 0.0008227722772277227, + "loss": 1.5563, + "step": 4190 + }, + { + "epoch": 3.5380499743867038, + "grad_norm": 0.2877024710178375, + "learning_rate": 0.0008217821782178218, + "loss": 1.5467, + "step": 4200 + }, + { + "epoch": 3.5464739028971484, + "grad_norm": 0.42324578762054443, + "learning_rate": 0.0008207920792079208, + "loss": 1.546, + "step": 4210 + }, + { + "epoch": 3.554897831407593, + "grad_norm": 0.38907232880592346, + "learning_rate": 0.0008198019801980197, + "loss": 1.5458, + "step": 4220 + }, + { + "epoch": 3.5633217599180376, + "grad_norm": 0.34750425815582275, + "learning_rate": 0.0008188118811881188, + "loss": 1.5437, + "step": 4230 + }, + { + "epoch": 3.5633217599180376, + "eval_accuracy": 0.6840987986477044, + "eval_loss": 1.4261698722839355, + "eval_runtime": 886.2695, + "eval_samples_per_second": 563.458, + "eval_steps_per_second": 5.217, + "step": 4230 + }, + { + "epoch": 3.571745688428482, + "grad_norm": 0.3718611001968384, + "learning_rate": 0.0008178217821782177, + "loss": 1.546, + "step": 4240 + }, + { + "epoch": 3.5801696169389263, + "grad_norm": 0.39119917154312134, + "learning_rate": 0.0008168316831683168, + "loss": 1.5411, + "step": 4250 + }, + { + "epoch": 3.588593545449371, + "grad_norm": 0.45689284801483154, + "learning_rate": 0.0008158415841584159, + "loss": 1.5416, + "step": 4260 + }, + { + "epoch": 3.5970174739598155, + "grad_norm": 0.4029008150100708, + "learning_rate": 0.0008148514851485148, + "loss": 1.5364, + "step": 4270 + }, + { + "epoch": 3.60544140247026, + "grad_norm": 0.3843879997730255, + "learning_rate": 0.0008138613861386138, + "loss": 1.5368, + "step": 4280 + }, + { + "epoch": 3.6138653309807047, + "grad_norm": 0.33945897221565247, + "learning_rate": 0.0008128712871287128, + "loss": 1.5369, + "step": 4290 + }, + { + "epoch": 3.6222892594911493, + "grad_norm": 0.29753997921943665, + "learning_rate": 0.000811881188118812, + "loss": 1.5326, + "step": 4300 + }, + { + "epoch": 3.630713188001594, + "grad_norm": 0.4412858784198761, + "learning_rate": 0.000810891089108911, + "loss": 1.5316, + "step": 4310 + }, + { + "epoch": 3.639137116512038, + "grad_norm": 0.30377647280693054, + "learning_rate": 0.00080990099009901, + "loss": 1.5308, + "step": 4320 + }, + { + "epoch": 3.639137116512038, + "eval_accuracy": 0.6865785598346558, + "eval_loss": 1.4111888408660889, + "eval_runtime": 880.9823, + "eval_samples_per_second": 566.84, + "eval_steps_per_second": 5.249, + "step": 4320 + }, + { + "epoch": 3.6475610450224827, + "grad_norm": 0.3666999638080597, + "learning_rate": 0.000808910891089109, + "loss": 1.5279, + "step": 4330 + }, + { + "epoch": 3.6559849735329273, + "grad_norm": 0.3254301846027374, + "learning_rate": 0.0008079207920792079, + "loss": 1.5277, + "step": 4340 + }, + { + "epoch": 3.664408902043372, + "grad_norm": 0.4963987469673157, + "learning_rate": 0.000806930693069307, + "loss": 1.5286, + "step": 4350 + }, + { + "epoch": 3.6728328305538165, + "grad_norm": 0.34190070629119873, + "learning_rate": 0.000805940594059406, + "loss": 1.5294, + "step": 4360 + }, + { + "epoch": 3.6812567590642606, + "grad_norm": 0.35153254866600037, + "learning_rate": 0.000804950495049505, + "loss": 1.5217, + "step": 4370 + }, + { + "epoch": 3.6896806875747057, + "grad_norm": 0.345929354429245, + "learning_rate": 0.000803960396039604, + "loss": 1.52, + "step": 4380 + }, + { + "epoch": 3.69810461608515, + "grad_norm": 0.37540799379348755, + "learning_rate": 0.000802970297029703, + "loss": 1.5208, + "step": 4390 + }, + { + "epoch": 3.7065285445955944, + "grad_norm": 0.33499011397361755, + "learning_rate": 0.000801980198019802, + "loss": 1.5196, + "step": 4400 + }, + { + "epoch": 3.714952473106039, + "grad_norm": 0.3461949825286865, + "learning_rate": 0.0008009900990099011, + "loss": 1.5188, + "step": 4410 + }, + { + "epoch": 3.714952473106039, + "eval_accuracy": 0.6888913088166951, + "eval_loss": 1.40292227268219, + "eval_runtime": 882.772, + "eval_samples_per_second": 565.691, + "eval_steps_per_second": 5.238, + "step": 4410 + }, + { + "epoch": 3.7233764016164836, + "grad_norm": 0.36491358280181885, + "learning_rate": 0.0008, + "loss": 1.5171, + "step": 4420 + }, + { + "epoch": 3.7318003301269282, + "grad_norm": 0.2799367606639862, + "learning_rate": 0.0007990099009900991, + "loss": 1.5142, + "step": 4430 + }, + { + "epoch": 3.7402242586373724, + "grad_norm": 0.361971914768219, + "learning_rate": 0.000798019801980198, + "loss": 1.5145, + "step": 4440 + }, + { + "epoch": 3.7486481871478174, + "grad_norm": 0.2618056535720825, + "learning_rate": 0.0007970297029702971, + "loss": 1.5113, + "step": 4450 + }, + { + "epoch": 3.7570721156582616, + "grad_norm": 0.5228148698806763, + "learning_rate": 0.0007960396039603961, + "loss": 1.5111, + "step": 4460 + }, + { + "epoch": 3.765496044168706, + "grad_norm": 0.37740132212638855, + "learning_rate": 0.0007950495049504951, + "loss": 1.5121, + "step": 4470 + }, + { + "epoch": 3.773919972679151, + "grad_norm": 0.3701629340648651, + "learning_rate": 0.0007940594059405941, + "loss": 1.5083, + "step": 4480 + }, + { + "epoch": 3.7823439011895954, + "grad_norm": 0.3345108926296234, + "learning_rate": 0.0007930693069306931, + "loss": 1.5077, + "step": 4490 + }, + { + "epoch": 3.79076782970004, + "grad_norm": 0.3989773988723755, + "learning_rate": 0.0007920792079207921, + "loss": 1.5079, + "step": 4500 + }, + { + "epoch": 3.79076782970004, + "eval_accuracy": 0.6907081981543249, + "eval_loss": 1.3909889459609985, + "eval_runtime": 889.7203, + "eval_samples_per_second": 561.273, + "eval_steps_per_second": 5.197, + "step": 4500 + }, + { + "epoch": 3.799191758210484, + "grad_norm": 0.284728080034256, + "learning_rate": 0.0007910891089108912, + "loss": 1.5046, + "step": 4510 + }, + { + "epoch": 3.8076156867209288, + "grad_norm": 0.5029779672622681, + "learning_rate": 0.0007900990099009901, + "loss": 1.5049, + "step": 4520 + }, + { + "epoch": 3.8160396152313734, + "grad_norm": 0.32617345452308655, + "learning_rate": 0.0007891089108910892, + "loss": 1.5068, + "step": 4530 + }, + { + "epoch": 3.824463543741818, + "grad_norm": 0.36316540837287903, + "learning_rate": 0.0007881188118811881, + "loss": 1.4999, + "step": 4540 + }, + { + "epoch": 3.8328874722522626, + "grad_norm": 0.30240392684936523, + "learning_rate": 0.0007871287128712872, + "loss": 1.498, + "step": 4550 + }, + { + "epoch": 3.841311400762707, + "grad_norm": 0.3905390202999115, + "learning_rate": 0.0007861386138613862, + "loss": 1.4978, + "step": 4560 + }, + { + "epoch": 3.8497353292731518, + "grad_norm": 0.30473875999450684, + "learning_rate": 0.0007851485148514852, + "loss": 1.4965, + "step": 4570 + }, + { + "epoch": 3.858159257783596, + "grad_norm": 0.3675777316093445, + "learning_rate": 0.0007841584158415842, + "loss": 1.4957, + "step": 4580 + }, + { + "epoch": 3.8665831862940405, + "grad_norm": 0.394168883562088, + "learning_rate": 0.0007831683168316832, + "loss": 1.4936, + "step": 4590 + }, + { + "epoch": 3.8665831862940405, + "eval_accuracy": 0.6926193728848408, + "eval_loss": 1.3844850063323975, + "eval_runtime": 887.3028, + "eval_samples_per_second": 562.802, + "eval_steps_per_second": 5.211, + "step": 4590 + }, + { + "epoch": 3.875007114804485, + "grad_norm": 0.3404500186443329, + "learning_rate": 0.0007821782178217822, + "loss": 1.4956, + "step": 4600 + }, + { + "epoch": 3.8834310433149297, + "grad_norm": 0.3074527978897095, + "learning_rate": 0.0007811881188118813, + "loss": 1.4928, + "step": 4610 + }, + { + "epoch": 3.8918549718253743, + "grad_norm": 0.44941094517707825, + "learning_rate": 0.0007801980198019802, + "loss": 1.4911, + "step": 4620 + }, + { + "epoch": 3.900278900335819, + "grad_norm": 0.3098917603492737, + "learning_rate": 0.0007792079207920793, + "loss": 1.4918, + "step": 4630 + }, + { + "epoch": 3.9087028288462635, + "grad_norm": 0.37436243891716003, + "learning_rate": 0.0007782178217821782, + "loss": 1.4866, + "step": 4640 + }, + { + "epoch": 3.9171267573567077, + "grad_norm": 0.3058597445487976, + "learning_rate": 0.0007772277227722773, + "loss": 1.4896, + "step": 4650 + }, + { + "epoch": 3.9255506858671523, + "grad_norm": 0.34245744347572327, + "learning_rate": 0.0007762376237623763, + "loss": 1.4874, + "step": 4660 + }, + { + "epoch": 3.933974614377597, + "grad_norm": 0.3401254117488861, + "learning_rate": 0.0007752475247524753, + "loss": 1.4866, + "step": 4670 + }, + { + "epoch": 3.9423985428880415, + "grad_norm": 0.35778889060020447, + "learning_rate": 0.0007742574257425743, + "loss": 1.4818, + "step": 4680 + }, + { + "epoch": 3.9423985428880415, + "eval_accuracy": 0.6951155140000936, + "eval_loss": 1.3689333200454712, + "eval_runtime": 879.8095, + "eval_samples_per_second": 567.596, + "eval_steps_per_second": 5.256, + "step": 4680 + }, + { + "epoch": 3.950822471398486, + "grad_norm": 0.2895776927471161, + "learning_rate": 0.0007732673267326733, + "loss": 1.4822, + "step": 4690 + }, + { + "epoch": 3.9592463999089302, + "grad_norm": 0.3483330309391022, + "learning_rate": 0.0007722772277227723, + "loss": 1.4802, + "step": 4700 + }, + { + "epoch": 3.9676703284193753, + "grad_norm": 0.30115026235580444, + "learning_rate": 0.0007712871287128714, + "loss": 1.4838, + "step": 4710 + }, + { + "epoch": 3.9760942569298194, + "grad_norm": 0.32046666741371155, + "learning_rate": 0.0007702970297029703, + "loss": 1.4799, + "step": 4720 + }, + { + "epoch": 3.984518185440264, + "grad_norm": 0.3833225965499878, + "learning_rate": 0.0007693069306930694, + "loss": 1.4785, + "step": 4730 + }, + { + "epoch": 3.9929421139507086, + "grad_norm": 0.30888909101486206, + "learning_rate": 0.0007683168316831683, + "loss": 1.475, + "step": 4740 + }, + { + "epoch": 4.001366042461153, + "grad_norm": 0.32462459802627563, + "learning_rate": 0.0007673267326732674, + "loss": 1.4746, + "step": 4750 + }, + { + "epoch": 4.009789970971598, + "grad_norm": 0.3200187683105469, + "learning_rate": 0.0007663366336633664, + "loss": 1.4768, + "step": 4760 + }, + { + "epoch": 4.018213899482042, + "grad_norm": 0.3794704079627991, + "learning_rate": 0.0007653465346534654, + "loss": 1.4761, + "step": 4770 + }, + { + "epoch": 4.018213899482042, + "eval_accuracy": 0.6969660848927619, + "eval_loss": 1.3595411777496338, + "eval_runtime": 887.2228, + "eval_samples_per_second": 562.853, + "eval_steps_per_second": 5.212, + "step": 4770 + }, + { + "epoch": 4.026637827992487, + "grad_norm": 0.27933019399642944, + "learning_rate": 0.0007643564356435644, + "loss": 1.47, + "step": 4780 + }, + { + "epoch": 4.035061756502931, + "grad_norm": 0.32542508840560913, + "learning_rate": 0.0007633663366336634, + "loss": 1.4726, + "step": 4790 + }, + { + "epoch": 4.043485685013376, + "grad_norm": 0.3638169765472412, + "learning_rate": 0.0007623762376237624, + "loss": 1.4697, + "step": 4800 + }, + { + "epoch": 4.05190961352382, + "grad_norm": 0.3762564957141876, + "learning_rate": 0.0007613861386138615, + "loss": 1.4663, + "step": 4810 + }, + { + "epoch": 4.0603335420342646, + "grad_norm": 0.36758995056152344, + "learning_rate": 0.0007603960396039604, + "loss": 1.4729, + "step": 4820 + }, + { + "epoch": 4.06875747054471, + "grad_norm": 0.34590932726860046, + "learning_rate": 0.0007594059405940595, + "loss": 1.4665, + "step": 4830 + }, + { + "epoch": 4.077181399055154, + "grad_norm": 0.3242778182029724, + "learning_rate": 0.0007584158415841584, + "loss": 1.4639, + "step": 4840 + }, + { + "epoch": 4.085605327565599, + "grad_norm": 0.3849882185459137, + "learning_rate": 0.0007574257425742574, + "loss": 1.4613, + "step": 4850 + }, + { + "epoch": 4.094029256076043, + "grad_norm": 0.3495323061943054, + "learning_rate": 0.0007564356435643565, + "loss": 1.4598, + "step": 4860 + }, + { + "epoch": 4.094029256076043, + "eval_accuracy": 0.6996214986490302, + "eval_loss": 1.3455697298049927, + "eval_runtime": 887.3091, + "eval_samples_per_second": 562.798, + "eval_steps_per_second": 5.211, + "step": 4860 + }, + { + "epoch": 4.102453184586488, + "grad_norm": 0.3290145993232727, + "learning_rate": 0.0007554455445544554, + "loss": 1.4601, + "step": 4870 + }, + { + "epoch": 4.110877113096932, + "grad_norm": 0.34369096159935, + "learning_rate": 0.0007544554455445545, + "loss": 1.4603, + "step": 4880 + }, + { + "epoch": 4.119301041607376, + "grad_norm": 0.3350279629230499, + "learning_rate": 0.0007534653465346534, + "loss": 1.4609, + "step": 4890 + }, + { + "epoch": 4.127724970117821, + "grad_norm": 0.2575846016407013, + "learning_rate": 0.0007524752475247525, + "loss": 1.4565, + "step": 4900 + }, + { + "epoch": 4.1361488986282655, + "grad_norm": 0.3337861895561218, + "learning_rate": 0.0007514851485148515, + "loss": 1.4574, + "step": 4910 + }, + { + "epoch": 4.144572827138711, + "grad_norm": 0.3752147853374481, + "learning_rate": 0.0007504950495049505, + "loss": 1.4594, + "step": 4920 + }, + { + "epoch": 4.152996755649155, + "grad_norm": 0.29587122797966003, + "learning_rate": 0.0007495049504950495, + "loss": 1.4518, + "step": 4930 + }, + { + "epoch": 4.161420684159599, + "grad_norm": 0.2764742374420166, + "learning_rate": 0.0007485148514851485, + "loss": 1.4514, + "step": 4940 + }, + { + "epoch": 4.169844612670044, + "grad_norm": 0.4625591039657593, + "learning_rate": 0.0007475247524752475, + "loss": 1.4527, + "step": 4950 + }, + { + "epoch": 4.169844612670044, + "eval_accuracy": 0.701515475804278, + "eval_loss": 1.3361947536468506, + "eval_runtime": 883.9818, + "eval_samples_per_second": 564.917, + "eval_steps_per_second": 5.231, + "step": 4950 + }, + { + "epoch": 4.178268541180488, + "grad_norm": 0.29412004351615906, + "learning_rate": 0.0007465346534653466, + "loss": 1.4514, + "step": 4960 + }, + { + "epoch": 4.186692469690933, + "grad_norm": 0.3580242693424225, + "learning_rate": 0.0007455445544554455, + "loss": 1.4486, + "step": 4970 + }, + { + "epoch": 4.195116398201377, + "grad_norm": 0.46256908774375916, + "learning_rate": 0.0007445544554455446, + "loss": 1.4494, + "step": 4980 + }, + { + "epoch": 4.203540326711822, + "grad_norm": 0.3117842674255371, + "learning_rate": 0.0007435643564356435, + "loss": 1.4486, + "step": 4990 + }, + { + "epoch": 4.2119642552222665, + "grad_norm": 0.3382858335971832, + "learning_rate": 0.0007425742574257426, + "loss": 1.4452, + "step": 5000 + }, + { + "epoch": 4.220388183732711, + "grad_norm": 0.3153148889541626, + "learning_rate": 0.0007415841584158416, + "loss": 1.4465, + "step": 5010 + }, + { + "epoch": 4.228812112243156, + "grad_norm": 0.3635173439979553, + "learning_rate": 0.0007405940594059406, + "loss": 1.4443, + "step": 5020 + }, + { + "epoch": 4.2372360407536, + "grad_norm": 0.4260285794734955, + "learning_rate": 0.0007396039603960396, + "loss": 1.4454, + "step": 5030 + }, + { + "epoch": 4.245659969264045, + "grad_norm": 0.29188039898872375, + "learning_rate": 0.0007386138613861386, + "loss": 1.4442, + "step": 5040 + }, + { + "epoch": 4.245659969264045, + "eval_accuracy": 0.7031089800515327, + "eval_loss": 1.3285191059112549, + "eval_runtime": 890.9721, + "eval_samples_per_second": 560.484, + "eval_steps_per_second": 5.19, + "step": 5040 + }, + { + "epoch": 4.254083897774489, + "grad_norm": 0.5350555777549744, + "learning_rate": 0.0007376237623762376, + "loss": 1.4416, + "step": 5050 + }, + { + "epoch": 4.262507826284934, + "grad_norm": 0.35281315445899963, + "learning_rate": 0.0007366336633663367, + "loss": 1.4432, + "step": 5060 + }, + { + "epoch": 4.270931754795378, + "grad_norm": 0.37922871112823486, + "learning_rate": 0.0007356435643564356, + "loss": 1.4399, + "step": 5070 + }, + { + "epoch": 4.279355683305822, + "grad_norm": 0.3072182238101959, + "learning_rate": 0.0007346534653465347, + "loss": 1.4383, + "step": 5080 + }, + { + "epoch": 4.287779611816267, + "grad_norm": 0.30223241448402405, + "learning_rate": 0.0007336633663366336, + "loss": 1.4406, + "step": 5090 + }, + { + "epoch": 4.296203540326712, + "grad_norm": 0.5292770862579346, + "learning_rate": 0.0007326732673267327, + "loss": 1.4376, + "step": 5100 + }, + { + "epoch": 4.304627468837157, + "grad_norm": 0.35330840945243835, + "learning_rate": 0.0007316831683168317, + "loss": 1.4389, + "step": 5110 + }, + { + "epoch": 4.313051397347601, + "grad_norm": 0.30719104409217834, + "learning_rate": 0.0007306930693069307, + "loss": 1.4384, + "step": 5120 + }, + { + "epoch": 4.321475325858046, + "grad_norm": 0.34203872084617615, + "learning_rate": 0.0007297029702970297, + "loss": 1.4374, + "step": 5130 + }, + { + "epoch": 4.321475325858046, + "eval_accuracy": 0.7048288335521147, + "eval_loss": 1.3187906742095947, + "eval_runtime": 887.0787, + "eval_samples_per_second": 562.944, + "eval_steps_per_second": 5.213, + "step": 5130 + }, + { + "epoch": 4.32989925436849, + "grad_norm": 0.38140207529067993, + "learning_rate": 0.0007287128712871287, + "loss": 1.4353, + "step": 5140 + }, + { + "epoch": 4.338323182878934, + "grad_norm": 0.303752064704895, + "learning_rate": 0.0007277227722772277, + "loss": 1.4336, + "step": 5150 + }, + { + "epoch": 4.346747111389379, + "grad_norm": 0.290764719247818, + "learning_rate": 0.0007267326732673268, + "loss": 1.4304, + "step": 5160 + }, + { + "epoch": 4.355171039899823, + "grad_norm": 0.4335167407989502, + "learning_rate": 0.0007257425742574257, + "loss": 1.4327, + "step": 5170 + }, + { + "epoch": 4.363594968410268, + "grad_norm": 0.3198365271091461, + "learning_rate": 0.0007247524752475248, + "loss": 1.4319, + "step": 5180 + }, + { + "epoch": 4.3720188969207125, + "grad_norm": 0.41567763686180115, + "learning_rate": 0.0007237623762376237, + "loss": 1.4318, + "step": 5190 + }, + { + "epoch": 4.380442825431157, + "grad_norm": 0.3342703580856323, + "learning_rate": 0.0007227722772277228, + "loss": 1.4298, + "step": 5200 + }, + { + "epoch": 4.388866753941602, + "grad_norm": 0.25702279806137085, + "learning_rate": 0.0007217821782178218, + "loss": 1.4265, + "step": 5210 + }, + { + "epoch": 4.397290682452046, + "grad_norm": 0.26949411630630493, + "learning_rate": 0.0007207920792079208, + "loss": 1.4278, + "step": 5220 + }, + { + "epoch": 4.397290682452046, + "eval_accuracy": 0.7063243134470976, + "eval_loss": 1.3113943338394165, + "eval_runtime": 889.8031, + "eval_samples_per_second": 561.221, + "eval_steps_per_second": 5.197, + "step": 5220 + }, + { + "epoch": 4.405714610962491, + "grad_norm": 0.3861467242240906, + "learning_rate": 0.0007198019801980198, + "loss": 1.4318, + "step": 5230 + }, + { + "epoch": 4.414138539472935, + "grad_norm": 0.34858283400535583, + "learning_rate": 0.0007188118811881188, + "loss": 1.4291, + "step": 5240 + }, + { + "epoch": 4.42256246798338, + "grad_norm": 0.3346785604953766, + "learning_rate": 0.0007178217821782178, + "loss": 1.425, + "step": 5250 + }, + { + "epoch": 4.430986396493824, + "grad_norm": 0.3916323184967041, + "learning_rate": 0.0007168316831683169, + "loss": 1.4241, + "step": 5260 + }, + { + "epoch": 4.439410325004269, + "grad_norm": 0.2802947759628296, + "learning_rate": 0.0007158415841584158, + "loss": 1.4221, + "step": 5270 + }, + { + "epoch": 4.4478342535147135, + "grad_norm": 0.4092938303947449, + "learning_rate": 0.0007148514851485149, + "loss": 1.4236, + "step": 5280 + }, + { + "epoch": 4.456258182025158, + "grad_norm": 0.25096723437309265, + "learning_rate": 0.0007138613861386138, + "loss": 1.4235, + "step": 5290 + }, + { + "epoch": 4.464682110535603, + "grad_norm": 0.3570871949195862, + "learning_rate": 0.0007128712871287129, + "loss": 1.4216, + "step": 5300 + }, + { + "epoch": 4.473106039046047, + "grad_norm": 0.3168172240257263, + "learning_rate": 0.0007118811881188119, + "loss": 1.4236, + "step": 5310 + }, + { + "epoch": 4.473106039046047, + "eval_accuracy": 0.7076842136916008, + "eval_loss": 1.307774543762207, + "eval_runtime": 889.4836, + "eval_samples_per_second": 561.422, + "eval_steps_per_second": 5.199, + "step": 5310 + }, + { + "epoch": 4.481529967556492, + "grad_norm": 0.30059170722961426, + "learning_rate": 0.0007108910891089109, + "loss": 1.4193, + "step": 5320 + }, + { + "epoch": 4.489953896066936, + "grad_norm": 0.331824392080307, + "learning_rate": 0.0007099009900990099, + "loss": 1.4185, + "step": 5330 + }, + { + "epoch": 4.49837782457738, + "grad_norm": 0.3295821249485016, + "learning_rate": 0.0007089108910891088, + "loss": 1.4198, + "step": 5340 + }, + { + "epoch": 4.506801753087825, + "grad_norm": 0.3506734371185303, + "learning_rate": 0.0007079207920792079, + "loss": 1.4167, + "step": 5350 + }, + { + "epoch": 4.515225681598269, + "grad_norm": 0.3836129903793335, + "learning_rate": 0.000706930693069307, + "loss": 1.417, + "step": 5360 + }, + { + "epoch": 4.5236496101087145, + "grad_norm": 0.3046220541000366, + "learning_rate": 0.0007059405940594059, + "loss": 1.4177, + "step": 5370 + }, + { + "epoch": 4.532073538619159, + "grad_norm": 0.37655332684516907, + "learning_rate": 0.000704950495049505, + "loss": 1.4149, + "step": 5380 + }, + { + "epoch": 4.540497467129603, + "grad_norm": 0.32939672470092773, + "learning_rate": 0.0007039603960396039, + "loss": 1.4165, + "step": 5390 + }, + { + "epoch": 4.548921395640048, + "grad_norm": 0.2900882363319397, + "learning_rate": 0.0007029702970297029, + "loss": 1.4128, + "step": 5400 + }, + { + "epoch": 4.548921395640048, + "eval_accuracy": 0.7087959913049944, + "eval_loss": 1.3013147115707397, + "eval_runtime": 892.9333, + "eval_samples_per_second": 559.253, + "eval_steps_per_second": 5.178, + "step": 5400 + }, + { + "epoch": 4.557345324150492, + "grad_norm": 0.27651771903038025, + "learning_rate": 0.000701980198019802, + "loss": 1.4122, + "step": 5410 + }, + { + "epoch": 4.565769252660937, + "grad_norm": 0.4160715639591217, + "learning_rate": 0.0007009900990099009, + "loss": 1.4122, + "step": 5420 + }, + { + "epoch": 4.574193181171381, + "grad_norm": 0.2724072337150574, + "learning_rate": 0.0007, + "loss": 1.41, + "step": 5430 + }, + { + "epoch": 4.582617109681826, + "grad_norm": 0.35586145520210266, + "learning_rate": 0.0006990099009900989, + "loss": 1.4118, + "step": 5440 + }, + { + "epoch": 4.59104103819227, + "grad_norm": 0.3268265128135681, + "learning_rate": 0.000698019801980198, + "loss": 1.4117, + "step": 5450 + }, + { + "epoch": 4.599464966702715, + "grad_norm": 0.3230002522468567, + "learning_rate": 0.000697029702970297, + "loss": 1.4102, + "step": 5460 + }, + { + "epoch": 4.60788889521316, + "grad_norm": 0.25019174814224243, + "learning_rate": 0.000696039603960396, + "loss": 1.4102, + "step": 5470 + }, + { + "epoch": 4.616312823723604, + "grad_norm": 0.38475289940834045, + "learning_rate": 0.000695049504950495, + "loss": 1.4075, + "step": 5480 + }, + { + "epoch": 4.624736752234049, + "grad_norm": 0.39824309945106506, + "learning_rate": 0.000694059405940594, + "loss": 1.4077, + "step": 5490 + }, + { + "epoch": 4.624736752234049, + "eval_accuracy": 0.7098417264518991, + "eval_loss": 1.2926928997039795, + "eval_runtime": 881.9048, + "eval_samples_per_second": 566.247, + "eval_steps_per_second": 5.243, + "step": 5490 + }, + { + "epoch": 4.633160680744493, + "grad_norm": 0.3250022828578949, + "learning_rate": 0.000693069306930693, + "loss": 1.4068, + "step": 5500 + }, + { + "epoch": 4.641584609254938, + "grad_norm": 0.32388612627983093, + "learning_rate": 0.0006920792079207921, + "loss": 1.4062, + "step": 5510 + }, + { + "epoch": 4.650008537765382, + "grad_norm": 0.2806077003479004, + "learning_rate": 0.000691089108910891, + "loss": 1.4049, + "step": 5520 + }, + { + "epoch": 4.658432466275826, + "grad_norm": 0.33755025267601013, + "learning_rate": 0.0006900990099009901, + "loss": 1.4045, + "step": 5530 + }, + { + "epoch": 4.666856394786271, + "grad_norm": 0.4184636175632477, + "learning_rate": 0.000689108910891089, + "loss": 1.4042, + "step": 5540 + }, + { + "epoch": 4.6752803232967155, + "grad_norm": 0.34234240651130676, + "learning_rate": 0.0006881188118811881, + "loss": 1.4055, + "step": 5550 + }, + { + "epoch": 4.6837042518071605, + "grad_norm": 0.32120293378829956, + "learning_rate": 0.0006871287128712872, + "loss": 1.4014, + "step": 5560 + }, + { + "epoch": 4.692128180317605, + "grad_norm": 0.3810026943683624, + "learning_rate": 0.0006861386138613862, + "loss": 1.4039, + "step": 5570 + }, + { + "epoch": 4.70055210882805, + "grad_norm": 0.3171080946922302, + "learning_rate": 0.0006851485148514852, + "loss": 1.4025, + "step": 5580 + }, + { + "epoch": 4.70055210882805, + "eval_accuracy": 0.7115425686273988, + "eval_loss": 1.285227656364441, + "eval_runtime": 891.3368, + "eval_samples_per_second": 560.255, + "eval_steps_per_second": 5.188, + "step": 5580 + }, + { + "epoch": 4.708976037338494, + "grad_norm": 0.24618960916996002, + "learning_rate": 0.0006841584158415842, + "loss": 1.3983, + "step": 5590 + }, + { + "epoch": 4.717399965848939, + "grad_norm": 0.494895339012146, + "learning_rate": 0.0006831683168316832, + "loss": 1.4, + "step": 5600 + }, + { + "epoch": 4.725823894359383, + "grad_norm": 0.31908226013183594, + "learning_rate": 0.0006821782178217823, + "loss": 1.3983, + "step": 5610 + }, + { + "epoch": 4.734247822869827, + "grad_norm": 0.26488983631134033, + "learning_rate": 0.0006811881188118812, + "loss": 1.3956, + "step": 5620 + }, + { + "epoch": 4.742671751380272, + "grad_norm": 0.3156343102455139, + "learning_rate": 0.0006801980198019803, + "loss": 1.397, + "step": 5630 + }, + { + "epoch": 4.7510956798907165, + "grad_norm": 0.38938194513320923, + "learning_rate": 0.0006792079207920792, + "loss": 1.3987, + "step": 5640 + }, + { + "epoch": 4.7595196084011615, + "grad_norm": 0.27233967185020447, + "learning_rate": 0.0006782178217821783, + "loss": 1.3983, + "step": 5650 + }, + { + "epoch": 4.767943536911606, + "grad_norm": 0.347419410943985, + "learning_rate": 0.0006772277227722773, + "loss": 1.3953, + "step": 5660 + }, + { + "epoch": 4.77636746542205, + "grad_norm": 0.44131675362586975, + "learning_rate": 0.0006762376237623763, + "loss": 1.3956, + "step": 5670 + }, + { + "epoch": 4.77636746542205, + "eval_accuracy": 0.7112416746447588, + "eval_loss": 1.290834665298462, + "eval_runtime": 886.5668, + "eval_samples_per_second": 563.269, + "eval_steps_per_second": 5.216, + "step": 5670 + }, + { + "epoch": 4.784791393932495, + "grad_norm": 0.3185184895992279, + "learning_rate": 0.0006752475247524753, + "loss": 1.3976, + "step": 5680 + }, + { + "epoch": 4.793215322442939, + "grad_norm": 0.2549585998058319, + "learning_rate": 0.0006742574257425743, + "loss": 1.3931, + "step": 5690 + }, + { + "epoch": 4.801639250953384, + "grad_norm": 0.315294086933136, + "learning_rate": 0.0006732673267326733, + "loss": 1.393, + "step": 5700 + }, + { + "epoch": 4.810063179463828, + "grad_norm": 0.3866962492465973, + "learning_rate": 0.0006722772277227724, + "loss": 1.3923, + "step": 5710 + }, + { + "epoch": 4.818487107974272, + "grad_norm": 0.28364527225494385, + "learning_rate": 0.0006712871287128713, + "loss": 1.3924, + "step": 5720 + }, + { + "epoch": 4.826911036484717, + "grad_norm": 0.3253314793109894, + "learning_rate": 0.0006702970297029704, + "loss": 1.3914, + "step": 5730 + }, + { + "epoch": 4.835334964995162, + "grad_norm": 0.31215131282806396, + "learning_rate": 0.0006693069306930693, + "loss": 1.3903, + "step": 5740 + }, + { + "epoch": 4.843758893505607, + "grad_norm": 0.34929993748664856, + "learning_rate": 0.0006683168316831684, + "loss": 1.3894, + "step": 5750 + }, + { + "epoch": 4.852182822016051, + "grad_norm": 0.38991761207580566, + "learning_rate": 0.0006673267326732674, + "loss": 1.3924, + "step": 5760 + }, + { + "epoch": 4.852182822016051, + "eval_accuracy": 0.7133021748514282, + "eval_loss": 1.2766938209533691, + "eval_runtime": 881.7452, + "eval_samples_per_second": 566.35, + "eval_steps_per_second": 5.244, + "step": 5760 + }, + { + "epoch": 4.860606750526496, + "grad_norm": 0.2888573408126831, + "learning_rate": 0.0006663366336633664, + "loss": 1.3918, + "step": 5770 + }, + { + "epoch": 4.86903067903694, + "grad_norm": 0.3224232494831085, + "learning_rate": 0.0006653465346534654, + "loss": 1.3895, + "step": 5780 + }, + { + "epoch": 4.877454607547385, + "grad_norm": 0.3562750518321991, + "learning_rate": 0.0006643564356435644, + "loss": 1.387, + "step": 5790 + }, + { + "epoch": 4.885878536057829, + "grad_norm": 0.3339401185512543, + "learning_rate": 0.0006633663366336634, + "loss": 1.3886, + "step": 5800 + }, + { + "epoch": 4.894302464568273, + "grad_norm": 0.3022938072681427, + "learning_rate": 0.0006623762376237625, + "loss": 1.3858, + "step": 5810 + }, + { + "epoch": 4.902726393078718, + "grad_norm": 0.276065856218338, + "learning_rate": 0.0006613861386138614, + "loss": 1.386, + "step": 5820 + }, + { + "epoch": 4.9111503215891625, + "grad_norm": 0.3148975372314453, + "learning_rate": 0.0006603960396039605, + "loss": 1.385, + "step": 5830 + }, + { + "epoch": 4.919574250099608, + "grad_norm": 0.3374193608760834, + "learning_rate": 0.0006594059405940594, + "loss": 1.3842, + "step": 5840 + }, + { + "epoch": 4.927998178610052, + "grad_norm": 0.3293200135231018, + "learning_rate": 0.0006584158415841585, + "loss": 1.3835, + "step": 5850 + }, + { + "epoch": 4.927998178610052, + "eval_accuracy": 0.7147221912687882, + "eval_loss": 1.2681052684783936, + "eval_runtime": 890.793, + "eval_samples_per_second": 560.597, + "eval_steps_per_second": 5.191, + "step": 5850 + }, + { + "epoch": 4.936422107120496, + "grad_norm": 0.3032568693161011, + "learning_rate": 0.0006574257425742575, + "loss": 1.3828, + "step": 5860 + }, + { + "epoch": 4.944846035630941, + "grad_norm": 0.24251434206962585, + "learning_rate": 0.0006564356435643565, + "loss": 1.3818, + "step": 5870 + }, + { + "epoch": 4.953269964141385, + "grad_norm": 0.3096301257610321, + "learning_rate": 0.0006554455445544555, + "loss": 1.3814, + "step": 5880 + }, + { + "epoch": 4.96169389265183, + "grad_norm": 0.34841156005859375, + "learning_rate": 0.0006544554455445545, + "loss": 1.3823, + "step": 5890 + }, + { + "epoch": 4.970117821162274, + "grad_norm": 0.312688946723938, + "learning_rate": 0.0006534653465346535, + "loss": 1.3818, + "step": 5900 + }, + { + "epoch": 4.978541749672719, + "grad_norm": 0.30799320340156555, + "learning_rate": 0.0006524752475247526, + "loss": 1.379, + "step": 5910 + }, + { + "epoch": 4.9869656781831635, + "grad_norm": 0.3510371148586273, + "learning_rate": 0.0006514851485148515, + "loss": 1.3814, + "step": 5920 + }, + { + "epoch": 4.9953896066936085, + "grad_norm": 0.2894381582736969, + "learning_rate": 0.0006504950495049506, + "loss": 1.3812, + "step": 5930 + }, + { + "epoch": 5.003813535204053, + "grad_norm": 0.2685450315475464, + "learning_rate": 0.0006495049504950495, + "loss": 1.3788, + "step": 5940 + }, + { + "epoch": 5.003813535204053, + "eval_accuracy": 0.7160080315056353, + "eval_loss": 1.2630343437194824, + "eval_runtime": 883.8805, + "eval_samples_per_second": 564.981, + "eval_steps_per_second": 5.231, + "step": 5940 + }, + { + "epoch": 5.012237463714497, + "grad_norm": 0.38857927918434143, + "learning_rate": 0.0006485148514851485, + "loss": 1.3809, + "step": 5950 + }, + { + "epoch": 5.020661392224942, + "grad_norm": 0.2822309136390686, + "learning_rate": 0.0006475247524752476, + "loss": 1.3769, + "step": 5960 + }, + { + "epoch": 5.029085320735386, + "grad_norm": 0.2725491523742676, + "learning_rate": 0.0006465346534653465, + "loss": 1.3762, + "step": 5970 + }, + { + "epoch": 5.037509249245831, + "grad_norm": 0.32517486810684204, + "learning_rate": 0.0006455445544554456, + "loss": 1.377, + "step": 5980 + }, + { + "epoch": 5.045933177756275, + "grad_norm": 0.34373360872268677, + "learning_rate": 0.0006445544554455445, + "loss": 1.3774, + "step": 5990 + }, + { + "epoch": 5.054357106266719, + "grad_norm": 0.3029853403568268, + "learning_rate": 0.0006435643564356436, + "loss": 1.3746, + "step": 6000 + }, + { + "epoch": 5.0627810347771645, + "grad_norm": 0.5577653646469116, + "learning_rate": 0.0006425742574257426, + "loss": 1.378, + "step": 6010 + }, + { + "epoch": 5.071204963287609, + "grad_norm": 0.27967342734336853, + "learning_rate": 0.0006415841584158416, + "loss": 1.3779, + "step": 6020 + }, + { + "epoch": 5.079628891798054, + "grad_norm": 0.2680428624153137, + "learning_rate": 0.0006405940594059406, + "loss": 1.3733, + "step": 6030 + }, + { + "epoch": 5.079628891798054, + "eval_accuracy": 0.7168763989390342, + "eval_loss": 1.258245825767517, + "eval_runtime": 902.3568, + "eval_samples_per_second": 553.413, + "eval_steps_per_second": 5.124, + "step": 6030 + }, + { + "epoch": 5.088052820308498, + "grad_norm": 0.24522745609283447, + "learning_rate": 0.0006396039603960396, + "loss": 1.3692, + "step": 6040 + }, + { + "epoch": 5.096476748818943, + "grad_norm": 0.3076081871986389, + "learning_rate": 0.0006386138613861386, + "loss": 1.3724, + "step": 6050 + }, + { + "epoch": 5.104900677329387, + "grad_norm": 0.32096347212791443, + "learning_rate": 0.0006376237623762377, + "loss": 1.3737, + "step": 6060 + }, + { + "epoch": 5.113324605839831, + "grad_norm": 0.35196197032928467, + "learning_rate": 0.0006366336633663366, + "loss": 1.3719, + "step": 6070 + }, + { + "epoch": 5.121748534350276, + "grad_norm": 0.39065635204315186, + "learning_rate": 0.0006356435643564357, + "loss": 1.3719, + "step": 6080 + }, + { + "epoch": 5.13017246286072, + "grad_norm": 0.3439326882362366, + "learning_rate": 0.0006346534653465346, + "loss": 1.3749, + "step": 6090 + }, + { + "epoch": 5.138596391371165, + "grad_norm": 0.3175961673259735, + "learning_rate": 0.0006336633663366337, + "loss": 1.3679, + "step": 6100 + }, + { + "epoch": 5.14702031988161, + "grad_norm": 0.37071719765663147, + "learning_rate": 0.0006326732673267327, + "loss": 1.3706, + "step": 6110 + }, + { + "epoch": 5.155444248392055, + "grad_norm": 0.2499271035194397, + "learning_rate": 0.0006316831683168317, + "loss": 1.3685, + "step": 6120 + }, + { + "epoch": 5.155444248392055, + "eval_accuracy": 0.717981203712741, + "eval_loss": 1.2521748542785645, + "eval_runtime": 885.5528, + "eval_samples_per_second": 563.914, + "eval_steps_per_second": 5.222, + "step": 6120 + }, + { + "epoch": 5.163868176902499, + "grad_norm": 0.3951607346534729, + "learning_rate": 0.0006306930693069307, + "loss": 1.3671, + "step": 6130 + }, + { + "epoch": 5.172292105412943, + "grad_norm": 0.4264112114906311, + "learning_rate": 0.0006297029702970297, + "loss": 1.3652, + "step": 6140 + }, + { + "epoch": 5.180716033923388, + "grad_norm": 0.3097785711288452, + "learning_rate": 0.0006287128712871287, + "loss": 1.3695, + "step": 6150 + }, + { + "epoch": 5.189139962433832, + "grad_norm": 0.28887125849723816, + "learning_rate": 0.0006277227722772278, + "loss": 1.3658, + "step": 6160 + }, + { + "epoch": 5.197563890944277, + "grad_norm": 0.27163591980934143, + "learning_rate": 0.0006267326732673267, + "loss": 1.3655, + "step": 6170 + }, + { + "epoch": 5.205987819454721, + "grad_norm": 0.30266183614730835, + "learning_rate": 0.0006257425742574258, + "loss": 1.3631, + "step": 6180 + }, + { + "epoch": 5.2144117479651655, + "grad_norm": 0.3191784620285034, + "learning_rate": 0.0006247524752475247, + "loss": 1.3667, + "step": 6190 + }, + { + "epoch": 5.2228356764756105, + "grad_norm": 0.30907300114631653, + "learning_rate": 0.0006237623762376238, + "loss": 1.3667, + "step": 6200 + }, + { + "epoch": 5.231259604986055, + "grad_norm": 0.3120558559894562, + "learning_rate": 0.0006227722772277228, + "loss": 1.3638, + "step": 6210 + }, + { + "epoch": 5.231259604986055, + "eval_accuracy": 0.7190249020483522, + "eval_loss": 1.2470471858978271, + "eval_runtime": 893.7706, + "eval_samples_per_second": 558.73, + "eval_steps_per_second": 5.174, + "step": 6210 + }, + { + "epoch": 5.2396835334965, + "grad_norm": 0.35595396161079407, + "learning_rate": 0.0006217821782178218, + "loss": 1.3634, + "step": 6220 + }, + { + "epoch": 5.248107462006944, + "grad_norm": 0.33759573101997375, + "learning_rate": 0.0006207920792079208, + "loss": 1.3661, + "step": 6230 + }, + { + "epoch": 5.256531390517389, + "grad_norm": 0.26417672634124756, + "learning_rate": 0.0006198019801980198, + "loss": 1.3627, + "step": 6240 + }, + { + "epoch": 5.264955319027833, + "grad_norm": 0.28236111998558044, + "learning_rate": 0.0006188118811881188, + "loss": 1.362, + "step": 6250 + }, + { + "epoch": 5.273379247538277, + "grad_norm": 0.5903481245040894, + "learning_rate": 0.0006178217821782179, + "loss": 1.3619, + "step": 6260 + }, + { + "epoch": 5.281803176048722, + "grad_norm": 0.298475056886673, + "learning_rate": 0.0006168316831683168, + "loss": 1.3671, + "step": 6270 + }, + { + "epoch": 5.2902271045591664, + "grad_norm": 0.27397215366363525, + "learning_rate": 0.0006158415841584159, + "loss": 1.3611, + "step": 6280 + }, + { + "epoch": 5.2986510330696115, + "grad_norm": 0.28740593791007996, + "learning_rate": 0.0006148514851485148, + "loss": 1.3579, + "step": 6290 + }, + { + "epoch": 5.307074961580056, + "grad_norm": 0.274557888507843, + "learning_rate": 0.0006138613861386139, + "loss": 1.3587, + "step": 6300 + }, + { + "epoch": 5.307074961580056, + "eval_accuracy": 0.719703789624826, + "eval_loss": 1.2432972192764282, + "eval_runtime": 881.2394, + "eval_samples_per_second": 566.675, + "eval_steps_per_second": 5.247, + "step": 6300 + }, + { + "epoch": 5.315498890090501, + "grad_norm": 0.31431418657302856, + "learning_rate": 0.0006128712871287129, + "loss": 1.3565, + "step": 6310 + }, + { + "epoch": 5.323922818600945, + "grad_norm": 0.358239084482193, + "learning_rate": 0.0006118811881188119, + "loss": 1.3614, + "step": 6320 + }, + { + "epoch": 5.332346747111389, + "grad_norm": 0.3043140769004822, + "learning_rate": 0.0006108910891089109, + "loss": 1.3576, + "step": 6330 + }, + { + "epoch": 5.340770675621834, + "grad_norm": 0.2583385109901428, + "learning_rate": 0.0006099009900990099, + "loss": 1.3578, + "step": 6340 + }, + { + "epoch": 5.349194604132278, + "grad_norm": 0.3068407475948334, + "learning_rate": 0.0006089108910891089, + "loss": 1.3577, + "step": 6350 + }, + { + "epoch": 5.357618532642723, + "grad_norm": 0.2893878221511841, + "learning_rate": 0.000607920792079208, + "loss": 1.3569, + "step": 6360 + }, + { + "epoch": 5.366042461153167, + "grad_norm": 0.2883850634098053, + "learning_rate": 0.0006069306930693069, + "loss": 1.3555, + "step": 6370 + }, + { + "epoch": 5.3744663896636125, + "grad_norm": 0.3248838484287262, + "learning_rate": 0.000605940594059406, + "loss": 1.3561, + "step": 6380 + }, + { + "epoch": 5.382890318174057, + "grad_norm": 0.29167214035987854, + "learning_rate": 0.0006049504950495049, + "loss": 1.3582, + "step": 6390 + }, + { + "epoch": 5.382890318174057, + "eval_accuracy": 0.7203339064191229, + "eval_loss": 1.241172432899475, + "eval_runtime": 891.2006, + "eval_samples_per_second": 560.341, + "eval_steps_per_second": 5.189, + "step": 6390 + }, + { + "epoch": 5.391314246684501, + "grad_norm": 0.3090030550956726, + "learning_rate": 0.000603960396039604, + "loss": 1.3534, + "step": 6400 + }, + { + "epoch": 5.399738175194946, + "grad_norm": 0.25337210297584534, + "learning_rate": 0.000602970297029703, + "loss": 1.3564, + "step": 6410 + }, + { + "epoch": 5.40816210370539, + "grad_norm": 0.25656768679618835, + "learning_rate": 0.000601980198019802, + "loss": 1.3549, + "step": 6420 + }, + { + "epoch": 5.416586032215835, + "grad_norm": 0.2951459288597107, + "learning_rate": 0.000600990099009901, + "loss": 1.3518, + "step": 6430 + }, + { + "epoch": 5.425009960726279, + "grad_norm": 0.2697450816631317, + "learning_rate": 0.0006, + "loss": 1.3531, + "step": 6440 + }, + { + "epoch": 5.433433889236724, + "grad_norm": 0.28866857290267944, + "learning_rate": 0.000599009900990099, + "loss": 1.3524, + "step": 6450 + }, + { + "epoch": 5.441857817747168, + "grad_norm": 0.26775673031806946, + "learning_rate": 0.000598019801980198, + "loss": 1.3505, + "step": 6460 + }, + { + "epoch": 5.4502817462576125, + "grad_norm": 0.3911271393299103, + "learning_rate": 0.000597029702970297, + "loss": 1.3516, + "step": 6470 + }, + { + "epoch": 5.458705674768058, + "grad_norm": 0.3151527941226959, + "learning_rate": 0.000596039603960396, + "loss": 1.353, + "step": 6480 + }, + { + "epoch": 5.458705674768058, + "eval_accuracy": 0.7213715986510872, + "eval_loss": 1.2357591390609741, + "eval_runtime": 888.8097, + "eval_samples_per_second": 561.848, + "eval_steps_per_second": 5.202, + "step": 6480 + }, + { + "epoch": 5.467129603278502, + "grad_norm": 0.32286888360977173, + "learning_rate": 0.000595049504950495, + "loss": 1.3527, + "step": 6490 + }, + { + "epoch": 5.475553531788947, + "grad_norm": 0.3933228850364685, + "learning_rate": 0.000594059405940594, + "loss": 1.3511, + "step": 6500 + }, + { + "epoch": 5.483977460299391, + "grad_norm": 0.3246067762374878, + "learning_rate": 0.0005930693069306931, + "loss": 1.3524, + "step": 6510 + }, + { + "epoch": 5.492401388809835, + "grad_norm": 0.2912397086620331, + "learning_rate": 0.000592079207920792, + "loss": 1.3495, + "step": 6520 + }, + { + "epoch": 5.50082531732028, + "grad_norm": 0.3058258891105652, + "learning_rate": 0.0005910891089108911, + "loss": 1.3486, + "step": 6530 + }, + { + "epoch": 5.509249245830724, + "grad_norm": 0.310024231672287, + "learning_rate": 0.00059009900990099, + "loss": 1.3507, + "step": 6540 + }, + { + "epoch": 5.517673174341169, + "grad_norm": 0.289165198802948, + "learning_rate": 0.0005891089108910891, + "loss": 1.3475, + "step": 6550 + }, + { + "epoch": 5.5260971028516135, + "grad_norm": 0.324613094329834, + "learning_rate": 0.0005881188118811881, + "loss": 1.3489, + "step": 6560 + }, + { + "epoch": 5.5345210313620585, + "grad_norm": 0.3530217111110687, + "learning_rate": 0.0005871287128712871, + "loss": 1.3477, + "step": 6570 + }, + { + "epoch": 5.5345210313620585, + "eval_accuracy": 0.722217175302605, + "eval_loss": 1.2293946743011475, + "eval_runtime": 881.4092, + "eval_samples_per_second": 566.565, + "eval_steps_per_second": 5.246, + "step": 6570 + }, + { + "epoch": 5.542944959872503, + "grad_norm": 0.3527272045612335, + "learning_rate": 0.0005861386138613861, + "loss": 1.3447, + "step": 6580 + }, + { + "epoch": 5.551368888382948, + "grad_norm": 0.26519855856895447, + "learning_rate": 0.0005851485148514851, + "loss": 1.346, + "step": 6590 + }, + { + "epoch": 5.559792816893392, + "grad_norm": 0.29473376274108887, + "learning_rate": 0.0005841584158415841, + "loss": 1.3461, + "step": 6600 + }, + { + "epoch": 5.568216745403836, + "grad_norm": 0.31212469935417175, + "learning_rate": 0.0005831683168316832, + "loss": 1.3454, + "step": 6610 + }, + { + "epoch": 5.576640673914281, + "grad_norm": 0.2541083097457886, + "learning_rate": 0.0005821782178217821, + "loss": 1.3451, + "step": 6620 + }, + { + "epoch": 5.585064602424725, + "grad_norm": 0.28075823187828064, + "learning_rate": 0.0005811881188118812, + "loss": 1.3417, + "step": 6630 + }, + { + "epoch": 5.59348853093517, + "grad_norm": 0.286945641040802, + "learning_rate": 0.0005801980198019801, + "loss": 1.3439, + "step": 6640 + }, + { + "epoch": 5.6019124594456144, + "grad_norm": 0.2825601100921631, + "learning_rate": 0.0005792079207920792, + "loss": 1.3447, + "step": 6650 + }, + { + "epoch": 5.610336387956059, + "grad_norm": 0.3023243844509125, + "learning_rate": 0.0005782178217821782, + "loss": 1.3428, + "step": 6660 + }, + { + "epoch": 5.610336387956059, + "eval_accuracy": 0.7226627197479346, + "eval_loss": 1.2287484407424927, + "eval_runtime": 893.8585, + "eval_samples_per_second": 558.675, + "eval_steps_per_second": 5.173, + "step": 6660 + }, + { + "epoch": 5.618760316466504, + "grad_norm": 0.2548897862434387, + "learning_rate": 0.0005772277227722772, + "loss": 1.3441, + "step": 6670 + }, + { + "epoch": 5.627184244976948, + "grad_norm": 0.28277119994163513, + "learning_rate": 0.0005762376237623762, + "loss": 1.3421, + "step": 6680 + }, + { + "epoch": 5.635608173487393, + "grad_norm": 0.35963568091392517, + "learning_rate": 0.0005752475247524752, + "loss": 1.3421, + "step": 6690 + }, + { + "epoch": 5.644032101997837, + "grad_norm": 0.2753046452999115, + "learning_rate": 0.0005742574257425742, + "loss": 1.3449, + "step": 6700 + }, + { + "epoch": 5.652456030508281, + "grad_norm": 0.31272053718566895, + "learning_rate": 0.0005732673267326733, + "loss": 1.3418, + "step": 6710 + }, + { + "epoch": 5.660879959018726, + "grad_norm": 0.24427007138729095, + "learning_rate": 0.0005722772277227722, + "loss": 1.3409, + "step": 6720 + }, + { + "epoch": 5.66930388752917, + "grad_norm": 0.4038189649581909, + "learning_rate": 0.0005712871287128713, + "loss": 1.3387, + "step": 6730 + }, + { + "epoch": 5.677727816039615, + "grad_norm": 0.30009007453918457, + "learning_rate": 0.0005702970297029702, + "loss": 1.3425, + "step": 6740 + }, + { + "epoch": 5.68615174455006, + "grad_norm": 0.2813461720943451, + "learning_rate": 0.0005693069306930693, + "loss": 1.3396, + "step": 6750 + }, + { + "epoch": 5.68615174455006, + "eval_accuracy": 0.7239226758241876, + "eval_loss": 1.2240657806396484, + "eval_runtime": 898.7215, + "eval_samples_per_second": 555.652, + "eval_steps_per_second": 5.145, + "step": 6750 + }, + { + "epoch": 5.694575673060505, + "grad_norm": 0.4396764039993286, + "learning_rate": 0.0005683168316831683, + "loss": 1.3408, + "step": 6760 + }, + { + "epoch": 5.702999601570949, + "grad_norm": 0.2992042899131775, + "learning_rate": 0.0005673267326732673, + "loss": 1.3408, + "step": 6770 + }, + { + "epoch": 5.711423530081394, + "grad_norm": 0.2579440474510193, + "learning_rate": 0.0005663366336633663, + "loss": 1.3369, + "step": 6780 + }, + { + "epoch": 5.719847458591838, + "grad_norm": 0.32076653838157654, + "learning_rate": 0.0005653465346534653, + "loss": 1.3365, + "step": 6790 + }, + { + "epoch": 5.728271387102282, + "grad_norm": 0.3180268108844757, + "learning_rate": 0.0005643564356435643, + "loss": 1.339, + "step": 6800 + }, + { + "epoch": 5.736695315612727, + "grad_norm": 0.27663713693618774, + "learning_rate": 0.0005633663366336634, + "loss": 1.3373, + "step": 6810 + }, + { + "epoch": 5.745119244123171, + "grad_norm": 0.27103811502456665, + "learning_rate": 0.0005623762376237624, + "loss": 1.3332, + "step": 6820 + }, + { + "epoch": 5.753543172633616, + "grad_norm": 0.34022676944732666, + "learning_rate": 0.0005613861386138615, + "loss": 1.3373, + "step": 6830 + }, + { + "epoch": 5.7619671011440605, + "grad_norm": 0.36838725209236145, + "learning_rate": 0.0005603960396039604, + "loss": 1.3384, + "step": 6840 + }, + { + "epoch": 5.7619671011440605, + "eval_accuracy": 0.7243312842270887, + "eval_loss": 1.221815586090088, + "eval_runtime": 891.7897, + "eval_samples_per_second": 559.971, + "eval_steps_per_second": 5.185, + "step": 6840 + }, + { + "epoch": 5.770391029654505, + "grad_norm": 0.2968374490737915, + "learning_rate": 0.0005594059405940595, + "loss": 1.3353, + "step": 6850 + }, + { + "epoch": 5.77881495816495, + "grad_norm": 0.36536258459091187, + "learning_rate": 0.0005584158415841585, + "loss": 1.3331, + "step": 6860 + }, + { + "epoch": 5.787238886675394, + "grad_norm": 0.2985541522502899, + "learning_rate": 0.0005574257425742575, + "loss": 1.3313, + "step": 6870 + }, + { + "epoch": 5.795662815185839, + "grad_norm": 0.33506348729133606, + "learning_rate": 0.0005564356435643565, + "loss": 1.3349, + "step": 6880 + }, + { + "epoch": 5.804086743696283, + "grad_norm": 0.31232866644859314, + "learning_rate": 0.0005554455445544555, + "loss": 1.3335, + "step": 6890 + }, + { + "epoch": 5.812510672206728, + "grad_norm": 0.27576977014541626, + "learning_rate": 0.0005544554455445545, + "loss": 1.3309, + "step": 6900 + }, + { + "epoch": 5.820934600717172, + "grad_norm": 0.2526339590549469, + "learning_rate": 0.0005534653465346536, + "loss": 1.3318, + "step": 6910 + }, + { + "epoch": 5.829358529227616, + "grad_norm": 0.25774866342544556, + "learning_rate": 0.0005524752475247525, + "loss": 1.3329, + "step": 6920 + }, + { + "epoch": 5.8377824577380615, + "grad_norm": 0.34311917424201965, + "learning_rate": 0.0005514851485148516, + "loss": 1.3334, + "step": 6930 + }, + { + "epoch": 5.8377824577380615, + "eval_accuracy": 0.7251374384748042, + "eval_loss": 1.216299057006836, + "eval_runtime": 889.6984, + "eval_samples_per_second": 561.287, + "eval_steps_per_second": 5.197, + "step": 6930 + }, + { + "epoch": 5.846206386248506, + "grad_norm": 0.32087624073028564, + "learning_rate": 0.0005504950495049505, + "loss": 1.3338, + "step": 6940 + }, + { + "epoch": 5.854630314758951, + "grad_norm": 0.25447556376457214, + "learning_rate": 0.0005495049504950496, + "loss": 1.3315, + "step": 6950 + }, + { + "epoch": 5.863054243269395, + "grad_norm": 0.285826712846756, + "learning_rate": 0.0005485148514851486, + "loss": 1.3303, + "step": 6960 + }, + { + "epoch": 5.87147817177984, + "grad_norm": 0.2816094756126404, + "learning_rate": 0.0005475247524752476, + "loss": 1.3308, + "step": 6970 + }, + { + "epoch": 5.879902100290284, + "grad_norm": 0.30444055795669556, + "learning_rate": 0.0005465346534653466, + "loss": 1.3303, + "step": 6980 + }, + { + "epoch": 5.888326028800728, + "grad_norm": 0.3512563705444336, + "learning_rate": 0.0005455445544554456, + "loss": 1.3305, + "step": 6990 + }, + { + "epoch": 5.896749957311173, + "grad_norm": 0.2924775779247284, + "learning_rate": 0.0005445544554455446, + "loss": 1.3307, + "step": 7000 + }, + { + "epoch": 5.905173885821617, + "grad_norm": 0.3497087359428406, + "learning_rate": 0.0005435643564356437, + "loss": 1.3295, + "step": 7010 + }, + { + "epoch": 5.913597814332062, + "grad_norm": 0.2714064419269562, + "learning_rate": 0.0005425742574257426, + "loss": 1.329, + "step": 7020 + }, + { + "epoch": 5.913597814332062, + "eval_accuracy": 0.7261800107692413, + "eval_loss": 1.2115275859832764, + "eval_runtime": 893.0627, + "eval_samples_per_second": 559.172, + "eval_steps_per_second": 5.178, + "step": 7020 + }, + { + "epoch": 5.922021742842507, + "grad_norm": 0.277203232049942, + "learning_rate": 0.0005415841584158417, + "loss": 1.3269, + "step": 7030 + }, + { + "epoch": 5.930445671352951, + "grad_norm": 0.3769485354423523, + "learning_rate": 0.0005405940594059406, + "loss": 1.3268, + "step": 7040 + }, + { + "epoch": 5.938869599863396, + "grad_norm": 0.2526576817035675, + "learning_rate": 0.0005396039603960396, + "loss": 1.3262, + "step": 7050 + }, + { + "epoch": 5.94729352837384, + "grad_norm": 0.2670144736766815, + "learning_rate": 0.0005386138613861387, + "loss": 1.327, + "step": 7060 + }, + { + "epoch": 5.955717456884285, + "grad_norm": 0.26662877202033997, + "learning_rate": 0.0005376237623762376, + "loss": 1.3277, + "step": 7070 + }, + { + "epoch": 5.964141385394729, + "grad_norm": 0.3263689875602722, + "learning_rate": 0.0005366336633663367, + "loss": 1.3271, + "step": 7080 + }, + { + "epoch": 5.972565313905174, + "grad_norm": 0.26732614636421204, + "learning_rate": 0.0005356435643564356, + "loss": 1.3264, + "step": 7090 + }, + { + "epoch": 5.980989242415618, + "grad_norm": 0.3332139551639557, + "learning_rate": 0.0005346534653465347, + "loss": 1.3266, + "step": 7100 + }, + { + "epoch": 5.989413170926063, + "grad_norm": 0.3081839680671692, + "learning_rate": 0.0005336633663366337, + "loss": 1.325, + "step": 7110 + }, + { + "epoch": 5.989413170926063, + "eval_accuracy": 0.7263082386708871, + "eval_loss": 1.2105002403259277, + "eval_runtime": 893.0055, + "eval_samples_per_second": 559.208, + "eval_steps_per_second": 5.178, + "step": 7110 + }, + { + "epoch": 5.997837099436508, + "grad_norm": 0.2502419650554657, + "learning_rate": 0.0005326732673267327, + "loss": 1.3263, + "step": 7120 + }, + { + "epoch": 6.006261027946952, + "grad_norm": 0.2437312752008438, + "learning_rate": 0.0005316831683168317, + "loss": 1.3225, + "step": 7130 + }, + { + "epoch": 6.014684956457397, + "grad_norm": 0.3372795581817627, + "learning_rate": 0.0005306930693069307, + "loss": 1.3234, + "step": 7140 + }, + { + "epoch": 6.023108884967841, + "grad_norm": 0.2895912826061249, + "learning_rate": 0.0005297029702970297, + "loss": 1.3252, + "step": 7150 + }, + { + "epoch": 6.031532813478286, + "grad_norm": 0.28451213240623474, + "learning_rate": 0.0005287128712871288, + "loss": 1.3238, + "step": 7160 + }, + { + "epoch": 6.03995674198873, + "grad_norm": 0.2496078759431839, + "learning_rate": 0.0005277227722772277, + "loss": 1.323, + "step": 7170 + }, + { + "epoch": 6.048380670499174, + "grad_norm": 0.26850923895835876, + "learning_rate": 0.0005267326732673268, + "loss": 1.322, + "step": 7180 + }, + { + "epoch": 6.056804599009619, + "grad_norm": 0.30225685238838196, + "learning_rate": 0.0005257425742574257, + "loss": 1.3212, + "step": 7190 + }, + { + "epoch": 6.0652285275200635, + "grad_norm": 0.32349905371665955, + "learning_rate": 0.0005247524752475248, + "loss": 1.3219, + "step": 7200 + }, + { + "epoch": 6.0652285275200635, + "eval_accuracy": 0.727180971273756, + "eval_loss": 1.205489993095398, + "eval_runtime": 890.8938, + "eval_samples_per_second": 560.534, + "eval_steps_per_second": 5.19, + "step": 7200 + }, + { + "epoch": 6.0736524560305085, + "grad_norm": 0.29943209886550903, + "learning_rate": 0.0005237623762376238, + "loss": 1.3182, + "step": 7210 + }, + { + "epoch": 6.082076384540953, + "grad_norm": 0.30952343344688416, + "learning_rate": 0.0005227722772277228, + "loss": 1.3194, + "step": 7220 + }, + { + "epoch": 6.090500313051398, + "grad_norm": 0.3158267140388489, + "learning_rate": 0.0005217821782178218, + "loss": 1.319, + "step": 7230 + }, + { + "epoch": 6.098924241561842, + "grad_norm": 0.27009105682373047, + "learning_rate": 0.0005207920792079208, + "loss": 1.3212, + "step": 7240 + }, + { + "epoch": 6.107348170072286, + "grad_norm": 0.2660143971443176, + "learning_rate": 0.0005198019801980198, + "loss": 1.3181, + "step": 7250 + }, + { + "epoch": 6.115772098582731, + "grad_norm": 0.32289671897888184, + "learning_rate": 0.0005188118811881189, + "loss": 1.3166, + "step": 7260 + }, + { + "epoch": 6.124196027093175, + "grad_norm": 0.301577627658844, + "learning_rate": 0.0005178217821782178, + "loss": 1.3215, + "step": 7270 + }, + { + "epoch": 6.13261995560362, + "grad_norm": 0.26539114117622375, + "learning_rate": 0.0005168316831683169, + "loss": 1.3173, + "step": 7280 + }, + { + "epoch": 6.141043884114064, + "grad_norm": 0.30636703968048096, + "learning_rate": 0.0005158415841584158, + "loss": 1.319, + "step": 7290 + }, + { + "epoch": 6.141043884114064, + "eval_accuracy": 0.7278776618882268, + "eval_loss": 1.2021031379699707, + "eval_runtime": 893.3533, + "eval_samples_per_second": 558.99, + "eval_steps_per_second": 5.176, + "step": 7290 + }, + { + "epoch": 6.1494678126245095, + "grad_norm": 0.2906350791454315, + "learning_rate": 0.0005148514851485149, + "loss": 1.3177, + "step": 7300 + }, + { + "epoch": 6.157891741134954, + "grad_norm": 0.33962422609329224, + "learning_rate": 0.0005138613861386139, + "loss": 1.3173, + "step": 7310 + }, + { + "epoch": 6.166315669645398, + "grad_norm": 0.29772093892097473, + "learning_rate": 0.0005128712871287129, + "loss": 1.3194, + "step": 7320 + }, + { + "epoch": 6.174739598155843, + "grad_norm": 0.27262043952941895, + "learning_rate": 0.0005118811881188119, + "loss": 1.3159, + "step": 7330 + }, + { + "epoch": 6.183163526666287, + "grad_norm": 0.2678314745426178, + "learning_rate": 0.0005108910891089109, + "loss": 1.3167, + "step": 7340 + }, + { + "epoch": 6.191587455176732, + "grad_norm": 0.3115740716457367, + "learning_rate": 0.0005099009900990099, + "loss": 1.3142, + "step": 7350 + }, + { + "epoch": 6.200011383687176, + "grad_norm": 0.2983403205871582, + "learning_rate": 0.000508910891089109, + "loss": 1.3158, + "step": 7360 + }, + { + "epoch": 6.208435312197621, + "grad_norm": 0.2797269821166992, + "learning_rate": 0.0005079207920792079, + "loss": 1.3163, + "step": 7370 + }, + { + "epoch": 6.216859240708065, + "grad_norm": 0.29581907391548157, + "learning_rate": 0.000506930693069307, + "loss": 1.3156, + "step": 7380 + }, + { + "epoch": 6.216859240708065, + "eval_accuracy": 0.7285335214596267, + "eval_loss": 1.1984630823135376, + "eval_runtime": 881.1088, + "eval_samples_per_second": 566.759, + "eval_steps_per_second": 5.248, + "step": 7380 + }, + { + "epoch": 6.2252831692185095, + "grad_norm": 0.2843240797519684, + "learning_rate": 0.0005059405940594059, + "loss": 1.3162, + "step": 7390 + }, + { + "epoch": 6.233707097728955, + "grad_norm": 0.2662515938282013, + "learning_rate": 0.000504950495049505, + "loss": 1.314, + "step": 7400 + }, + { + "epoch": 6.242131026239399, + "grad_norm": 0.3370913565158844, + "learning_rate": 0.000503960396039604, + "loss": 1.3136, + "step": 7410 + }, + { + "epoch": 6.250554954749844, + "grad_norm": 0.29014459252357483, + "learning_rate": 0.000502970297029703, + "loss": 1.3127, + "step": 7420 + }, + { + "epoch": 6.258978883260288, + "grad_norm": 0.2779816687107086, + "learning_rate": 0.000501980198019802, + "loss": 1.3137, + "step": 7430 + }, + { + "epoch": 6.267402811770733, + "grad_norm": 0.2942447066307068, + "learning_rate": 0.000500990099009901, + "loss": 1.3138, + "step": 7440 + }, + { + "epoch": 6.275826740281177, + "grad_norm": 0.3536125719547272, + "learning_rate": 0.0005, + "loss": 1.3135, + "step": 7450 + }, + { + "epoch": 6.284250668791621, + "grad_norm": 0.29686686396598816, + "learning_rate": 0.0004990099009900991, + "loss": 1.3129, + "step": 7460 + }, + { + "epoch": 6.292674597302066, + "grad_norm": 0.30590084195137024, + "learning_rate": 0.000498019801980198, + "loss": 1.3114, + "step": 7470 + }, + { + "epoch": 6.292674597302066, + "eval_accuracy": 0.7293452386458654, + "eval_loss": 1.1951327323913574, + "eval_runtime": 893.3348, + "eval_samples_per_second": 559.002, + "eval_steps_per_second": 5.176, + "step": 7470 + }, + { + "epoch": 6.3010985258125105, + "grad_norm": 0.2687655985355377, + "learning_rate": 0.0004970297029702971, + "loss": 1.3125, + "step": 7480 + }, + { + "epoch": 6.3095224543229556, + "grad_norm": 0.31057268381118774, + "learning_rate": 0.000496039603960396, + "loss": 1.3106, + "step": 7490 + }, + { + "epoch": 6.3179463828334, + "grad_norm": 0.3097970187664032, + "learning_rate": 0.0004950495049504951, + "loss": 1.31, + "step": 7500 + }, + { + "epoch": 6.326370311343844, + "grad_norm": 0.28469330072402954, + "learning_rate": 0.0004940594059405941, + "loss": 1.3098, + "step": 7510 + }, + { + "epoch": 6.334794239854289, + "grad_norm": 0.2911768853664398, + "learning_rate": 0.000493069306930693, + "loss": 1.3103, + "step": 7520 + }, + { + "epoch": 6.343218168364733, + "grad_norm": 0.2990330755710602, + "learning_rate": 0.0004920792079207921, + "loss": 1.3108, + "step": 7530 + }, + { + "epoch": 6.351642096875178, + "grad_norm": 0.2908383905887604, + "learning_rate": 0.000491089108910891, + "loss": 1.3092, + "step": 7540 + }, + { + "epoch": 6.360066025385622, + "grad_norm": 0.306233674287796, + "learning_rate": 0.0004900990099009901, + "loss": 1.3107, + "step": 7550 + }, + { + "epoch": 6.368489953896067, + "grad_norm": 0.2749456465244293, + "learning_rate": 0.0004891089108910892, + "loss": 1.3073, + "step": 7560 + }, + { + "epoch": 6.368489953896067, + "eval_accuracy": 0.7300212582744398, + "eval_loss": 1.1918327808380127, + "eval_runtime": 886.4778, + "eval_samples_per_second": 563.326, + "eval_steps_per_second": 5.216, + "step": 7560 + }, + { + "epoch": 6.3769138824065115, + "grad_norm": 0.2799837291240692, + "learning_rate": 0.0004881188118811881, + "loss": 1.3084, + "step": 7570 + }, + { + "epoch": 6.385337810916956, + "grad_norm": 0.3050614893436432, + "learning_rate": 0.00048712871287128715, + "loss": 1.3082, + "step": 7580 + }, + { + "epoch": 6.393761739427401, + "grad_norm": 0.2900220453739166, + "learning_rate": 0.00048613861386138615, + "loss": 1.3087, + "step": 7590 + }, + { + "epoch": 6.402185667937845, + "grad_norm": 0.2592508792877197, + "learning_rate": 0.00048514851485148515, + "loss": 1.3082, + "step": 7600 + }, + { + "epoch": 6.41060959644829, + "grad_norm": 0.2503323256969452, + "learning_rate": 0.00048415841584158414, + "loss": 1.3066, + "step": 7610 + }, + { + "epoch": 6.419033524958734, + "grad_norm": 0.30254074931144714, + "learning_rate": 0.00048316831683168314, + "loss": 1.3079, + "step": 7620 + }, + { + "epoch": 6.427457453469179, + "grad_norm": 0.28869137167930603, + "learning_rate": 0.0004821782178217822, + "loss": 1.3061, + "step": 7630 + }, + { + "epoch": 6.435881381979623, + "grad_norm": 0.3226109445095062, + "learning_rate": 0.0004811881188118812, + "loss": 1.3051, + "step": 7640 + }, + { + "epoch": 6.444305310490067, + "grad_norm": 0.2900817096233368, + "learning_rate": 0.0004801980198019802, + "loss": 1.3062, + "step": 7650 + }, + { + "epoch": 6.444305310490067, + "eval_accuracy": 0.7304169114350704, + "eval_loss": 1.1914669275283813, + "eval_runtime": 888.5325, + "eval_samples_per_second": 562.023, + "eval_steps_per_second": 5.204, + "step": 7650 + }, + { + "epoch": 6.452729239000512, + "grad_norm": 0.3235354721546173, + "learning_rate": 0.0004792079207920792, + "loss": 1.3074, + "step": 7660 + }, + { + "epoch": 6.461153167510957, + "grad_norm": 0.26384827494621277, + "learning_rate": 0.0004782178217821782, + "loss": 1.3052, + "step": 7670 + }, + { + "epoch": 6.469577096021402, + "grad_norm": 0.27176037430763245, + "learning_rate": 0.00047722772277227724, + "loss": 1.3032, + "step": 7680 + }, + { + "epoch": 6.478001024531846, + "grad_norm": 0.27846911549568176, + "learning_rate": 0.00047623762376237624, + "loss": 1.3038, + "step": 7690 + }, + { + "epoch": 6.48642495304229, + "grad_norm": 0.32258498668670654, + "learning_rate": 0.00047524752475247524, + "loss": 1.3052, + "step": 7700 + }, + { + "epoch": 6.494848881552735, + "grad_norm": 0.3000924587249756, + "learning_rate": 0.00047425742574257423, + "loss": 1.3046, + "step": 7710 + }, + { + "epoch": 6.503272810063179, + "grad_norm": 0.22748370468616486, + "learning_rate": 0.00047326732673267323, + "loss": 1.3054, + "step": 7720 + }, + { + "epoch": 6.511696738573624, + "grad_norm": 0.3552054464817047, + "learning_rate": 0.0004722772277227723, + "loss": 1.3026, + "step": 7730 + }, + { + "epoch": 6.520120667084068, + "grad_norm": 0.2629605531692505, + "learning_rate": 0.0004712871287128713, + "loss": 1.3021, + "step": 7740 + }, + { + "epoch": 6.520120667084068, + "eval_accuracy": 0.7311149976881265, + "eval_loss": 1.1877076625823975, + "eval_runtime": 883.1573, + "eval_samples_per_second": 565.444, + "eval_steps_per_second": 5.236, + "step": 7740 + }, + { + "epoch": 6.528544595594513, + "grad_norm": 0.31692177057266235, + "learning_rate": 0.0004702970297029703, + "loss": 1.3048, + "step": 7750 + }, + { + "epoch": 6.5369685241049575, + "grad_norm": 0.3689730167388916, + "learning_rate": 0.0004693069306930693, + "loss": 1.3016, + "step": 7760 + }, + { + "epoch": 6.545392452615403, + "grad_norm": 0.2619648277759552, + "learning_rate": 0.00046831683168316833, + "loss": 1.3018, + "step": 7770 + }, + { + "epoch": 6.553816381125847, + "grad_norm": 0.29713907837867737, + "learning_rate": 0.0004673267326732674, + "loss": 1.3007, + "step": 7780 + }, + { + "epoch": 6.562240309636291, + "grad_norm": 0.3426944315433502, + "learning_rate": 0.0004663366336633664, + "loss": 1.302, + "step": 7790 + }, + { + "epoch": 6.570664238146736, + "grad_norm": 0.30286312103271484, + "learning_rate": 0.0004653465346534654, + "loss": 1.3024, + "step": 7800 + }, + { + "epoch": 6.57908816665718, + "grad_norm": 0.2533584237098694, + "learning_rate": 0.0004643564356435644, + "loss": 1.2991, + "step": 7810 + }, + { + "epoch": 6.587512095167625, + "grad_norm": 0.23465867340564728, + "learning_rate": 0.0004633663366336634, + "loss": 1.3007, + "step": 7820 + }, + { + "epoch": 6.595936023678069, + "grad_norm": 0.31729191541671753, + "learning_rate": 0.00046237623762376243, + "loss": 1.3, + "step": 7830 + }, + { + "epoch": 6.595936023678069, + "eval_accuracy": 0.7318502985148011, + "eval_loss": 1.1818432807922363, + "eval_runtime": 891.13, + "eval_samples_per_second": 560.385, + "eval_steps_per_second": 5.189, + "step": 7830 + }, + { + "epoch": 6.6043599521885135, + "grad_norm": 0.26264631748199463, + "learning_rate": 0.00046138613861386143, + "loss": 1.3003, + "step": 7840 + }, + { + "epoch": 6.6127838806989585, + "grad_norm": 0.26062801480293274, + "learning_rate": 0.0004603960396039604, + "loss": 1.2977, + "step": 7850 + }, + { + "epoch": 6.621207809209403, + "grad_norm": 0.2755686640739441, + "learning_rate": 0.0004594059405940594, + "loss": 1.2979, + "step": 7860 + }, + { + "epoch": 6.629631737719848, + "grad_norm": 0.32309025526046753, + "learning_rate": 0.0004584158415841584, + "loss": 1.297, + "step": 7870 + }, + { + "epoch": 6.638055666230292, + "grad_norm": 0.2709057927131653, + "learning_rate": 0.0004574257425742575, + "loss": 1.2999, + "step": 7880 + }, + { + "epoch": 6.646479594740737, + "grad_norm": 0.2785532772541046, + "learning_rate": 0.00045643564356435647, + "loss": 1.2959, + "step": 7890 + }, + { + "epoch": 6.654903523251181, + "grad_norm": 0.2822953164577484, + "learning_rate": 0.00045544554455445547, + "loss": 1.2984, + "step": 7900 + }, + { + "epoch": 6.663327451761625, + "grad_norm": 0.2704668641090393, + "learning_rate": 0.00045445544554455447, + "loss": 1.2956, + "step": 7910 + }, + { + "epoch": 6.67175138027207, + "grad_norm": 0.3228791058063507, + "learning_rate": 0.00045346534653465347, + "loss": 1.2984, + "step": 7920 + }, + { + "epoch": 6.67175138027207, + "eval_accuracy": 0.7318941432804211, + "eval_loss": 1.184158205986023, + "eval_runtime": 883.7641, + "eval_samples_per_second": 565.056, + "eval_steps_per_second": 5.232, + "step": 7920 + }, + { + "epoch": 6.680175308782514, + "grad_norm": 0.2641367018222809, + "learning_rate": 0.0004524752475247525, + "loss": 1.299, + "step": 7930 + }, + { + "epoch": 6.6885992372929595, + "grad_norm": 0.28555190563201904, + "learning_rate": 0.0004514851485148515, + "loss": 1.2985, + "step": 7940 + }, + { + "epoch": 6.697023165803404, + "grad_norm": 0.2615039050579071, + "learning_rate": 0.0004504950495049505, + "loss": 1.294, + "step": 7950 + }, + { + "epoch": 6.705447094313849, + "grad_norm": 0.25349870324134827, + "learning_rate": 0.0004495049504950495, + "loss": 1.295, + "step": 7960 + }, + { + "epoch": 6.713871022824293, + "grad_norm": 0.3342011272907257, + "learning_rate": 0.0004485148514851485, + "loss": 1.2963, + "step": 7970 + }, + { + "epoch": 6.722294951334737, + "grad_norm": 0.2608206570148468, + "learning_rate": 0.00044752475247524756, + "loss": 1.2957, + "step": 7980 + }, + { + "epoch": 6.730718879845182, + "grad_norm": 0.27476873993873596, + "learning_rate": 0.00044653465346534656, + "loss": 1.2939, + "step": 7990 + }, + { + "epoch": 6.739142808355626, + "grad_norm": 0.3241907060146332, + "learning_rate": 0.00044554455445544556, + "loss": 1.2965, + "step": 8000 + }, + { + "epoch": 6.747566736866071, + "grad_norm": 0.3494180142879486, + "learning_rate": 0.00044455445544554456, + "loss": 1.2962, + "step": 8010 + }, + { + "epoch": 6.747566736866071, + "eval_accuracy": 0.7322386411238602, + "eval_loss": 1.182516098022461, + "eval_runtime": 889.7545, + "eval_samples_per_second": 561.251, + "eval_steps_per_second": 5.197, + "step": 8010 + }, + { + "epoch": 6.755990665376515, + "grad_norm": 0.2616145610809326, + "learning_rate": 0.00044356435643564356, + "loss": 1.2958, + "step": 8020 + }, + { + "epoch": 6.7644145938869595, + "grad_norm": 0.29238995909690857, + "learning_rate": 0.0004425742574257426, + "loss": 1.293, + "step": 8030 + }, + { + "epoch": 6.772838522397405, + "grad_norm": 0.24060964584350586, + "learning_rate": 0.0004415841584158416, + "loss": 1.2948, + "step": 8040 + }, + { + "epoch": 6.781262450907849, + "grad_norm": 0.29363489151000977, + "learning_rate": 0.0004405940594059406, + "loss": 1.2928, + "step": 8050 + }, + { + "epoch": 6.789686379418294, + "grad_norm": 0.3320622444152832, + "learning_rate": 0.0004396039603960396, + "loss": 1.2925, + "step": 8060 + }, + { + "epoch": 6.798110307928738, + "grad_norm": 0.23857133090496063, + "learning_rate": 0.0004386138613861386, + "loss": 1.2943, + "step": 8070 + }, + { + "epoch": 6.806534236439183, + "grad_norm": 0.24713198840618134, + "learning_rate": 0.00043762376237623765, + "loss": 1.2938, + "step": 8080 + }, + { + "epoch": 6.814958164949627, + "grad_norm": 0.26270854473114014, + "learning_rate": 0.00043663366336633665, + "loss": 1.2916, + "step": 8090 + }, + { + "epoch": 6.823382093460072, + "grad_norm": 0.2450101524591446, + "learning_rate": 0.00043564356435643565, + "loss": 1.2931, + "step": 8100 + }, + { + "epoch": 6.823382093460072, + "eval_accuracy": 0.7332625526391774, + "eval_loss": 1.1757333278656006, + "eval_runtime": 889.0249, + "eval_samples_per_second": 561.712, + "eval_steps_per_second": 5.201, + "step": 8100 + }, + { + "epoch": 6.831806021970516, + "grad_norm": 0.27462685108184814, + "learning_rate": 0.00043465346534653465, + "loss": 1.2923, + "step": 8110 + }, + { + "epoch": 6.8402299504809605, + "grad_norm": 0.2707907259464264, + "learning_rate": 0.00043366336633663365, + "loss": 1.2925, + "step": 8120 + }, + { + "epoch": 6.8486538789914055, + "grad_norm": 0.24748317897319794, + "learning_rate": 0.0004326732673267327, + "loss": 1.2929, + "step": 8130 + }, + { + "epoch": 6.85707780750185, + "grad_norm": 0.226767897605896, + "learning_rate": 0.0004316831683168317, + "loss": 1.2883, + "step": 8140 + }, + { + "epoch": 6.865501736012295, + "grad_norm": 0.24889105558395386, + "learning_rate": 0.0004306930693069307, + "loss": 1.2893, + "step": 8150 + }, + { + "epoch": 6.873925664522739, + "grad_norm": 0.26075902581214905, + "learning_rate": 0.0004297029702970297, + "loss": 1.2893, + "step": 8160 + }, + { + "epoch": 6.882349593033183, + "grad_norm": 0.26210734248161316, + "learning_rate": 0.0004287128712871287, + "loss": 1.2868, + "step": 8170 + }, + { + "epoch": 6.890773521543628, + "grad_norm": 0.2559298872947693, + "learning_rate": 0.00042772277227722774, + "loss": 1.2886, + "step": 8180 + }, + { + "epoch": 6.899197450054072, + "grad_norm": 0.2503817081451416, + "learning_rate": 0.00042673267326732674, + "loss": 1.2883, + "step": 8190 + }, + { + "epoch": 6.899197450054072, + "eval_accuracy": 0.7335132915044345, + "eval_loss": 1.1744158267974854, + "eval_runtime": 885.5636, + "eval_samples_per_second": 563.908, + "eval_steps_per_second": 5.222, + "step": 8190 + }, + { + "epoch": 6.907621378564517, + "grad_norm": 0.24540117383003235, + "learning_rate": 0.00042574257425742574, + "loss": 1.2893, + "step": 8200 + }, + { + "epoch": 6.9160453070749615, + "grad_norm": 0.3089258670806885, + "learning_rate": 0.00042475247524752474, + "loss": 1.2896, + "step": 8210 + }, + { + "epoch": 6.9244692355854065, + "grad_norm": 0.26888999342918396, + "learning_rate": 0.00042376237623762374, + "loss": 1.2895, + "step": 8220 + }, + { + "epoch": 6.932893164095851, + "grad_norm": 0.24743571877479553, + "learning_rate": 0.0004227722772277228, + "loss": 1.2884, + "step": 8230 + }, + { + "epoch": 6.941317092606295, + "grad_norm": 0.24364733695983887, + "learning_rate": 0.0004217821782178218, + "loss": 1.2879, + "step": 8240 + }, + { + "epoch": 6.94974102111674, + "grad_norm": 0.2963743507862091, + "learning_rate": 0.0004207920792079208, + "loss": 1.2878, + "step": 8250 + }, + { + "epoch": 6.958164949627184, + "grad_norm": 0.2444639950990677, + "learning_rate": 0.0004198019801980198, + "loss": 1.2871, + "step": 8260 + }, + { + "epoch": 6.966588878137629, + "grad_norm": 0.27140820026397705, + "learning_rate": 0.0004188118811881188, + "loss": 1.2878, + "step": 8270 + }, + { + "epoch": 6.975012806648073, + "grad_norm": 0.2628765404224396, + "learning_rate": 0.00041782178217821784, + "loss": 1.2873, + "step": 8280 + }, + { + "epoch": 6.975012806648073, + "eval_accuracy": 0.734204579286565, + "eval_loss": 1.171156644821167, + "eval_runtime": 888.1172, + "eval_samples_per_second": 562.286, + "eval_steps_per_second": 5.207, + "step": 8280 + }, + { + "epoch": 6.983436735158518, + "grad_norm": 0.2539413869380951, + "learning_rate": 0.00041683168316831683, + "loss": 1.2874, + "step": 8290 + }, + { + "epoch": 6.991860663668962, + "grad_norm": 0.29522642493247986, + "learning_rate": 0.00041584158415841583, + "loss": 1.2859, + "step": 8300 + }, + { + "epoch": 7.000284592179407, + "grad_norm": 0.29553958773612976, + "learning_rate": 0.00041485148514851483, + "loss": 1.2878, + "step": 8310 + }, + { + "epoch": 7.008708520689852, + "grad_norm": 0.3111182153224945, + "learning_rate": 0.00041386138613861383, + "loss": 1.2874, + "step": 8320 + }, + { + "epoch": 7.017132449200296, + "grad_norm": 0.33146336674690247, + "learning_rate": 0.0004128712871287129, + "loss": 1.287, + "step": 8330 + }, + { + "epoch": 7.025556377710741, + "grad_norm": 0.27456361055374146, + "learning_rate": 0.0004118811881188119, + "loss": 1.2858, + "step": 8340 + }, + { + "epoch": 7.033980306221185, + "grad_norm": 0.29216212034225464, + "learning_rate": 0.0004108910891089109, + "loss": 1.2838, + "step": 8350 + }, + { + "epoch": 7.042404234731629, + "grad_norm": 0.24966631829738617, + "learning_rate": 0.0004099009900990099, + "loss": 1.2857, + "step": 8360 + }, + { + "epoch": 7.050828163242074, + "grad_norm": 0.2910294234752655, + "learning_rate": 0.0004089108910891089, + "loss": 1.2858, + "step": 8370 + }, + { + "epoch": 7.050828163242074, + "eval_accuracy": 0.7346228547150983, + "eval_loss": 1.169946551322937, + "eval_runtime": 890.9908, + "eval_samples_per_second": 560.473, + "eval_steps_per_second": 5.19, + "step": 8370 + }, + { + "epoch": 7.059252091752518, + "grad_norm": 0.26337358355522156, + "learning_rate": 0.0004079207920792079, + "loss": 1.2842, + "step": 8380 + }, + { + "epoch": 7.067676020262963, + "grad_norm": 0.2426845133304596, + "learning_rate": 0.0004069306930693069, + "loss": 1.2836, + "step": 8390 + }, + { + "epoch": 7.0760999487734075, + "grad_norm": 0.2740408778190613, + "learning_rate": 0.000405940594059406, + "loss": 1.2842, + "step": 8400 + }, + { + "epoch": 7.084523877283853, + "grad_norm": 0.27966201305389404, + "learning_rate": 0.000404950495049505, + "loss": 1.2841, + "step": 8410 + }, + { + "epoch": 7.092947805794297, + "grad_norm": 0.3083817660808563, + "learning_rate": 0.00040396039603960397, + "loss": 1.2823, + "step": 8420 + }, + { + "epoch": 7.101371734304741, + "grad_norm": 0.30730104446411133, + "learning_rate": 0.000402970297029703, + "loss": 1.2845, + "step": 8430 + }, + { + "epoch": 7.109795662815186, + "grad_norm": 0.2973144054412842, + "learning_rate": 0.000401980198019802, + "loss": 1.2814, + "step": 8440 + }, + { + "epoch": 7.11821959132563, + "grad_norm": 0.2775426208972931, + "learning_rate": 0.000400990099009901, + "loss": 1.2823, + "step": 8450 + }, + { + "epoch": 7.126643519836075, + "grad_norm": 0.2734345495700836, + "learning_rate": 0.0004, + "loss": 1.2819, + "step": 8460 + }, + { + "epoch": 7.126643519836075, + "eval_accuracy": 0.735104089750221, + "eval_loss": 1.1682698726654053, + "eval_runtime": 886.7497, + "eval_samples_per_second": 563.153, + "eval_steps_per_second": 5.215, + "step": 8460 + }, + { + "epoch": 7.135067448346519, + "grad_norm": 0.27912047505378723, + "learning_rate": 0.000399009900990099, + "loss": 1.2826, + "step": 8470 + }, + { + "epoch": 7.143491376856964, + "grad_norm": 0.3084285855293274, + "learning_rate": 0.00039801980198019807, + "loss": 1.2811, + "step": 8480 + }, + { + "epoch": 7.1519153053674085, + "grad_norm": 0.30194783210754395, + "learning_rate": 0.00039702970297029707, + "loss": 1.2828, + "step": 8490 + }, + { + "epoch": 7.160339233877853, + "grad_norm": 0.25307685136795044, + "learning_rate": 0.00039603960396039607, + "loss": 1.2791, + "step": 8500 + }, + { + "epoch": 7.168763162388298, + "grad_norm": 0.25018778443336487, + "learning_rate": 0.00039504950495049506, + "loss": 1.2796, + "step": 8510 + }, + { + "epoch": 7.177187090898742, + "grad_norm": 0.2541010081768036, + "learning_rate": 0.00039405940594059406, + "loss": 1.2812, + "step": 8520 + }, + { + "epoch": 7.185611019409187, + "grad_norm": 0.29745373129844666, + "learning_rate": 0.0003930693069306931, + "loss": 1.2828, + "step": 8530 + }, + { + "epoch": 7.194034947919631, + "grad_norm": 0.2740705907344818, + "learning_rate": 0.0003920792079207921, + "loss": 1.2812, + "step": 8540 + }, + { + "epoch": 7.202458876430076, + "grad_norm": 0.23998434841632843, + "learning_rate": 0.0003910891089108911, + "loss": 1.2781, + "step": 8550 + }, + { + "epoch": 7.202458876430076, + "eval_accuracy": 0.7354429371546514, + "eval_loss": 1.1649537086486816, + "eval_runtime": 891.9041, + "eval_samples_per_second": 559.899, + "eval_steps_per_second": 5.184, + "step": 8550 + }, + { + "epoch": 7.21088280494052, + "grad_norm": 0.2691722512245178, + "learning_rate": 0.0003900990099009901, + "loss": 1.2785, + "step": 8560 + }, + { + "epoch": 7.219306733450964, + "grad_norm": 0.28188225626945496, + "learning_rate": 0.0003891089108910891, + "loss": 1.2807, + "step": 8570 + }, + { + "epoch": 7.2277306619614095, + "grad_norm": 0.3311617970466614, + "learning_rate": 0.00038811881188118816, + "loss": 1.2809, + "step": 8580 + }, + { + "epoch": 7.236154590471854, + "grad_norm": 0.2717738747596741, + "learning_rate": 0.00038712871287128716, + "loss": 1.278, + "step": 8590 + }, + { + "epoch": 7.244578518982299, + "grad_norm": 0.27171820402145386, + "learning_rate": 0.00038613861386138616, + "loss": 1.2803, + "step": 8600 + }, + { + "epoch": 7.253002447492743, + "grad_norm": 0.249137282371521, + "learning_rate": 0.00038514851485148515, + "loss": 1.277, + "step": 8610 + }, + { + "epoch": 7.261426376003188, + "grad_norm": 0.26939263939857483, + "learning_rate": 0.00038415841584158415, + "loss": 1.2773, + "step": 8620 + }, + { + "epoch": 7.269850304513632, + "grad_norm": 0.3177802860736847, + "learning_rate": 0.0003831683168316832, + "loss": 1.2763, + "step": 8630 + }, + { + "epoch": 7.278274233024076, + "grad_norm": 0.2421504557132721, + "learning_rate": 0.0003821782178217822, + "loss": 1.2771, + "step": 8640 + }, + { + "epoch": 7.278274233024076, + "eval_accuracy": 0.7357238880776348, + "eval_loss": 1.1646403074264526, + "eval_runtime": 878.5966, + "eval_samples_per_second": 568.379, + "eval_steps_per_second": 5.263, + "step": 8640 + }, + { + "epoch": 7.286698161534521, + "grad_norm": 0.28808215260505676, + "learning_rate": 0.0003811881188118812, + "loss": 1.2744, + "step": 8650 + }, + { + "epoch": 7.295122090044965, + "grad_norm": 0.26363667845726013, + "learning_rate": 0.0003801980198019802, + "loss": 1.2788, + "step": 8660 + }, + { + "epoch": 7.30354601855541, + "grad_norm": 0.35491064190864563, + "learning_rate": 0.0003792079207920792, + "loss": 1.2792, + "step": 8670 + }, + { + "epoch": 7.311969947065855, + "grad_norm": 0.3273920714855194, + "learning_rate": 0.00037821782178217825, + "loss": 1.278, + "step": 8680 + }, + { + "epoch": 7.320393875576299, + "grad_norm": 0.28319239616394043, + "learning_rate": 0.00037722772277227725, + "loss": 1.2762, + "step": 8690 + }, + { + "epoch": 7.328817804086744, + "grad_norm": 0.28414586186408997, + "learning_rate": 0.00037623762376237625, + "loss": 1.2769, + "step": 8700 + }, + { + "epoch": 7.337241732597188, + "grad_norm": 0.25393033027648926, + "learning_rate": 0.00037524752475247524, + "loss": 1.2742, + "step": 8710 + }, + { + "epoch": 7.345665661107633, + "grad_norm": 0.25634288787841797, + "learning_rate": 0.00037425742574257424, + "loss": 1.2753, + "step": 8720 + }, + { + "epoch": 7.354089589618077, + "grad_norm": 0.2355813831090927, + "learning_rate": 0.0003732673267326733, + "loss": 1.2749, + "step": 8730 + }, + { + "epoch": 7.354089589618077, + "eval_accuracy": 0.7361996522899728, + "eval_loss": 1.160847544670105, + "eval_runtime": 889.4544, + "eval_samples_per_second": 561.441, + "eval_steps_per_second": 5.199, + "step": 8730 + }, + { + "epoch": 7.362513518128522, + "grad_norm": 0.24002189934253693, + "learning_rate": 0.0003722772277227723, + "loss": 1.2751, + "step": 8740 + }, + { + "epoch": 7.370937446638966, + "grad_norm": 0.2806450128555298, + "learning_rate": 0.0003712871287128713, + "loss": 1.275, + "step": 8750 + }, + { + "epoch": 7.3793613751494105, + "grad_norm": 0.24552834033966064, + "learning_rate": 0.0003702970297029703, + "loss": 1.2753, + "step": 8760 + }, + { + "epoch": 7.3877853036598555, + "grad_norm": 0.24814461171627045, + "learning_rate": 0.0003693069306930693, + "loss": 1.276, + "step": 8770 + }, + { + "epoch": 7.3962092321703, + "grad_norm": 0.26086533069610596, + "learning_rate": 0.00036831683168316834, + "loss": 1.2744, + "step": 8780 + }, + { + "epoch": 7.404633160680745, + "grad_norm": 0.2854679822921753, + "learning_rate": 0.00036732673267326734, + "loss": 1.2739, + "step": 8790 + }, + { + "epoch": 7.413057089191189, + "grad_norm": 0.24847003817558289, + "learning_rate": 0.00036633663366336634, + "loss": 1.2731, + "step": 8800 + }, + { + "epoch": 7.421481017701634, + "grad_norm": 0.3230905532836914, + "learning_rate": 0.00036534653465346533, + "loss": 1.2732, + "step": 8810 + }, + { + "epoch": 7.429904946212078, + "grad_norm": 0.30264076590538025, + "learning_rate": 0.00036435643564356433, + "loss": 1.273, + "step": 8820 + }, + { + "epoch": 7.429904946212078, + "eval_accuracy": 0.7366944357714759, + "eval_loss": 1.1585748195648193, + "eval_runtime": 884.7129, + "eval_samples_per_second": 564.45, + "eval_steps_per_second": 5.227, + "step": 8820 + }, + { + "epoch": 7.438328874722522, + "grad_norm": 0.25705888867378235, + "learning_rate": 0.0003633663366336634, + "loss": 1.2738, + "step": 8830 + }, + { + "epoch": 7.446752803232967, + "grad_norm": 0.2455236166715622, + "learning_rate": 0.0003623762376237624, + "loss": 1.2727, + "step": 8840 + }, + { + "epoch": 7.4551767317434114, + "grad_norm": 0.2877678871154785, + "learning_rate": 0.0003613861386138614, + "loss": 1.2733, + "step": 8850 + }, + { + "epoch": 7.4636006602538565, + "grad_norm": 0.2644253969192505, + "learning_rate": 0.0003603960396039604, + "loss": 1.2711, + "step": 8860 + }, + { + "epoch": 7.472024588764301, + "grad_norm": 0.25103089213371277, + "learning_rate": 0.0003594059405940594, + "loss": 1.2727, + "step": 8870 + }, + { + "epoch": 7.480448517274746, + "grad_norm": 0.28732746839523315, + "learning_rate": 0.00035841584158415843, + "loss": 1.2729, + "step": 8880 + }, + { + "epoch": 7.48887244578519, + "grad_norm": 0.3096875846385956, + "learning_rate": 0.00035742574257425743, + "loss": 1.2733, + "step": 8890 + }, + { + "epoch": 7.497296374295634, + "grad_norm": 0.27695363759994507, + "learning_rate": 0.0003564356435643564, + "loss": 1.2719, + "step": 8900 + }, + { + "epoch": 7.505720302806079, + "grad_norm": 0.26089048385620117, + "learning_rate": 0.0003554455445544554, + "loss": 1.2718, + "step": 8910 + }, + { + "epoch": 7.505720302806079, + "eval_accuracy": 0.7372118632602084, + "eval_loss": 1.1557950973510742, + "eval_runtime": 890.5411, + "eval_samples_per_second": 560.756, + "eval_steps_per_second": 5.192, + "step": 8910 + }, + { + "epoch": 7.514144231316523, + "grad_norm": 0.24578547477722168, + "learning_rate": 0.0003544554455445544, + "loss": 1.2723, + "step": 8920 + }, + { + "epoch": 7.522568159826968, + "grad_norm": 0.2624136209487915, + "learning_rate": 0.0003534653465346535, + "loss": 1.2708, + "step": 8930 + }, + { + "epoch": 7.530992088337412, + "grad_norm": 0.25748109817504883, + "learning_rate": 0.0003524752475247525, + "loss": 1.2708, + "step": 8940 + }, + { + "epoch": 7.5394160168478574, + "grad_norm": 0.28079208731651306, + "learning_rate": 0.00035148514851485147, + "loss": 1.2727, + "step": 8950 + }, + { + "epoch": 7.547839945358302, + "grad_norm": 0.2706407904624939, + "learning_rate": 0.00035049504950495047, + "loss": 1.2712, + "step": 8960 + }, + { + "epoch": 7.556263873868746, + "grad_norm": 0.27032172679901123, + "learning_rate": 0.00034950495049504947, + "loss": 1.2673, + "step": 8970 + }, + { + "epoch": 7.564687802379191, + "grad_norm": 0.24915465712547302, + "learning_rate": 0.0003485148514851485, + "loss": 1.2682, + "step": 8980 + }, + { + "epoch": 7.573111730889635, + "grad_norm": 0.24191108345985413, + "learning_rate": 0.0003475247524752475, + "loss": 1.2719, + "step": 8990 + }, + { + "epoch": 7.58153565940008, + "grad_norm": 0.2806965112686157, + "learning_rate": 0.0003465346534653465, + "loss": 1.2681, + "step": 9000 + }, + { + "epoch": 7.58153565940008, + "eval_accuracy": 0.7375367942915361, + "eval_loss": 1.1551363468170166, + "eval_runtime": 876.3936, + "eval_samples_per_second": 569.808, + "eval_steps_per_second": 5.276, + "step": 9000 + }, + { + "epoch": 7.589959587910524, + "grad_norm": 0.2909415364265442, + "learning_rate": 0.0003455445544554455, + "loss": 1.2687, + "step": 9010 + }, + { + "epoch": 7.598383516420968, + "grad_norm": 0.30222398042678833, + "learning_rate": 0.0003445544554455445, + "loss": 1.2684, + "step": 9020 + }, + { + "epoch": 7.606807444931413, + "grad_norm": 0.25246381759643555, + "learning_rate": 0.0003435643564356436, + "loss": 1.2689, + "step": 9030 + }, + { + "epoch": 7.6152313734418575, + "grad_norm": 0.25202953815460205, + "learning_rate": 0.0003425742574257426, + "loss": 1.2689, + "step": 9040 + }, + { + "epoch": 7.623655301952303, + "grad_norm": 0.2351432740688324, + "learning_rate": 0.0003415841584158416, + "loss": 1.2655, + "step": 9050 + }, + { + "epoch": 7.632079230462747, + "grad_norm": 0.26545044779777527, + "learning_rate": 0.0003405940594059406, + "loss": 1.2659, + "step": 9060 + }, + { + "epoch": 7.640503158973192, + "grad_norm": 0.248436838388443, + "learning_rate": 0.0003396039603960396, + "loss": 1.2677, + "step": 9070 + }, + { + "epoch": 7.648927087483636, + "grad_norm": 0.3021203279495239, + "learning_rate": 0.00033861386138613867, + "loss": 1.2692, + "step": 9080 + }, + { + "epoch": 7.657351015994081, + "grad_norm": 0.27577024698257446, + "learning_rate": 0.00033762376237623766, + "loss": 1.2672, + "step": 9090 + }, + { + "epoch": 7.657351015994081, + "eval_accuracy": 0.7378275299930978, + "eval_loss": 1.1522574424743652, + "eval_runtime": 891.8663, + "eval_samples_per_second": 559.923, + "eval_steps_per_second": 5.185, + "step": 9090 + }, + { + "epoch": 7.665774944504525, + "grad_norm": 0.2087612897157669, + "learning_rate": 0.00033663366336633666, + "loss": 1.2655, + "step": 9100 + }, + { + "epoch": 7.674198873014969, + "grad_norm": 0.24880866706371307, + "learning_rate": 0.00033564356435643566, + "loss": 1.2677, + "step": 9110 + }, + { + "epoch": 7.682622801525414, + "grad_norm": 0.26335397362709045, + "learning_rate": 0.00033465346534653466, + "loss": 1.2647, + "step": 9120 + }, + { + "epoch": 7.6910467300358585, + "grad_norm": 0.25413015484809875, + "learning_rate": 0.0003336633663366337, + "loss": 1.265, + "step": 9130 + }, + { + "epoch": 7.6994706585463035, + "grad_norm": 0.3119896650314331, + "learning_rate": 0.0003326732673267327, + "loss": 1.2674, + "step": 9140 + }, + { + "epoch": 7.707894587056748, + "grad_norm": 0.2269907146692276, + "learning_rate": 0.0003316831683168317, + "loss": 1.2647, + "step": 9150 + }, + { + "epoch": 7.716318515567192, + "grad_norm": 0.31745684146881104, + "learning_rate": 0.0003306930693069307, + "loss": 1.2668, + "step": 9160 + }, + { + "epoch": 7.724742444077637, + "grad_norm": 0.28096485137939453, + "learning_rate": 0.0003297029702970297, + "loss": 1.2658, + "step": 9170 + }, + { + "epoch": 7.733166372588081, + "grad_norm": 0.26646697521209717, + "learning_rate": 0.00032871287128712876, + "loss": 1.2664, + "step": 9180 + }, + { + "epoch": 7.733166372588081, + "eval_accuracy": 0.7381772885380696, + "eval_loss": 1.151962161064148, + "eval_runtime": 889.9446, + "eval_samples_per_second": 561.132, + "eval_steps_per_second": 5.196, + "step": 9180 + }, + { + "epoch": 7.741590301098526, + "grad_norm": 0.24463273584842682, + "learning_rate": 0.00032772277227722775, + "loss": 1.2663, + "step": 9190 + }, + { + "epoch": 7.75001422960897, + "grad_norm": 0.23978425562381744, + "learning_rate": 0.00032673267326732675, + "loss": 1.2634, + "step": 9200 + }, + { + "epoch": 7.758438158119414, + "grad_norm": 0.25662901997566223, + "learning_rate": 0.00032574257425742575, + "loss": 1.2651, + "step": 9210 + }, + { + "epoch": 7.766862086629859, + "grad_norm": 0.2697198688983917, + "learning_rate": 0.00032475247524752475, + "loss": 1.2628, + "step": 9220 + }, + { + "epoch": 7.775286015140304, + "grad_norm": 0.2753835618495941, + "learning_rate": 0.0003237623762376238, + "loss": 1.2632, + "step": 9230 + }, + { + "epoch": 7.783709943650749, + "grad_norm": 0.23303931951522827, + "learning_rate": 0.0003227722772277228, + "loss": 1.2625, + "step": 9240 + }, + { + "epoch": 7.792133872161193, + "grad_norm": 0.26077255606651306, + "learning_rate": 0.0003217821782178218, + "loss": 1.2648, + "step": 9250 + }, + { + "epoch": 7.800557800671638, + "grad_norm": 0.25494781136512756, + "learning_rate": 0.0003207920792079208, + "loss": 1.2648, + "step": 9260 + }, + { + "epoch": 7.808981729182082, + "grad_norm": 0.2447885125875473, + "learning_rate": 0.0003198019801980198, + "loss": 1.2645, + "step": 9270 + }, + { + "epoch": 7.808981729182082, + "eval_accuracy": 0.7385748699480129, + "eval_loss": 1.1492513418197632, + "eval_runtime": 885.3604, + "eval_samples_per_second": 564.037, + "eval_steps_per_second": 5.223, + "step": 9270 + }, + { + "epoch": 7.817405657692527, + "grad_norm": 0.23961922526359558, + "learning_rate": 0.00031881188118811885, + "loss": 1.2631, + "step": 9280 + }, + { + "epoch": 7.825829586202971, + "grad_norm": 0.2850695252418518, + "learning_rate": 0.00031782178217821784, + "loss": 1.2636, + "step": 9290 + }, + { + "epoch": 7.834253514713415, + "grad_norm": 0.257962167263031, + "learning_rate": 0.00031683168316831684, + "loss": 1.2647, + "step": 9300 + }, + { + "epoch": 7.84267744322386, + "grad_norm": 0.28995752334594727, + "learning_rate": 0.00031584158415841584, + "loss": 1.2613, + "step": 9310 + }, + { + "epoch": 7.851101371734305, + "grad_norm": 0.23544956743717194, + "learning_rate": 0.00031485148514851484, + "loss": 1.261, + "step": 9320 + }, + { + "epoch": 7.85952530024475, + "grad_norm": 0.27855780720710754, + "learning_rate": 0.0003138613861386139, + "loss": 1.2615, + "step": 9330 + }, + { + "epoch": 7.867949228755194, + "grad_norm": 0.2668914198875427, + "learning_rate": 0.0003128712871287129, + "loss": 1.2629, + "step": 9340 + }, + { + "epoch": 7.876373157265638, + "grad_norm": 0.2561187446117401, + "learning_rate": 0.0003118811881188119, + "loss": 1.2614, + "step": 9350 + }, + { + "epoch": 7.884797085776083, + "grad_norm": 0.23943807184696198, + "learning_rate": 0.0003108910891089109, + "loss": 1.2591, + "step": 9360 + }, + { + "epoch": 7.884797085776083, + "eval_accuracy": 0.7389714933005799, + "eval_loss": 1.1477636098861694, + "eval_runtime": 884.2901, + "eval_samples_per_second": 564.72, + "eval_steps_per_second": 5.229, + "step": 9360 + }, + { + "epoch": 7.893221014286527, + "grad_norm": 0.3144013583660126, + "learning_rate": 0.0003099009900990099, + "loss": 1.2606, + "step": 9370 + }, + { + "epoch": 7.901644942796972, + "grad_norm": 0.30694615840911865, + "learning_rate": 0.00030891089108910894, + "loss": 1.2607, + "step": 9380 + }, + { + "epoch": 7.910068871307416, + "grad_norm": 0.28703033924102783, + "learning_rate": 0.00030792079207920793, + "loss": 1.2625, + "step": 9390 + }, + { + "epoch": 7.918492799817861, + "grad_norm": 0.24160224199295044, + "learning_rate": 0.00030693069306930693, + "loss": 1.2594, + "step": 9400 + }, + { + "epoch": 7.9269167283283055, + "grad_norm": 0.26693734526634216, + "learning_rate": 0.00030594059405940593, + "loss": 1.2605, + "step": 9410 + }, + { + "epoch": 7.935340656838751, + "grad_norm": 0.23551449179649353, + "learning_rate": 0.00030495049504950493, + "loss": 1.2589, + "step": 9420 + }, + { + "epoch": 7.943764585349195, + "grad_norm": 0.23266945779323578, + "learning_rate": 0.000303960396039604, + "loss": 1.2575, + "step": 9430 + }, + { + "epoch": 7.952188513859639, + "grad_norm": 0.19307726621627808, + "learning_rate": 0.000302970297029703, + "loss": 1.2594, + "step": 9440 + }, + { + "epoch": 7.960612442370084, + "grad_norm": 0.2490869015455246, + "learning_rate": 0.000301980198019802, + "loss": 1.2594, + "step": 9450 + }, + { + "epoch": 7.960612442370084, + "eval_accuracy": 0.7392987654643606, + "eval_loss": 1.1463170051574707, + "eval_runtime": 887.3291, + "eval_samples_per_second": 562.786, + "eval_steps_per_second": 5.211, + "step": 9450 + }, + { + "epoch": 7.969036370880528, + "grad_norm": 0.24613766372203827, + "learning_rate": 0.000300990099009901, + "loss": 1.2586, + "step": 9460 + }, + { + "epoch": 7.977460299390973, + "grad_norm": 0.28653955459594727, + "learning_rate": 0.0003, + "loss": 1.2596, + "step": 9470 + }, + { + "epoch": 7.985884227901417, + "grad_norm": 0.2534151077270508, + "learning_rate": 0.000299009900990099, + "loss": 1.258, + "step": 9480 + }, + { + "epoch": 7.994308156411861, + "grad_norm": 0.2278260588645935, + "learning_rate": 0.000298019801980198, + "loss": 1.2596, + "step": 9490 + }, + { + "epoch": 8.002732084922306, + "grad_norm": 0.24955512583255768, + "learning_rate": 0.000297029702970297, + "loss": 1.2589, + "step": 9500 + }, + { + "epoch": 8.011156013432752, + "grad_norm": 0.24727576971054077, + "learning_rate": 0.000296039603960396, + "loss": 1.259, + "step": 9510 + }, + { + "epoch": 8.019579941943196, + "grad_norm": 0.23246212303638458, + "learning_rate": 0.000295049504950495, + "loss": 1.2569, + "step": 9520 + }, + { + "epoch": 8.02800387045364, + "grad_norm": 0.31031736731529236, + "learning_rate": 0.00029405940594059407, + "loss": 1.2576, + "step": 9530 + }, + { + "epoch": 8.036427798964084, + "grad_norm": 0.25005343556404114, + "learning_rate": 0.00029306930693069307, + "loss": 1.2586, + "step": 9540 + }, + { + "epoch": 8.036427798964084, + "eval_accuracy": 0.7396166114825387, + "eval_loss": 1.1443780660629272, + "eval_runtime": 886.7087, + "eval_samples_per_second": 563.179, + "eval_steps_per_second": 5.215, + "step": 9540 + }, + { + "epoch": 8.044851727474528, + "grad_norm": 0.26693809032440186, + "learning_rate": 0.00029207920792079207, + "loss": 1.2565, + "step": 9550 + }, + { + "epoch": 8.053275655984974, + "grad_norm": 0.2694302797317505, + "learning_rate": 0.00029108910891089107, + "loss": 1.2578, + "step": 9560 + }, + { + "epoch": 8.061699584495418, + "grad_norm": 0.28717589378356934, + "learning_rate": 0.00029009900990099006, + "loss": 1.257, + "step": 9570 + }, + { + "epoch": 8.070123513005862, + "grad_norm": 0.2473517805337906, + "learning_rate": 0.0002891089108910891, + "loss": 1.2584, + "step": 9580 + }, + { + "epoch": 8.078547441516307, + "grad_norm": 0.238663449883461, + "learning_rate": 0.0002881188118811881, + "loss": 1.2565, + "step": 9590 + }, + { + "epoch": 8.086971370026752, + "grad_norm": 0.25168007612228394, + "learning_rate": 0.0002871287128712871, + "loss": 1.2601, + "step": 9600 + }, + { + "epoch": 8.095395298537197, + "grad_norm": 0.2553163766860962, + "learning_rate": 0.0002861386138613861, + "loss": 1.2582, + "step": 9610 + }, + { + "epoch": 8.10381922704764, + "grad_norm": 0.22442133724689484, + "learning_rate": 0.0002851485148514851, + "loss": 1.2564, + "step": 9620 + }, + { + "epoch": 8.112243155558085, + "grad_norm": 0.2428729087114334, + "learning_rate": 0.00028415841584158416, + "loss": 1.2555, + "step": 9630 + }, + { + "epoch": 8.112243155558085, + "eval_accuracy": 0.7398516451845706, + "eval_loss": 1.1434710025787354, + "eval_runtime": 884.9135, + "eval_samples_per_second": 564.322, + "eval_steps_per_second": 5.225, + "step": 9630 + }, + { + "epoch": 8.120667084068529, + "grad_norm": 0.24635536968708038, + "learning_rate": 0.00028316831683168316, + "loss": 1.256, + "step": 9640 + }, + { + "epoch": 8.129091012578975, + "grad_norm": 0.25894826650619507, + "learning_rate": 0.00028217821782178216, + "loss": 1.2559, + "step": 9650 + }, + { + "epoch": 8.13751494108942, + "grad_norm": 0.28364095091819763, + "learning_rate": 0.0002811881188118812, + "loss": 1.2558, + "step": 9660 + }, + { + "epoch": 8.145938869599863, + "grad_norm": 0.27813902497291565, + "learning_rate": 0.0002801980198019802, + "loss": 1.2551, + "step": 9670 + }, + { + "epoch": 8.154362798110308, + "grad_norm": 0.25842994451522827, + "learning_rate": 0.00027920792079207926, + "loss": 1.2566, + "step": 9680 + }, + { + "epoch": 8.162786726620752, + "grad_norm": 0.28136196732521057, + "learning_rate": 0.00027821782178217826, + "loss": 1.2558, + "step": 9690 + }, + { + "epoch": 8.171210655131198, + "grad_norm": 0.24087685346603394, + "learning_rate": 0.00027722772277227726, + "loss": 1.2548, + "step": 9700 + }, + { + "epoch": 8.179634583641642, + "grad_norm": 0.24687226116657257, + "learning_rate": 0.00027623762376237626, + "loss": 1.2585, + "step": 9710 + }, + { + "epoch": 8.188058512152086, + "grad_norm": 0.22570998966693878, + "learning_rate": 0.00027524752475247525, + "loss": 1.2534, + "step": 9720 + }, + { + "epoch": 8.188058512152086, + "eval_accuracy": 0.7402963892075639, + "eval_loss": 1.1417516469955444, + "eval_runtime": 887.2248, + "eval_samples_per_second": 562.852, + "eval_steps_per_second": 5.212, + "step": 9720 + }, + { + "epoch": 8.19648244066253, + "grad_norm": 0.2180325835943222, + "learning_rate": 0.0002742574257425743, + "loss": 1.254, + "step": 9730 + }, + { + "epoch": 8.204906369172976, + "grad_norm": 0.24650686979293823, + "learning_rate": 0.0002732673267326733, + "loss": 1.2549, + "step": 9740 + }, + { + "epoch": 8.21333029768342, + "grad_norm": 0.23055210709571838, + "learning_rate": 0.0002722772277227723, + "loss": 1.2533, + "step": 9750 + }, + { + "epoch": 8.221754226193864, + "grad_norm": 0.2486119419336319, + "learning_rate": 0.0002712871287128713, + "loss": 1.2535, + "step": 9760 + }, + { + "epoch": 8.230178154704308, + "grad_norm": 0.2295829951763153, + "learning_rate": 0.0002702970297029703, + "loss": 1.2532, + "step": 9770 + }, + { + "epoch": 8.238602083214753, + "grad_norm": 0.24997445940971375, + "learning_rate": 0.00026930693069306935, + "loss": 1.2531, + "step": 9780 + }, + { + "epoch": 8.247026011725199, + "grad_norm": 0.26696640253067017, + "learning_rate": 0.00026831683168316835, + "loss": 1.2537, + "step": 9790 + }, + { + "epoch": 8.255449940235643, + "grad_norm": 0.26139459013938904, + "learning_rate": 0.00026732673267326735, + "loss": 1.255, + "step": 9800 + }, + { + "epoch": 8.263873868746087, + "grad_norm": 0.24359402060508728, + "learning_rate": 0.00026633663366336635, + "loss": 1.2531, + "step": 9810 + }, + { + "epoch": 8.263873868746087, + "eval_accuracy": 0.7405673501883495, + "eval_loss": 1.139613389968872, + "eval_runtime": 879.601, + "eval_samples_per_second": 567.73, + "eval_steps_per_second": 5.257, + "step": 9810 + }, + { + "epoch": 8.272297797256531, + "grad_norm": 0.2327917069196701, + "learning_rate": 0.00026534653465346534, + "loss": 1.2534, + "step": 9820 + }, + { + "epoch": 8.280721725766975, + "grad_norm": 0.25629815459251404, + "learning_rate": 0.0002643564356435644, + "loss": 1.2531, + "step": 9830 + }, + { + "epoch": 8.289145654277421, + "grad_norm": 0.22450138628482819, + "learning_rate": 0.0002633663366336634, + "loss": 1.2529, + "step": 9840 + }, + { + "epoch": 8.297569582787865, + "grad_norm": 0.2623524069786072, + "learning_rate": 0.0002623762376237624, + "loss": 1.2504, + "step": 9850 + }, + { + "epoch": 8.30599351129831, + "grad_norm": 0.2159668356180191, + "learning_rate": 0.0002613861386138614, + "loss": 1.2528, + "step": 9860 + }, + { + "epoch": 8.314417439808754, + "grad_norm": 0.24267102777957916, + "learning_rate": 0.0002603960396039604, + "loss": 1.2514, + "step": 9870 + }, + { + "epoch": 8.322841368319198, + "grad_norm": 0.2541745603084564, + "learning_rate": 0.00025940594059405944, + "loss": 1.2505, + "step": 9880 + }, + { + "epoch": 8.331265296829644, + "grad_norm": 0.28231385350227356, + "learning_rate": 0.00025841584158415844, + "loss": 1.2511, + "step": 9890 + }, + { + "epoch": 8.339689225340088, + "grad_norm": 0.2412833273410797, + "learning_rate": 0.00025742574257425744, + "loss": 1.2506, + "step": 9900 + }, + { + "epoch": 8.339689225340088, + "eval_accuracy": 0.740612444763646, + "eval_loss": 1.140478491783142, + "eval_runtime": 884.9323, + "eval_samples_per_second": 564.31, + "eval_steps_per_second": 5.225, + "step": 9900 + }, + { + "epoch": 8.348113153850532, + "grad_norm": 0.2641441524028778, + "learning_rate": 0.00025643564356435644, + "loss": 1.2519, + "step": 9910 + }, + { + "epoch": 8.356537082360976, + "grad_norm": 0.2675786316394806, + "learning_rate": 0.00025544554455445543, + "loss": 1.2516, + "step": 9920 + }, + { + "epoch": 8.364961010871422, + "grad_norm": 0.2118910253047943, + "learning_rate": 0.0002544554455445545, + "loss": 1.2511, + "step": 9930 + }, + { + "epoch": 8.373384939381866, + "grad_norm": 0.27223941683769226, + "learning_rate": 0.0002534653465346535, + "loss": 1.2519, + "step": 9940 + }, + { + "epoch": 8.38180886789231, + "grad_norm": 0.2487749308347702, + "learning_rate": 0.0002524752475247525, + "loss": 1.2506, + "step": 9950 + }, + { + "epoch": 8.390232796402755, + "grad_norm": 0.2320510894060135, + "learning_rate": 0.0002514851485148515, + "loss": 1.2534, + "step": 9960 + }, + { + "epoch": 8.398656724913199, + "grad_norm": 0.2474934607744217, + "learning_rate": 0.0002504950495049505, + "loss": 1.249, + "step": 9970 + }, + { + "epoch": 8.407080653423645, + "grad_norm": 0.23778343200683594, + "learning_rate": 0.00024950495049504953, + "loss": 1.2503, + "step": 9980 + }, + { + "epoch": 8.415504581934089, + "grad_norm": 0.2715946137905121, + "learning_rate": 0.00024851485148514853, + "loss": 1.2515, + "step": 9990 + }, + { + "epoch": 8.415504581934089, + "eval_accuracy": 0.7412818791412316, + "eval_loss": 1.137270450592041, + "eval_runtime": 885.4223, + "eval_samples_per_second": 563.998, + "eval_steps_per_second": 5.222, + "step": 9990 + }, + { + "epoch": 8.423928510444533, + "grad_norm": 0.26555290818214417, + "learning_rate": 0.00024752475247524753, + "loss": 1.2485, + "step": 10000 + }, + { + "epoch": 8.432352438954977, + "grad_norm": 0.23698092997074127, + "learning_rate": 0.0002465346534653465, + "loss": 1.2498, + "step": 10010 + }, + { + "epoch": 8.440776367465421, + "grad_norm": 0.23015616834163666, + "learning_rate": 0.0002455445544554455, + "loss": 1.2482, + "step": 10020 + }, + { + "epoch": 8.449200295975867, + "grad_norm": 0.22911451756954193, + "learning_rate": 0.0002445544554455446, + "loss": 1.2503, + "step": 10030 + }, + { + "epoch": 8.457624224486311, + "grad_norm": 0.24171452224254608, + "learning_rate": 0.00024356435643564357, + "loss": 1.2485, + "step": 10040 + }, + { + "epoch": 8.466048152996756, + "grad_norm": 0.24717497825622559, + "learning_rate": 0.00024257425742574257, + "loss": 1.2503, + "step": 10050 + }, + { + "epoch": 8.4744720815072, + "grad_norm": 0.23118732869625092, + "learning_rate": 0.00024158415841584157, + "loss": 1.2488, + "step": 10060 + }, + { + "epoch": 8.482896010017644, + "grad_norm": 0.22151467204093933, + "learning_rate": 0.0002405940594059406, + "loss": 1.2484, + "step": 10070 + }, + { + "epoch": 8.49131993852809, + "grad_norm": 0.2284466177225113, + "learning_rate": 0.0002396039603960396, + "loss": 1.2487, + "step": 10080 + }, + { + "epoch": 8.49131993852809, + "eval_accuracy": 0.7414350855696202, + "eval_loss": 1.134464144706726, + "eval_runtime": 887.5421, + "eval_samples_per_second": 562.65, + "eval_steps_per_second": 5.21, + "step": 10080 + }, + { + "epoch": 8.499743867038534, + "grad_norm": 0.2377534806728363, + "learning_rate": 0.00023861386138613862, + "loss": 1.2491, + "step": 10090 + }, + { + "epoch": 8.508167795548978, + "grad_norm": 0.2649644613265991, + "learning_rate": 0.00023762376237623762, + "loss": 1.2467, + "step": 10100 + }, + { + "epoch": 8.516591724059422, + "grad_norm": 0.22302138805389404, + "learning_rate": 0.00023663366336633662, + "loss": 1.2496, + "step": 10110 + }, + { + "epoch": 8.525015652569868, + "grad_norm": 0.24170257151126862, + "learning_rate": 0.00023564356435643564, + "loss": 1.2471, + "step": 10120 + }, + { + "epoch": 8.533439581080312, + "grad_norm": 0.2645774781703949, + "learning_rate": 0.00023465346534653464, + "loss": 1.2477, + "step": 10130 + }, + { + "epoch": 8.541863509590756, + "grad_norm": 0.24155734479427338, + "learning_rate": 0.0002336633663366337, + "loss": 1.2466, + "step": 10140 + }, + { + "epoch": 8.5502874381012, + "grad_norm": 0.23023132979869843, + "learning_rate": 0.0002326732673267327, + "loss": 1.2457, + "step": 10150 + }, + { + "epoch": 8.558711366611645, + "grad_norm": 0.2243080586194992, + "learning_rate": 0.0002316831683168317, + "loss": 1.2476, + "step": 10160 + }, + { + "epoch": 8.56713529512209, + "grad_norm": 0.278157114982605, + "learning_rate": 0.00023069306930693071, + "loss": 1.2462, + "step": 10170 + }, + { + "epoch": 8.56713529512209, + "eval_accuracy": 0.7417397824056636, + "eval_loss": 1.1336922645568848, + "eval_runtime": 892.4907, + "eval_samples_per_second": 559.531, + "eval_steps_per_second": 5.181, + "step": 10170 + }, + { + "epoch": 8.575559223632535, + "grad_norm": 0.24606026709079742, + "learning_rate": 0.0002297029702970297, + "loss": 1.2478, + "step": 10180 + }, + { + "epoch": 8.583983152142979, + "grad_norm": 0.23494498431682587, + "learning_rate": 0.00022871287128712874, + "loss": 1.2463, + "step": 10190 + }, + { + "epoch": 8.592407080653423, + "grad_norm": 0.21522320806980133, + "learning_rate": 0.00022772277227722774, + "loss": 1.2479, + "step": 10200 + }, + { + "epoch": 8.60083100916387, + "grad_norm": 0.2655723989009857, + "learning_rate": 0.00022673267326732673, + "loss": 1.2468, + "step": 10210 + }, + { + "epoch": 8.609254937674313, + "grad_norm": 0.2444898933172226, + "learning_rate": 0.00022574257425742576, + "loss": 1.246, + "step": 10220 + }, + { + "epoch": 8.617678866184757, + "grad_norm": 0.2277156114578247, + "learning_rate": 0.00022475247524752476, + "loss": 1.2466, + "step": 10230 + }, + { + "epoch": 8.626102794695202, + "grad_norm": 0.22111962735652924, + "learning_rate": 0.00022376237623762378, + "loss": 1.2451, + "step": 10240 + }, + { + "epoch": 8.634526723205646, + "grad_norm": 0.23199447989463806, + "learning_rate": 0.00022277227722772278, + "loss": 1.2463, + "step": 10250 + }, + { + "epoch": 8.642950651716092, + "grad_norm": 0.22960427403450012, + "learning_rate": 0.00022178217821782178, + "loss": 1.2465, + "step": 10260 + }, + { + "epoch": 8.642950651716092, + "eval_accuracy": 0.7420823467349104, + "eval_loss": 1.1322184801101685, + "eval_runtime": 883.7567, + "eval_samples_per_second": 565.061, + "eval_steps_per_second": 5.232, + "step": 10260 + }, + { + "epoch": 8.651374580226536, + "grad_norm": 0.290622353553772, + "learning_rate": 0.0002207920792079208, + "loss": 1.2444, + "step": 10270 + }, + { + "epoch": 8.65979850873698, + "grad_norm": 0.2639337480068207, + "learning_rate": 0.0002198019801980198, + "loss": 1.247, + "step": 10280 + }, + { + "epoch": 8.668222437247424, + "grad_norm": 0.22477252781391144, + "learning_rate": 0.00021881188118811883, + "loss": 1.2443, + "step": 10290 + }, + { + "epoch": 8.676646365757868, + "grad_norm": 0.2989983558654785, + "learning_rate": 0.00021782178217821783, + "loss": 1.2461, + "step": 10300 + }, + { + "epoch": 8.685070294268314, + "grad_norm": 0.22259776294231415, + "learning_rate": 0.00021683168316831682, + "loss": 1.2438, + "step": 10310 + }, + { + "epoch": 8.693494222778758, + "grad_norm": 0.21380363404750824, + "learning_rate": 0.00021584158415841585, + "loss": 1.2414, + "step": 10320 + }, + { + "epoch": 8.701918151289203, + "grad_norm": 0.23593538999557495, + "learning_rate": 0.00021485148514851485, + "loss": 1.2454, + "step": 10330 + }, + { + "epoch": 8.710342079799647, + "grad_norm": 0.25987499952316284, + "learning_rate": 0.00021386138613861387, + "loss": 1.2444, + "step": 10340 + }, + { + "epoch": 8.71876600831009, + "grad_norm": 0.21150009334087372, + "learning_rate": 0.00021287128712871287, + "loss": 1.2414, + "step": 10350 + }, + { + "epoch": 8.71876600831009, + "eval_accuracy": 0.7421671573662553, + "eval_loss": 1.1316900253295898, + "eval_runtime": 893.0033, + "eval_samples_per_second": 559.21, + "eval_steps_per_second": 5.178, + "step": 10350 + }, + { + "epoch": 8.727189936820537, + "grad_norm": 0.23628725111484528, + "learning_rate": 0.00021188118811881187, + "loss": 1.2432, + "step": 10360 + }, + { + "epoch": 8.735613865330981, + "grad_norm": 0.24477533996105194, + "learning_rate": 0.0002108910891089109, + "loss": 1.2447, + "step": 10370 + }, + { + "epoch": 8.744037793841425, + "grad_norm": 0.2156253159046173, + "learning_rate": 0.0002099009900990099, + "loss": 1.2452, + "step": 10380 + }, + { + "epoch": 8.75246172235187, + "grad_norm": 0.27982792258262634, + "learning_rate": 0.00020891089108910892, + "loss": 1.2434, + "step": 10390 + }, + { + "epoch": 8.760885650862313, + "grad_norm": 0.24025356769561768, + "learning_rate": 0.00020792079207920792, + "loss": 1.244, + "step": 10400 + }, + { + "epoch": 8.76930957937276, + "grad_norm": 0.22768454253673553, + "learning_rate": 0.00020693069306930691, + "loss": 1.2427, + "step": 10410 + }, + { + "epoch": 8.777733507883204, + "grad_norm": 0.2676762640476227, + "learning_rate": 0.00020594059405940594, + "loss": 1.244, + "step": 10420 + }, + { + "epoch": 8.786157436393648, + "grad_norm": 0.23502378165721893, + "learning_rate": 0.00020495049504950494, + "loss": 1.244, + "step": 10430 + }, + { + "epoch": 8.794581364904092, + "grad_norm": 0.23354895412921906, + "learning_rate": 0.00020396039603960396, + "loss": 1.2435, + "step": 10440 + }, + { + "epoch": 8.794581364904092, + "eval_accuracy": 0.7425177306861277, + "eval_loss": 1.1301963329315186, + "eval_runtime": 885.137, + "eval_samples_per_second": 564.179, + "eval_steps_per_second": 5.224, + "step": 10440 + }, + { + "epoch": 8.803005293414538, + "grad_norm": 0.22738757729530334, + "learning_rate": 0.000202970297029703, + "loss": 1.2426, + "step": 10450 + }, + { + "epoch": 8.811429221924982, + "grad_norm": 0.20702116191387177, + "learning_rate": 0.00020198019801980199, + "loss": 1.243, + "step": 10460 + }, + { + "epoch": 8.819853150435426, + "grad_norm": 0.20945468544960022, + "learning_rate": 0.000200990099009901, + "loss": 1.2411, + "step": 10470 + }, + { + "epoch": 8.82827707894587, + "grad_norm": 0.21654458343982697, + "learning_rate": 0.0002, + "loss": 1.2428, + "step": 10480 + }, + { + "epoch": 8.836701007456314, + "grad_norm": 0.2217228263616562, + "learning_rate": 0.00019900990099009903, + "loss": 1.2405, + "step": 10490 + }, + { + "epoch": 8.84512493596676, + "grad_norm": 0.27619633078575134, + "learning_rate": 0.00019801980198019803, + "loss": 1.2424, + "step": 10500 + }, + { + "epoch": 8.853548864477204, + "grad_norm": 0.2569934129714966, + "learning_rate": 0.00019702970297029703, + "loss": 1.2418, + "step": 10510 + }, + { + "epoch": 8.861972792987649, + "grad_norm": 0.2570299804210663, + "learning_rate": 0.00019603960396039606, + "loss": 1.2423, + "step": 10520 + }, + { + "epoch": 8.870396721498093, + "grad_norm": 0.22972337901592255, + "learning_rate": 0.00019504950495049505, + "loss": 1.2399, + "step": 10530 + }, + { + "epoch": 8.870396721498093, + "eval_accuracy": 0.7427001211705735, + "eval_loss": 1.1304486989974976, + "eval_runtime": 881.4454, + "eval_samples_per_second": 566.542, + "eval_steps_per_second": 5.246, + "step": 10530 + }, + { + "epoch": 8.878820650008539, + "grad_norm": 0.2365693300962448, + "learning_rate": 0.00019405940594059408, + "loss": 1.2426, + "step": 10540 + }, + { + "epoch": 8.887244578518983, + "grad_norm": 0.2252751588821411, + "learning_rate": 0.00019306930693069308, + "loss": 1.2406, + "step": 10550 + }, + { + "epoch": 8.895668507029427, + "grad_norm": 0.2205033302307129, + "learning_rate": 0.00019207920792079208, + "loss": 1.2419, + "step": 10560 + }, + { + "epoch": 8.904092435539871, + "grad_norm": 0.21468041837215424, + "learning_rate": 0.0001910891089108911, + "loss": 1.2406, + "step": 10570 + }, + { + "epoch": 8.912516364050315, + "grad_norm": 0.23669223487377167, + "learning_rate": 0.0001900990099009901, + "loss": 1.2401, + "step": 10580 + }, + { + "epoch": 8.920940292560761, + "grad_norm": 0.2412618100643158, + "learning_rate": 0.00018910891089108913, + "loss": 1.2402, + "step": 10590 + }, + { + "epoch": 8.929364221071205, + "grad_norm": 0.21675223112106323, + "learning_rate": 0.00018811881188118812, + "loss": 1.2417, + "step": 10600 + }, + { + "epoch": 8.93778814958165, + "grad_norm": 0.24683676660060883, + "learning_rate": 0.00018712871287128712, + "loss": 1.2417, + "step": 10610 + }, + { + "epoch": 8.946212078092094, + "grad_norm": 0.21681492030620575, + "learning_rate": 0.00018613861386138615, + "loss": 1.2408, + "step": 10620 + }, + { + "epoch": 8.946212078092094, + "eval_accuracy": 0.7428579001690714, + "eval_loss": 1.1290760040283203, + "eval_runtime": 889.1418, + "eval_samples_per_second": 561.638, + "eval_steps_per_second": 5.201, + "step": 10620 + }, + { + "epoch": 8.954636006602538, + "grad_norm": 0.22117485105991364, + "learning_rate": 0.00018514851485148514, + "loss": 1.2399, + "step": 10630 + }, + { + "epoch": 8.963059935112984, + "grad_norm": 0.2180255800485611, + "learning_rate": 0.00018415841584158417, + "loss": 1.2378, + "step": 10640 + }, + { + "epoch": 8.971483863623428, + "grad_norm": 0.23244567215442657, + "learning_rate": 0.00018316831683168317, + "loss": 1.2402, + "step": 10650 + }, + { + "epoch": 8.979907792133872, + "grad_norm": 0.23777294158935547, + "learning_rate": 0.00018217821782178217, + "loss": 1.2417, + "step": 10660 + }, + { + "epoch": 8.988331720644316, + "grad_norm": 0.26418906450271606, + "learning_rate": 0.0001811881188118812, + "loss": 1.238, + "step": 10670 + }, + { + "epoch": 8.99675564915476, + "grad_norm": 0.21142803132534027, + "learning_rate": 0.0001801980198019802, + "loss": 1.2384, + "step": 10680 + }, + { + "epoch": 9.005179577665206, + "grad_norm": 0.21976542472839355, + "learning_rate": 0.00017920792079207922, + "loss": 1.2399, + "step": 10690 + }, + { + "epoch": 9.01360350617565, + "grad_norm": 0.2216147631406784, + "learning_rate": 0.0001782178217821782, + "loss": 1.2391, + "step": 10700 + }, + { + "epoch": 9.022027434686095, + "grad_norm": 0.1873018890619278, + "learning_rate": 0.0001772277227722772, + "loss": 1.2368, + "step": 10710 + }, + { + "epoch": 9.022027434686095, + "eval_accuracy": 0.7431224622062498, + "eval_loss": 1.1265127658843994, + "eval_runtime": 891.5668, + "eval_samples_per_second": 560.111, + "eval_steps_per_second": 5.186, + "step": 10710 + }, + { + "epoch": 9.030451363196539, + "grad_norm": 0.23913191258907318, + "learning_rate": 0.00017623762376237624, + "loss": 1.2404, + "step": 10720 + }, + { + "epoch": 9.038875291706983, + "grad_norm": 0.21578449010849, + "learning_rate": 0.00017524752475247524, + "loss": 1.2388, + "step": 10730 + }, + { + "epoch": 9.047299220217429, + "grad_norm": 0.2038455754518509, + "learning_rate": 0.00017425742574257426, + "loss": 1.2402, + "step": 10740 + }, + { + "epoch": 9.055723148727873, + "grad_norm": 0.21903488039970398, + "learning_rate": 0.00017326732673267326, + "loss": 1.2383, + "step": 10750 + }, + { + "epoch": 9.064147077238317, + "grad_norm": 0.21970726549625397, + "learning_rate": 0.00017227722772277226, + "loss": 1.2386, + "step": 10760 + }, + { + "epoch": 9.072571005748761, + "grad_norm": 0.22701360285282135, + "learning_rate": 0.0001712871287128713, + "loss": 1.2391, + "step": 10770 + }, + { + "epoch": 9.080994934259207, + "grad_norm": 0.21777622401714325, + "learning_rate": 0.0001702970297029703, + "loss": 1.2388, + "step": 10780 + }, + { + "epoch": 9.089418862769651, + "grad_norm": 0.2336941659450531, + "learning_rate": 0.00016930693069306933, + "loss": 1.2383, + "step": 10790 + }, + { + "epoch": 9.097842791280096, + "grad_norm": 0.20545706152915955, + "learning_rate": 0.00016831683168316833, + "loss": 1.2376, + "step": 10800 + }, + { + "epoch": 9.097842791280096, + "eval_accuracy": 0.7435866345331611, + "eval_loss": 1.1250243186950684, + "eval_runtime": 885.3582, + "eval_samples_per_second": 564.038, + "eval_steps_per_second": 5.223, + "step": 10800 + }, + { + "epoch": 9.10626671979054, + "grad_norm": 0.23678459227085114, + "learning_rate": 0.00016732673267326733, + "loss": 1.2394, + "step": 10810 + }, + { + "epoch": 9.114690648300984, + "grad_norm": 0.24195948243141174, + "learning_rate": 0.00016633663366336635, + "loss": 1.238, + "step": 10820 + }, + { + "epoch": 9.12311457681143, + "grad_norm": 0.20026259124279022, + "learning_rate": 0.00016534653465346535, + "loss": 1.2364, + "step": 10830 + }, + { + "epoch": 9.131538505321874, + "grad_norm": 0.21753010153770447, + "learning_rate": 0.00016435643564356438, + "loss": 1.238, + "step": 10840 + }, + { + "epoch": 9.139962433832318, + "grad_norm": 0.20273657143115997, + "learning_rate": 0.00016336633663366338, + "loss": 1.2374, + "step": 10850 + }, + { + "epoch": 9.148386362342762, + "grad_norm": 0.21302086114883423, + "learning_rate": 0.00016237623762376237, + "loss": 1.2372, + "step": 10860 + }, + { + "epoch": 9.156810290853207, + "grad_norm": 0.23342467844486237, + "learning_rate": 0.0001613861386138614, + "loss": 1.2378, + "step": 10870 + }, + { + "epoch": 9.165234219363652, + "grad_norm": 0.24393875896930695, + "learning_rate": 0.0001603960396039604, + "loss": 1.2362, + "step": 10880 + }, + { + "epoch": 9.173658147874097, + "grad_norm": 0.19604717195034027, + "learning_rate": 0.00015940594059405942, + "loss": 1.237, + "step": 10890 + }, + { + "epoch": 9.173658147874097, + "eval_accuracy": 0.743667723412049, + "eval_loss": 1.124830722808838, + "eval_runtime": 887.4222, + "eval_samples_per_second": 562.727, + "eval_steps_per_second": 5.211, + "step": 10890 + }, + { + "epoch": 9.18208207638454, + "grad_norm": 0.19619697332382202, + "learning_rate": 0.00015841584158415842, + "loss": 1.2356, + "step": 10900 + }, + { + "epoch": 9.190506004894985, + "grad_norm": 0.20415499806404114, + "learning_rate": 0.00015742574257425742, + "loss": 1.2373, + "step": 10910 + }, + { + "epoch": 9.19892993340543, + "grad_norm": 0.21602529287338257, + "learning_rate": 0.00015643564356435644, + "loss": 1.2369, + "step": 10920 + }, + { + "epoch": 9.207353861915875, + "grad_norm": 0.2266259491443634, + "learning_rate": 0.00015544554455445544, + "loss": 1.236, + "step": 10930 + }, + { + "epoch": 9.21577779042632, + "grad_norm": 0.2172340452671051, + "learning_rate": 0.00015445544554455447, + "loss": 1.236, + "step": 10940 + }, + { + "epoch": 9.224201718936763, + "grad_norm": 0.21929994225502014, + "learning_rate": 0.00015346534653465347, + "loss": 1.2381, + "step": 10950 + }, + { + "epoch": 9.232625647447207, + "grad_norm": 0.20617130398750305, + "learning_rate": 0.00015247524752475246, + "loss": 1.2346, + "step": 10960 + }, + { + "epoch": 9.241049575957653, + "grad_norm": 0.2271021008491516, + "learning_rate": 0.0001514851485148515, + "loss": 1.2364, + "step": 10970 + }, + { + "epoch": 9.249473504468098, + "grad_norm": 0.22377552092075348, + "learning_rate": 0.0001504950495049505, + "loss": 1.2342, + "step": 10980 + }, + { + "epoch": 9.249473504468098, + "eval_accuracy": 0.7438243969178056, + "eval_loss": 1.124144434928894, + "eval_runtime": 880.0851, + "eval_samples_per_second": 567.418, + "eval_steps_per_second": 5.254, + "step": 10980 + }, + { + "epoch": 9.257897432978542, + "grad_norm": 0.23195216059684753, + "learning_rate": 0.0001495049504950495, + "loss": 1.2347, + "step": 10990 + }, + { + "epoch": 9.266321361488986, + "grad_norm": 0.19934554398059845, + "learning_rate": 0.0001485148514851485, + "loss": 1.2359, + "step": 11000 + }, + { + "epoch": 9.27474528999943, + "grad_norm": 0.19541287422180176, + "learning_rate": 0.0001475247524752475, + "loss": 1.2342, + "step": 11010 + }, + { + "epoch": 9.283169218509876, + "grad_norm": 0.2204955518245697, + "learning_rate": 0.00014653465346534653, + "loss": 1.2356, + "step": 11020 + }, + { + "epoch": 9.29159314702032, + "grad_norm": 0.22855669260025024, + "learning_rate": 0.00014554455445544553, + "loss": 1.2367, + "step": 11030 + }, + { + "epoch": 9.300017075530764, + "grad_norm": 0.20308193564414978, + "learning_rate": 0.00014455445544554456, + "loss": 1.235, + "step": 11040 + }, + { + "epoch": 9.308441004041208, + "grad_norm": 0.18201188743114471, + "learning_rate": 0.00014356435643564356, + "loss": 1.235, + "step": 11050 + }, + { + "epoch": 9.316864932551653, + "grad_norm": 0.199186772108078, + "learning_rate": 0.00014257425742574255, + "loss": 1.2348, + "step": 11060 + }, + { + "epoch": 9.325288861062099, + "grad_norm": 0.23214493691921234, + "learning_rate": 0.00014158415841584158, + "loss": 1.2335, + "step": 11070 + }, + { + "epoch": 9.325288861062099, + "eval_accuracy": 0.7438911749364814, + "eval_loss": 1.123384714126587, + "eval_runtime": 888.3176, + "eval_samples_per_second": 562.159, + "eval_steps_per_second": 5.205, + "step": 11070 + }, + { + "epoch": 9.333712789572543, + "grad_norm": 0.2128278762102127, + "learning_rate": 0.0001405940594059406, + "loss": 1.2337, + "step": 11080 + }, + { + "epoch": 9.342136718082987, + "grad_norm": 0.20257510244846344, + "learning_rate": 0.00013960396039603963, + "loss": 1.2357, + "step": 11090 + }, + { + "epoch": 9.350560646593431, + "grad_norm": 0.22038786113262177, + "learning_rate": 0.00013861386138613863, + "loss": 1.2333, + "step": 11100 + }, + { + "epoch": 9.358984575103877, + "grad_norm": 0.2351042628288269, + "learning_rate": 0.00013762376237623763, + "loss": 1.235, + "step": 11110 + }, + { + "epoch": 9.367408503614321, + "grad_norm": 0.2042153775691986, + "learning_rate": 0.00013663366336633665, + "loss": 1.2339, + "step": 11120 + }, + { + "epoch": 9.375832432124765, + "grad_norm": 0.20065917074680328, + "learning_rate": 0.00013564356435643565, + "loss": 1.234, + "step": 11130 + }, + { + "epoch": 9.38425636063521, + "grad_norm": 0.22544540464878082, + "learning_rate": 0.00013465346534653468, + "loss": 1.2319, + "step": 11140 + }, + { + "epoch": 9.392680289145654, + "grad_norm": 0.2352074533700943, + "learning_rate": 0.00013366336633663367, + "loss": 1.2347, + "step": 11150 + }, + { + "epoch": 9.4011042176561, + "grad_norm": 0.2452593892812729, + "learning_rate": 0.00013267326732673267, + "loss": 1.2343, + "step": 11160 + }, + { + "epoch": 9.4011042176561, + "eval_accuracy": 0.7445740208736444, + "eval_loss": 1.1202077865600586, + "eval_runtime": 879.3984, + "eval_samples_per_second": 567.861, + "eval_steps_per_second": 5.258, + "step": 11160 + }, + { + "epoch": 9.409528146166544, + "grad_norm": 0.20848217606544495, + "learning_rate": 0.0001316831683168317, + "loss": 1.2315, + "step": 11170 + }, + { + "epoch": 9.417952074676988, + "grad_norm": 0.20628029108047485, + "learning_rate": 0.0001306930693069307, + "loss": 1.2326, + "step": 11180 + }, + { + "epoch": 9.426376003187432, + "grad_norm": 0.199026957154274, + "learning_rate": 0.00012970297029702972, + "loss": 1.2329, + "step": 11190 + }, + { + "epoch": 9.434799931697876, + "grad_norm": 0.21373671293258667, + "learning_rate": 0.00012871287128712872, + "loss": 1.2326, + "step": 11200 + }, + { + "epoch": 9.443223860208322, + "grad_norm": 0.2015460729598999, + "learning_rate": 0.00012772277227722772, + "loss": 1.2327, + "step": 11210 + }, + { + "epoch": 9.451647788718766, + "grad_norm": 0.2228008210659027, + "learning_rate": 0.00012673267326732674, + "loss": 1.2334, + "step": 11220 + }, + { + "epoch": 9.46007171722921, + "grad_norm": 0.21561528742313385, + "learning_rate": 0.00012574257425742574, + "loss": 1.233, + "step": 11230 + }, + { + "epoch": 9.468495645739655, + "grad_norm": 0.2073032706975937, + "learning_rate": 0.00012475247524752477, + "loss": 1.2314, + "step": 11240 + }, + { + "epoch": 9.4769195742501, + "grad_norm": 0.19552037119865417, + "learning_rate": 0.00012376237623762376, + "loss": 1.2333, + "step": 11250 + }, + { + "epoch": 9.4769195742501, + "eval_accuracy": 0.744401638855597, + "eval_loss": 1.1210565567016602, + "eval_runtime": 888.2535, + "eval_samples_per_second": 562.2, + "eval_steps_per_second": 5.206, + "step": 11250 + }, + { + "epoch": 9.485343502760545, + "grad_norm": 0.20909276604652405, + "learning_rate": 0.00012277227722772276, + "loss": 1.2332, + "step": 11260 + }, + { + "epoch": 9.493767431270989, + "grad_norm": 0.210150346159935, + "learning_rate": 0.00012178217821782179, + "loss": 1.2308, + "step": 11270 + }, + { + "epoch": 9.502191359781433, + "grad_norm": 0.1982164978981018, + "learning_rate": 0.00012079207920792079, + "loss": 1.2305, + "step": 11280 + }, + { + "epoch": 9.510615288291877, + "grad_norm": 0.2049965262413025, + "learning_rate": 0.0001198019801980198, + "loss": 1.2334, + "step": 11290 + }, + { + "epoch": 9.519039216802323, + "grad_norm": 0.18243108689785004, + "learning_rate": 0.00011881188118811881, + "loss": 1.2335, + "step": 11300 + }, + { + "epoch": 9.527463145312767, + "grad_norm": 0.2009328156709671, + "learning_rate": 0.00011782178217821782, + "loss": 1.2313, + "step": 11310 + }, + { + "epoch": 9.535887073823211, + "grad_norm": 0.19226033985614777, + "learning_rate": 0.00011683168316831685, + "loss": 1.2332, + "step": 11320 + }, + { + "epoch": 9.544311002333655, + "grad_norm": 0.20206843316555023, + "learning_rate": 0.00011584158415841584, + "loss": 1.2333, + "step": 11330 + }, + { + "epoch": 9.5527349308441, + "grad_norm": 0.20852382481098175, + "learning_rate": 0.00011485148514851486, + "loss": 1.2322, + "step": 11340 + }, + { + "epoch": 9.5527349308441, + "eval_accuracy": 0.7448142064493213, + "eval_loss": 1.1182734966278076, + "eval_runtime": 889.106, + "eval_samples_per_second": 561.661, + "eval_steps_per_second": 5.201, + "step": 11340 + }, + { + "epoch": 9.561158859354546, + "grad_norm": 0.19330884516239166, + "learning_rate": 0.00011386138613861387, + "loss": 1.2294, + "step": 11350 + }, + { + "epoch": 9.56958278786499, + "grad_norm": 0.17878125607967377, + "learning_rate": 0.00011287128712871288, + "loss": 1.2301, + "step": 11360 + }, + { + "epoch": 9.578006716375434, + "grad_norm": 0.20679515600204468, + "learning_rate": 0.00011188118811881189, + "loss": 1.2302, + "step": 11370 + }, + { + "epoch": 9.586430644885878, + "grad_norm": 0.20949432253837585, + "learning_rate": 0.00011089108910891089, + "loss": 1.2308, + "step": 11380 + }, + { + "epoch": 9.594854573396322, + "grad_norm": 0.21771377325057983, + "learning_rate": 0.0001099009900990099, + "loss": 1.2313, + "step": 11390 + }, + { + "epoch": 9.603278501906768, + "grad_norm": 0.1953546106815338, + "learning_rate": 0.00010891089108910891, + "loss": 1.2305, + "step": 11400 + }, + { + "epoch": 9.611702430417212, + "grad_norm": 0.20105966925621033, + "learning_rate": 0.00010792079207920792, + "loss": 1.2294, + "step": 11410 + }, + { + "epoch": 9.620126358927656, + "grad_norm": 0.20625823736190796, + "learning_rate": 0.00010693069306930694, + "loss": 1.2287, + "step": 11420 + }, + { + "epoch": 9.6285502874381, + "grad_norm": 0.2024402767419815, + "learning_rate": 0.00010594059405940593, + "loss": 1.2309, + "step": 11430 + }, + { + "epoch": 9.6285502874381, + "eval_accuracy": 0.7450274546722492, + "eval_loss": 1.1177880764007568, + "eval_runtime": 889.3816, + "eval_samples_per_second": 561.487, + "eval_steps_per_second": 5.199, + "step": 11430 + }, + { + "epoch": 9.636974215948547, + "grad_norm": 0.20498992502689362, + "learning_rate": 0.00010495049504950495, + "loss": 1.228, + "step": 11440 + }, + { + "epoch": 9.64539814445899, + "grad_norm": 0.18760576844215393, + "learning_rate": 0.00010396039603960396, + "loss": 1.2287, + "step": 11450 + }, + { + "epoch": 9.653822072969435, + "grad_norm": 0.2059292048215866, + "learning_rate": 0.00010297029702970297, + "loss": 1.2284, + "step": 11460 + }, + { + "epoch": 9.662246001479879, + "grad_norm": 0.20898665487766266, + "learning_rate": 0.00010198019801980198, + "loss": 1.231, + "step": 11470 + }, + { + "epoch": 9.670669929990323, + "grad_norm": 0.20303255319595337, + "learning_rate": 0.00010099009900990099, + "loss": 1.2302, + "step": 11480 + }, + { + "epoch": 9.679093858500769, + "grad_norm": 0.20947200059890747, + "learning_rate": 0.0001, + "loss": 1.2314, + "step": 11490 + }, + { + "epoch": 9.687517787011213, + "grad_norm": 0.20898771286010742, + "learning_rate": 9.900990099009902e-05, + "loss": 1.2294, + "step": 11500 + }, + { + "epoch": 9.695941715521657, + "grad_norm": 0.18466849625110626, + "learning_rate": 9.801980198019803e-05, + "loss": 1.2309, + "step": 11510 + }, + { + "epoch": 9.704365644032102, + "grad_norm": 0.1769760698080063, + "learning_rate": 9.702970297029704e-05, + "loss": 1.2282, + "step": 11520 + }, + { + "epoch": 9.704365644032102, + "eval_accuracy": 0.7449189101862153, + "eval_loss": 1.118354082107544, + "eval_runtime": 879.3937, + "eval_samples_per_second": 567.864, + "eval_steps_per_second": 5.258, + "step": 11520 + }, + { + "epoch": 9.712789572542546, + "grad_norm": 0.18270480632781982, + "learning_rate": 9.603960396039604e-05, + "loss": 1.2286, + "step": 11530 + }, + { + "epoch": 9.721213501052992, + "grad_norm": 0.1812662035226822, + "learning_rate": 9.504950495049505e-05, + "loss": 1.2279, + "step": 11540 + }, + { + "epoch": 9.729637429563436, + "grad_norm": 0.20632152259349823, + "learning_rate": 9.405940594059406e-05, + "loss": 1.2295, + "step": 11550 + }, + { + "epoch": 9.73806135807388, + "grad_norm": 0.19512777030467987, + "learning_rate": 9.306930693069307e-05, + "loss": 1.2292, + "step": 11560 + }, + { + "epoch": 9.746485286584324, + "grad_norm": 0.19665522873401642, + "learning_rate": 9.207920792079209e-05, + "loss": 1.2294, + "step": 11570 + }, + { + "epoch": 9.75490921509477, + "grad_norm": 0.18540680408477783, + "learning_rate": 9.108910891089108e-05, + "loss": 1.2297, + "step": 11580 + }, + { + "epoch": 9.763333143605214, + "grad_norm": 0.21472424268722534, + "learning_rate": 9.00990099009901e-05, + "loss": 1.2277, + "step": 11590 + }, + { + "epoch": 9.771757072115658, + "grad_norm": 0.2189822793006897, + "learning_rate": 8.91089108910891e-05, + "loss": 1.2293, + "step": 11600 + }, + { + "epoch": 9.780181000626103, + "grad_norm": 0.19983939826488495, + "learning_rate": 8.811881188118812e-05, + "loss": 1.2287, + "step": 11610 + }, + { + "epoch": 9.780181000626103, + "eval_accuracy": 0.7452771934107217, + "eval_loss": 1.1166530847549438, + "eval_runtime": 886.9822, + "eval_samples_per_second": 563.006, + "eval_steps_per_second": 5.213, + "step": 11610 + }, + { + "epoch": 9.788604929136547, + "grad_norm": 0.1868014931678772, + "learning_rate": 8.712871287128713e-05, + "loss": 1.2296, + "step": 11620 + }, + { + "epoch": 9.797028857646993, + "grad_norm": 0.2048911601305008, + "learning_rate": 8.613861386138613e-05, + "loss": 1.2291, + "step": 11630 + }, + { + "epoch": 9.805452786157437, + "grad_norm": 0.2088802009820938, + "learning_rate": 8.514851485148515e-05, + "loss": 1.2271, + "step": 11640 + }, + { + "epoch": 9.813876714667881, + "grad_norm": 0.20058122277259827, + "learning_rate": 8.415841584158417e-05, + "loss": 1.2296, + "step": 11650 + }, + { + "epoch": 9.822300643178325, + "grad_norm": 0.1964656561613083, + "learning_rate": 8.316831683168318e-05, + "loss": 1.2272, + "step": 11660 + }, + { + "epoch": 9.83072457168877, + "grad_norm": 0.20214231312274933, + "learning_rate": 8.217821782178219e-05, + "loss": 1.2271, + "step": 11670 + }, + { + "epoch": 9.839148500199215, + "grad_norm": 0.19427910447120667, + "learning_rate": 8.118811881188119e-05, + "loss": 1.2264, + "step": 11680 + }, + { + "epoch": 9.84757242870966, + "grad_norm": 0.18842646479606628, + "learning_rate": 8.01980198019802e-05, + "loss": 1.2265, + "step": 11690 + }, + { + "epoch": 9.855996357220103, + "grad_norm": 0.18588952720165253, + "learning_rate": 7.920792079207921e-05, + "loss": 1.2279, + "step": 11700 + }, + { + "epoch": 9.855996357220103, + "eval_accuracy": 0.7454476541387279, + "eval_loss": 1.1153885126113892, + "eval_runtime": 879.2745, + "eval_samples_per_second": 567.941, + "eval_steps_per_second": 5.259, + "step": 11700 + }, + { + "epoch": 9.864420285730548, + "grad_norm": 0.18300525844097137, + "learning_rate": 7.821782178217822e-05, + "loss": 1.2268, + "step": 11710 + }, + { + "epoch": 9.872844214240992, + "grad_norm": 0.18436813354492188, + "learning_rate": 7.722772277227723e-05, + "loss": 1.2256, + "step": 11720 + }, + { + "epoch": 9.881268142751438, + "grad_norm": 0.19767363369464874, + "learning_rate": 7.623762376237623e-05, + "loss": 1.2246, + "step": 11730 + }, + { + "epoch": 9.889692071261882, + "grad_norm": 0.1749766319990158, + "learning_rate": 7.524752475247524e-05, + "loss": 1.2277, + "step": 11740 + }, + { + "epoch": 9.898115999772326, + "grad_norm": 0.17161355912685394, + "learning_rate": 7.425742574257426e-05, + "loss": 1.2262, + "step": 11750 + }, + { + "epoch": 9.90653992828277, + "grad_norm": 0.190937340259552, + "learning_rate": 7.326732673267327e-05, + "loss": 1.2276, + "step": 11760 + }, + { + "epoch": 9.914963856793216, + "grad_norm": 0.18256962299346924, + "learning_rate": 7.227722772277228e-05, + "loss": 1.2274, + "step": 11770 + }, + { + "epoch": 9.92338778530366, + "grad_norm": 0.1912631094455719, + "learning_rate": 7.128712871287128e-05, + "loss": 1.2243, + "step": 11780 + }, + { + "epoch": 9.931811713814104, + "grad_norm": 0.19331537187099457, + "learning_rate": 7.02970297029703e-05, + "loss": 1.2261, + "step": 11790 + }, + { + "epoch": 9.931811713814104, + "eval_accuracy": 0.7455543705350357, + "eval_loss": 1.115136981010437, + "eval_runtime": 887.3277, + "eval_samples_per_second": 562.786, + "eval_steps_per_second": 5.211, + "step": 11790 + }, + { + "epoch": 9.940235642324549, + "grad_norm": 0.17607170343399048, + "learning_rate": 6.930693069306931e-05, + "loss": 1.228, + "step": 11800 + }, + { + "epoch": 9.948659570834993, + "grad_norm": 0.17280788719654083, + "learning_rate": 6.831683168316833e-05, + "loss": 1.2269, + "step": 11810 + }, + { + "epoch": 9.957083499345439, + "grad_norm": 0.19290916621685028, + "learning_rate": 6.732673267326734e-05, + "loss": 1.2279, + "step": 11820 + }, + { + "epoch": 9.965507427855883, + "grad_norm": 0.19125664234161377, + "learning_rate": 6.633663366336634e-05, + "loss": 1.227, + "step": 11830 + }, + { + "epoch": 9.973931356366327, + "grad_norm": 0.18251217901706696, + "learning_rate": 6.534653465346535e-05, + "loss": 1.2254, + "step": 11840 + }, + { + "epoch": 9.982355284876771, + "grad_norm": 0.19647039473056793, + "learning_rate": 6.435643564356436e-05, + "loss": 1.2261, + "step": 11850 + }, + { + "epoch": 9.990779213387215, + "grad_norm": 0.17714038491249084, + "learning_rate": 6.336633663366337e-05, + "loss": 1.2276, + "step": 11860 + }, + { + "epoch": 9.999203141897661, + "grad_norm": 0.18365037441253662, + "learning_rate": 6.237623762376238e-05, + "loss": 1.2261, + "step": 11870 + }, + { + "epoch": 10.007627070408105, + "grad_norm": 0.1910678595304489, + "learning_rate": 6.138613861386138e-05, + "loss": 1.2244, + "step": 11880 + }, + { + "epoch": 10.007627070408105, + "eval_accuracy": 0.7456593741030724, + "eval_loss": 1.1154232025146484, + "eval_runtime": 887.0764, + "eval_samples_per_second": 562.946, + "eval_steps_per_second": 5.213, + "step": 11880 + }, + { + "epoch": 10.01605099891855, + "grad_norm": 0.18324702978134155, + "learning_rate": 6.039603960396039e-05, + "loss": 1.2267, + "step": 11890 + }, + { + "epoch": 10.024474927428994, + "grad_norm": 0.1686498522758484, + "learning_rate": 5.9405940594059404e-05, + "loss": 1.2242, + "step": 11900 + }, + { + "epoch": 10.03289885593944, + "grad_norm": 0.17256265878677368, + "learning_rate": 5.841584158415842e-05, + "loss": 1.2239, + "step": 11910 + }, + { + "epoch": 10.041322784449884, + "grad_norm": 0.19624483585357666, + "learning_rate": 5.742574257425743e-05, + "loss": 1.2258, + "step": 11920 + }, + { + "epoch": 10.049746712960328, + "grad_norm": 0.17262500524520874, + "learning_rate": 5.643564356435644e-05, + "loss": 1.2258, + "step": 11930 + }, + { + "epoch": 10.058170641470772, + "grad_norm": 0.1741054356098175, + "learning_rate": 5.5445544554455445e-05, + "loss": 1.2245, + "step": 11940 + }, + { + "epoch": 10.066594569981216, + "grad_norm": 0.17313139140605927, + "learning_rate": 5.4455445544554456e-05, + "loss": 1.2256, + "step": 11950 + }, + { + "epoch": 10.075018498491662, + "grad_norm": 0.18322905898094177, + "learning_rate": 5.346534653465347e-05, + "loss": 1.2243, + "step": 11960 + }, + { + "epoch": 10.083442427002106, + "grad_norm": 0.18261946737766266, + "learning_rate": 5.247524752475247e-05, + "loss": 1.2252, + "step": 11970 + }, + { + "epoch": 10.083442427002106, + "eval_accuracy": 0.7457714664313748, + "eval_loss": 1.1143237352371216, + "eval_runtime": 887.1041, + "eval_samples_per_second": 562.928, + "eval_steps_per_second": 5.212, + "step": 11970 + }, + { + "epoch": 10.09186635551255, + "grad_norm": 0.1877572238445282, + "learning_rate": 5.1485148514851485e-05, + "loss": 1.2249, + "step": 11980 + }, + { + "epoch": 10.100290284022995, + "grad_norm": 0.18356889486312866, + "learning_rate": 5.0495049504950497e-05, + "loss": 1.2255, + "step": 11990 + }, + { + "epoch": 10.108714212533439, + "grad_norm": 0.1898818463087082, + "learning_rate": 4.950495049504951e-05, + "loss": 1.2241, + "step": 12000 + }, + { + "epoch": 10.117138141043885, + "grad_norm": 0.17149324715137482, + "learning_rate": 4.851485148514852e-05, + "loss": 1.2257, + "step": 12010 + }, + { + "epoch": 10.125562069554329, + "grad_norm": 0.16672831773757935, + "learning_rate": 4.7524752475247525e-05, + "loss": 1.2255, + "step": 12020 + }, + { + "epoch": 10.133985998064773, + "grad_norm": 0.16820046305656433, + "learning_rate": 4.653465346534654e-05, + "loss": 1.225, + "step": 12030 + }, + { + "epoch": 10.142409926575217, + "grad_norm": 0.17770229279994965, + "learning_rate": 4.554455445544554e-05, + "loss": 1.227, + "step": 12040 + }, + { + "epoch": 10.150833855085661, + "grad_norm": 0.16082800924777985, + "learning_rate": 4.455445544554455e-05, + "loss": 1.2253, + "step": 12050 + }, + { + "epoch": 10.159257783596107, + "grad_norm": 0.1669086515903473, + "learning_rate": 4.3564356435643565e-05, + "loss": 1.2241, + "step": 12060 + }, + { + "epoch": 10.159257783596107, + "eval_accuracy": 0.7460534494522424, + "eval_loss": 1.1121779680252075, + "eval_runtime": 882.614, + "eval_samples_per_second": 565.792, + "eval_steps_per_second": 5.239, + "step": 12060 + }, + { + "epoch": 10.167681712106551, + "grad_norm": 0.17394189536571503, + "learning_rate": 4.257425742574258e-05, + "loss": 1.2238, + "step": 12070 + }, + { + "epoch": 10.176105640616996, + "grad_norm": 0.1611398160457611, + "learning_rate": 4.158415841584159e-05, + "loss": 1.2243, + "step": 12080 + }, + { + "epoch": 10.18452956912744, + "grad_norm": 0.16469168663024902, + "learning_rate": 4.0594059405940594e-05, + "loss": 1.2232, + "step": 12090 + }, + { + "epoch": 10.192953497637886, + "grad_norm": 0.1700202375650406, + "learning_rate": 3.9603960396039605e-05, + "loss": 1.2243, + "step": 12100 + }, + { + "epoch": 10.20137742614833, + "grad_norm": 0.16961273550987244, + "learning_rate": 3.861386138613862e-05, + "loss": 1.2244, + "step": 12110 + }, + { + "epoch": 10.209801354658774, + "grad_norm": 0.18176864087581635, + "learning_rate": 3.762376237623762e-05, + "loss": 1.2234, + "step": 12120 + }, + { + "epoch": 10.218225283169218, + "grad_norm": 0.17132678627967834, + "learning_rate": 3.6633663366336634e-05, + "loss": 1.2231, + "step": 12130 + }, + { + "epoch": 10.226649211679662, + "grad_norm": 0.1708788424730301, + "learning_rate": 3.564356435643564e-05, + "loss": 1.2228, + "step": 12140 + }, + { + "epoch": 10.235073140190108, + "grad_norm": 0.16924616694450378, + "learning_rate": 3.465346534653466e-05, + "loss": 1.2241, + "step": 12150 + }, + { + "epoch": 10.235073140190108, + "eval_accuracy": 0.7462807420235112, + "eval_loss": 1.1115893125534058, + "eval_runtime": 893.1249, + "eval_samples_per_second": 559.133, + "eval_steps_per_second": 5.177, + "step": 12150 + }, + { + "epoch": 10.243497068700552, + "grad_norm": 0.1617705076932907, + "learning_rate": 3.366336633663367e-05, + "loss": 1.2239, + "step": 12160 + }, + { + "epoch": 10.251920997210997, + "grad_norm": 0.17731362581253052, + "learning_rate": 3.2673267326732674e-05, + "loss": 1.2232, + "step": 12170 + }, + { + "epoch": 10.26034492572144, + "grad_norm": 0.17324230074882507, + "learning_rate": 3.1683168316831686e-05, + "loss": 1.224, + "step": 12180 + }, + { + "epoch": 10.268768854231885, + "grad_norm": 0.15266722440719604, + "learning_rate": 3.069306930693069e-05, + "loss": 1.224, + "step": 12190 + }, + { + "epoch": 10.27719278274233, + "grad_norm": 0.1547342985868454, + "learning_rate": 2.9702970297029702e-05, + "loss": 1.2232, + "step": 12200 + }, + { + "epoch": 10.285616711252775, + "grad_norm": 0.15873835980892181, + "learning_rate": 2.8712871287128714e-05, + "loss": 1.2221, + "step": 12210 + }, + { + "epoch": 10.29404063976322, + "grad_norm": 0.15968631207942963, + "learning_rate": 2.7722772277227722e-05, + "loss": 1.223, + "step": 12220 + }, + { + "epoch": 10.302464568273663, + "grad_norm": 0.15929782390594482, + "learning_rate": 2.6732673267326734e-05, + "loss": 1.2242, + "step": 12230 + }, + { + "epoch": 10.31088849678411, + "grad_norm": 0.1512889713048935, + "learning_rate": 2.5742574257425742e-05, + "loss": 1.2223, + "step": 12240 + }, + { + "epoch": 10.31088849678411, + "eval_accuracy": 0.7462616988558893, + "eval_loss": 1.1114362478256226, + "eval_runtime": 886.8923, + "eval_samples_per_second": 563.063, + "eval_steps_per_second": 5.214, + "step": 12240 + }, + { + "epoch": 10.319312425294553, + "grad_norm": 0.15943297743797302, + "learning_rate": 2.4752475247524754e-05, + "loss": 1.2224, + "step": 12250 + }, + { + "epoch": 10.327736353804998, + "grad_norm": 0.16134706139564514, + "learning_rate": 2.3762376237623762e-05, + "loss": 1.2218, + "step": 12260 + }, + { + "epoch": 10.336160282315442, + "grad_norm": 0.15525278449058533, + "learning_rate": 2.277227722772277e-05, + "loss": 1.2237, + "step": 12270 + }, + { + "epoch": 10.344584210825886, + "grad_norm": 0.1626599282026291, + "learning_rate": 2.1782178217821783e-05, + "loss": 1.2228, + "step": 12280 + }, + { + "epoch": 10.353008139336332, + "grad_norm": 0.1533862203359604, + "learning_rate": 2.0792079207920794e-05, + "loss": 1.221, + "step": 12290 + }, + { + "epoch": 10.361432067846776, + "grad_norm": 0.14988014101982117, + "learning_rate": 1.9801980198019803e-05, + "loss": 1.2238, + "step": 12300 + }, + { + "epoch": 10.36985599635722, + "grad_norm": 0.15282054245471954, + "learning_rate": 1.881188118811881e-05, + "loss": 1.2202, + "step": 12310 + }, + { + "epoch": 10.378279924867664, + "grad_norm": 0.1532844454050064, + "learning_rate": 1.782178217821782e-05, + "loss": 1.2222, + "step": 12320 + }, + { + "epoch": 10.386703853378108, + "grad_norm": 0.15041793882846832, + "learning_rate": 1.6831683168316834e-05, + "loss": 1.2233, + "step": 12330 + }, + { + "epoch": 10.386703853378108, + "eval_accuracy": 0.7464784909349403, + "eval_loss": 1.1103906631469727, + "eval_runtime": 893.2259, + "eval_samples_per_second": 559.07, + "eval_steps_per_second": 5.177, + "step": 12330 + } + ], + "logging_steps": 10, + "max_steps": 12500, + "num_input_tokens_seen": 0, + "num_train_epochs": 11, + "save_steps": 90, + "total_flos": 3.205415169974477e+18, + "train_batch_size": 108, + "trial_name": null, + "trial_params": null +}