80M-0.050-cosmopedia / checkpoint /trainer_state.json
rs545837's picture
Upload folder using huggingface_hub
53fa63d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49997953419835456,
"eval_steps": 4886,
"global_step": 24430,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009823584789816217,
"grad_norm": 640.0,
"learning_rate": 0.001,
"loss": 11885.6341,
"step": 48
},
{
"epoch": 0.0019647169579632435,
"grad_norm": 454.0,
"learning_rate": 0.001,
"loss": 8966.3691,
"step": 96
},
{
"epoch": 0.0029470754369448652,
"grad_norm": 572.0,
"learning_rate": 0.001,
"loss": 7738.5072,
"step": 144
},
{
"epoch": 0.003929433915926487,
"grad_norm": 636.0,
"learning_rate": 0.001,
"loss": 7036.127,
"step": 192
},
{
"epoch": 0.004911792394908109,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 6540.5202,
"step": 240
},
{
"epoch": 0.0058941508738897305,
"grad_norm": 556.0,
"learning_rate": 0.001,
"loss": 6263.0488,
"step": 288
},
{
"epoch": 0.006876509352871352,
"grad_norm": 532.0,
"learning_rate": 0.001,
"loss": 5950.1823,
"step": 336
},
{
"epoch": 0.007858867831852974,
"grad_norm": 928.0,
"learning_rate": 0.001,
"loss": 5705.2292,
"step": 384
},
{
"epoch": 0.008841226310834595,
"grad_norm": 444.0,
"learning_rate": 0.001,
"loss": 5496.4583,
"step": 432
},
{
"epoch": 0.009823584789816217,
"grad_norm": 656.0,
"learning_rate": 0.001,
"loss": 5272.5752,
"step": 480
},
{
"epoch": 0.010805943268797838,
"grad_norm": 612.0,
"learning_rate": 0.001,
"loss": 5051.2663,
"step": 528
},
{
"epoch": 0.011788301747779461,
"grad_norm": 608.0,
"learning_rate": 0.001,
"loss": 4938.0895,
"step": 576
},
{
"epoch": 0.012770660226761082,
"grad_norm": 466.0,
"learning_rate": 0.001,
"loss": 4740.762,
"step": 624
},
{
"epoch": 0.013753018705742704,
"grad_norm": 438.0,
"learning_rate": 0.001,
"loss": 4573.4443,
"step": 672
},
{
"epoch": 0.014735377184724325,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 4539.9521,
"step": 720
},
{
"epoch": 0.015717735663705948,
"grad_norm": 892.0,
"learning_rate": 0.001,
"loss": 4435.4001,
"step": 768
},
{
"epoch": 0.01670009414268757,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 4239.0426,
"step": 816
},
{
"epoch": 0.01768245262166919,
"grad_norm": 548.0,
"learning_rate": 0.001,
"loss": 4189.9281,
"step": 864
},
{
"epoch": 0.018664811100650814,
"grad_norm": 540.0,
"learning_rate": 0.001,
"loss": 4104.7835,
"step": 912
},
{
"epoch": 0.019647169579632435,
"grad_norm": 592.0,
"learning_rate": 0.001,
"loss": 4044.3369,
"step": 960
},
{
"epoch": 0.020629528058614056,
"grad_norm": 536.0,
"learning_rate": 0.001,
"loss": 3936.5283,
"step": 1008
},
{
"epoch": 0.021611886537595677,
"grad_norm": 604.0,
"learning_rate": 0.001,
"loss": 3915.6911,
"step": 1056
},
{
"epoch": 0.0225942450165773,
"grad_norm": 458.0,
"learning_rate": 0.001,
"loss": 3759.7747,
"step": 1104
},
{
"epoch": 0.023576603495558922,
"grad_norm": 636.0,
"learning_rate": 0.001,
"loss": 3760.4476,
"step": 1152
},
{
"epoch": 0.024558961974540543,
"grad_norm": 672.0,
"learning_rate": 0.001,
"loss": 3672.9059,
"step": 1200
},
{
"epoch": 0.025541320453522164,
"grad_norm": 592.0,
"learning_rate": 0.001,
"loss": 3645.0697,
"step": 1248
},
{
"epoch": 0.026523678932503784,
"grad_norm": 552.0,
"learning_rate": 0.001,
"loss": 3528.9896,
"step": 1296
},
{
"epoch": 0.02750603741148541,
"grad_norm": 470.0,
"learning_rate": 0.001,
"loss": 3488.8187,
"step": 1344
},
{
"epoch": 0.02848839589046703,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 3466.627,
"step": 1392
},
{
"epoch": 0.02947075436944865,
"grad_norm": 584.0,
"learning_rate": 0.001,
"loss": 3399.1475,
"step": 1440
},
{
"epoch": 0.03045311284843027,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 3363.9762,
"step": 1488
},
{
"epoch": 0.031435471327411896,
"grad_norm": 616.0,
"learning_rate": 0.001,
"loss": 3337.4564,
"step": 1536
},
{
"epoch": 0.03241782980639352,
"grad_norm": 540.0,
"learning_rate": 0.001,
"loss": 3298.4583,
"step": 1584
},
{
"epoch": 0.03340018828537514,
"grad_norm": 512.0,
"learning_rate": 0.001,
"loss": 3212.2949,
"step": 1632
},
{
"epoch": 0.03438254676435676,
"grad_norm": 512.0,
"learning_rate": 0.001,
"loss": 3217.6631,
"step": 1680
},
{
"epoch": 0.03536490524333838,
"grad_norm": 624.0,
"learning_rate": 0.001,
"loss": 3175.7318,
"step": 1728
},
{
"epoch": 0.03634726372232,
"grad_norm": 520.0,
"learning_rate": 0.001,
"loss": 3140.3923,
"step": 1776
},
{
"epoch": 0.03732962220130163,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 3099.8044,
"step": 1824
},
{
"epoch": 0.03831198068028325,
"grad_norm": 604.0,
"learning_rate": 0.001,
"loss": 3114.8079,
"step": 1872
},
{
"epoch": 0.03929433915926487,
"grad_norm": 478.0,
"learning_rate": 0.001,
"loss": 3048.9001,
"step": 1920
},
{
"epoch": 0.04027669763824649,
"grad_norm": 520.0,
"learning_rate": 0.001,
"loss": 3018.8714,
"step": 1968
},
{
"epoch": 0.04125905611722811,
"grad_norm": 456.0,
"learning_rate": 0.001,
"loss": 2981.1152,
"step": 2016
},
{
"epoch": 0.04224141459620973,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 2999.249,
"step": 2064
},
{
"epoch": 0.04322377307519135,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 2942.3376,
"step": 2112
},
{
"epoch": 0.044206131554172974,
"grad_norm": 600.0,
"learning_rate": 0.001,
"loss": 2890.7354,
"step": 2160
},
{
"epoch": 0.0451884900331546,
"grad_norm": 632.0,
"learning_rate": 0.001,
"loss": 2896.4242,
"step": 2208
},
{
"epoch": 0.04617084851213622,
"grad_norm": 536.0,
"learning_rate": 0.001,
"loss": 2874.8643,
"step": 2256
},
{
"epoch": 0.047153206991117844,
"grad_norm": 494.0,
"learning_rate": 0.001,
"loss": 2807.6911,
"step": 2304
},
{
"epoch": 0.048135565470099465,
"grad_norm": 548.0,
"learning_rate": 0.001,
"loss": 2820.04,
"step": 2352
},
{
"epoch": 0.049117923949081085,
"grad_norm": 636.0,
"learning_rate": 0.001,
"loss": 2787.0247,
"step": 2400
},
{
"epoch": 0.050100282428062706,
"grad_norm": 624.0,
"learning_rate": 0.001,
"loss": 2782.2428,
"step": 2448
},
{
"epoch": 0.05108264090704433,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 2725.3781,
"step": 2496
},
{
"epoch": 0.05206499938602595,
"grad_norm": 540.0,
"learning_rate": 0.001,
"loss": 2755.7458,
"step": 2544
},
{
"epoch": 0.05304735786500757,
"grad_norm": 568.0,
"learning_rate": 0.001,
"loss": 2699.16,
"step": 2592
},
{
"epoch": 0.0540297163439892,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 2680.3232,
"step": 2640
},
{
"epoch": 0.05501207482297082,
"grad_norm": 564.0,
"learning_rate": 0.001,
"loss": 2669.6646,
"step": 2688
},
{
"epoch": 0.05599443330195244,
"grad_norm": 552.0,
"learning_rate": 0.001,
"loss": 2683.8433,
"step": 2736
},
{
"epoch": 0.05697679178093406,
"grad_norm": 636.0,
"learning_rate": 0.001,
"loss": 2643.8172,
"step": 2784
},
{
"epoch": 0.05795915025991568,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 2649.0441,
"step": 2832
},
{
"epoch": 0.0589415087388973,
"grad_norm": 512.0,
"learning_rate": 0.001,
"loss": 2615.9657,
"step": 2880
},
{
"epoch": 0.05992386721787892,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 2608.1457,
"step": 2928
},
{
"epoch": 0.06090622569686054,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 2590.1567,
"step": 2976
},
{
"epoch": 0.06188858417584217,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 2627.8358,
"step": 3024
},
{
"epoch": 0.06287094265482379,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 2538.9543,
"step": 3072
},
{
"epoch": 0.06385330113380541,
"grad_norm": 568.0,
"learning_rate": 0.001,
"loss": 2482.9673,
"step": 3120
},
{
"epoch": 0.06483565961278703,
"grad_norm": 548.0,
"learning_rate": 0.001,
"loss": 2530.4771,
"step": 3168
},
{
"epoch": 0.06581801809176865,
"grad_norm": 456.0,
"learning_rate": 0.001,
"loss": 2496.41,
"step": 3216
},
{
"epoch": 0.06680037657075028,
"grad_norm": 684.0,
"learning_rate": 0.001,
"loss": 2518.8866,
"step": 3264
},
{
"epoch": 0.0677827350497319,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 2475.0793,
"step": 3312
},
{
"epoch": 0.06876509352871352,
"grad_norm": 812.0,
"learning_rate": 0.001,
"loss": 2461.3527,
"step": 3360
},
{
"epoch": 0.06974745200769514,
"grad_norm": 490.0,
"learning_rate": 0.001,
"loss": 2467.4508,
"step": 3408
},
{
"epoch": 0.07072981048667676,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 2443.8037,
"step": 3456
},
{
"epoch": 0.07171216896565838,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 2445.9336,
"step": 3504
},
{
"epoch": 0.07269452744464,
"grad_norm": 524.0,
"learning_rate": 0.001,
"loss": 2411.4482,
"step": 3552
},
{
"epoch": 0.07367688592362164,
"grad_norm": 608.0,
"learning_rate": 0.001,
"loss": 2417.4673,
"step": 3600
},
{
"epoch": 0.07465924440260326,
"grad_norm": 504.0,
"learning_rate": 0.001,
"loss": 2420.4196,
"step": 3648
},
{
"epoch": 0.07564160288158488,
"grad_norm": 564.0,
"learning_rate": 0.001,
"loss": 2390.8983,
"step": 3696
},
{
"epoch": 0.0766239613605665,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 2377.8607,
"step": 3744
},
{
"epoch": 0.07760631983954812,
"grad_norm": 616.0,
"learning_rate": 0.001,
"loss": 2359.1242,
"step": 3792
},
{
"epoch": 0.07858867831852974,
"grad_norm": 636.0,
"learning_rate": 0.001,
"loss": 2385.3102,
"step": 3840
},
{
"epoch": 0.07957103679751136,
"grad_norm": 454.0,
"learning_rate": 0.001,
"loss": 2373.0225,
"step": 3888
},
{
"epoch": 0.08055339527649298,
"grad_norm": 502.0,
"learning_rate": 0.001,
"loss": 2361.2386,
"step": 3936
},
{
"epoch": 0.0815357537554746,
"grad_norm": 506.0,
"learning_rate": 0.001,
"loss": 2341.1328,
"step": 3984
},
{
"epoch": 0.08251811223445622,
"grad_norm": 472.0,
"learning_rate": 0.001,
"loss": 2308.069,
"step": 4032
},
{
"epoch": 0.08350047071343784,
"grad_norm": 472.0,
"learning_rate": 0.001,
"loss": 2305.2542,
"step": 4080
},
{
"epoch": 0.08448282919241946,
"grad_norm": 502.0,
"learning_rate": 0.001,
"loss": 2338.4048,
"step": 4128
},
{
"epoch": 0.08546518767140109,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 2307.96,
"step": 4176
},
{
"epoch": 0.0864475461503827,
"grad_norm": 516.0,
"learning_rate": 0.001,
"loss": 2314.313,
"step": 4224
},
{
"epoch": 0.08742990462936433,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 2270.4917,
"step": 4272
},
{
"epoch": 0.08841226310834595,
"grad_norm": 576.0,
"learning_rate": 0.001,
"loss": 2292.9497,
"step": 4320
},
{
"epoch": 0.08939462158732757,
"grad_norm": 604.0,
"learning_rate": 0.001,
"loss": 2274.6584,
"step": 4368
},
{
"epoch": 0.0903769800663092,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 2275.2266,
"step": 4416
},
{
"epoch": 0.09135933854529082,
"grad_norm": 548.0,
"learning_rate": 0.001,
"loss": 2262.757,
"step": 4464
},
{
"epoch": 0.09234169702427245,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 2257.687,
"step": 4512
},
{
"epoch": 0.09332405550325407,
"grad_norm": 544.0,
"learning_rate": 0.001,
"loss": 2259.9118,
"step": 4560
},
{
"epoch": 0.09430641398223569,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 2224.4427,
"step": 4608
},
{
"epoch": 0.09528877246121731,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 2248.397,
"step": 4656
},
{
"epoch": 0.09627113094019893,
"grad_norm": 600.0,
"learning_rate": 0.001,
"loss": 2203.2843,
"step": 4704
},
{
"epoch": 0.09725348941918055,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 2223.5656,
"step": 4752
},
{
"epoch": 0.09823584789816217,
"grad_norm": 536.0,
"learning_rate": 0.001,
"loss": 2169.4321,
"step": 4800
},
{
"epoch": 0.09921820637714379,
"grad_norm": 516.0,
"learning_rate": 0.001,
"loss": 2183.4987,
"step": 4848
},
{
"epoch": 0.09999590683967091,
"eval_loss": 2087.82763671875,
"eval_runtime": 9.0001,
"eval_samples_per_second": 111.11,
"eval_steps_per_second": 1.444,
"step": 4886
},
{
"epoch": 0.10020056485612541,
"grad_norm": 620.0,
"learning_rate": 0.001,
"loss": 2210.3151,
"step": 4896
},
{
"epoch": 0.10118292333510703,
"grad_norm": 616.0,
"learning_rate": 0.001,
"loss": 2208.1779,
"step": 4944
},
{
"epoch": 0.10216528181408865,
"grad_norm": 592.0,
"learning_rate": 0.001,
"loss": 2166.7116,
"step": 4992
},
{
"epoch": 0.10314764029307028,
"grad_norm": 596.0,
"learning_rate": 0.001,
"loss": 2191.4295,
"step": 5040
},
{
"epoch": 0.1041299987720519,
"grad_norm": 684.0,
"learning_rate": 0.001,
"loss": 2155.1141,
"step": 5088
},
{
"epoch": 0.10511235725103352,
"grad_norm": 512.0,
"learning_rate": 0.001,
"loss": 2135.7635,
"step": 5136
},
{
"epoch": 0.10609471573001514,
"grad_norm": 506.0,
"learning_rate": 0.001,
"loss": 2155.5701,
"step": 5184
},
{
"epoch": 0.10707707420899677,
"grad_norm": 480.0,
"learning_rate": 0.001,
"loss": 2150.0086,
"step": 5232
},
{
"epoch": 0.1080594326879784,
"grad_norm": 540.0,
"learning_rate": 0.001,
"loss": 2142.4181,
"step": 5280
},
{
"epoch": 0.10904179116696001,
"grad_norm": 572.0,
"learning_rate": 0.001,
"loss": 2116.3011,
"step": 5328
},
{
"epoch": 0.11002414964594164,
"grad_norm": 548.0,
"learning_rate": 0.001,
"loss": 2141.0239,
"step": 5376
},
{
"epoch": 0.11100650812492326,
"grad_norm": 676.0,
"learning_rate": 0.001,
"loss": 2119.1307,
"step": 5424
},
{
"epoch": 0.11198886660390488,
"grad_norm": 656.0,
"learning_rate": 0.001,
"loss": 2137.8016,
"step": 5472
},
{
"epoch": 0.1129712250828865,
"grad_norm": 676.0,
"learning_rate": 0.001,
"loss": 2119.2923,
"step": 5520
},
{
"epoch": 0.11395358356186812,
"grad_norm": 588.0,
"learning_rate": 0.001,
"loss": 2120.9912,
"step": 5568
},
{
"epoch": 0.11493594204084974,
"grad_norm": 612.0,
"learning_rate": 0.001,
"loss": 2111.5037,
"step": 5616
},
{
"epoch": 0.11591830051983136,
"grad_norm": 588.0,
"learning_rate": 0.001,
"loss": 2119.6444,
"step": 5664
},
{
"epoch": 0.11690065899881298,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 2078.1807,
"step": 5712
},
{
"epoch": 0.1178830174777946,
"grad_norm": 564.0,
"learning_rate": 0.001,
"loss": 2095.8706,
"step": 5760
},
{
"epoch": 0.11886537595677622,
"grad_norm": 552.0,
"learning_rate": 0.001,
"loss": 2080.8527,
"step": 5808
},
{
"epoch": 0.11984773443575784,
"grad_norm": 488.0,
"learning_rate": 0.001,
"loss": 2062.9159,
"step": 5856
},
{
"epoch": 0.12083009291473946,
"grad_norm": 616.0,
"learning_rate": 0.001,
"loss": 2060.964,
"step": 5904
},
{
"epoch": 0.12181245139372109,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 2088.8507,
"step": 5952
},
{
"epoch": 0.12279480987270272,
"grad_norm": 604.0,
"learning_rate": 0.001,
"loss": 2052.1393,
"step": 6000
},
{
"epoch": 0.12377716835168434,
"grad_norm": 720.0,
"learning_rate": 0.001,
"loss": 2043.2277,
"step": 6048
},
{
"epoch": 0.12475952683066596,
"grad_norm": 616.0,
"learning_rate": 0.001,
"loss": 2043.3983,
"step": 6096
},
{
"epoch": 0.12574188530964758,
"grad_norm": 668.0,
"learning_rate": 0.001,
"loss": 2080.6297,
"step": 6144
},
{
"epoch": 0.1267242437886292,
"grad_norm": 532.0,
"learning_rate": 0.001,
"loss": 2059.5207,
"step": 6192
},
{
"epoch": 0.12770660226761082,
"grad_norm": 568.0,
"learning_rate": 0.001,
"loss": 2030.5203,
"step": 6240
},
{
"epoch": 0.12868896074659245,
"grad_norm": 560.0,
"learning_rate": 0.001,
"loss": 2047.7404,
"step": 6288
},
{
"epoch": 0.12967131922557407,
"grad_norm": 624.0,
"learning_rate": 0.001,
"loss": 2043.3193,
"step": 6336
},
{
"epoch": 0.1306536777045557,
"grad_norm": 592.0,
"learning_rate": 0.001,
"loss": 2051.0589,
"step": 6384
},
{
"epoch": 0.1316360361835373,
"grad_norm": 876.0,
"learning_rate": 0.001,
"loss": 2054.3232,
"step": 6432
},
{
"epoch": 0.13261839466251893,
"grad_norm": 544.0,
"learning_rate": 0.001,
"loss": 2047.3159,
"step": 6480
},
{
"epoch": 0.13360075314150055,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 2029.021,
"step": 6528
},
{
"epoch": 0.13458311162048217,
"grad_norm": 556.0,
"learning_rate": 0.001,
"loss": 2027.506,
"step": 6576
},
{
"epoch": 0.1355654700994638,
"grad_norm": 672.0,
"learning_rate": 0.001,
"loss": 2034.3325,
"step": 6624
},
{
"epoch": 0.1365478285784454,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 1988.6841,
"step": 6672
},
{
"epoch": 0.13753018705742703,
"grad_norm": 592.0,
"learning_rate": 0.001,
"loss": 1998.0236,
"step": 6720
},
{
"epoch": 0.13851254553640865,
"grad_norm": 552.0,
"learning_rate": 0.001,
"loss": 2008.8337,
"step": 6768
},
{
"epoch": 0.13949490401539028,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 2008.4787,
"step": 6816
},
{
"epoch": 0.1404772624943719,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1995.237,
"step": 6864
},
{
"epoch": 0.14145962097335352,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 1996.2018,
"step": 6912
},
{
"epoch": 0.14244197945233514,
"grad_norm": 652.0,
"learning_rate": 0.001,
"loss": 1992.167,
"step": 6960
},
{
"epoch": 0.14342433793131676,
"grad_norm": 544.0,
"learning_rate": 0.001,
"loss": 1985.2515,
"step": 7008
},
{
"epoch": 0.14440669641029838,
"grad_norm": 600.0,
"learning_rate": 0.001,
"loss": 1989.0208,
"step": 7056
},
{
"epoch": 0.14538905488928,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1993.743,
"step": 7104
},
{
"epoch": 0.14637141336826162,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 1986.2668,
"step": 7152
},
{
"epoch": 0.14735377184724327,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1988.6514,
"step": 7200
},
{
"epoch": 0.1483361303262249,
"grad_norm": 452.0,
"learning_rate": 0.001,
"loss": 1971.7622,
"step": 7248
},
{
"epoch": 0.1493184888052065,
"grad_norm": 576.0,
"learning_rate": 0.001,
"loss": 1977.0863,
"step": 7296
},
{
"epoch": 0.15030084728418813,
"grad_norm": 708.0,
"learning_rate": 0.001,
"loss": 1968.3294,
"step": 7344
},
{
"epoch": 0.15128320576316975,
"grad_norm": 572.0,
"learning_rate": 0.001,
"loss": 1981.1888,
"step": 7392
},
{
"epoch": 0.15226556424215137,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1937.5469,
"step": 7440
},
{
"epoch": 0.153247922721133,
"grad_norm": 672.0,
"learning_rate": 0.001,
"loss": 1944.2785,
"step": 7488
},
{
"epoch": 0.15423028120011462,
"grad_norm": 816.0,
"learning_rate": 0.001,
"loss": 1934.2336,
"step": 7536
},
{
"epoch": 0.15521263967909624,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1953.3698,
"step": 7584
},
{
"epoch": 0.15619499815807786,
"grad_norm": 652.0,
"learning_rate": 0.001,
"loss": 1951.9084,
"step": 7632
},
{
"epoch": 0.15717735663705948,
"grad_norm": 652.0,
"learning_rate": 0.001,
"loss": 1934.8753,
"step": 7680
},
{
"epoch": 0.1581597151160411,
"grad_norm": 652.0,
"learning_rate": 0.001,
"loss": 1923.8843,
"step": 7728
},
{
"epoch": 0.15914207359502272,
"grad_norm": 612.0,
"learning_rate": 0.001,
"loss": 1935.5955,
"step": 7776
},
{
"epoch": 0.16012443207400434,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1962.8574,
"step": 7824
},
{
"epoch": 0.16110679055298596,
"grad_norm": 540.0,
"learning_rate": 0.001,
"loss": 1955.3468,
"step": 7872
},
{
"epoch": 0.16208914903196758,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1915.3901,
"step": 7920
},
{
"epoch": 0.1630715075109492,
"grad_norm": 572.0,
"learning_rate": 0.001,
"loss": 1944.2292,
"step": 7968
},
{
"epoch": 0.16405386598993082,
"grad_norm": 668.0,
"learning_rate": 0.001,
"loss": 1926.0425,
"step": 8016
},
{
"epoch": 0.16503622446891245,
"grad_norm": 556.0,
"learning_rate": 0.001,
"loss": 1938.1131,
"step": 8064
},
{
"epoch": 0.16601858294789407,
"grad_norm": 756.0,
"learning_rate": 0.001,
"loss": 1925.4678,
"step": 8112
},
{
"epoch": 0.1670009414268757,
"grad_norm": 848.0,
"learning_rate": 0.001,
"loss": 1921.8462,
"step": 8160
},
{
"epoch": 0.1679832999058573,
"grad_norm": 588.0,
"learning_rate": 0.001,
"loss": 1890.1263,
"step": 8208
},
{
"epoch": 0.16896565838483893,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 1923.7113,
"step": 8256
},
{
"epoch": 0.16994801686382055,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1902.661,
"step": 8304
},
{
"epoch": 0.17093037534280217,
"grad_norm": 676.0,
"learning_rate": 0.001,
"loss": 1898.4054,
"step": 8352
},
{
"epoch": 0.1719127338217838,
"grad_norm": 604.0,
"learning_rate": 0.001,
"loss": 1899.0542,
"step": 8400
},
{
"epoch": 0.1728950923007654,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 1906.8057,
"step": 8448
},
{
"epoch": 0.17387745077974703,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1908.1032,
"step": 8496
},
{
"epoch": 0.17485980925872865,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 1928.3221,
"step": 8544
},
{
"epoch": 0.17584216773771028,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 1890.7321,
"step": 8592
},
{
"epoch": 0.1768245262166919,
"grad_norm": 556.0,
"learning_rate": 0.001,
"loss": 1910.8001,
"step": 8640
},
{
"epoch": 0.17780688469567352,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 1908.4972,
"step": 8688
},
{
"epoch": 0.17878924317465514,
"grad_norm": 608.0,
"learning_rate": 0.001,
"loss": 1870.7344,
"step": 8736
},
{
"epoch": 0.1797716016536368,
"grad_norm": 644.0,
"learning_rate": 0.001,
"loss": 1901.4289,
"step": 8784
},
{
"epoch": 0.1807539601326184,
"grad_norm": 580.0,
"learning_rate": 0.001,
"loss": 1883.8433,
"step": 8832
},
{
"epoch": 0.18173631861160003,
"grad_norm": 828.0,
"learning_rate": 0.001,
"loss": 1869.978,
"step": 8880
},
{
"epoch": 0.18271867709058165,
"grad_norm": 652.0,
"learning_rate": 0.001,
"loss": 1895.2178,
"step": 8928
},
{
"epoch": 0.18370103556956327,
"grad_norm": 680.0,
"learning_rate": 0.001,
"loss": 1857.217,
"step": 8976
},
{
"epoch": 0.1846833940485449,
"grad_norm": 608.0,
"learning_rate": 0.001,
"loss": 1880.6992,
"step": 9024
},
{
"epoch": 0.1856657525275265,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 1869.5422,
"step": 9072
},
{
"epoch": 0.18664811100650813,
"grad_norm": 720.0,
"learning_rate": 0.001,
"loss": 1898.1034,
"step": 9120
},
{
"epoch": 0.18763046948548975,
"grad_norm": 604.0,
"learning_rate": 0.001,
"loss": 1887.1818,
"step": 9168
},
{
"epoch": 0.18861282796447137,
"grad_norm": 672.0,
"learning_rate": 0.001,
"loss": 1869.6294,
"step": 9216
},
{
"epoch": 0.189595186443453,
"grad_norm": 572.0,
"learning_rate": 0.001,
"loss": 1857.5962,
"step": 9264
},
{
"epoch": 0.19057754492243462,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1869.6444,
"step": 9312
},
{
"epoch": 0.19155990340141624,
"grad_norm": 848.0,
"learning_rate": 0.001,
"loss": 1869.8807,
"step": 9360
},
{
"epoch": 0.19254226188039786,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1857.5882,
"step": 9408
},
{
"epoch": 0.19352462035937948,
"grad_norm": 652.0,
"learning_rate": 0.001,
"loss": 1835.6294,
"step": 9456
},
{
"epoch": 0.1945069788383611,
"grad_norm": 608.0,
"learning_rate": 0.001,
"loss": 1853.5081,
"step": 9504
},
{
"epoch": 0.19548933731734272,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 1866.897,
"step": 9552
},
{
"epoch": 0.19647169579632434,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1848.0703,
"step": 9600
},
{
"epoch": 0.19745405427530596,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 1867.1585,
"step": 9648
},
{
"epoch": 0.19843641275428758,
"grad_norm": 612.0,
"learning_rate": 0.001,
"loss": 1864.8203,
"step": 9696
},
{
"epoch": 0.1994187712332692,
"grad_norm": 572.0,
"learning_rate": 0.001,
"loss": 1827.0848,
"step": 9744
},
{
"epoch": 0.19999181367934182,
"eval_loss": 1771.5172119140625,
"eval_runtime": 9.0052,
"eval_samples_per_second": 111.047,
"eval_steps_per_second": 1.444,
"step": 9772
},
{
"epoch": 0.20040112971225083,
"grad_norm": 784.0,
"learning_rate": 0.001,
"loss": 1850.5506,
"step": 9792
},
{
"epoch": 0.20138348819123245,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1853.3254,
"step": 9840
},
{
"epoch": 0.20236584667021407,
"grad_norm": 744.0,
"learning_rate": 0.001,
"loss": 1884.8763,
"step": 9888
},
{
"epoch": 0.2033482051491957,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1852.5361,
"step": 9936
},
{
"epoch": 0.2043305636281773,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 1840.8996,
"step": 9984
},
{
"epoch": 0.20531292210715893,
"grad_norm": 676.0,
"learning_rate": 0.001,
"loss": 1848.5868,
"step": 10032
},
{
"epoch": 0.20629528058614055,
"grad_norm": 764.0,
"learning_rate": 0.001,
"loss": 1848.7498,
"step": 10080
},
{
"epoch": 0.20727763906512217,
"grad_norm": 856.0,
"learning_rate": 0.001,
"loss": 1845.7381,
"step": 10128
},
{
"epoch": 0.2082599975441038,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1851.979,
"step": 10176
},
{
"epoch": 0.2092423560230854,
"grad_norm": 644.0,
"learning_rate": 0.001,
"loss": 1850.7716,
"step": 10224
},
{
"epoch": 0.21022471450206703,
"grad_norm": 828.0,
"learning_rate": 0.001,
"loss": 1844.3057,
"step": 10272
},
{
"epoch": 0.21120707298104865,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 1820.6678,
"step": 10320
},
{
"epoch": 0.21218943146003028,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1793.4041,
"step": 10368
},
{
"epoch": 0.21317178993901192,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 1853.5828,
"step": 10416
},
{
"epoch": 0.21415414841799355,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 1816.429,
"step": 10464
},
{
"epoch": 0.21513650689697517,
"grad_norm": 708.0,
"learning_rate": 0.001,
"loss": 1827.7533,
"step": 10512
},
{
"epoch": 0.2161188653759568,
"grad_norm": 812.0,
"learning_rate": 0.001,
"loss": 1807.555,
"step": 10560
},
{
"epoch": 0.2171012238549384,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1824.4678,
"step": 10608
},
{
"epoch": 0.21808358233392003,
"grad_norm": 760.0,
"learning_rate": 0.001,
"loss": 1832.056,
"step": 10656
},
{
"epoch": 0.21906594081290165,
"grad_norm": 672.0,
"learning_rate": 0.001,
"loss": 1819.7812,
"step": 10704
},
{
"epoch": 0.22004829929188327,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 1837.8351,
"step": 10752
},
{
"epoch": 0.2210306577708649,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1823.1432,
"step": 10800
},
{
"epoch": 0.2220130162498465,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1810.9959,
"step": 10848
},
{
"epoch": 0.22299537472882813,
"grad_norm": 756.0,
"learning_rate": 0.001,
"loss": 1848.2907,
"step": 10896
},
{
"epoch": 0.22397773320780975,
"grad_norm": 744.0,
"learning_rate": 0.001,
"loss": 1786.6442,
"step": 10944
},
{
"epoch": 0.22496009168679137,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1804.0133,
"step": 10992
},
{
"epoch": 0.225942450165773,
"grad_norm": 640.0,
"learning_rate": 0.001,
"loss": 1813.6567,
"step": 11040
},
{
"epoch": 0.22692480864475462,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1812.9946,
"step": 11088
},
{
"epoch": 0.22790716712373624,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1816.8553,
"step": 11136
},
{
"epoch": 0.22888952560271786,
"grad_norm": 640.0,
"learning_rate": 0.001,
"loss": 1801.8009,
"step": 11184
},
{
"epoch": 0.22987188408169948,
"grad_norm": 680.0,
"learning_rate": 0.001,
"loss": 1816.7332,
"step": 11232
},
{
"epoch": 0.2308542425606811,
"grad_norm": 692.0,
"learning_rate": 0.001,
"loss": 1799.8708,
"step": 11280
},
{
"epoch": 0.23183660103966272,
"grad_norm": 680.0,
"learning_rate": 0.001,
"loss": 1791.1471,
"step": 11328
},
{
"epoch": 0.23281895951864434,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 1812.3979,
"step": 11376
},
{
"epoch": 0.23380131799762596,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1807.806,
"step": 11424
},
{
"epoch": 0.23478367647660758,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1811.6502,
"step": 11472
},
{
"epoch": 0.2357660349555892,
"grad_norm": 736.0,
"learning_rate": 0.001,
"loss": 1821.8501,
"step": 11520
},
{
"epoch": 0.23674839343457083,
"grad_norm": 648.0,
"learning_rate": 0.001,
"loss": 1777.6597,
"step": 11568
},
{
"epoch": 0.23773075191355245,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1804.2365,
"step": 11616
},
{
"epoch": 0.23871311039253407,
"grad_norm": 672.0,
"learning_rate": 0.001,
"loss": 1794.9201,
"step": 11664
},
{
"epoch": 0.2396954688715157,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1775.2284,
"step": 11712
},
{
"epoch": 0.2406778273504973,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 1785.5417,
"step": 11760
},
{
"epoch": 0.24166018582947893,
"grad_norm": 840.0,
"learning_rate": 0.001,
"loss": 1792.2282,
"step": 11808
},
{
"epoch": 0.24264254430846055,
"grad_norm": 992.0,
"learning_rate": 0.001,
"loss": 1799.9831,
"step": 11856
},
{
"epoch": 0.24362490278744217,
"grad_norm": 872.0,
"learning_rate": 0.001,
"loss": 1804.1024,
"step": 11904
},
{
"epoch": 0.2446072612664238,
"grad_norm": 668.0,
"learning_rate": 0.001,
"loss": 1785.5889,
"step": 11952
},
{
"epoch": 0.24558961974540544,
"grad_norm": 872.0,
"learning_rate": 0.001,
"loss": 1785.6185,
"step": 12000
},
{
"epoch": 0.24657197822438706,
"grad_norm": 784.0,
"learning_rate": 0.001,
"loss": 1785.6107,
"step": 12048
},
{
"epoch": 0.24755433670336868,
"grad_norm": 644.0,
"learning_rate": 0.001,
"loss": 1789.2995,
"step": 12096
},
{
"epoch": 0.2485366951823503,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1780.3151,
"step": 12144
},
{
"epoch": 0.24951905366133192,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1769.0786,
"step": 12192
},
{
"epoch": 0.2505014121403135,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1801.0431,
"step": 12240
},
{
"epoch": 0.25148377061929517,
"grad_norm": 844.0,
"learning_rate": 0.001,
"loss": 1796.9209,
"step": 12288
},
{
"epoch": 0.25246612909827676,
"grad_norm": 1088.0,
"learning_rate": 0.001,
"loss": 1795.411,
"step": 12336
},
{
"epoch": 0.2534484875772584,
"grad_norm": 1320.0,
"learning_rate": 0.001,
"loss": 1802.1553,
"step": 12384
},
{
"epoch": 0.25443084605624,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1772.7713,
"step": 12432
},
{
"epoch": 0.25541320453522165,
"grad_norm": 744.0,
"learning_rate": 0.001,
"loss": 1787.7516,
"step": 12480
},
{
"epoch": 0.25639556301420324,
"grad_norm": 744.0,
"learning_rate": 0.001,
"loss": 1785.005,
"step": 12528
},
{
"epoch": 0.2573779214931849,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1763.7612,
"step": 12576
},
{
"epoch": 0.2583602799721665,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1777.0677,
"step": 12624
},
{
"epoch": 0.25934263845114813,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1761.1235,
"step": 12672
},
{
"epoch": 0.2603249969301297,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1795.2174,
"step": 12720
},
{
"epoch": 0.2613073554091114,
"grad_norm": 816.0,
"learning_rate": 0.001,
"loss": 1763.8905,
"step": 12768
},
{
"epoch": 0.262289713888093,
"grad_norm": 852.0,
"learning_rate": 0.001,
"loss": 1761.4404,
"step": 12816
},
{
"epoch": 0.2632720723670746,
"grad_norm": 608.0,
"learning_rate": 0.001,
"loss": 1762.6668,
"step": 12864
},
{
"epoch": 0.26425443084605627,
"grad_norm": 892.0,
"learning_rate": 0.001,
"loss": 1725.5638,
"step": 12912
},
{
"epoch": 0.26523678932503786,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1762.1764,
"step": 12960
},
{
"epoch": 0.2662191478040195,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1764.0163,
"step": 13008
},
{
"epoch": 0.2672015062830011,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 1754.1729,
"step": 13056
},
{
"epoch": 0.26818386476198275,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1772.8592,
"step": 13104
},
{
"epoch": 0.26916622324096434,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 1780.2349,
"step": 13152
},
{
"epoch": 0.270148581719946,
"grad_norm": 720.0,
"learning_rate": 0.001,
"loss": 1764.6678,
"step": 13200
},
{
"epoch": 0.2711309401989276,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1757.6209,
"step": 13248
},
{
"epoch": 0.27211329867790923,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1749.5741,
"step": 13296
},
{
"epoch": 0.2730956571568908,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 1758.4183,
"step": 13344
},
{
"epoch": 0.2740780156358725,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1770.6115,
"step": 13392
},
{
"epoch": 0.27506037411485407,
"grad_norm": 784.0,
"learning_rate": 0.001,
"loss": 1752.6141,
"step": 13440
},
{
"epoch": 0.2760427325938357,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 1732.4147,
"step": 13488
},
{
"epoch": 0.2770250910728173,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 1757.4318,
"step": 13536
},
{
"epoch": 0.27800744955179896,
"grad_norm": 764.0,
"learning_rate": 0.001,
"loss": 1746.452,
"step": 13584
},
{
"epoch": 0.27898980803078055,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1733.8742,
"step": 13632
},
{
"epoch": 0.2799721665097622,
"grad_norm": 824.0,
"learning_rate": 0.001,
"loss": 1761.4808,
"step": 13680
},
{
"epoch": 0.2809545249887438,
"grad_norm": 664.0,
"learning_rate": 0.001,
"loss": 1749.8506,
"step": 13728
},
{
"epoch": 0.28193688346772544,
"grad_norm": 776.0,
"learning_rate": 0.001,
"loss": 1734.1479,
"step": 13776
},
{
"epoch": 0.28291924194670703,
"grad_norm": 708.0,
"learning_rate": 0.001,
"loss": 1721.951,
"step": 13824
},
{
"epoch": 0.2839016004256887,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1741.5046,
"step": 13872
},
{
"epoch": 0.2848839589046703,
"grad_norm": 764.0,
"learning_rate": 0.001,
"loss": 1743.1763,
"step": 13920
},
{
"epoch": 0.2858663173836519,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 1767.7448,
"step": 13968
},
{
"epoch": 0.2868486758626335,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1717.0291,
"step": 14016
},
{
"epoch": 0.28783103434161517,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 1731.1566,
"step": 14064
},
{
"epoch": 0.28881339282059676,
"grad_norm": 788.0,
"learning_rate": 0.001,
"loss": 1720.1551,
"step": 14112
},
{
"epoch": 0.2897957512995784,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1743.5467,
"step": 14160
},
{
"epoch": 0.29077810977856,
"grad_norm": 680.0,
"learning_rate": 0.001,
"loss": 1736.5747,
"step": 14208
},
{
"epoch": 0.29176046825754165,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1737.3779,
"step": 14256
},
{
"epoch": 0.29274282673652324,
"grad_norm": 760.0,
"learning_rate": 0.001,
"loss": 1718.3322,
"step": 14304
},
{
"epoch": 0.2937251852155049,
"grad_norm": 760.0,
"learning_rate": 0.001,
"loss": 1736.3989,
"step": 14352
},
{
"epoch": 0.29470754369448654,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1738.8551,
"step": 14400
},
{
"epoch": 0.29568990217346813,
"grad_norm": 900.0,
"learning_rate": 0.001,
"loss": 1711.0072,
"step": 14448
},
{
"epoch": 0.2966722606524498,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1709.6022,
"step": 14496
},
{
"epoch": 0.2976546191314314,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 1741.5581,
"step": 14544
},
{
"epoch": 0.298636977610413,
"grad_norm": 684.0,
"learning_rate": 0.001,
"loss": 1715.0571,
"step": 14592
},
{
"epoch": 0.2996193360893946,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1733.5199,
"step": 14640
},
{
"epoch": 0.29998772051901273,
"eval_loss": 1650.6409912109375,
"eval_runtime": 9.0148,
"eval_samples_per_second": 110.929,
"eval_steps_per_second": 1.442,
"step": 14658
},
{
"epoch": 0.30060169456837627,
"grad_norm": 708.0,
"learning_rate": 0.001,
"loss": 1719.6375,
"step": 14688
},
{
"epoch": 0.30158405304735786,
"grad_norm": 616.0,
"learning_rate": 0.001,
"loss": 1728.5174,
"step": 14736
},
{
"epoch": 0.3025664115263395,
"grad_norm": 624.0,
"learning_rate": 0.001,
"loss": 1724.9813,
"step": 14784
},
{
"epoch": 0.3035487700053211,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1704.8024,
"step": 14832
},
{
"epoch": 0.30453112848430275,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1714.6235,
"step": 14880
},
{
"epoch": 0.30551348696328434,
"grad_norm": 1048.0,
"learning_rate": 0.001,
"loss": 1734.4709,
"step": 14928
},
{
"epoch": 0.306495845442266,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 1721.2712,
"step": 14976
},
{
"epoch": 0.3074782039212476,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1752.32,
"step": 15024
},
{
"epoch": 0.30846056240022923,
"grad_norm": 792.0,
"learning_rate": 0.001,
"loss": 1711.9393,
"step": 15072
},
{
"epoch": 0.3094429208792108,
"grad_norm": 904.0,
"learning_rate": 0.001,
"loss": 1722.3177,
"step": 15120
},
{
"epoch": 0.3104252793581925,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1737.7088,
"step": 15168
},
{
"epoch": 0.31140763783717407,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1728.0853,
"step": 15216
},
{
"epoch": 0.3123899963161557,
"grad_norm": 776.0,
"learning_rate": 0.001,
"loss": 1711.749,
"step": 15264
},
{
"epoch": 0.3133723547951373,
"grad_norm": 840.0,
"learning_rate": 0.001,
"loss": 1717.5446,
"step": 15312
},
{
"epoch": 0.31435471327411896,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1718.4888,
"step": 15360
},
{
"epoch": 0.31533707175310055,
"grad_norm": 692.0,
"learning_rate": 0.001,
"loss": 1722.6672,
"step": 15408
},
{
"epoch": 0.3163194302320822,
"grad_norm": 864.0,
"learning_rate": 0.001,
"loss": 1715.826,
"step": 15456
},
{
"epoch": 0.3173017887110638,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1714.7765,
"step": 15504
},
{
"epoch": 0.31828414719004544,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1718.0269,
"step": 15552
},
{
"epoch": 0.31926650566902703,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1691.7855,
"step": 15600
},
{
"epoch": 0.3202488641480087,
"grad_norm": 888.0,
"learning_rate": 0.001,
"loss": 1717.6808,
"step": 15648
},
{
"epoch": 0.3212312226269903,
"grad_norm": 680.0,
"learning_rate": 0.001,
"loss": 1719.3472,
"step": 15696
},
{
"epoch": 0.3222135811059719,
"grad_norm": 868.0,
"learning_rate": 0.001,
"loss": 1691.0158,
"step": 15744
},
{
"epoch": 0.3231959395849535,
"grad_norm": 756.0,
"learning_rate": 0.001,
"loss": 1704.2186,
"step": 15792
},
{
"epoch": 0.32417829806393517,
"grad_norm": 888.0,
"learning_rate": 0.001,
"loss": 1723.0382,
"step": 15840
},
{
"epoch": 0.32516065654291676,
"grad_norm": 912.0,
"learning_rate": 0.001,
"loss": 1702.2889,
"step": 15888
},
{
"epoch": 0.3261430150218984,
"grad_norm": 820.0,
"learning_rate": 0.001,
"loss": 1728.0734,
"step": 15936
},
{
"epoch": 0.32712537350088006,
"grad_norm": 788.0,
"learning_rate": 0.001,
"loss": 1720.2152,
"step": 15984
},
{
"epoch": 0.32810773197986165,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 1702.1133,
"step": 16032
},
{
"epoch": 0.3290900904588433,
"grad_norm": 836.0,
"learning_rate": 0.001,
"loss": 1720.4746,
"step": 16080
},
{
"epoch": 0.3300724489378249,
"grad_norm": 836.0,
"learning_rate": 0.001,
"loss": 1689.6606,
"step": 16128
},
{
"epoch": 0.33105480741680654,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1689.0417,
"step": 16176
},
{
"epoch": 0.33203716589578813,
"grad_norm": 848.0,
"learning_rate": 0.001,
"loss": 1703.012,
"step": 16224
},
{
"epoch": 0.3330195243747698,
"grad_norm": 756.0,
"learning_rate": 0.001,
"loss": 1700.2785,
"step": 16272
},
{
"epoch": 0.3340018828537514,
"grad_norm": 756.0,
"learning_rate": 0.001,
"loss": 1709.3231,
"step": 16320
},
{
"epoch": 0.334984241332733,
"grad_norm": 960.0,
"learning_rate": 0.001,
"loss": 1715.8831,
"step": 16368
},
{
"epoch": 0.3359665998117146,
"grad_norm": 692.0,
"learning_rate": 0.001,
"loss": 1695.813,
"step": 16416
},
{
"epoch": 0.33694895829069627,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 1685.9803,
"step": 16464
},
{
"epoch": 0.33793131676967786,
"grad_norm": 876.0,
"learning_rate": 0.001,
"loss": 1704.5868,
"step": 16512
},
{
"epoch": 0.3389136752486595,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1681.2751,
"step": 16560
},
{
"epoch": 0.3398960337276411,
"grad_norm": 748.0,
"learning_rate": 0.001,
"loss": 1690.0252,
"step": 16608
},
{
"epoch": 0.34087839220662275,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 1698.1932,
"step": 16656
},
{
"epoch": 0.34186075068560434,
"grad_norm": 856.0,
"learning_rate": 0.001,
"loss": 1692.6128,
"step": 16704
},
{
"epoch": 0.342843109164586,
"grad_norm": 880.0,
"learning_rate": 0.001,
"loss": 1696.6901,
"step": 16752
},
{
"epoch": 0.3438254676435676,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 1693.9344,
"step": 16800
},
{
"epoch": 0.34480782612254923,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 1704.4855,
"step": 16848
},
{
"epoch": 0.3457901846015308,
"grad_norm": 692.0,
"learning_rate": 0.001,
"loss": 1705.7817,
"step": 16896
},
{
"epoch": 0.3467725430805125,
"grad_norm": 760.0,
"learning_rate": 0.001,
"loss": 1690.8944,
"step": 16944
},
{
"epoch": 0.34775490155949407,
"grad_norm": 868.0,
"learning_rate": 0.001,
"loss": 1685.9479,
"step": 16992
},
{
"epoch": 0.3487372600384757,
"grad_norm": 744.0,
"learning_rate": 0.001,
"loss": 1698.2961,
"step": 17040
},
{
"epoch": 0.3497196185174573,
"grad_norm": 688.0,
"learning_rate": 0.001,
"loss": 1693.7596,
"step": 17088
},
{
"epoch": 0.35070197699643896,
"grad_norm": 760.0,
"learning_rate": 0.001,
"loss": 1702.9092,
"step": 17136
},
{
"epoch": 0.35168433547542055,
"grad_norm": 788.0,
"learning_rate": 0.001,
"loss": 1672.0039,
"step": 17184
},
{
"epoch": 0.3526666939544022,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1673.5811,
"step": 17232
},
{
"epoch": 0.3536490524333838,
"grad_norm": 976.0,
"learning_rate": 0.001,
"loss": 1697.0251,
"step": 17280
},
{
"epoch": 0.35463141091236544,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1699.7508,
"step": 17328
},
{
"epoch": 0.35561376939134703,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 1708.7798,
"step": 17376
},
{
"epoch": 0.3565961278703287,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1693.451,
"step": 17424
},
{
"epoch": 0.3575784863493103,
"grad_norm": 904.0,
"learning_rate": 0.001,
"loss": 1676.6382,
"step": 17472
},
{
"epoch": 0.3585608448282919,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 1691.7266,
"step": 17520
},
{
"epoch": 0.3595432033072736,
"grad_norm": 1012.0,
"learning_rate": 0.001,
"loss": 1666.9458,
"step": 17568
},
{
"epoch": 0.36052556178625517,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 1675.5584,
"step": 17616
},
{
"epoch": 0.3615079202652368,
"grad_norm": 856.0,
"learning_rate": 0.001,
"loss": 1685.9595,
"step": 17664
},
{
"epoch": 0.3624902787442184,
"grad_norm": 836.0,
"learning_rate": 0.001,
"loss": 1695.4741,
"step": 17712
},
{
"epoch": 0.36347263722320006,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1689.256,
"step": 17760
},
{
"epoch": 0.36445499570218165,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1689.8599,
"step": 17808
},
{
"epoch": 0.3654373541811633,
"grad_norm": 876.0,
"learning_rate": 0.001,
"loss": 1668.894,
"step": 17856
},
{
"epoch": 0.3664197126601449,
"grad_norm": 780.0,
"learning_rate": 0.001,
"loss": 1684.5539,
"step": 17904
},
{
"epoch": 0.36740207113912654,
"grad_norm": 952.0,
"learning_rate": 0.001,
"loss": 1683.5539,
"step": 17952
},
{
"epoch": 0.36838442961810813,
"grad_norm": 860.0,
"learning_rate": 0.001,
"loss": 1695.5282,
"step": 18000
},
{
"epoch": 0.3693667880970898,
"grad_norm": 844.0,
"learning_rate": 0.001,
"loss": 1667.8563,
"step": 18048
},
{
"epoch": 0.3703491465760714,
"grad_norm": 800.0,
"learning_rate": 0.001,
"loss": 1671.2471,
"step": 18096
},
{
"epoch": 0.371331505055053,
"grad_norm": 760.0,
"learning_rate": 0.001,
"loss": 1664.4082,
"step": 18144
},
{
"epoch": 0.3723138635340346,
"grad_norm": 1120.0,
"learning_rate": 0.001,
"loss": 1666.0448,
"step": 18192
},
{
"epoch": 0.37329622201301627,
"grad_norm": 824.0,
"learning_rate": 0.001,
"loss": 1665.9009,
"step": 18240
},
{
"epoch": 0.37427858049199786,
"grad_norm": 872.0,
"learning_rate": 0.001,
"loss": 1663.6131,
"step": 18288
},
{
"epoch": 0.3752609389709795,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 1665.7214,
"step": 18336
},
{
"epoch": 0.3762432974499611,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1663.0591,
"step": 18384
},
{
"epoch": 0.37722565592894275,
"grad_norm": 988.0,
"learning_rate": 0.001,
"loss": 1683.5985,
"step": 18432
},
{
"epoch": 0.37820801440792434,
"grad_norm": 804.0,
"learning_rate": 0.001,
"loss": 1661.8081,
"step": 18480
},
{
"epoch": 0.379190372886906,
"grad_norm": 776.0,
"learning_rate": 0.001,
"loss": 1685.9769,
"step": 18528
},
{
"epoch": 0.3801727313658876,
"grad_norm": 920.0,
"learning_rate": 0.001,
"loss": 1676.7816,
"step": 18576
},
{
"epoch": 0.38115508984486923,
"grad_norm": 800.0,
"learning_rate": 0.001,
"loss": 1669.9821,
"step": 18624
},
{
"epoch": 0.3821374483238508,
"grad_norm": 828.0,
"learning_rate": 0.001,
"loss": 1654.353,
"step": 18672
},
{
"epoch": 0.3831198068028325,
"grad_norm": 756.0,
"learning_rate": 0.001,
"loss": 1641.133,
"step": 18720
},
{
"epoch": 0.38410216528181407,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1654.8257,
"step": 18768
},
{
"epoch": 0.3850845237607957,
"grad_norm": 856.0,
"learning_rate": 0.001,
"loss": 1674.0685,
"step": 18816
},
{
"epoch": 0.3860668822397773,
"grad_norm": 980.0,
"learning_rate": 0.001,
"loss": 1663.0804,
"step": 18864
},
{
"epoch": 0.38704924071875896,
"grad_norm": 852.0,
"learning_rate": 0.001,
"loss": 1676.6375,
"step": 18912
},
{
"epoch": 0.38803159919774055,
"grad_norm": 788.0,
"learning_rate": 0.001,
"loss": 1681.3968,
"step": 18960
},
{
"epoch": 0.3890139576767222,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1658.5428,
"step": 19008
},
{
"epoch": 0.3899963161557038,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1693.6683,
"step": 19056
},
{
"epoch": 0.39097867463468544,
"grad_norm": 1224.0,
"learning_rate": 0.001,
"loss": 1637.9217,
"step": 19104
},
{
"epoch": 0.3919610331136671,
"grad_norm": 896.0,
"learning_rate": 0.001,
"loss": 1680.59,
"step": 19152
},
{
"epoch": 0.3929433915926487,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1663.9777,
"step": 19200
},
{
"epoch": 0.39392575007163033,
"grad_norm": 940.0,
"learning_rate": 0.001,
"loss": 1672.4491,
"step": 19248
},
{
"epoch": 0.3949081085506119,
"grad_norm": 696.0,
"learning_rate": 0.001,
"loss": 1668.8294,
"step": 19296
},
{
"epoch": 0.3958904670295936,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1652.0028,
"step": 19344
},
{
"epoch": 0.39687282550857517,
"grad_norm": 900.0,
"learning_rate": 0.001,
"loss": 1670.4543,
"step": 19392
},
{
"epoch": 0.3978551839875568,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1674.2799,
"step": 19440
},
{
"epoch": 0.3988375424665384,
"grad_norm": 896.0,
"learning_rate": 0.001,
"loss": 1637.7557,
"step": 19488
},
{
"epoch": 0.39981990094552006,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1659.3999,
"step": 19536
},
{
"epoch": 0.39998362735868365,
"eval_loss": 1588.186767578125,
"eval_runtime": 9.0185,
"eval_samples_per_second": 110.884,
"eval_steps_per_second": 1.441,
"step": 19544
},
{
"epoch": 0.40080225942450165,
"grad_norm": 812.0,
"learning_rate": 0.001,
"loss": 1663.3135,
"step": 19584
},
{
"epoch": 0.4017846179034833,
"grad_norm": 820.0,
"learning_rate": 0.001,
"loss": 1648.4126,
"step": 19632
},
{
"epoch": 0.4027669763824649,
"grad_norm": 828.0,
"learning_rate": 0.001,
"loss": 1629.137,
"step": 19680
},
{
"epoch": 0.40374933486144654,
"grad_norm": 840.0,
"learning_rate": 0.001,
"loss": 1675.1574,
"step": 19728
},
{
"epoch": 0.40473169334042813,
"grad_norm": 892.0,
"learning_rate": 0.001,
"loss": 1651.4735,
"step": 19776
},
{
"epoch": 0.4057140518194098,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1659.3854,
"step": 19824
},
{
"epoch": 0.4066964102983914,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 1676.7249,
"step": 19872
},
{
"epoch": 0.407678768777373,
"grad_norm": 1088.0,
"learning_rate": 0.001,
"loss": 1658.8581,
"step": 19920
},
{
"epoch": 0.4086611272563546,
"grad_norm": 772.0,
"learning_rate": 0.001,
"loss": 1663.8711,
"step": 19968
},
{
"epoch": 0.40964348573533627,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1657.056,
"step": 20016
},
{
"epoch": 0.41062584421431786,
"grad_norm": 920.0,
"learning_rate": 0.001,
"loss": 1637.993,
"step": 20064
},
{
"epoch": 0.4116082026932995,
"grad_norm": 636.0,
"learning_rate": 0.001,
"loss": 1644.8833,
"step": 20112
},
{
"epoch": 0.4125905611722811,
"grad_norm": 956.0,
"learning_rate": 0.001,
"loss": 1656.4339,
"step": 20160
},
{
"epoch": 0.41357291965126275,
"grad_norm": 792.0,
"learning_rate": 0.001,
"loss": 1652.4888,
"step": 20208
},
{
"epoch": 0.41455527813024434,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1642.5142,
"step": 20256
},
{
"epoch": 0.415537636609226,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1668.1183,
"step": 20304
},
{
"epoch": 0.4165199950882076,
"grad_norm": 796.0,
"learning_rate": 0.001,
"loss": 1660.486,
"step": 20352
},
{
"epoch": 0.41750235356718923,
"grad_norm": 788.0,
"learning_rate": 0.001,
"loss": 1655.4614,
"step": 20400
},
{
"epoch": 0.4184847120461708,
"grad_norm": 844.0,
"learning_rate": 0.001,
"loss": 1650.6449,
"step": 20448
},
{
"epoch": 0.4194670705251525,
"grad_norm": 1080.0,
"learning_rate": 0.001,
"loss": 1639.0168,
"step": 20496
},
{
"epoch": 0.42044942900413407,
"grad_norm": 944.0,
"learning_rate": 0.001,
"loss": 1655.6951,
"step": 20544
},
{
"epoch": 0.4214317874831157,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1644.007,
"step": 20592
},
{
"epoch": 0.4224141459620973,
"grad_norm": 896.0,
"learning_rate": 0.001,
"loss": 1614.1253,
"step": 20640
},
{
"epoch": 0.42339650444107896,
"grad_norm": 736.0,
"learning_rate": 0.001,
"loss": 1637.6761,
"step": 20688
},
{
"epoch": 0.42437886292006055,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1635.8405,
"step": 20736
},
{
"epoch": 0.4253612213990422,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1653.1647,
"step": 20784
},
{
"epoch": 0.42634357987802385,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1657.5373,
"step": 20832
},
{
"epoch": 0.42732593835700544,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1649.7998,
"step": 20880
},
{
"epoch": 0.4283082968359871,
"grad_norm": 848.0,
"learning_rate": 0.001,
"loss": 1639.043,
"step": 20928
},
{
"epoch": 0.4292906553149687,
"grad_norm": 716.0,
"learning_rate": 0.001,
"loss": 1656.0706,
"step": 20976
},
{
"epoch": 0.43027301379395033,
"grad_norm": 924.0,
"learning_rate": 0.001,
"loss": 1640.4744,
"step": 21024
},
{
"epoch": 0.4312553722729319,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1640.8403,
"step": 21072
},
{
"epoch": 0.4322377307519136,
"grad_norm": 828.0,
"learning_rate": 0.001,
"loss": 1638.4172,
"step": 21120
},
{
"epoch": 0.43322008923089517,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1662.0506,
"step": 21168
},
{
"epoch": 0.4342024477098768,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1634.0928,
"step": 21216
},
{
"epoch": 0.4351848061888584,
"grad_norm": 976.0,
"learning_rate": 0.001,
"loss": 1650.1844,
"step": 21264
},
{
"epoch": 0.43616716466784006,
"grad_norm": 864.0,
"learning_rate": 0.001,
"loss": 1637.9504,
"step": 21312
},
{
"epoch": 0.43714952314682165,
"grad_norm": 660.0,
"learning_rate": 0.001,
"loss": 1651.0807,
"step": 21360
},
{
"epoch": 0.4381318816258033,
"grad_norm": 884.0,
"learning_rate": 0.001,
"loss": 1639.6099,
"step": 21408
},
{
"epoch": 0.4391142401047849,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1636.7961,
"step": 21456
},
{
"epoch": 0.44009659858376654,
"grad_norm": 704.0,
"learning_rate": 0.001,
"loss": 1630.2891,
"step": 21504
},
{
"epoch": 0.44107895706274813,
"grad_norm": 868.0,
"learning_rate": 0.001,
"loss": 1651.1029,
"step": 21552
},
{
"epoch": 0.4420613155417298,
"grad_norm": 1200.0,
"learning_rate": 0.001,
"loss": 1658.1079,
"step": 21600
},
{
"epoch": 0.4430436740207114,
"grad_norm": 820.0,
"learning_rate": 0.001,
"loss": 1650.035,
"step": 21648
},
{
"epoch": 0.444026032499693,
"grad_norm": 628.0,
"learning_rate": 0.001,
"loss": 1651.446,
"step": 21696
},
{
"epoch": 0.4450083909786746,
"grad_norm": 936.0,
"learning_rate": 0.001,
"loss": 1651.8545,
"step": 21744
},
{
"epoch": 0.44599074945765627,
"grad_norm": 724.0,
"learning_rate": 0.001,
"loss": 1633.7931,
"step": 21792
},
{
"epoch": 0.44697310793663786,
"grad_norm": 868.0,
"learning_rate": 0.001,
"loss": 1643.6271,
"step": 21840
},
{
"epoch": 0.4479554664156195,
"grad_norm": 884.0,
"learning_rate": 0.001,
"loss": 1628.9087,
"step": 21888
},
{
"epoch": 0.4489378248946011,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 1633.3311,
"step": 21936
},
{
"epoch": 0.44992018337358275,
"grad_norm": 776.0,
"learning_rate": 0.001,
"loss": 1636.3483,
"step": 21984
},
{
"epoch": 0.45090254185256434,
"grad_norm": 740.0,
"learning_rate": 0.001,
"loss": 1627.1842,
"step": 22032
},
{
"epoch": 0.451884900331546,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1632.6536,
"step": 22080
},
{
"epoch": 0.4528672588105276,
"grad_norm": 920.0,
"learning_rate": 0.001,
"loss": 1663.3418,
"step": 22128
},
{
"epoch": 0.45384961728950923,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1635.7738,
"step": 22176
},
{
"epoch": 0.4548319757684908,
"grad_norm": 752.0,
"learning_rate": 0.001,
"loss": 1636.0459,
"step": 22224
},
{
"epoch": 0.4558143342474725,
"grad_norm": 892.0,
"learning_rate": 0.001,
"loss": 1627.4956,
"step": 22272
},
{
"epoch": 0.45679669272645407,
"grad_norm": 860.0,
"learning_rate": 0.001,
"loss": 1628.1626,
"step": 22320
},
{
"epoch": 0.4577790512054357,
"grad_norm": 776.0,
"learning_rate": 0.001,
"loss": 1628.0701,
"step": 22368
},
{
"epoch": 0.45876140968441737,
"grad_norm": 792.0,
"learning_rate": 0.001,
"loss": 1644.1922,
"step": 22416
},
{
"epoch": 0.45974376816339896,
"grad_norm": 776.0,
"learning_rate": 0.001,
"loss": 1608.5988,
"step": 22464
},
{
"epoch": 0.4607261266423806,
"grad_norm": 860.0,
"learning_rate": 0.001,
"loss": 1637.166,
"step": 22512
},
{
"epoch": 0.4617084851213622,
"grad_norm": 924.0,
"learning_rate": 0.001,
"loss": 1626.6854,
"step": 22560
},
{
"epoch": 0.46269084360034385,
"grad_norm": 876.0,
"learning_rate": 0.001,
"loss": 1639.1245,
"step": 22608
},
{
"epoch": 0.46367320207932544,
"grad_norm": 936.0,
"learning_rate": 0.001,
"loss": 1634.3815,
"step": 22656
},
{
"epoch": 0.4646555605583071,
"grad_norm": 912.0,
"learning_rate": 0.001,
"loss": 1606.1912,
"step": 22704
},
{
"epoch": 0.4656379190372887,
"grad_norm": 952.0,
"learning_rate": 0.001,
"loss": 1620.5391,
"step": 22752
},
{
"epoch": 0.46662027751627033,
"grad_norm": 960.0,
"learning_rate": 0.001,
"loss": 1612.0667,
"step": 22800
},
{
"epoch": 0.4676026359952519,
"grad_norm": 832.0,
"learning_rate": 0.001,
"loss": 1651.9868,
"step": 22848
},
{
"epoch": 0.4685849944742336,
"grad_norm": 712.0,
"learning_rate": 0.001,
"loss": 1629.237,
"step": 22896
},
{
"epoch": 0.46956735295321517,
"grad_norm": 864.0,
"learning_rate": 0.001,
"loss": 1618.2004,
"step": 22944
},
{
"epoch": 0.4705497114321968,
"grad_norm": 728.0,
"learning_rate": 0.001,
"loss": 1625.5379,
"step": 22992
},
{
"epoch": 0.4715320699111784,
"grad_norm": 836.0,
"learning_rate": 0.001,
"loss": 1622.8146,
"step": 23040
},
{
"epoch": 0.47251442839016006,
"grad_norm": 1064.0,
"learning_rate": 0.001,
"loss": 1623.9705,
"step": 23088
},
{
"epoch": 0.47349678686914165,
"grad_norm": 860.0,
"learning_rate": 0.001,
"loss": 1626.2383,
"step": 23136
},
{
"epoch": 0.4744791453481233,
"grad_norm": 1120.0,
"learning_rate": 0.001,
"loss": 1634.2668,
"step": 23184
},
{
"epoch": 0.4754615038271049,
"grad_norm": 796.0,
"learning_rate": 0.001,
"loss": 1642.5649,
"step": 23232
},
{
"epoch": 0.47644386230608654,
"grad_norm": 1072.0,
"learning_rate": 0.001,
"loss": 1633.4873,
"step": 23280
},
{
"epoch": 0.47742622078506813,
"grad_norm": 904.0,
"learning_rate": 0.001,
"loss": 1604.186,
"step": 23328
},
{
"epoch": 0.4784085792640498,
"grad_norm": 852.0,
"learning_rate": 0.001,
"loss": 1608.5158,
"step": 23376
},
{
"epoch": 0.4793909377430314,
"grad_norm": 860.0,
"learning_rate": 0.001,
"loss": 1624.3991,
"step": 23424
},
{
"epoch": 0.480373296222013,
"grad_norm": 820.0,
"learning_rate": 0.001,
"loss": 1640.4948,
"step": 23472
},
{
"epoch": 0.4813556547009946,
"grad_norm": 700.0,
"learning_rate": 0.001,
"loss": 1593.3931,
"step": 23520
},
{
"epoch": 0.48233801317997627,
"grad_norm": 824.0,
"learning_rate": 0.001,
"loss": 1622.2378,
"step": 23568
},
{
"epoch": 0.48332037165895786,
"grad_norm": 936.0,
"learning_rate": 0.001,
"loss": 1612.6034,
"step": 23616
},
{
"epoch": 0.4843027301379395,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1624.4165,
"step": 23664
},
{
"epoch": 0.4852850886169211,
"grad_norm": 820.0,
"learning_rate": 0.001,
"loss": 1634.2424,
"step": 23712
},
{
"epoch": 0.48626744709590275,
"grad_norm": 896.0,
"learning_rate": 0.001,
"loss": 1616.617,
"step": 23760
},
{
"epoch": 0.48724980557488434,
"grad_norm": 732.0,
"learning_rate": 0.001,
"loss": 1629.8065,
"step": 23808
},
{
"epoch": 0.488232164053866,
"grad_norm": 876.0,
"learning_rate": 0.001,
"loss": 1611.0832,
"step": 23856
},
{
"epoch": 0.4892145225328476,
"grad_norm": 1004.0,
"learning_rate": 0.001,
"loss": 1596.9705,
"step": 23904
},
{
"epoch": 0.49019688101182923,
"grad_norm": 812.0,
"learning_rate": 0.001,
"loss": 1612.6437,
"step": 23952
},
{
"epoch": 0.4911792394908109,
"grad_norm": 764.0,
"learning_rate": 0.001,
"loss": 1612.0643,
"step": 24000
},
{
"epoch": 0.4921615979697925,
"grad_norm": 944.0,
"learning_rate": 0.001,
"loss": 1620.1268,
"step": 24048
},
{
"epoch": 0.4931439564487741,
"grad_norm": 920.0,
"learning_rate": 0.001,
"loss": 1619.8875,
"step": 24096
},
{
"epoch": 0.4941263149277557,
"grad_norm": 808.0,
"learning_rate": 0.001,
"loss": 1608.6463,
"step": 24144
},
{
"epoch": 0.49510867340673737,
"grad_norm": 784.0,
"learning_rate": 0.001,
"loss": 1609.1462,
"step": 24192
},
{
"epoch": 0.49609103188571896,
"grad_norm": 796.0,
"learning_rate": 0.001,
"loss": 1610.4935,
"step": 24240
},
{
"epoch": 0.4970733903647006,
"grad_norm": 896.0,
"learning_rate": 0.001,
"loss": 1622.8371,
"step": 24288
},
{
"epoch": 0.4980557488436822,
"grad_norm": 768.0,
"learning_rate": 0.001,
"loss": 1617.0732,
"step": 24336
},
{
"epoch": 0.49903810732266385,
"grad_norm": 920.0,
"learning_rate": 0.001,
"loss": 1615.2142,
"step": 24384
},
{
"epoch": 0.49997953419835456,
"eval_loss": 1546.7662353515625,
"eval_runtime": 9.0072,
"eval_samples_per_second": 111.022,
"eval_steps_per_second": 1.443,
"step": 24430
}
],
"logging_steps": 48,
"max_steps": 48862,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 4886,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7859053283033743e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}