|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.49997953419835456, |
|
"eval_steps": 4886, |
|
"global_step": 24430, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009823584789816217, |
|
"grad_norm": 640.0, |
|
"learning_rate": 0.001, |
|
"loss": 11885.6341, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0019647169579632435, |
|
"grad_norm": 454.0, |
|
"learning_rate": 0.001, |
|
"loss": 8966.3691, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0029470754369448652, |
|
"grad_norm": 572.0, |
|
"learning_rate": 0.001, |
|
"loss": 7738.5072, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.003929433915926487, |
|
"grad_norm": 636.0, |
|
"learning_rate": 0.001, |
|
"loss": 7036.127, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.004911792394908109, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 6540.5202, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0058941508738897305, |
|
"grad_norm": 556.0, |
|
"learning_rate": 0.001, |
|
"loss": 6263.0488, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.006876509352871352, |
|
"grad_norm": 532.0, |
|
"learning_rate": 0.001, |
|
"loss": 5950.1823, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.007858867831852974, |
|
"grad_norm": 928.0, |
|
"learning_rate": 0.001, |
|
"loss": 5705.2292, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.008841226310834595, |
|
"grad_norm": 444.0, |
|
"learning_rate": 0.001, |
|
"loss": 5496.4583, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.009823584789816217, |
|
"grad_norm": 656.0, |
|
"learning_rate": 0.001, |
|
"loss": 5272.5752, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.010805943268797838, |
|
"grad_norm": 612.0, |
|
"learning_rate": 0.001, |
|
"loss": 5051.2663, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.011788301747779461, |
|
"grad_norm": 608.0, |
|
"learning_rate": 0.001, |
|
"loss": 4938.0895, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.012770660226761082, |
|
"grad_norm": 466.0, |
|
"learning_rate": 0.001, |
|
"loss": 4740.762, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.013753018705742704, |
|
"grad_norm": 438.0, |
|
"learning_rate": 0.001, |
|
"loss": 4573.4443, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.014735377184724325, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 4539.9521, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.015717735663705948, |
|
"grad_norm": 892.0, |
|
"learning_rate": 0.001, |
|
"loss": 4435.4001, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.01670009414268757, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 4239.0426, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.01768245262166919, |
|
"grad_norm": 548.0, |
|
"learning_rate": 0.001, |
|
"loss": 4189.9281, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.018664811100650814, |
|
"grad_norm": 540.0, |
|
"learning_rate": 0.001, |
|
"loss": 4104.7835, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.019647169579632435, |
|
"grad_norm": 592.0, |
|
"learning_rate": 0.001, |
|
"loss": 4044.3369, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.020629528058614056, |
|
"grad_norm": 536.0, |
|
"learning_rate": 0.001, |
|
"loss": 3936.5283, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.021611886537595677, |
|
"grad_norm": 604.0, |
|
"learning_rate": 0.001, |
|
"loss": 3915.6911, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.0225942450165773, |
|
"grad_norm": 458.0, |
|
"learning_rate": 0.001, |
|
"loss": 3759.7747, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.023576603495558922, |
|
"grad_norm": 636.0, |
|
"learning_rate": 0.001, |
|
"loss": 3760.4476, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.024558961974540543, |
|
"grad_norm": 672.0, |
|
"learning_rate": 0.001, |
|
"loss": 3672.9059, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.025541320453522164, |
|
"grad_norm": 592.0, |
|
"learning_rate": 0.001, |
|
"loss": 3645.0697, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.026523678932503784, |
|
"grad_norm": 552.0, |
|
"learning_rate": 0.001, |
|
"loss": 3528.9896, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.02750603741148541, |
|
"grad_norm": 470.0, |
|
"learning_rate": 0.001, |
|
"loss": 3488.8187, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.02848839589046703, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 3466.627, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.02947075436944865, |
|
"grad_norm": 584.0, |
|
"learning_rate": 0.001, |
|
"loss": 3399.1475, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.03045311284843027, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 3363.9762, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 0.031435471327411896, |
|
"grad_norm": 616.0, |
|
"learning_rate": 0.001, |
|
"loss": 3337.4564, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.03241782980639352, |
|
"grad_norm": 540.0, |
|
"learning_rate": 0.001, |
|
"loss": 3298.4583, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.03340018828537514, |
|
"grad_norm": 512.0, |
|
"learning_rate": 0.001, |
|
"loss": 3212.2949, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 0.03438254676435676, |
|
"grad_norm": 512.0, |
|
"learning_rate": 0.001, |
|
"loss": 3217.6631, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.03536490524333838, |
|
"grad_norm": 624.0, |
|
"learning_rate": 0.001, |
|
"loss": 3175.7318, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 0.03634726372232, |
|
"grad_norm": 520.0, |
|
"learning_rate": 0.001, |
|
"loss": 3140.3923, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 0.03732962220130163, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 3099.8044, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 0.03831198068028325, |
|
"grad_norm": 604.0, |
|
"learning_rate": 0.001, |
|
"loss": 3114.8079, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 0.03929433915926487, |
|
"grad_norm": 478.0, |
|
"learning_rate": 0.001, |
|
"loss": 3048.9001, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.04027669763824649, |
|
"grad_norm": 520.0, |
|
"learning_rate": 0.001, |
|
"loss": 3018.8714, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 0.04125905611722811, |
|
"grad_norm": 456.0, |
|
"learning_rate": 0.001, |
|
"loss": 2981.1152, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.04224141459620973, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 2999.249, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 0.04322377307519135, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 2942.3376, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 0.044206131554172974, |
|
"grad_norm": 600.0, |
|
"learning_rate": 0.001, |
|
"loss": 2890.7354, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.0451884900331546, |
|
"grad_norm": 632.0, |
|
"learning_rate": 0.001, |
|
"loss": 2896.4242, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 0.04617084851213622, |
|
"grad_norm": 536.0, |
|
"learning_rate": 0.001, |
|
"loss": 2874.8643, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 0.047153206991117844, |
|
"grad_norm": 494.0, |
|
"learning_rate": 0.001, |
|
"loss": 2807.6911, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 0.048135565470099465, |
|
"grad_norm": 548.0, |
|
"learning_rate": 0.001, |
|
"loss": 2820.04, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 0.049117923949081085, |
|
"grad_norm": 636.0, |
|
"learning_rate": 0.001, |
|
"loss": 2787.0247, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.050100282428062706, |
|
"grad_norm": 624.0, |
|
"learning_rate": 0.001, |
|
"loss": 2782.2428, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 0.05108264090704433, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 2725.3781, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 0.05206499938602595, |
|
"grad_norm": 540.0, |
|
"learning_rate": 0.001, |
|
"loss": 2755.7458, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 0.05304735786500757, |
|
"grad_norm": 568.0, |
|
"learning_rate": 0.001, |
|
"loss": 2699.16, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 0.0540297163439892, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 2680.3232, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.05501207482297082, |
|
"grad_norm": 564.0, |
|
"learning_rate": 0.001, |
|
"loss": 2669.6646, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 0.05599443330195244, |
|
"grad_norm": 552.0, |
|
"learning_rate": 0.001, |
|
"loss": 2683.8433, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 0.05697679178093406, |
|
"grad_norm": 636.0, |
|
"learning_rate": 0.001, |
|
"loss": 2643.8172, |
|
"step": 2784 |
|
}, |
|
{ |
|
"epoch": 0.05795915025991568, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 2649.0441, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 0.0589415087388973, |
|
"grad_norm": 512.0, |
|
"learning_rate": 0.001, |
|
"loss": 2615.9657, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.05992386721787892, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 2608.1457, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 0.06090622569686054, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 2590.1567, |
|
"step": 2976 |
|
}, |
|
{ |
|
"epoch": 0.06188858417584217, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 2627.8358, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 0.06287094265482379, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 2538.9543, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 0.06385330113380541, |
|
"grad_norm": 568.0, |
|
"learning_rate": 0.001, |
|
"loss": 2482.9673, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.06483565961278703, |
|
"grad_norm": 548.0, |
|
"learning_rate": 0.001, |
|
"loss": 2530.4771, |
|
"step": 3168 |
|
}, |
|
{ |
|
"epoch": 0.06581801809176865, |
|
"grad_norm": 456.0, |
|
"learning_rate": 0.001, |
|
"loss": 2496.41, |
|
"step": 3216 |
|
}, |
|
{ |
|
"epoch": 0.06680037657075028, |
|
"grad_norm": 684.0, |
|
"learning_rate": 0.001, |
|
"loss": 2518.8866, |
|
"step": 3264 |
|
}, |
|
{ |
|
"epoch": 0.0677827350497319, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 2475.0793, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 0.06876509352871352, |
|
"grad_norm": 812.0, |
|
"learning_rate": 0.001, |
|
"loss": 2461.3527, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.06974745200769514, |
|
"grad_norm": 490.0, |
|
"learning_rate": 0.001, |
|
"loss": 2467.4508, |
|
"step": 3408 |
|
}, |
|
{ |
|
"epoch": 0.07072981048667676, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 2443.8037, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 0.07171216896565838, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 2445.9336, |
|
"step": 3504 |
|
}, |
|
{ |
|
"epoch": 0.07269452744464, |
|
"grad_norm": 524.0, |
|
"learning_rate": 0.001, |
|
"loss": 2411.4482, |
|
"step": 3552 |
|
}, |
|
{ |
|
"epoch": 0.07367688592362164, |
|
"grad_norm": 608.0, |
|
"learning_rate": 0.001, |
|
"loss": 2417.4673, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.07465924440260326, |
|
"grad_norm": 504.0, |
|
"learning_rate": 0.001, |
|
"loss": 2420.4196, |
|
"step": 3648 |
|
}, |
|
{ |
|
"epoch": 0.07564160288158488, |
|
"grad_norm": 564.0, |
|
"learning_rate": 0.001, |
|
"loss": 2390.8983, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 0.0766239613605665, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 2377.8607, |
|
"step": 3744 |
|
}, |
|
{ |
|
"epoch": 0.07760631983954812, |
|
"grad_norm": 616.0, |
|
"learning_rate": 0.001, |
|
"loss": 2359.1242, |
|
"step": 3792 |
|
}, |
|
{ |
|
"epoch": 0.07858867831852974, |
|
"grad_norm": 636.0, |
|
"learning_rate": 0.001, |
|
"loss": 2385.3102, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.07957103679751136, |
|
"grad_norm": 454.0, |
|
"learning_rate": 0.001, |
|
"loss": 2373.0225, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 0.08055339527649298, |
|
"grad_norm": 502.0, |
|
"learning_rate": 0.001, |
|
"loss": 2361.2386, |
|
"step": 3936 |
|
}, |
|
{ |
|
"epoch": 0.0815357537554746, |
|
"grad_norm": 506.0, |
|
"learning_rate": 0.001, |
|
"loss": 2341.1328, |
|
"step": 3984 |
|
}, |
|
{ |
|
"epoch": 0.08251811223445622, |
|
"grad_norm": 472.0, |
|
"learning_rate": 0.001, |
|
"loss": 2308.069, |
|
"step": 4032 |
|
}, |
|
{ |
|
"epoch": 0.08350047071343784, |
|
"grad_norm": 472.0, |
|
"learning_rate": 0.001, |
|
"loss": 2305.2542, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.08448282919241946, |
|
"grad_norm": 502.0, |
|
"learning_rate": 0.001, |
|
"loss": 2338.4048, |
|
"step": 4128 |
|
}, |
|
{ |
|
"epoch": 0.08546518767140109, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 2307.96, |
|
"step": 4176 |
|
}, |
|
{ |
|
"epoch": 0.0864475461503827, |
|
"grad_norm": 516.0, |
|
"learning_rate": 0.001, |
|
"loss": 2314.313, |
|
"step": 4224 |
|
}, |
|
{ |
|
"epoch": 0.08742990462936433, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 2270.4917, |
|
"step": 4272 |
|
}, |
|
{ |
|
"epoch": 0.08841226310834595, |
|
"grad_norm": 576.0, |
|
"learning_rate": 0.001, |
|
"loss": 2292.9497, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.08939462158732757, |
|
"grad_norm": 604.0, |
|
"learning_rate": 0.001, |
|
"loss": 2274.6584, |
|
"step": 4368 |
|
}, |
|
{ |
|
"epoch": 0.0903769800663092, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 2275.2266, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 0.09135933854529082, |
|
"grad_norm": 548.0, |
|
"learning_rate": 0.001, |
|
"loss": 2262.757, |
|
"step": 4464 |
|
}, |
|
{ |
|
"epoch": 0.09234169702427245, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 2257.687, |
|
"step": 4512 |
|
}, |
|
{ |
|
"epoch": 0.09332405550325407, |
|
"grad_norm": 544.0, |
|
"learning_rate": 0.001, |
|
"loss": 2259.9118, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.09430641398223569, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 2224.4427, |
|
"step": 4608 |
|
}, |
|
{ |
|
"epoch": 0.09528877246121731, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 2248.397, |
|
"step": 4656 |
|
}, |
|
{ |
|
"epoch": 0.09627113094019893, |
|
"grad_norm": 600.0, |
|
"learning_rate": 0.001, |
|
"loss": 2203.2843, |
|
"step": 4704 |
|
}, |
|
{ |
|
"epoch": 0.09725348941918055, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 2223.5656, |
|
"step": 4752 |
|
}, |
|
{ |
|
"epoch": 0.09823584789816217, |
|
"grad_norm": 536.0, |
|
"learning_rate": 0.001, |
|
"loss": 2169.4321, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.09921820637714379, |
|
"grad_norm": 516.0, |
|
"learning_rate": 0.001, |
|
"loss": 2183.4987, |
|
"step": 4848 |
|
}, |
|
{ |
|
"epoch": 0.09999590683967091, |
|
"eval_loss": 2087.82763671875, |
|
"eval_runtime": 9.0001, |
|
"eval_samples_per_second": 111.11, |
|
"eval_steps_per_second": 1.444, |
|
"step": 4886 |
|
}, |
|
{ |
|
"epoch": 0.10020056485612541, |
|
"grad_norm": 620.0, |
|
"learning_rate": 0.001, |
|
"loss": 2210.3151, |
|
"step": 4896 |
|
}, |
|
{ |
|
"epoch": 0.10118292333510703, |
|
"grad_norm": 616.0, |
|
"learning_rate": 0.001, |
|
"loss": 2208.1779, |
|
"step": 4944 |
|
}, |
|
{ |
|
"epoch": 0.10216528181408865, |
|
"grad_norm": 592.0, |
|
"learning_rate": 0.001, |
|
"loss": 2166.7116, |
|
"step": 4992 |
|
}, |
|
{ |
|
"epoch": 0.10314764029307028, |
|
"grad_norm": 596.0, |
|
"learning_rate": 0.001, |
|
"loss": 2191.4295, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.1041299987720519, |
|
"grad_norm": 684.0, |
|
"learning_rate": 0.001, |
|
"loss": 2155.1141, |
|
"step": 5088 |
|
}, |
|
{ |
|
"epoch": 0.10511235725103352, |
|
"grad_norm": 512.0, |
|
"learning_rate": 0.001, |
|
"loss": 2135.7635, |
|
"step": 5136 |
|
}, |
|
{ |
|
"epoch": 0.10609471573001514, |
|
"grad_norm": 506.0, |
|
"learning_rate": 0.001, |
|
"loss": 2155.5701, |
|
"step": 5184 |
|
}, |
|
{ |
|
"epoch": 0.10707707420899677, |
|
"grad_norm": 480.0, |
|
"learning_rate": 0.001, |
|
"loss": 2150.0086, |
|
"step": 5232 |
|
}, |
|
{ |
|
"epoch": 0.1080594326879784, |
|
"grad_norm": 540.0, |
|
"learning_rate": 0.001, |
|
"loss": 2142.4181, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.10904179116696001, |
|
"grad_norm": 572.0, |
|
"learning_rate": 0.001, |
|
"loss": 2116.3011, |
|
"step": 5328 |
|
}, |
|
{ |
|
"epoch": 0.11002414964594164, |
|
"grad_norm": 548.0, |
|
"learning_rate": 0.001, |
|
"loss": 2141.0239, |
|
"step": 5376 |
|
}, |
|
{ |
|
"epoch": 0.11100650812492326, |
|
"grad_norm": 676.0, |
|
"learning_rate": 0.001, |
|
"loss": 2119.1307, |
|
"step": 5424 |
|
}, |
|
{ |
|
"epoch": 0.11198886660390488, |
|
"grad_norm": 656.0, |
|
"learning_rate": 0.001, |
|
"loss": 2137.8016, |
|
"step": 5472 |
|
}, |
|
{ |
|
"epoch": 0.1129712250828865, |
|
"grad_norm": 676.0, |
|
"learning_rate": 0.001, |
|
"loss": 2119.2923, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.11395358356186812, |
|
"grad_norm": 588.0, |
|
"learning_rate": 0.001, |
|
"loss": 2120.9912, |
|
"step": 5568 |
|
}, |
|
{ |
|
"epoch": 0.11493594204084974, |
|
"grad_norm": 612.0, |
|
"learning_rate": 0.001, |
|
"loss": 2111.5037, |
|
"step": 5616 |
|
}, |
|
{ |
|
"epoch": 0.11591830051983136, |
|
"grad_norm": 588.0, |
|
"learning_rate": 0.001, |
|
"loss": 2119.6444, |
|
"step": 5664 |
|
}, |
|
{ |
|
"epoch": 0.11690065899881298, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 2078.1807, |
|
"step": 5712 |
|
}, |
|
{ |
|
"epoch": 0.1178830174777946, |
|
"grad_norm": 564.0, |
|
"learning_rate": 0.001, |
|
"loss": 2095.8706, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.11886537595677622, |
|
"grad_norm": 552.0, |
|
"learning_rate": 0.001, |
|
"loss": 2080.8527, |
|
"step": 5808 |
|
}, |
|
{ |
|
"epoch": 0.11984773443575784, |
|
"grad_norm": 488.0, |
|
"learning_rate": 0.001, |
|
"loss": 2062.9159, |
|
"step": 5856 |
|
}, |
|
{ |
|
"epoch": 0.12083009291473946, |
|
"grad_norm": 616.0, |
|
"learning_rate": 0.001, |
|
"loss": 2060.964, |
|
"step": 5904 |
|
}, |
|
{ |
|
"epoch": 0.12181245139372109, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 2088.8507, |
|
"step": 5952 |
|
}, |
|
{ |
|
"epoch": 0.12279480987270272, |
|
"grad_norm": 604.0, |
|
"learning_rate": 0.001, |
|
"loss": 2052.1393, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.12377716835168434, |
|
"grad_norm": 720.0, |
|
"learning_rate": 0.001, |
|
"loss": 2043.2277, |
|
"step": 6048 |
|
}, |
|
{ |
|
"epoch": 0.12475952683066596, |
|
"grad_norm": 616.0, |
|
"learning_rate": 0.001, |
|
"loss": 2043.3983, |
|
"step": 6096 |
|
}, |
|
{ |
|
"epoch": 0.12574188530964758, |
|
"grad_norm": 668.0, |
|
"learning_rate": 0.001, |
|
"loss": 2080.6297, |
|
"step": 6144 |
|
}, |
|
{ |
|
"epoch": 0.1267242437886292, |
|
"grad_norm": 532.0, |
|
"learning_rate": 0.001, |
|
"loss": 2059.5207, |
|
"step": 6192 |
|
}, |
|
{ |
|
"epoch": 0.12770660226761082, |
|
"grad_norm": 568.0, |
|
"learning_rate": 0.001, |
|
"loss": 2030.5203, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.12868896074659245, |
|
"grad_norm": 560.0, |
|
"learning_rate": 0.001, |
|
"loss": 2047.7404, |
|
"step": 6288 |
|
}, |
|
{ |
|
"epoch": 0.12967131922557407, |
|
"grad_norm": 624.0, |
|
"learning_rate": 0.001, |
|
"loss": 2043.3193, |
|
"step": 6336 |
|
}, |
|
{ |
|
"epoch": 0.1306536777045557, |
|
"grad_norm": 592.0, |
|
"learning_rate": 0.001, |
|
"loss": 2051.0589, |
|
"step": 6384 |
|
}, |
|
{ |
|
"epoch": 0.1316360361835373, |
|
"grad_norm": 876.0, |
|
"learning_rate": 0.001, |
|
"loss": 2054.3232, |
|
"step": 6432 |
|
}, |
|
{ |
|
"epoch": 0.13261839466251893, |
|
"grad_norm": 544.0, |
|
"learning_rate": 0.001, |
|
"loss": 2047.3159, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.13360075314150055, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 2029.021, |
|
"step": 6528 |
|
}, |
|
{ |
|
"epoch": 0.13458311162048217, |
|
"grad_norm": 556.0, |
|
"learning_rate": 0.001, |
|
"loss": 2027.506, |
|
"step": 6576 |
|
}, |
|
{ |
|
"epoch": 0.1355654700994638, |
|
"grad_norm": 672.0, |
|
"learning_rate": 0.001, |
|
"loss": 2034.3325, |
|
"step": 6624 |
|
}, |
|
{ |
|
"epoch": 0.1365478285784454, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 1988.6841, |
|
"step": 6672 |
|
}, |
|
{ |
|
"epoch": 0.13753018705742703, |
|
"grad_norm": 592.0, |
|
"learning_rate": 0.001, |
|
"loss": 1998.0236, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.13851254553640865, |
|
"grad_norm": 552.0, |
|
"learning_rate": 0.001, |
|
"loss": 2008.8337, |
|
"step": 6768 |
|
}, |
|
{ |
|
"epoch": 0.13949490401539028, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 2008.4787, |
|
"step": 6816 |
|
}, |
|
{ |
|
"epoch": 0.1404772624943719, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1995.237, |
|
"step": 6864 |
|
}, |
|
{ |
|
"epoch": 0.14145962097335352, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 1996.2018, |
|
"step": 6912 |
|
}, |
|
{ |
|
"epoch": 0.14244197945233514, |
|
"grad_norm": 652.0, |
|
"learning_rate": 0.001, |
|
"loss": 1992.167, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.14342433793131676, |
|
"grad_norm": 544.0, |
|
"learning_rate": 0.001, |
|
"loss": 1985.2515, |
|
"step": 7008 |
|
}, |
|
{ |
|
"epoch": 0.14440669641029838, |
|
"grad_norm": 600.0, |
|
"learning_rate": 0.001, |
|
"loss": 1989.0208, |
|
"step": 7056 |
|
}, |
|
{ |
|
"epoch": 0.14538905488928, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1993.743, |
|
"step": 7104 |
|
}, |
|
{ |
|
"epoch": 0.14637141336826162, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 1986.2668, |
|
"step": 7152 |
|
}, |
|
{ |
|
"epoch": 0.14735377184724327, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1988.6514, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.1483361303262249, |
|
"grad_norm": 452.0, |
|
"learning_rate": 0.001, |
|
"loss": 1971.7622, |
|
"step": 7248 |
|
}, |
|
{ |
|
"epoch": 0.1493184888052065, |
|
"grad_norm": 576.0, |
|
"learning_rate": 0.001, |
|
"loss": 1977.0863, |
|
"step": 7296 |
|
}, |
|
{ |
|
"epoch": 0.15030084728418813, |
|
"grad_norm": 708.0, |
|
"learning_rate": 0.001, |
|
"loss": 1968.3294, |
|
"step": 7344 |
|
}, |
|
{ |
|
"epoch": 0.15128320576316975, |
|
"grad_norm": 572.0, |
|
"learning_rate": 0.001, |
|
"loss": 1981.1888, |
|
"step": 7392 |
|
}, |
|
{ |
|
"epoch": 0.15226556424215137, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1937.5469, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.153247922721133, |
|
"grad_norm": 672.0, |
|
"learning_rate": 0.001, |
|
"loss": 1944.2785, |
|
"step": 7488 |
|
}, |
|
{ |
|
"epoch": 0.15423028120011462, |
|
"grad_norm": 816.0, |
|
"learning_rate": 0.001, |
|
"loss": 1934.2336, |
|
"step": 7536 |
|
}, |
|
{ |
|
"epoch": 0.15521263967909624, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1953.3698, |
|
"step": 7584 |
|
}, |
|
{ |
|
"epoch": 0.15619499815807786, |
|
"grad_norm": 652.0, |
|
"learning_rate": 0.001, |
|
"loss": 1951.9084, |
|
"step": 7632 |
|
}, |
|
{ |
|
"epoch": 0.15717735663705948, |
|
"grad_norm": 652.0, |
|
"learning_rate": 0.001, |
|
"loss": 1934.8753, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.1581597151160411, |
|
"grad_norm": 652.0, |
|
"learning_rate": 0.001, |
|
"loss": 1923.8843, |
|
"step": 7728 |
|
}, |
|
{ |
|
"epoch": 0.15914207359502272, |
|
"grad_norm": 612.0, |
|
"learning_rate": 0.001, |
|
"loss": 1935.5955, |
|
"step": 7776 |
|
}, |
|
{ |
|
"epoch": 0.16012443207400434, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1962.8574, |
|
"step": 7824 |
|
}, |
|
{ |
|
"epoch": 0.16110679055298596, |
|
"grad_norm": 540.0, |
|
"learning_rate": 0.001, |
|
"loss": 1955.3468, |
|
"step": 7872 |
|
}, |
|
{ |
|
"epoch": 0.16208914903196758, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1915.3901, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.1630715075109492, |
|
"grad_norm": 572.0, |
|
"learning_rate": 0.001, |
|
"loss": 1944.2292, |
|
"step": 7968 |
|
}, |
|
{ |
|
"epoch": 0.16405386598993082, |
|
"grad_norm": 668.0, |
|
"learning_rate": 0.001, |
|
"loss": 1926.0425, |
|
"step": 8016 |
|
}, |
|
{ |
|
"epoch": 0.16503622446891245, |
|
"grad_norm": 556.0, |
|
"learning_rate": 0.001, |
|
"loss": 1938.1131, |
|
"step": 8064 |
|
}, |
|
{ |
|
"epoch": 0.16601858294789407, |
|
"grad_norm": 756.0, |
|
"learning_rate": 0.001, |
|
"loss": 1925.4678, |
|
"step": 8112 |
|
}, |
|
{ |
|
"epoch": 0.1670009414268757, |
|
"grad_norm": 848.0, |
|
"learning_rate": 0.001, |
|
"loss": 1921.8462, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.1679832999058573, |
|
"grad_norm": 588.0, |
|
"learning_rate": 0.001, |
|
"loss": 1890.1263, |
|
"step": 8208 |
|
}, |
|
{ |
|
"epoch": 0.16896565838483893, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 1923.7113, |
|
"step": 8256 |
|
}, |
|
{ |
|
"epoch": 0.16994801686382055, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1902.661, |
|
"step": 8304 |
|
}, |
|
{ |
|
"epoch": 0.17093037534280217, |
|
"grad_norm": 676.0, |
|
"learning_rate": 0.001, |
|
"loss": 1898.4054, |
|
"step": 8352 |
|
}, |
|
{ |
|
"epoch": 0.1719127338217838, |
|
"grad_norm": 604.0, |
|
"learning_rate": 0.001, |
|
"loss": 1899.0542, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.1728950923007654, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 1906.8057, |
|
"step": 8448 |
|
}, |
|
{ |
|
"epoch": 0.17387745077974703, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1908.1032, |
|
"step": 8496 |
|
}, |
|
{ |
|
"epoch": 0.17485980925872865, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 1928.3221, |
|
"step": 8544 |
|
}, |
|
{ |
|
"epoch": 0.17584216773771028, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 1890.7321, |
|
"step": 8592 |
|
}, |
|
{ |
|
"epoch": 0.1768245262166919, |
|
"grad_norm": 556.0, |
|
"learning_rate": 0.001, |
|
"loss": 1910.8001, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.17780688469567352, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 1908.4972, |
|
"step": 8688 |
|
}, |
|
{ |
|
"epoch": 0.17878924317465514, |
|
"grad_norm": 608.0, |
|
"learning_rate": 0.001, |
|
"loss": 1870.7344, |
|
"step": 8736 |
|
}, |
|
{ |
|
"epoch": 0.1797716016536368, |
|
"grad_norm": 644.0, |
|
"learning_rate": 0.001, |
|
"loss": 1901.4289, |
|
"step": 8784 |
|
}, |
|
{ |
|
"epoch": 0.1807539601326184, |
|
"grad_norm": 580.0, |
|
"learning_rate": 0.001, |
|
"loss": 1883.8433, |
|
"step": 8832 |
|
}, |
|
{ |
|
"epoch": 0.18173631861160003, |
|
"grad_norm": 828.0, |
|
"learning_rate": 0.001, |
|
"loss": 1869.978, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.18271867709058165, |
|
"grad_norm": 652.0, |
|
"learning_rate": 0.001, |
|
"loss": 1895.2178, |
|
"step": 8928 |
|
}, |
|
{ |
|
"epoch": 0.18370103556956327, |
|
"grad_norm": 680.0, |
|
"learning_rate": 0.001, |
|
"loss": 1857.217, |
|
"step": 8976 |
|
}, |
|
{ |
|
"epoch": 0.1846833940485449, |
|
"grad_norm": 608.0, |
|
"learning_rate": 0.001, |
|
"loss": 1880.6992, |
|
"step": 9024 |
|
}, |
|
{ |
|
"epoch": 0.1856657525275265, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 1869.5422, |
|
"step": 9072 |
|
}, |
|
{ |
|
"epoch": 0.18664811100650813, |
|
"grad_norm": 720.0, |
|
"learning_rate": 0.001, |
|
"loss": 1898.1034, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.18763046948548975, |
|
"grad_norm": 604.0, |
|
"learning_rate": 0.001, |
|
"loss": 1887.1818, |
|
"step": 9168 |
|
}, |
|
{ |
|
"epoch": 0.18861282796447137, |
|
"grad_norm": 672.0, |
|
"learning_rate": 0.001, |
|
"loss": 1869.6294, |
|
"step": 9216 |
|
}, |
|
{ |
|
"epoch": 0.189595186443453, |
|
"grad_norm": 572.0, |
|
"learning_rate": 0.001, |
|
"loss": 1857.5962, |
|
"step": 9264 |
|
}, |
|
{ |
|
"epoch": 0.19057754492243462, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1869.6444, |
|
"step": 9312 |
|
}, |
|
{ |
|
"epoch": 0.19155990340141624, |
|
"grad_norm": 848.0, |
|
"learning_rate": 0.001, |
|
"loss": 1869.8807, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.19254226188039786, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1857.5882, |
|
"step": 9408 |
|
}, |
|
{ |
|
"epoch": 0.19352462035937948, |
|
"grad_norm": 652.0, |
|
"learning_rate": 0.001, |
|
"loss": 1835.6294, |
|
"step": 9456 |
|
}, |
|
{ |
|
"epoch": 0.1945069788383611, |
|
"grad_norm": 608.0, |
|
"learning_rate": 0.001, |
|
"loss": 1853.5081, |
|
"step": 9504 |
|
}, |
|
{ |
|
"epoch": 0.19548933731734272, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 1866.897, |
|
"step": 9552 |
|
}, |
|
{ |
|
"epoch": 0.19647169579632434, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1848.0703, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.19745405427530596, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 1867.1585, |
|
"step": 9648 |
|
}, |
|
{ |
|
"epoch": 0.19843641275428758, |
|
"grad_norm": 612.0, |
|
"learning_rate": 0.001, |
|
"loss": 1864.8203, |
|
"step": 9696 |
|
}, |
|
{ |
|
"epoch": 0.1994187712332692, |
|
"grad_norm": 572.0, |
|
"learning_rate": 0.001, |
|
"loss": 1827.0848, |
|
"step": 9744 |
|
}, |
|
{ |
|
"epoch": 0.19999181367934182, |
|
"eval_loss": 1771.5172119140625, |
|
"eval_runtime": 9.0052, |
|
"eval_samples_per_second": 111.047, |
|
"eval_steps_per_second": 1.444, |
|
"step": 9772 |
|
}, |
|
{ |
|
"epoch": 0.20040112971225083, |
|
"grad_norm": 784.0, |
|
"learning_rate": 0.001, |
|
"loss": 1850.5506, |
|
"step": 9792 |
|
}, |
|
{ |
|
"epoch": 0.20138348819123245, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1853.3254, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.20236584667021407, |
|
"grad_norm": 744.0, |
|
"learning_rate": 0.001, |
|
"loss": 1884.8763, |
|
"step": 9888 |
|
}, |
|
{ |
|
"epoch": 0.2033482051491957, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1852.5361, |
|
"step": 9936 |
|
}, |
|
{ |
|
"epoch": 0.2043305636281773, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 1840.8996, |
|
"step": 9984 |
|
}, |
|
{ |
|
"epoch": 0.20531292210715893, |
|
"grad_norm": 676.0, |
|
"learning_rate": 0.001, |
|
"loss": 1848.5868, |
|
"step": 10032 |
|
}, |
|
{ |
|
"epoch": 0.20629528058614055, |
|
"grad_norm": 764.0, |
|
"learning_rate": 0.001, |
|
"loss": 1848.7498, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.20727763906512217, |
|
"grad_norm": 856.0, |
|
"learning_rate": 0.001, |
|
"loss": 1845.7381, |
|
"step": 10128 |
|
}, |
|
{ |
|
"epoch": 0.2082599975441038, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1851.979, |
|
"step": 10176 |
|
}, |
|
{ |
|
"epoch": 0.2092423560230854, |
|
"grad_norm": 644.0, |
|
"learning_rate": 0.001, |
|
"loss": 1850.7716, |
|
"step": 10224 |
|
}, |
|
{ |
|
"epoch": 0.21022471450206703, |
|
"grad_norm": 828.0, |
|
"learning_rate": 0.001, |
|
"loss": 1844.3057, |
|
"step": 10272 |
|
}, |
|
{ |
|
"epoch": 0.21120707298104865, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 1820.6678, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.21218943146003028, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1793.4041, |
|
"step": 10368 |
|
}, |
|
{ |
|
"epoch": 0.21317178993901192, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 1853.5828, |
|
"step": 10416 |
|
}, |
|
{ |
|
"epoch": 0.21415414841799355, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 1816.429, |
|
"step": 10464 |
|
}, |
|
{ |
|
"epoch": 0.21513650689697517, |
|
"grad_norm": 708.0, |
|
"learning_rate": 0.001, |
|
"loss": 1827.7533, |
|
"step": 10512 |
|
}, |
|
{ |
|
"epoch": 0.2161188653759568, |
|
"grad_norm": 812.0, |
|
"learning_rate": 0.001, |
|
"loss": 1807.555, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.2171012238549384, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1824.4678, |
|
"step": 10608 |
|
}, |
|
{ |
|
"epoch": 0.21808358233392003, |
|
"grad_norm": 760.0, |
|
"learning_rate": 0.001, |
|
"loss": 1832.056, |
|
"step": 10656 |
|
}, |
|
{ |
|
"epoch": 0.21906594081290165, |
|
"grad_norm": 672.0, |
|
"learning_rate": 0.001, |
|
"loss": 1819.7812, |
|
"step": 10704 |
|
}, |
|
{ |
|
"epoch": 0.22004829929188327, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 1837.8351, |
|
"step": 10752 |
|
}, |
|
{ |
|
"epoch": 0.2210306577708649, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1823.1432, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.2220130162498465, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1810.9959, |
|
"step": 10848 |
|
}, |
|
{ |
|
"epoch": 0.22299537472882813, |
|
"grad_norm": 756.0, |
|
"learning_rate": 0.001, |
|
"loss": 1848.2907, |
|
"step": 10896 |
|
}, |
|
{ |
|
"epoch": 0.22397773320780975, |
|
"grad_norm": 744.0, |
|
"learning_rate": 0.001, |
|
"loss": 1786.6442, |
|
"step": 10944 |
|
}, |
|
{ |
|
"epoch": 0.22496009168679137, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1804.0133, |
|
"step": 10992 |
|
}, |
|
{ |
|
"epoch": 0.225942450165773, |
|
"grad_norm": 640.0, |
|
"learning_rate": 0.001, |
|
"loss": 1813.6567, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.22692480864475462, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1812.9946, |
|
"step": 11088 |
|
}, |
|
{ |
|
"epoch": 0.22790716712373624, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1816.8553, |
|
"step": 11136 |
|
}, |
|
{ |
|
"epoch": 0.22888952560271786, |
|
"grad_norm": 640.0, |
|
"learning_rate": 0.001, |
|
"loss": 1801.8009, |
|
"step": 11184 |
|
}, |
|
{ |
|
"epoch": 0.22987188408169948, |
|
"grad_norm": 680.0, |
|
"learning_rate": 0.001, |
|
"loss": 1816.7332, |
|
"step": 11232 |
|
}, |
|
{ |
|
"epoch": 0.2308542425606811, |
|
"grad_norm": 692.0, |
|
"learning_rate": 0.001, |
|
"loss": 1799.8708, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.23183660103966272, |
|
"grad_norm": 680.0, |
|
"learning_rate": 0.001, |
|
"loss": 1791.1471, |
|
"step": 11328 |
|
}, |
|
{ |
|
"epoch": 0.23281895951864434, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 1812.3979, |
|
"step": 11376 |
|
}, |
|
{ |
|
"epoch": 0.23380131799762596, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1807.806, |
|
"step": 11424 |
|
}, |
|
{ |
|
"epoch": 0.23478367647660758, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1811.6502, |
|
"step": 11472 |
|
}, |
|
{ |
|
"epoch": 0.2357660349555892, |
|
"grad_norm": 736.0, |
|
"learning_rate": 0.001, |
|
"loss": 1821.8501, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.23674839343457083, |
|
"grad_norm": 648.0, |
|
"learning_rate": 0.001, |
|
"loss": 1777.6597, |
|
"step": 11568 |
|
}, |
|
{ |
|
"epoch": 0.23773075191355245, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1804.2365, |
|
"step": 11616 |
|
}, |
|
{ |
|
"epoch": 0.23871311039253407, |
|
"grad_norm": 672.0, |
|
"learning_rate": 0.001, |
|
"loss": 1794.9201, |
|
"step": 11664 |
|
}, |
|
{ |
|
"epoch": 0.2396954688715157, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1775.2284, |
|
"step": 11712 |
|
}, |
|
{ |
|
"epoch": 0.2406778273504973, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 1785.5417, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 0.24166018582947893, |
|
"grad_norm": 840.0, |
|
"learning_rate": 0.001, |
|
"loss": 1792.2282, |
|
"step": 11808 |
|
}, |
|
{ |
|
"epoch": 0.24264254430846055, |
|
"grad_norm": 992.0, |
|
"learning_rate": 0.001, |
|
"loss": 1799.9831, |
|
"step": 11856 |
|
}, |
|
{ |
|
"epoch": 0.24362490278744217, |
|
"grad_norm": 872.0, |
|
"learning_rate": 0.001, |
|
"loss": 1804.1024, |
|
"step": 11904 |
|
}, |
|
{ |
|
"epoch": 0.2446072612664238, |
|
"grad_norm": 668.0, |
|
"learning_rate": 0.001, |
|
"loss": 1785.5889, |
|
"step": 11952 |
|
}, |
|
{ |
|
"epoch": 0.24558961974540544, |
|
"grad_norm": 872.0, |
|
"learning_rate": 0.001, |
|
"loss": 1785.6185, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.24657197822438706, |
|
"grad_norm": 784.0, |
|
"learning_rate": 0.001, |
|
"loss": 1785.6107, |
|
"step": 12048 |
|
}, |
|
{ |
|
"epoch": 0.24755433670336868, |
|
"grad_norm": 644.0, |
|
"learning_rate": 0.001, |
|
"loss": 1789.2995, |
|
"step": 12096 |
|
}, |
|
{ |
|
"epoch": 0.2485366951823503, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1780.3151, |
|
"step": 12144 |
|
}, |
|
{ |
|
"epoch": 0.24951905366133192, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1769.0786, |
|
"step": 12192 |
|
}, |
|
{ |
|
"epoch": 0.2505014121403135, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1801.0431, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.25148377061929517, |
|
"grad_norm": 844.0, |
|
"learning_rate": 0.001, |
|
"loss": 1796.9209, |
|
"step": 12288 |
|
}, |
|
{ |
|
"epoch": 0.25246612909827676, |
|
"grad_norm": 1088.0, |
|
"learning_rate": 0.001, |
|
"loss": 1795.411, |
|
"step": 12336 |
|
}, |
|
{ |
|
"epoch": 0.2534484875772584, |
|
"grad_norm": 1320.0, |
|
"learning_rate": 0.001, |
|
"loss": 1802.1553, |
|
"step": 12384 |
|
}, |
|
{ |
|
"epoch": 0.25443084605624, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1772.7713, |
|
"step": 12432 |
|
}, |
|
{ |
|
"epoch": 0.25541320453522165, |
|
"grad_norm": 744.0, |
|
"learning_rate": 0.001, |
|
"loss": 1787.7516, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 0.25639556301420324, |
|
"grad_norm": 744.0, |
|
"learning_rate": 0.001, |
|
"loss": 1785.005, |
|
"step": 12528 |
|
}, |
|
{ |
|
"epoch": 0.2573779214931849, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1763.7612, |
|
"step": 12576 |
|
}, |
|
{ |
|
"epoch": 0.2583602799721665, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1777.0677, |
|
"step": 12624 |
|
}, |
|
{ |
|
"epoch": 0.25934263845114813, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1761.1235, |
|
"step": 12672 |
|
}, |
|
{ |
|
"epoch": 0.2603249969301297, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1795.2174, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 0.2613073554091114, |
|
"grad_norm": 816.0, |
|
"learning_rate": 0.001, |
|
"loss": 1763.8905, |
|
"step": 12768 |
|
}, |
|
{ |
|
"epoch": 0.262289713888093, |
|
"grad_norm": 852.0, |
|
"learning_rate": 0.001, |
|
"loss": 1761.4404, |
|
"step": 12816 |
|
}, |
|
{ |
|
"epoch": 0.2632720723670746, |
|
"grad_norm": 608.0, |
|
"learning_rate": 0.001, |
|
"loss": 1762.6668, |
|
"step": 12864 |
|
}, |
|
{ |
|
"epoch": 0.26425443084605627, |
|
"grad_norm": 892.0, |
|
"learning_rate": 0.001, |
|
"loss": 1725.5638, |
|
"step": 12912 |
|
}, |
|
{ |
|
"epoch": 0.26523678932503786, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1762.1764, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 0.2662191478040195, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1764.0163, |
|
"step": 13008 |
|
}, |
|
{ |
|
"epoch": 0.2672015062830011, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 1754.1729, |
|
"step": 13056 |
|
}, |
|
{ |
|
"epoch": 0.26818386476198275, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1772.8592, |
|
"step": 13104 |
|
}, |
|
{ |
|
"epoch": 0.26916622324096434, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 1780.2349, |
|
"step": 13152 |
|
}, |
|
{ |
|
"epoch": 0.270148581719946, |
|
"grad_norm": 720.0, |
|
"learning_rate": 0.001, |
|
"loss": 1764.6678, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.2711309401989276, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1757.6209, |
|
"step": 13248 |
|
}, |
|
{ |
|
"epoch": 0.27211329867790923, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1749.5741, |
|
"step": 13296 |
|
}, |
|
{ |
|
"epoch": 0.2730956571568908, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 1758.4183, |
|
"step": 13344 |
|
}, |
|
{ |
|
"epoch": 0.2740780156358725, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1770.6115, |
|
"step": 13392 |
|
}, |
|
{ |
|
"epoch": 0.27506037411485407, |
|
"grad_norm": 784.0, |
|
"learning_rate": 0.001, |
|
"loss": 1752.6141, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 0.2760427325938357, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 1732.4147, |
|
"step": 13488 |
|
}, |
|
{ |
|
"epoch": 0.2770250910728173, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 1757.4318, |
|
"step": 13536 |
|
}, |
|
{ |
|
"epoch": 0.27800744955179896, |
|
"grad_norm": 764.0, |
|
"learning_rate": 0.001, |
|
"loss": 1746.452, |
|
"step": 13584 |
|
}, |
|
{ |
|
"epoch": 0.27898980803078055, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1733.8742, |
|
"step": 13632 |
|
}, |
|
{ |
|
"epoch": 0.2799721665097622, |
|
"grad_norm": 824.0, |
|
"learning_rate": 0.001, |
|
"loss": 1761.4808, |
|
"step": 13680 |
|
}, |
|
{ |
|
"epoch": 0.2809545249887438, |
|
"grad_norm": 664.0, |
|
"learning_rate": 0.001, |
|
"loss": 1749.8506, |
|
"step": 13728 |
|
}, |
|
{ |
|
"epoch": 0.28193688346772544, |
|
"grad_norm": 776.0, |
|
"learning_rate": 0.001, |
|
"loss": 1734.1479, |
|
"step": 13776 |
|
}, |
|
{ |
|
"epoch": 0.28291924194670703, |
|
"grad_norm": 708.0, |
|
"learning_rate": 0.001, |
|
"loss": 1721.951, |
|
"step": 13824 |
|
}, |
|
{ |
|
"epoch": 0.2839016004256887, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1741.5046, |
|
"step": 13872 |
|
}, |
|
{ |
|
"epoch": 0.2848839589046703, |
|
"grad_norm": 764.0, |
|
"learning_rate": 0.001, |
|
"loss": 1743.1763, |
|
"step": 13920 |
|
}, |
|
{ |
|
"epoch": 0.2858663173836519, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 1767.7448, |
|
"step": 13968 |
|
}, |
|
{ |
|
"epoch": 0.2868486758626335, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1717.0291, |
|
"step": 14016 |
|
}, |
|
{ |
|
"epoch": 0.28783103434161517, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 1731.1566, |
|
"step": 14064 |
|
}, |
|
{ |
|
"epoch": 0.28881339282059676, |
|
"grad_norm": 788.0, |
|
"learning_rate": 0.001, |
|
"loss": 1720.1551, |
|
"step": 14112 |
|
}, |
|
{ |
|
"epoch": 0.2897957512995784, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1743.5467, |
|
"step": 14160 |
|
}, |
|
{ |
|
"epoch": 0.29077810977856, |
|
"grad_norm": 680.0, |
|
"learning_rate": 0.001, |
|
"loss": 1736.5747, |
|
"step": 14208 |
|
}, |
|
{ |
|
"epoch": 0.29176046825754165, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1737.3779, |
|
"step": 14256 |
|
}, |
|
{ |
|
"epoch": 0.29274282673652324, |
|
"grad_norm": 760.0, |
|
"learning_rate": 0.001, |
|
"loss": 1718.3322, |
|
"step": 14304 |
|
}, |
|
{ |
|
"epoch": 0.2937251852155049, |
|
"grad_norm": 760.0, |
|
"learning_rate": 0.001, |
|
"loss": 1736.3989, |
|
"step": 14352 |
|
}, |
|
{ |
|
"epoch": 0.29470754369448654, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1738.8551, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.29568990217346813, |
|
"grad_norm": 900.0, |
|
"learning_rate": 0.001, |
|
"loss": 1711.0072, |
|
"step": 14448 |
|
}, |
|
{ |
|
"epoch": 0.2966722606524498, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1709.6022, |
|
"step": 14496 |
|
}, |
|
{ |
|
"epoch": 0.2976546191314314, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 1741.5581, |
|
"step": 14544 |
|
}, |
|
{ |
|
"epoch": 0.298636977610413, |
|
"grad_norm": 684.0, |
|
"learning_rate": 0.001, |
|
"loss": 1715.0571, |
|
"step": 14592 |
|
}, |
|
{ |
|
"epoch": 0.2996193360893946, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1733.5199, |
|
"step": 14640 |
|
}, |
|
{ |
|
"epoch": 0.29998772051901273, |
|
"eval_loss": 1650.6409912109375, |
|
"eval_runtime": 9.0148, |
|
"eval_samples_per_second": 110.929, |
|
"eval_steps_per_second": 1.442, |
|
"step": 14658 |
|
}, |
|
{ |
|
"epoch": 0.30060169456837627, |
|
"grad_norm": 708.0, |
|
"learning_rate": 0.001, |
|
"loss": 1719.6375, |
|
"step": 14688 |
|
}, |
|
{ |
|
"epoch": 0.30158405304735786, |
|
"grad_norm": 616.0, |
|
"learning_rate": 0.001, |
|
"loss": 1728.5174, |
|
"step": 14736 |
|
}, |
|
{ |
|
"epoch": 0.3025664115263395, |
|
"grad_norm": 624.0, |
|
"learning_rate": 0.001, |
|
"loss": 1724.9813, |
|
"step": 14784 |
|
}, |
|
{ |
|
"epoch": 0.3035487700053211, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1704.8024, |
|
"step": 14832 |
|
}, |
|
{ |
|
"epoch": 0.30453112848430275, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1714.6235, |
|
"step": 14880 |
|
}, |
|
{ |
|
"epoch": 0.30551348696328434, |
|
"grad_norm": 1048.0, |
|
"learning_rate": 0.001, |
|
"loss": 1734.4709, |
|
"step": 14928 |
|
}, |
|
{ |
|
"epoch": 0.306495845442266, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 1721.2712, |
|
"step": 14976 |
|
}, |
|
{ |
|
"epoch": 0.3074782039212476, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1752.32, |
|
"step": 15024 |
|
}, |
|
{ |
|
"epoch": 0.30846056240022923, |
|
"grad_norm": 792.0, |
|
"learning_rate": 0.001, |
|
"loss": 1711.9393, |
|
"step": 15072 |
|
}, |
|
{ |
|
"epoch": 0.3094429208792108, |
|
"grad_norm": 904.0, |
|
"learning_rate": 0.001, |
|
"loss": 1722.3177, |
|
"step": 15120 |
|
}, |
|
{ |
|
"epoch": 0.3104252793581925, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1737.7088, |
|
"step": 15168 |
|
}, |
|
{ |
|
"epoch": 0.31140763783717407, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1728.0853, |
|
"step": 15216 |
|
}, |
|
{ |
|
"epoch": 0.3123899963161557, |
|
"grad_norm": 776.0, |
|
"learning_rate": 0.001, |
|
"loss": 1711.749, |
|
"step": 15264 |
|
}, |
|
{ |
|
"epoch": 0.3133723547951373, |
|
"grad_norm": 840.0, |
|
"learning_rate": 0.001, |
|
"loss": 1717.5446, |
|
"step": 15312 |
|
}, |
|
{ |
|
"epoch": 0.31435471327411896, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1718.4888, |
|
"step": 15360 |
|
}, |
|
{ |
|
"epoch": 0.31533707175310055, |
|
"grad_norm": 692.0, |
|
"learning_rate": 0.001, |
|
"loss": 1722.6672, |
|
"step": 15408 |
|
}, |
|
{ |
|
"epoch": 0.3163194302320822, |
|
"grad_norm": 864.0, |
|
"learning_rate": 0.001, |
|
"loss": 1715.826, |
|
"step": 15456 |
|
}, |
|
{ |
|
"epoch": 0.3173017887110638, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1714.7765, |
|
"step": 15504 |
|
}, |
|
{ |
|
"epoch": 0.31828414719004544, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1718.0269, |
|
"step": 15552 |
|
}, |
|
{ |
|
"epoch": 0.31926650566902703, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1691.7855, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.3202488641480087, |
|
"grad_norm": 888.0, |
|
"learning_rate": 0.001, |
|
"loss": 1717.6808, |
|
"step": 15648 |
|
}, |
|
{ |
|
"epoch": 0.3212312226269903, |
|
"grad_norm": 680.0, |
|
"learning_rate": 0.001, |
|
"loss": 1719.3472, |
|
"step": 15696 |
|
}, |
|
{ |
|
"epoch": 0.3222135811059719, |
|
"grad_norm": 868.0, |
|
"learning_rate": 0.001, |
|
"loss": 1691.0158, |
|
"step": 15744 |
|
}, |
|
{ |
|
"epoch": 0.3231959395849535, |
|
"grad_norm": 756.0, |
|
"learning_rate": 0.001, |
|
"loss": 1704.2186, |
|
"step": 15792 |
|
}, |
|
{ |
|
"epoch": 0.32417829806393517, |
|
"grad_norm": 888.0, |
|
"learning_rate": 0.001, |
|
"loss": 1723.0382, |
|
"step": 15840 |
|
}, |
|
{ |
|
"epoch": 0.32516065654291676, |
|
"grad_norm": 912.0, |
|
"learning_rate": 0.001, |
|
"loss": 1702.2889, |
|
"step": 15888 |
|
}, |
|
{ |
|
"epoch": 0.3261430150218984, |
|
"grad_norm": 820.0, |
|
"learning_rate": 0.001, |
|
"loss": 1728.0734, |
|
"step": 15936 |
|
}, |
|
{ |
|
"epoch": 0.32712537350088006, |
|
"grad_norm": 788.0, |
|
"learning_rate": 0.001, |
|
"loss": 1720.2152, |
|
"step": 15984 |
|
}, |
|
{ |
|
"epoch": 0.32810773197986165, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 1702.1133, |
|
"step": 16032 |
|
}, |
|
{ |
|
"epoch": 0.3290900904588433, |
|
"grad_norm": 836.0, |
|
"learning_rate": 0.001, |
|
"loss": 1720.4746, |
|
"step": 16080 |
|
}, |
|
{ |
|
"epoch": 0.3300724489378249, |
|
"grad_norm": 836.0, |
|
"learning_rate": 0.001, |
|
"loss": 1689.6606, |
|
"step": 16128 |
|
}, |
|
{ |
|
"epoch": 0.33105480741680654, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1689.0417, |
|
"step": 16176 |
|
}, |
|
{ |
|
"epoch": 0.33203716589578813, |
|
"grad_norm": 848.0, |
|
"learning_rate": 0.001, |
|
"loss": 1703.012, |
|
"step": 16224 |
|
}, |
|
{ |
|
"epoch": 0.3330195243747698, |
|
"grad_norm": 756.0, |
|
"learning_rate": 0.001, |
|
"loss": 1700.2785, |
|
"step": 16272 |
|
}, |
|
{ |
|
"epoch": 0.3340018828537514, |
|
"grad_norm": 756.0, |
|
"learning_rate": 0.001, |
|
"loss": 1709.3231, |
|
"step": 16320 |
|
}, |
|
{ |
|
"epoch": 0.334984241332733, |
|
"grad_norm": 960.0, |
|
"learning_rate": 0.001, |
|
"loss": 1715.8831, |
|
"step": 16368 |
|
}, |
|
{ |
|
"epoch": 0.3359665998117146, |
|
"grad_norm": 692.0, |
|
"learning_rate": 0.001, |
|
"loss": 1695.813, |
|
"step": 16416 |
|
}, |
|
{ |
|
"epoch": 0.33694895829069627, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 1685.9803, |
|
"step": 16464 |
|
}, |
|
{ |
|
"epoch": 0.33793131676967786, |
|
"grad_norm": 876.0, |
|
"learning_rate": 0.001, |
|
"loss": 1704.5868, |
|
"step": 16512 |
|
}, |
|
{ |
|
"epoch": 0.3389136752486595, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1681.2751, |
|
"step": 16560 |
|
}, |
|
{ |
|
"epoch": 0.3398960337276411, |
|
"grad_norm": 748.0, |
|
"learning_rate": 0.001, |
|
"loss": 1690.0252, |
|
"step": 16608 |
|
}, |
|
{ |
|
"epoch": 0.34087839220662275, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 1698.1932, |
|
"step": 16656 |
|
}, |
|
{ |
|
"epoch": 0.34186075068560434, |
|
"grad_norm": 856.0, |
|
"learning_rate": 0.001, |
|
"loss": 1692.6128, |
|
"step": 16704 |
|
}, |
|
{ |
|
"epoch": 0.342843109164586, |
|
"grad_norm": 880.0, |
|
"learning_rate": 0.001, |
|
"loss": 1696.6901, |
|
"step": 16752 |
|
}, |
|
{ |
|
"epoch": 0.3438254676435676, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 1693.9344, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.34480782612254923, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 1704.4855, |
|
"step": 16848 |
|
}, |
|
{ |
|
"epoch": 0.3457901846015308, |
|
"grad_norm": 692.0, |
|
"learning_rate": 0.001, |
|
"loss": 1705.7817, |
|
"step": 16896 |
|
}, |
|
{ |
|
"epoch": 0.3467725430805125, |
|
"grad_norm": 760.0, |
|
"learning_rate": 0.001, |
|
"loss": 1690.8944, |
|
"step": 16944 |
|
}, |
|
{ |
|
"epoch": 0.34775490155949407, |
|
"grad_norm": 868.0, |
|
"learning_rate": 0.001, |
|
"loss": 1685.9479, |
|
"step": 16992 |
|
}, |
|
{ |
|
"epoch": 0.3487372600384757, |
|
"grad_norm": 744.0, |
|
"learning_rate": 0.001, |
|
"loss": 1698.2961, |
|
"step": 17040 |
|
}, |
|
{ |
|
"epoch": 0.3497196185174573, |
|
"grad_norm": 688.0, |
|
"learning_rate": 0.001, |
|
"loss": 1693.7596, |
|
"step": 17088 |
|
}, |
|
{ |
|
"epoch": 0.35070197699643896, |
|
"grad_norm": 760.0, |
|
"learning_rate": 0.001, |
|
"loss": 1702.9092, |
|
"step": 17136 |
|
}, |
|
{ |
|
"epoch": 0.35168433547542055, |
|
"grad_norm": 788.0, |
|
"learning_rate": 0.001, |
|
"loss": 1672.0039, |
|
"step": 17184 |
|
}, |
|
{ |
|
"epoch": 0.3526666939544022, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1673.5811, |
|
"step": 17232 |
|
}, |
|
{ |
|
"epoch": 0.3536490524333838, |
|
"grad_norm": 976.0, |
|
"learning_rate": 0.001, |
|
"loss": 1697.0251, |
|
"step": 17280 |
|
}, |
|
{ |
|
"epoch": 0.35463141091236544, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1699.7508, |
|
"step": 17328 |
|
}, |
|
{ |
|
"epoch": 0.35561376939134703, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 1708.7798, |
|
"step": 17376 |
|
}, |
|
{ |
|
"epoch": 0.3565961278703287, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1693.451, |
|
"step": 17424 |
|
}, |
|
{ |
|
"epoch": 0.3575784863493103, |
|
"grad_norm": 904.0, |
|
"learning_rate": 0.001, |
|
"loss": 1676.6382, |
|
"step": 17472 |
|
}, |
|
{ |
|
"epoch": 0.3585608448282919, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 1691.7266, |
|
"step": 17520 |
|
}, |
|
{ |
|
"epoch": 0.3595432033072736, |
|
"grad_norm": 1012.0, |
|
"learning_rate": 0.001, |
|
"loss": 1666.9458, |
|
"step": 17568 |
|
}, |
|
{ |
|
"epoch": 0.36052556178625517, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 1675.5584, |
|
"step": 17616 |
|
}, |
|
{ |
|
"epoch": 0.3615079202652368, |
|
"grad_norm": 856.0, |
|
"learning_rate": 0.001, |
|
"loss": 1685.9595, |
|
"step": 17664 |
|
}, |
|
{ |
|
"epoch": 0.3624902787442184, |
|
"grad_norm": 836.0, |
|
"learning_rate": 0.001, |
|
"loss": 1695.4741, |
|
"step": 17712 |
|
}, |
|
{ |
|
"epoch": 0.36347263722320006, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1689.256, |
|
"step": 17760 |
|
}, |
|
{ |
|
"epoch": 0.36445499570218165, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1689.8599, |
|
"step": 17808 |
|
}, |
|
{ |
|
"epoch": 0.3654373541811633, |
|
"grad_norm": 876.0, |
|
"learning_rate": 0.001, |
|
"loss": 1668.894, |
|
"step": 17856 |
|
}, |
|
{ |
|
"epoch": 0.3664197126601449, |
|
"grad_norm": 780.0, |
|
"learning_rate": 0.001, |
|
"loss": 1684.5539, |
|
"step": 17904 |
|
}, |
|
{ |
|
"epoch": 0.36740207113912654, |
|
"grad_norm": 952.0, |
|
"learning_rate": 0.001, |
|
"loss": 1683.5539, |
|
"step": 17952 |
|
}, |
|
{ |
|
"epoch": 0.36838442961810813, |
|
"grad_norm": 860.0, |
|
"learning_rate": 0.001, |
|
"loss": 1695.5282, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.3693667880970898, |
|
"grad_norm": 844.0, |
|
"learning_rate": 0.001, |
|
"loss": 1667.8563, |
|
"step": 18048 |
|
}, |
|
{ |
|
"epoch": 0.3703491465760714, |
|
"grad_norm": 800.0, |
|
"learning_rate": 0.001, |
|
"loss": 1671.2471, |
|
"step": 18096 |
|
}, |
|
{ |
|
"epoch": 0.371331505055053, |
|
"grad_norm": 760.0, |
|
"learning_rate": 0.001, |
|
"loss": 1664.4082, |
|
"step": 18144 |
|
}, |
|
{ |
|
"epoch": 0.3723138635340346, |
|
"grad_norm": 1120.0, |
|
"learning_rate": 0.001, |
|
"loss": 1666.0448, |
|
"step": 18192 |
|
}, |
|
{ |
|
"epoch": 0.37329622201301627, |
|
"grad_norm": 824.0, |
|
"learning_rate": 0.001, |
|
"loss": 1665.9009, |
|
"step": 18240 |
|
}, |
|
{ |
|
"epoch": 0.37427858049199786, |
|
"grad_norm": 872.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.6131, |
|
"step": 18288 |
|
}, |
|
{ |
|
"epoch": 0.3752609389709795, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 1665.7214, |
|
"step": 18336 |
|
}, |
|
{ |
|
"epoch": 0.3762432974499611, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.0591, |
|
"step": 18384 |
|
}, |
|
{ |
|
"epoch": 0.37722565592894275, |
|
"grad_norm": 988.0, |
|
"learning_rate": 0.001, |
|
"loss": 1683.5985, |
|
"step": 18432 |
|
}, |
|
{ |
|
"epoch": 0.37820801440792434, |
|
"grad_norm": 804.0, |
|
"learning_rate": 0.001, |
|
"loss": 1661.8081, |
|
"step": 18480 |
|
}, |
|
{ |
|
"epoch": 0.379190372886906, |
|
"grad_norm": 776.0, |
|
"learning_rate": 0.001, |
|
"loss": 1685.9769, |
|
"step": 18528 |
|
}, |
|
{ |
|
"epoch": 0.3801727313658876, |
|
"grad_norm": 920.0, |
|
"learning_rate": 0.001, |
|
"loss": 1676.7816, |
|
"step": 18576 |
|
}, |
|
{ |
|
"epoch": 0.38115508984486923, |
|
"grad_norm": 800.0, |
|
"learning_rate": 0.001, |
|
"loss": 1669.9821, |
|
"step": 18624 |
|
}, |
|
{ |
|
"epoch": 0.3821374483238508, |
|
"grad_norm": 828.0, |
|
"learning_rate": 0.001, |
|
"loss": 1654.353, |
|
"step": 18672 |
|
}, |
|
{ |
|
"epoch": 0.3831198068028325, |
|
"grad_norm": 756.0, |
|
"learning_rate": 0.001, |
|
"loss": 1641.133, |
|
"step": 18720 |
|
}, |
|
{ |
|
"epoch": 0.38410216528181407, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1654.8257, |
|
"step": 18768 |
|
}, |
|
{ |
|
"epoch": 0.3850845237607957, |
|
"grad_norm": 856.0, |
|
"learning_rate": 0.001, |
|
"loss": 1674.0685, |
|
"step": 18816 |
|
}, |
|
{ |
|
"epoch": 0.3860668822397773, |
|
"grad_norm": 980.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.0804, |
|
"step": 18864 |
|
}, |
|
{ |
|
"epoch": 0.38704924071875896, |
|
"grad_norm": 852.0, |
|
"learning_rate": 0.001, |
|
"loss": 1676.6375, |
|
"step": 18912 |
|
}, |
|
{ |
|
"epoch": 0.38803159919774055, |
|
"grad_norm": 788.0, |
|
"learning_rate": 0.001, |
|
"loss": 1681.3968, |
|
"step": 18960 |
|
}, |
|
{ |
|
"epoch": 0.3890139576767222, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1658.5428, |
|
"step": 19008 |
|
}, |
|
{ |
|
"epoch": 0.3899963161557038, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1693.6683, |
|
"step": 19056 |
|
}, |
|
{ |
|
"epoch": 0.39097867463468544, |
|
"grad_norm": 1224.0, |
|
"learning_rate": 0.001, |
|
"loss": 1637.9217, |
|
"step": 19104 |
|
}, |
|
{ |
|
"epoch": 0.3919610331136671, |
|
"grad_norm": 896.0, |
|
"learning_rate": 0.001, |
|
"loss": 1680.59, |
|
"step": 19152 |
|
}, |
|
{ |
|
"epoch": 0.3929433915926487, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.9777, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.39392575007163033, |
|
"grad_norm": 940.0, |
|
"learning_rate": 0.001, |
|
"loss": 1672.4491, |
|
"step": 19248 |
|
}, |
|
{ |
|
"epoch": 0.3949081085506119, |
|
"grad_norm": 696.0, |
|
"learning_rate": 0.001, |
|
"loss": 1668.8294, |
|
"step": 19296 |
|
}, |
|
{ |
|
"epoch": 0.3958904670295936, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1652.0028, |
|
"step": 19344 |
|
}, |
|
{ |
|
"epoch": 0.39687282550857517, |
|
"grad_norm": 900.0, |
|
"learning_rate": 0.001, |
|
"loss": 1670.4543, |
|
"step": 19392 |
|
}, |
|
{ |
|
"epoch": 0.3978551839875568, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1674.2799, |
|
"step": 19440 |
|
}, |
|
{ |
|
"epoch": 0.3988375424665384, |
|
"grad_norm": 896.0, |
|
"learning_rate": 0.001, |
|
"loss": 1637.7557, |
|
"step": 19488 |
|
}, |
|
{ |
|
"epoch": 0.39981990094552006, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1659.3999, |
|
"step": 19536 |
|
}, |
|
{ |
|
"epoch": 0.39998362735868365, |
|
"eval_loss": 1588.186767578125, |
|
"eval_runtime": 9.0185, |
|
"eval_samples_per_second": 110.884, |
|
"eval_steps_per_second": 1.441, |
|
"step": 19544 |
|
}, |
|
{ |
|
"epoch": 0.40080225942450165, |
|
"grad_norm": 812.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.3135, |
|
"step": 19584 |
|
}, |
|
{ |
|
"epoch": 0.4017846179034833, |
|
"grad_norm": 820.0, |
|
"learning_rate": 0.001, |
|
"loss": 1648.4126, |
|
"step": 19632 |
|
}, |
|
{ |
|
"epoch": 0.4027669763824649, |
|
"grad_norm": 828.0, |
|
"learning_rate": 0.001, |
|
"loss": 1629.137, |
|
"step": 19680 |
|
}, |
|
{ |
|
"epoch": 0.40374933486144654, |
|
"grad_norm": 840.0, |
|
"learning_rate": 0.001, |
|
"loss": 1675.1574, |
|
"step": 19728 |
|
}, |
|
{ |
|
"epoch": 0.40473169334042813, |
|
"grad_norm": 892.0, |
|
"learning_rate": 0.001, |
|
"loss": 1651.4735, |
|
"step": 19776 |
|
}, |
|
{ |
|
"epoch": 0.4057140518194098, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1659.3854, |
|
"step": 19824 |
|
}, |
|
{ |
|
"epoch": 0.4066964102983914, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 1676.7249, |
|
"step": 19872 |
|
}, |
|
{ |
|
"epoch": 0.407678768777373, |
|
"grad_norm": 1088.0, |
|
"learning_rate": 0.001, |
|
"loss": 1658.8581, |
|
"step": 19920 |
|
}, |
|
{ |
|
"epoch": 0.4086611272563546, |
|
"grad_norm": 772.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.8711, |
|
"step": 19968 |
|
}, |
|
{ |
|
"epoch": 0.40964348573533627, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1657.056, |
|
"step": 20016 |
|
}, |
|
{ |
|
"epoch": 0.41062584421431786, |
|
"grad_norm": 920.0, |
|
"learning_rate": 0.001, |
|
"loss": 1637.993, |
|
"step": 20064 |
|
}, |
|
{ |
|
"epoch": 0.4116082026932995, |
|
"grad_norm": 636.0, |
|
"learning_rate": 0.001, |
|
"loss": 1644.8833, |
|
"step": 20112 |
|
}, |
|
{ |
|
"epoch": 0.4125905611722811, |
|
"grad_norm": 956.0, |
|
"learning_rate": 0.001, |
|
"loss": 1656.4339, |
|
"step": 20160 |
|
}, |
|
{ |
|
"epoch": 0.41357291965126275, |
|
"grad_norm": 792.0, |
|
"learning_rate": 0.001, |
|
"loss": 1652.4888, |
|
"step": 20208 |
|
}, |
|
{ |
|
"epoch": 0.41455527813024434, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1642.5142, |
|
"step": 20256 |
|
}, |
|
{ |
|
"epoch": 0.415537636609226, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1668.1183, |
|
"step": 20304 |
|
}, |
|
{ |
|
"epoch": 0.4165199950882076, |
|
"grad_norm": 796.0, |
|
"learning_rate": 0.001, |
|
"loss": 1660.486, |
|
"step": 20352 |
|
}, |
|
{ |
|
"epoch": 0.41750235356718923, |
|
"grad_norm": 788.0, |
|
"learning_rate": 0.001, |
|
"loss": 1655.4614, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.4184847120461708, |
|
"grad_norm": 844.0, |
|
"learning_rate": 0.001, |
|
"loss": 1650.6449, |
|
"step": 20448 |
|
}, |
|
{ |
|
"epoch": 0.4194670705251525, |
|
"grad_norm": 1080.0, |
|
"learning_rate": 0.001, |
|
"loss": 1639.0168, |
|
"step": 20496 |
|
}, |
|
{ |
|
"epoch": 0.42044942900413407, |
|
"grad_norm": 944.0, |
|
"learning_rate": 0.001, |
|
"loss": 1655.6951, |
|
"step": 20544 |
|
}, |
|
{ |
|
"epoch": 0.4214317874831157, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1644.007, |
|
"step": 20592 |
|
}, |
|
{ |
|
"epoch": 0.4224141459620973, |
|
"grad_norm": 896.0, |
|
"learning_rate": 0.001, |
|
"loss": 1614.1253, |
|
"step": 20640 |
|
}, |
|
{ |
|
"epoch": 0.42339650444107896, |
|
"grad_norm": 736.0, |
|
"learning_rate": 0.001, |
|
"loss": 1637.6761, |
|
"step": 20688 |
|
}, |
|
{ |
|
"epoch": 0.42437886292006055, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1635.8405, |
|
"step": 20736 |
|
}, |
|
{ |
|
"epoch": 0.4253612213990422, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1653.1647, |
|
"step": 20784 |
|
}, |
|
{ |
|
"epoch": 0.42634357987802385, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1657.5373, |
|
"step": 20832 |
|
}, |
|
{ |
|
"epoch": 0.42732593835700544, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1649.7998, |
|
"step": 20880 |
|
}, |
|
{ |
|
"epoch": 0.4283082968359871, |
|
"grad_norm": 848.0, |
|
"learning_rate": 0.001, |
|
"loss": 1639.043, |
|
"step": 20928 |
|
}, |
|
{ |
|
"epoch": 0.4292906553149687, |
|
"grad_norm": 716.0, |
|
"learning_rate": 0.001, |
|
"loss": 1656.0706, |
|
"step": 20976 |
|
}, |
|
{ |
|
"epoch": 0.43027301379395033, |
|
"grad_norm": 924.0, |
|
"learning_rate": 0.001, |
|
"loss": 1640.4744, |
|
"step": 21024 |
|
}, |
|
{ |
|
"epoch": 0.4312553722729319, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1640.8403, |
|
"step": 21072 |
|
}, |
|
{ |
|
"epoch": 0.4322377307519136, |
|
"grad_norm": 828.0, |
|
"learning_rate": 0.001, |
|
"loss": 1638.4172, |
|
"step": 21120 |
|
}, |
|
{ |
|
"epoch": 0.43322008923089517, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1662.0506, |
|
"step": 21168 |
|
}, |
|
{ |
|
"epoch": 0.4342024477098768, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1634.0928, |
|
"step": 21216 |
|
}, |
|
{ |
|
"epoch": 0.4351848061888584, |
|
"grad_norm": 976.0, |
|
"learning_rate": 0.001, |
|
"loss": 1650.1844, |
|
"step": 21264 |
|
}, |
|
{ |
|
"epoch": 0.43616716466784006, |
|
"grad_norm": 864.0, |
|
"learning_rate": 0.001, |
|
"loss": 1637.9504, |
|
"step": 21312 |
|
}, |
|
{ |
|
"epoch": 0.43714952314682165, |
|
"grad_norm": 660.0, |
|
"learning_rate": 0.001, |
|
"loss": 1651.0807, |
|
"step": 21360 |
|
}, |
|
{ |
|
"epoch": 0.4381318816258033, |
|
"grad_norm": 884.0, |
|
"learning_rate": 0.001, |
|
"loss": 1639.6099, |
|
"step": 21408 |
|
}, |
|
{ |
|
"epoch": 0.4391142401047849, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1636.7961, |
|
"step": 21456 |
|
}, |
|
{ |
|
"epoch": 0.44009659858376654, |
|
"grad_norm": 704.0, |
|
"learning_rate": 0.001, |
|
"loss": 1630.2891, |
|
"step": 21504 |
|
}, |
|
{ |
|
"epoch": 0.44107895706274813, |
|
"grad_norm": 868.0, |
|
"learning_rate": 0.001, |
|
"loss": 1651.1029, |
|
"step": 21552 |
|
}, |
|
{ |
|
"epoch": 0.4420613155417298, |
|
"grad_norm": 1200.0, |
|
"learning_rate": 0.001, |
|
"loss": 1658.1079, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.4430436740207114, |
|
"grad_norm": 820.0, |
|
"learning_rate": 0.001, |
|
"loss": 1650.035, |
|
"step": 21648 |
|
}, |
|
{ |
|
"epoch": 0.444026032499693, |
|
"grad_norm": 628.0, |
|
"learning_rate": 0.001, |
|
"loss": 1651.446, |
|
"step": 21696 |
|
}, |
|
{ |
|
"epoch": 0.4450083909786746, |
|
"grad_norm": 936.0, |
|
"learning_rate": 0.001, |
|
"loss": 1651.8545, |
|
"step": 21744 |
|
}, |
|
{ |
|
"epoch": 0.44599074945765627, |
|
"grad_norm": 724.0, |
|
"learning_rate": 0.001, |
|
"loss": 1633.7931, |
|
"step": 21792 |
|
}, |
|
{ |
|
"epoch": 0.44697310793663786, |
|
"grad_norm": 868.0, |
|
"learning_rate": 0.001, |
|
"loss": 1643.6271, |
|
"step": 21840 |
|
}, |
|
{ |
|
"epoch": 0.4479554664156195, |
|
"grad_norm": 884.0, |
|
"learning_rate": 0.001, |
|
"loss": 1628.9087, |
|
"step": 21888 |
|
}, |
|
{ |
|
"epoch": 0.4489378248946011, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 1633.3311, |
|
"step": 21936 |
|
}, |
|
{ |
|
"epoch": 0.44992018337358275, |
|
"grad_norm": 776.0, |
|
"learning_rate": 0.001, |
|
"loss": 1636.3483, |
|
"step": 21984 |
|
}, |
|
{ |
|
"epoch": 0.45090254185256434, |
|
"grad_norm": 740.0, |
|
"learning_rate": 0.001, |
|
"loss": 1627.1842, |
|
"step": 22032 |
|
}, |
|
{ |
|
"epoch": 0.451884900331546, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1632.6536, |
|
"step": 22080 |
|
}, |
|
{ |
|
"epoch": 0.4528672588105276, |
|
"grad_norm": 920.0, |
|
"learning_rate": 0.001, |
|
"loss": 1663.3418, |
|
"step": 22128 |
|
}, |
|
{ |
|
"epoch": 0.45384961728950923, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1635.7738, |
|
"step": 22176 |
|
}, |
|
{ |
|
"epoch": 0.4548319757684908, |
|
"grad_norm": 752.0, |
|
"learning_rate": 0.001, |
|
"loss": 1636.0459, |
|
"step": 22224 |
|
}, |
|
{ |
|
"epoch": 0.4558143342474725, |
|
"grad_norm": 892.0, |
|
"learning_rate": 0.001, |
|
"loss": 1627.4956, |
|
"step": 22272 |
|
}, |
|
{ |
|
"epoch": 0.45679669272645407, |
|
"grad_norm": 860.0, |
|
"learning_rate": 0.001, |
|
"loss": 1628.1626, |
|
"step": 22320 |
|
}, |
|
{ |
|
"epoch": 0.4577790512054357, |
|
"grad_norm": 776.0, |
|
"learning_rate": 0.001, |
|
"loss": 1628.0701, |
|
"step": 22368 |
|
}, |
|
{ |
|
"epoch": 0.45876140968441737, |
|
"grad_norm": 792.0, |
|
"learning_rate": 0.001, |
|
"loss": 1644.1922, |
|
"step": 22416 |
|
}, |
|
{ |
|
"epoch": 0.45974376816339896, |
|
"grad_norm": 776.0, |
|
"learning_rate": 0.001, |
|
"loss": 1608.5988, |
|
"step": 22464 |
|
}, |
|
{ |
|
"epoch": 0.4607261266423806, |
|
"grad_norm": 860.0, |
|
"learning_rate": 0.001, |
|
"loss": 1637.166, |
|
"step": 22512 |
|
}, |
|
{ |
|
"epoch": 0.4617084851213622, |
|
"grad_norm": 924.0, |
|
"learning_rate": 0.001, |
|
"loss": 1626.6854, |
|
"step": 22560 |
|
}, |
|
{ |
|
"epoch": 0.46269084360034385, |
|
"grad_norm": 876.0, |
|
"learning_rate": 0.001, |
|
"loss": 1639.1245, |
|
"step": 22608 |
|
}, |
|
{ |
|
"epoch": 0.46367320207932544, |
|
"grad_norm": 936.0, |
|
"learning_rate": 0.001, |
|
"loss": 1634.3815, |
|
"step": 22656 |
|
}, |
|
{ |
|
"epoch": 0.4646555605583071, |
|
"grad_norm": 912.0, |
|
"learning_rate": 0.001, |
|
"loss": 1606.1912, |
|
"step": 22704 |
|
}, |
|
{ |
|
"epoch": 0.4656379190372887, |
|
"grad_norm": 952.0, |
|
"learning_rate": 0.001, |
|
"loss": 1620.5391, |
|
"step": 22752 |
|
}, |
|
{ |
|
"epoch": 0.46662027751627033, |
|
"grad_norm": 960.0, |
|
"learning_rate": 0.001, |
|
"loss": 1612.0667, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.4676026359952519, |
|
"grad_norm": 832.0, |
|
"learning_rate": 0.001, |
|
"loss": 1651.9868, |
|
"step": 22848 |
|
}, |
|
{ |
|
"epoch": 0.4685849944742336, |
|
"grad_norm": 712.0, |
|
"learning_rate": 0.001, |
|
"loss": 1629.237, |
|
"step": 22896 |
|
}, |
|
{ |
|
"epoch": 0.46956735295321517, |
|
"grad_norm": 864.0, |
|
"learning_rate": 0.001, |
|
"loss": 1618.2004, |
|
"step": 22944 |
|
}, |
|
{ |
|
"epoch": 0.4705497114321968, |
|
"grad_norm": 728.0, |
|
"learning_rate": 0.001, |
|
"loss": 1625.5379, |
|
"step": 22992 |
|
}, |
|
{ |
|
"epoch": 0.4715320699111784, |
|
"grad_norm": 836.0, |
|
"learning_rate": 0.001, |
|
"loss": 1622.8146, |
|
"step": 23040 |
|
}, |
|
{ |
|
"epoch": 0.47251442839016006, |
|
"grad_norm": 1064.0, |
|
"learning_rate": 0.001, |
|
"loss": 1623.9705, |
|
"step": 23088 |
|
}, |
|
{ |
|
"epoch": 0.47349678686914165, |
|
"grad_norm": 860.0, |
|
"learning_rate": 0.001, |
|
"loss": 1626.2383, |
|
"step": 23136 |
|
}, |
|
{ |
|
"epoch": 0.4744791453481233, |
|
"grad_norm": 1120.0, |
|
"learning_rate": 0.001, |
|
"loss": 1634.2668, |
|
"step": 23184 |
|
}, |
|
{ |
|
"epoch": 0.4754615038271049, |
|
"grad_norm": 796.0, |
|
"learning_rate": 0.001, |
|
"loss": 1642.5649, |
|
"step": 23232 |
|
}, |
|
{ |
|
"epoch": 0.47644386230608654, |
|
"grad_norm": 1072.0, |
|
"learning_rate": 0.001, |
|
"loss": 1633.4873, |
|
"step": 23280 |
|
}, |
|
{ |
|
"epoch": 0.47742622078506813, |
|
"grad_norm": 904.0, |
|
"learning_rate": 0.001, |
|
"loss": 1604.186, |
|
"step": 23328 |
|
}, |
|
{ |
|
"epoch": 0.4784085792640498, |
|
"grad_norm": 852.0, |
|
"learning_rate": 0.001, |
|
"loss": 1608.5158, |
|
"step": 23376 |
|
}, |
|
{ |
|
"epoch": 0.4793909377430314, |
|
"grad_norm": 860.0, |
|
"learning_rate": 0.001, |
|
"loss": 1624.3991, |
|
"step": 23424 |
|
}, |
|
{ |
|
"epoch": 0.480373296222013, |
|
"grad_norm": 820.0, |
|
"learning_rate": 0.001, |
|
"loss": 1640.4948, |
|
"step": 23472 |
|
}, |
|
{ |
|
"epoch": 0.4813556547009946, |
|
"grad_norm": 700.0, |
|
"learning_rate": 0.001, |
|
"loss": 1593.3931, |
|
"step": 23520 |
|
}, |
|
{ |
|
"epoch": 0.48233801317997627, |
|
"grad_norm": 824.0, |
|
"learning_rate": 0.001, |
|
"loss": 1622.2378, |
|
"step": 23568 |
|
}, |
|
{ |
|
"epoch": 0.48332037165895786, |
|
"grad_norm": 936.0, |
|
"learning_rate": 0.001, |
|
"loss": 1612.6034, |
|
"step": 23616 |
|
}, |
|
{ |
|
"epoch": 0.4843027301379395, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1624.4165, |
|
"step": 23664 |
|
}, |
|
{ |
|
"epoch": 0.4852850886169211, |
|
"grad_norm": 820.0, |
|
"learning_rate": 0.001, |
|
"loss": 1634.2424, |
|
"step": 23712 |
|
}, |
|
{ |
|
"epoch": 0.48626744709590275, |
|
"grad_norm": 896.0, |
|
"learning_rate": 0.001, |
|
"loss": 1616.617, |
|
"step": 23760 |
|
}, |
|
{ |
|
"epoch": 0.48724980557488434, |
|
"grad_norm": 732.0, |
|
"learning_rate": 0.001, |
|
"loss": 1629.8065, |
|
"step": 23808 |
|
}, |
|
{ |
|
"epoch": 0.488232164053866, |
|
"grad_norm": 876.0, |
|
"learning_rate": 0.001, |
|
"loss": 1611.0832, |
|
"step": 23856 |
|
}, |
|
{ |
|
"epoch": 0.4892145225328476, |
|
"grad_norm": 1004.0, |
|
"learning_rate": 0.001, |
|
"loss": 1596.9705, |
|
"step": 23904 |
|
}, |
|
{ |
|
"epoch": 0.49019688101182923, |
|
"grad_norm": 812.0, |
|
"learning_rate": 0.001, |
|
"loss": 1612.6437, |
|
"step": 23952 |
|
}, |
|
{ |
|
"epoch": 0.4911792394908109, |
|
"grad_norm": 764.0, |
|
"learning_rate": 0.001, |
|
"loss": 1612.0643, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.4921615979697925, |
|
"grad_norm": 944.0, |
|
"learning_rate": 0.001, |
|
"loss": 1620.1268, |
|
"step": 24048 |
|
}, |
|
{ |
|
"epoch": 0.4931439564487741, |
|
"grad_norm": 920.0, |
|
"learning_rate": 0.001, |
|
"loss": 1619.8875, |
|
"step": 24096 |
|
}, |
|
{ |
|
"epoch": 0.4941263149277557, |
|
"grad_norm": 808.0, |
|
"learning_rate": 0.001, |
|
"loss": 1608.6463, |
|
"step": 24144 |
|
}, |
|
{ |
|
"epoch": 0.49510867340673737, |
|
"grad_norm": 784.0, |
|
"learning_rate": 0.001, |
|
"loss": 1609.1462, |
|
"step": 24192 |
|
}, |
|
{ |
|
"epoch": 0.49609103188571896, |
|
"grad_norm": 796.0, |
|
"learning_rate": 0.001, |
|
"loss": 1610.4935, |
|
"step": 24240 |
|
}, |
|
{ |
|
"epoch": 0.4970733903647006, |
|
"grad_norm": 896.0, |
|
"learning_rate": 0.001, |
|
"loss": 1622.8371, |
|
"step": 24288 |
|
}, |
|
{ |
|
"epoch": 0.4980557488436822, |
|
"grad_norm": 768.0, |
|
"learning_rate": 0.001, |
|
"loss": 1617.0732, |
|
"step": 24336 |
|
}, |
|
{ |
|
"epoch": 0.49903810732266385, |
|
"grad_norm": 920.0, |
|
"learning_rate": 0.001, |
|
"loss": 1615.2142, |
|
"step": 24384 |
|
}, |
|
{ |
|
"epoch": 0.49997953419835456, |
|
"eval_loss": 1546.7662353515625, |
|
"eval_runtime": 9.0072, |
|
"eval_samples_per_second": 111.022, |
|
"eval_steps_per_second": 1.443, |
|
"step": 24430 |
|
} |
|
], |
|
"logging_steps": 48, |
|
"max_steps": 48862, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 4886, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7859053283033743e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|