|
{ |
|
"best_metric": 0.746105968952179, |
|
"best_model_checkpoint": "./vit-base-hate-meme/checkpoint-1064", |
|
"epoch": 8.0, |
|
"eval_steps": 500, |
|
"global_step": 4256, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.893226146697998, |
|
"learning_rate": 3.6e-06, |
|
"loss": 0.6079, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8366069793701172, |
|
"learning_rate": 7.2e-06, |
|
"loss": 0.6672, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.005919933319092, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.6246, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.47935283184051514, |
|
"learning_rate": 1.52e-05, |
|
"loss": 0.6929, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.6089649200439453, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.6635, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0585061311721802, |
|
"learning_rate": 2.32e-05, |
|
"loss": 0.626, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.16031716763973236, |
|
"learning_rate": 2.7200000000000004e-05, |
|
"loss": 0.6391, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.742310106754303, |
|
"learning_rate": 3.12e-05, |
|
"loss": 0.6518, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0254610776901245, |
|
"learning_rate": 3.52e-05, |
|
"loss": 0.6408, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.074569821357727, |
|
"learning_rate": 3.9200000000000004e-05, |
|
"loss": 0.6446, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.3864551782608032, |
|
"learning_rate": 4.32e-05, |
|
"loss": 0.6591, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1516258716583252, |
|
"learning_rate": 4.72e-05, |
|
"loss": 0.6288, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.2458890676498413, |
|
"learning_rate": 5.1200000000000004e-05, |
|
"loss": 0.6325, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7807619571685791, |
|
"learning_rate": 5.520000000000001e-05, |
|
"loss": 0.662, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.871969699859619, |
|
"learning_rate": 5.88e-05, |
|
"loss": 0.6877, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.10189151763916, |
|
"learning_rate": 6.280000000000001e-05, |
|
"loss": 0.6344, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.75629186630249, |
|
"learning_rate": 6.680000000000001e-05, |
|
"loss": 0.6487, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9313517212867737, |
|
"learning_rate": 7.08e-05, |
|
"loss": 0.5723, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.9287912845611572, |
|
"learning_rate": 7.48e-05, |
|
"loss": 0.6724, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.517734527587891, |
|
"learning_rate": 7.88e-05, |
|
"loss": 0.647, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.1182656288146973, |
|
"learning_rate": 8.28e-05, |
|
"loss": 0.6423, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.048739194869995, |
|
"learning_rate": 8.680000000000001e-05, |
|
"loss": 0.5778, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.1923900842666626, |
|
"learning_rate": 9.080000000000001e-05, |
|
"loss": 0.6203, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.969618082046509, |
|
"learning_rate": 9.48e-05, |
|
"loss": 0.6514, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6900493502616882, |
|
"learning_rate": 9.88e-05, |
|
"loss": 0.6581, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.1176040172576904, |
|
"learning_rate": 0.0001028, |
|
"loss": 0.6104, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.029479742050171, |
|
"learning_rate": 0.00010680000000000001, |
|
"loss": 0.5896, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8468737602233887, |
|
"learning_rate": 0.00011080000000000001, |
|
"loss": 0.6067, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5066865682601929, |
|
"learning_rate": 0.0001148, |
|
"loss": 0.6, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6331378817558289, |
|
"learning_rate": 0.0001188, |
|
"loss": 0.6437, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.0694966316223145, |
|
"learning_rate": 0.0001228, |
|
"loss": 0.6304, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4095279574394226, |
|
"learning_rate": 0.00012680000000000002, |
|
"loss": 0.5867, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.9158384799957275, |
|
"learning_rate": 0.0001308, |
|
"loss": 0.6397, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.578360915184021, |
|
"learning_rate": 0.00013480000000000002, |
|
"loss": 0.6616, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3877038359642029, |
|
"learning_rate": 0.00013879999999999999, |
|
"loss": 0.6161, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0857969522476196, |
|
"learning_rate": 0.0001428, |
|
"loss": 0.6281, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.571174621582031, |
|
"learning_rate": 0.00014680000000000002, |
|
"loss": 0.6338, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.5768461227416992, |
|
"learning_rate": 0.0001508, |
|
"loss": 0.6602, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.675762414932251, |
|
"learning_rate": 0.00015480000000000002, |
|
"loss": 0.6649, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6675221920013428, |
|
"learning_rate": 0.0001588, |
|
"loss": 0.6342, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.4393932819366455, |
|
"learning_rate": 0.0001628, |
|
"loss": 0.6678, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.380407810211182, |
|
"learning_rate": 0.0001668, |
|
"loss": 0.6496, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.5338869094848633, |
|
"learning_rate": 0.0001708, |
|
"loss": 0.6416, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.133732318878174, |
|
"learning_rate": 0.00017480000000000002, |
|
"loss": 0.6142, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.7674825191497803, |
|
"learning_rate": 0.0001788, |
|
"loss": 0.5776, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.7282730937004089, |
|
"learning_rate": 0.00018280000000000003, |
|
"loss": 0.6721, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.9304282665252686, |
|
"learning_rate": 0.00018680000000000001, |
|
"loss": 0.6354, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.655442476272583, |
|
"learning_rate": 0.0001908, |
|
"loss": 0.6835, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.454685926437378, |
|
"learning_rate": 0.0001948, |
|
"loss": 0.6148, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.250034809112549, |
|
"learning_rate": 0.0001988, |
|
"loss": 0.6026, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.33308330178260803, |
|
"learning_rate": 0.0001996272630457934, |
|
"loss": 0.6638, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.148402214050293, |
|
"learning_rate": 0.00019909478168264112, |
|
"loss": 0.6184, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.5404582023620605, |
|
"learning_rate": 0.00019856230031948884, |
|
"loss": 0.6483, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5, |
|
"eval_loss": 0.7933283448219299, |
|
"eval_runtime": 5.2926, |
|
"eval_samples_per_second": 94.472, |
|
"eval_steps_per_second": 3.023, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.8757955431938171, |
|
"learning_rate": 0.00019802981895633654, |
|
"loss": 0.56, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.5476624965667725, |
|
"learning_rate": 0.00019749733759318424, |
|
"loss": 0.7584, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 4.550638675689697, |
|
"learning_rate": 0.00019696485623003196, |
|
"loss": 0.6235, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.46172571182251, |
|
"learning_rate": 0.00019643237486687966, |
|
"loss": 0.6473, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.1486427783966064, |
|
"learning_rate": 0.00019589989350372736, |
|
"loss": 0.6259, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.4022955894470215, |
|
"learning_rate": 0.0001953674121405751, |
|
"loss": 0.6031, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.3872004747390747, |
|
"learning_rate": 0.0001948349307774228, |
|
"loss": 0.5789, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.193093180656433, |
|
"learning_rate": 0.0001943024494142705, |
|
"loss": 0.6409, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.4846415519714355, |
|
"learning_rate": 0.0001937699680511182, |
|
"loss": 0.6433, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.3275984525680542, |
|
"learning_rate": 0.00019323748668796594, |
|
"loss": 0.6853, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.5516121983528137, |
|
"learning_rate": 0.00019270500532481363, |
|
"loss": 0.6577, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.6763603091239929, |
|
"learning_rate": 0.00019217252396166133, |
|
"loss": 0.6181, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.7651398181915283, |
|
"learning_rate": 0.00019164004259850906, |
|
"loss": 0.6698, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 6.038695335388184, |
|
"learning_rate": 0.00019110756123535678, |
|
"loss": 0.6425, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.9138271808624268, |
|
"learning_rate": 0.00019057507987220448, |
|
"loss": 0.6428, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.1620535850524902, |
|
"learning_rate": 0.00019004259850905218, |
|
"loss": 0.6643, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.9087329506874084, |
|
"learning_rate": 0.0001895101171458999, |
|
"loss": 0.6042, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 3.8417811393737793, |
|
"learning_rate": 0.0001889776357827476, |
|
"loss": 0.7502, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.6006288528442383, |
|
"learning_rate": 0.0001884451544195953, |
|
"loss": 0.6141, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.9083914756774902, |
|
"learning_rate": 0.00018791267305644303, |
|
"loss": 0.6221, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.8606581687927246, |
|
"learning_rate": 0.00018738019169329076, |
|
"loss": 0.593, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.3666915893554688, |
|
"learning_rate": 0.00018684771033013845, |
|
"loss": 0.6307, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.7459832429885864, |
|
"learning_rate": 0.00018631522896698615, |
|
"loss": 0.6774, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.8331255912780762, |
|
"learning_rate": 0.00018578274760383388, |
|
"loss": 0.6637, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 4.606655120849609, |
|
"learning_rate": 0.00018525026624068158, |
|
"loss": 0.5936, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.44328516721725464, |
|
"learning_rate": 0.00018471778487752928, |
|
"loss": 0.6676, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.314159393310547, |
|
"learning_rate": 0.00018418530351437703, |
|
"loss": 0.7248, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.6179046630859375, |
|
"learning_rate": 0.00018365282215122473, |
|
"loss": 0.673, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 4.103793144226074, |
|
"learning_rate": 0.00018312034078807243, |
|
"loss": 0.6637, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.7133169770240784, |
|
"learning_rate": 0.00018258785942492015, |
|
"loss": 0.6193, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.252029299736023, |
|
"learning_rate": 0.00018205537806176785, |
|
"loss": 0.594, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.786902904510498, |
|
"learning_rate": 0.00018152289669861555, |
|
"loss": 0.5598, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.060324668884277, |
|
"learning_rate": 0.00018099041533546325, |
|
"loss": 0.6586, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.4381039142608643, |
|
"learning_rate": 0.000180457933972311, |
|
"loss": 0.6487, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.3903331756591797, |
|
"learning_rate": 0.0001799254526091587, |
|
"loss": 0.624, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.2366740703582764, |
|
"learning_rate": 0.0001793929712460064, |
|
"loss": 0.5997, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 3.3629353046417236, |
|
"learning_rate": 0.00017886048988285412, |
|
"loss": 0.5978, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.4150805473327637, |
|
"learning_rate": 0.00017832800851970182, |
|
"loss": 0.6342, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.2921897768974304, |
|
"learning_rate": 0.00017779552715654952, |
|
"loss": 0.655, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.8717799186706543, |
|
"learning_rate": 0.00017726304579339722, |
|
"loss": 0.6383, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.2297135591506958, |
|
"learning_rate": 0.00017673056443024497, |
|
"loss": 0.6154, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.8782236576080322, |
|
"learning_rate": 0.00017619808306709267, |
|
"loss": 0.6469, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.708266019821167, |
|
"learning_rate": 0.00017566560170394037, |
|
"loss": 0.6462, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.30842864513397217, |
|
"learning_rate": 0.0001751331203407881, |
|
"loss": 0.5934, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.9341667294502258, |
|
"learning_rate": 0.0001746006389776358, |
|
"loss": 0.5741, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.7768970727920532, |
|
"learning_rate": 0.0001740681576144835, |
|
"loss": 0.6028, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 4.887693881988525, |
|
"learning_rate": 0.0001735356762513312, |
|
"loss": 0.6171, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.1910202503204346, |
|
"learning_rate": 0.00017300319488817894, |
|
"loss": 0.6039, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.127533912658691, |
|
"learning_rate": 0.00017247071352502664, |
|
"loss": 0.6404, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.148878574371338, |
|
"learning_rate": 0.00017193823216187434, |
|
"loss": 0.6367, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.9742037057876587, |
|
"learning_rate": 0.00017140575079872207, |
|
"loss": 0.6539, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 7.3390913009643555, |
|
"learning_rate": 0.00017087326943556976, |
|
"loss": 0.6369, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.8897514343261719, |
|
"learning_rate": 0.00017034078807241746, |
|
"loss": 0.6203, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.498, |
|
"eval_loss": 0.746105968952179, |
|
"eval_runtime": 5.0077, |
|
"eval_samples_per_second": 99.846, |
|
"eval_steps_per_second": 3.195, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 5.566906452178955, |
|
"learning_rate": 0.0001698083067092652, |
|
"loss": 0.7104, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 4.016073703765869, |
|
"learning_rate": 0.00016927582534611291, |
|
"loss": 0.6763, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.48662686347961426, |
|
"learning_rate": 0.0001687433439829606, |
|
"loss": 0.6283, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 3.80304217338562, |
|
"learning_rate": 0.0001682108626198083, |
|
"loss": 0.5875, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 3.6678249835968018, |
|
"learning_rate": 0.00016767838125665604, |
|
"loss": 0.6383, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.7282447814941406, |
|
"learning_rate": 0.00016714589989350374, |
|
"loss": 0.5975, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.8518829345703125, |
|
"learning_rate": 0.00016661341853035143, |
|
"loss": 0.6223, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.6626885533332825, |
|
"learning_rate": 0.00016608093716719916, |
|
"loss": 0.6059, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.4838600158691406, |
|
"learning_rate": 0.00016554845580404689, |
|
"loss": 0.6218, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 3.6144859790802, |
|
"learning_rate": 0.00016501597444089458, |
|
"loss": 0.601, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.9720486998558044, |
|
"learning_rate": 0.00016448349307774228, |
|
"loss": 0.5471, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 5.14749002456665, |
|
"learning_rate": 0.00016395101171459, |
|
"loss": 0.6341, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 4.361828804016113, |
|
"learning_rate": 0.0001634185303514377, |
|
"loss": 0.5915, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.6119446754455566, |
|
"learning_rate": 0.0001628860489882854, |
|
"loss": 0.658, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.6508371233940125, |
|
"learning_rate": 0.00016235356762513313, |
|
"loss": 0.5626, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.7702106237411499, |
|
"learning_rate": 0.00016182108626198086, |
|
"loss": 0.6118, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.45979470014572144, |
|
"learning_rate": 0.00016128860489882855, |
|
"loss": 0.6097, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.6143121719360352, |
|
"learning_rate": 0.00016075612353567625, |
|
"loss": 0.6675, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.4905166625976562, |
|
"learning_rate": 0.00016022364217252398, |
|
"loss": 0.5858, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.7425104379653931, |
|
"learning_rate": 0.00015969116080937168, |
|
"loss": 0.5901, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.7030450105667114, |
|
"learning_rate": 0.00015915867944621938, |
|
"loss": 0.5834, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 3.350250720977783, |
|
"learning_rate": 0.0001586261980830671, |
|
"loss": 0.554, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.625653028488159, |
|
"learning_rate": 0.00015809371671991483, |
|
"loss": 0.5784, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.9641324281692505, |
|
"learning_rate": 0.00015756123535676253, |
|
"loss": 0.6203, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.912189245223999, |
|
"learning_rate": 0.00015702875399361022, |
|
"loss": 0.6344, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.581179141998291, |
|
"learning_rate": 0.00015649627263045795, |
|
"loss": 0.6706, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.7273750901222229, |
|
"learning_rate": 0.00015596379126730565, |
|
"loss": 0.6587, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.5495343804359436, |
|
"learning_rate": 0.00015543130990415335, |
|
"loss": 0.6005, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.7823123931884766, |
|
"learning_rate": 0.00015489882854100107, |
|
"loss": 0.5477, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.560255527496338, |
|
"learning_rate": 0.0001543663471778488, |
|
"loss": 0.6153, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 4.055157661437988, |
|
"learning_rate": 0.0001538338658146965, |
|
"loss": 0.6093, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.9480986595153809, |
|
"learning_rate": 0.0001533013844515442, |
|
"loss": 0.653, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.6847740411758423, |
|
"learning_rate": 0.00015276890308839192, |
|
"loss": 0.5842, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 5.935375690460205, |
|
"learning_rate": 0.00015223642172523962, |
|
"loss": 0.5934, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.9190566539764404, |
|
"learning_rate": 0.00015170394036208732, |
|
"loss": 0.5746, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.8768565654754639, |
|
"learning_rate": 0.00015117145899893504, |
|
"loss": 0.5792, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.8437117338180542, |
|
"learning_rate": 0.00015063897763578277, |
|
"loss": 0.6118, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.211174488067627, |
|
"learning_rate": 0.00015010649627263047, |
|
"loss": 0.5693, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 4.7668609619140625, |
|
"learning_rate": 0.00014957401490947817, |
|
"loss": 0.6926, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.8787531852722168, |
|
"learning_rate": 0.0001490415335463259, |
|
"loss": 0.5893, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 3.1121323108673096, |
|
"learning_rate": 0.0001485090521831736, |
|
"loss": 0.6001, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.5263334512710571, |
|
"learning_rate": 0.0001479765708200213, |
|
"loss": 0.6538, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.4740121364593506, |
|
"learning_rate": 0.00014744408945686902, |
|
"loss": 0.633, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.5917043685913086, |
|
"learning_rate": 0.00014691160809371674, |
|
"loss": 0.619, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.5224074125289917, |
|
"learning_rate": 0.00014637912673056444, |
|
"loss": 0.5376, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.7697654962539673, |
|
"learning_rate": 0.00014584664536741214, |
|
"loss": 0.5838, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 4.307724475860596, |
|
"learning_rate": 0.00014531416400425986, |
|
"loss": 0.5861, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.4162418842315674, |
|
"learning_rate": 0.00014478168264110756, |
|
"loss": 0.7074, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 3.3249616622924805, |
|
"learning_rate": 0.00014424920127795526, |
|
"loss": 0.6251, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.486440420150757, |
|
"learning_rate": 0.000143716719914803, |
|
"loss": 0.6949, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.9070874452590942, |
|
"learning_rate": 0.0001431842385516507, |
|
"loss": 0.6384, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.9392743706703186, |
|
"learning_rate": 0.0001426517571884984, |
|
"loss": 0.6292, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.008888840675354, |
|
"learning_rate": 0.0001421192758253461, |
|
"loss": 0.5955, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.518, |
|
"eval_loss": 0.7538835406303406, |
|
"eval_runtime": 5.5893, |
|
"eval_samples_per_second": 89.456, |
|
"eval_steps_per_second": 2.863, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.014275074005127, |
|
"learning_rate": 0.00014158679446219384, |
|
"loss": 0.6254, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 3.639158248901367, |
|
"learning_rate": 0.00014105431309904153, |
|
"loss": 0.5718, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 3.8647546768188477, |
|
"learning_rate": 0.00014052183173588923, |
|
"loss": 0.5798, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.522571086883545, |
|
"learning_rate": 0.00013998935037273696, |
|
"loss": 0.596, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.9586620330810547, |
|
"learning_rate": 0.00013945686900958468, |
|
"loss": 0.546, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.2226234674453735, |
|
"learning_rate": 0.00013892438764643238, |
|
"loss": 0.5209, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.6508853435516357, |
|
"learning_rate": 0.00013839190628328008, |
|
"loss": 0.5763, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 15.09642505645752, |
|
"learning_rate": 0.0001378594249201278, |
|
"loss": 0.5614, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 2.6017839908599854, |
|
"learning_rate": 0.0001373269435569755, |
|
"loss": 0.6123, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 2.848982572555542, |
|
"learning_rate": 0.0001367944621938232, |
|
"loss": 0.5747, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.5202741622924805, |
|
"learning_rate": 0.00013626198083067093, |
|
"loss": 0.5505, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.2070810794830322, |
|
"learning_rate": 0.00013572949946751866, |
|
"loss": 0.4952, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.999449610710144, |
|
"learning_rate": 0.00013519701810436635, |
|
"loss": 0.4874, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.2801100015640259, |
|
"learning_rate": 0.00013466453674121405, |
|
"loss": 0.568, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.8408716917037964, |
|
"learning_rate": 0.00013413205537806178, |
|
"loss": 0.5894, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 2.0327749252319336, |
|
"learning_rate": 0.00013359957401490948, |
|
"loss": 0.5749, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.4758570194244385, |
|
"learning_rate": 0.00013306709265175718, |
|
"loss": 0.5308, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.3630428314208984, |
|
"learning_rate": 0.00013253461128860493, |
|
"loss": 0.5475, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.8149157166481018, |
|
"learning_rate": 0.00013200212992545263, |
|
"loss": 0.5295, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 5.804959297180176, |
|
"learning_rate": 0.00013146964856230033, |
|
"loss": 0.5317, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.0680649280548096, |
|
"learning_rate": 0.00013093716719914802, |
|
"loss": 0.5561, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.4818127155303955, |
|
"learning_rate": 0.00013040468583599575, |
|
"loss": 0.6085, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.4538164138793945, |
|
"learning_rate": 0.00012987220447284345, |
|
"loss": 0.5614, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 2.130237102508545, |
|
"learning_rate": 0.00012933972310969115, |
|
"loss": 0.5255, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 4.228801250457764, |
|
"learning_rate": 0.0001288072417465389, |
|
"loss": 0.592, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.1142246723175049, |
|
"learning_rate": 0.0001282747603833866, |
|
"loss": 0.5262, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.6029281616210938, |
|
"learning_rate": 0.0001277422790202343, |
|
"loss": 0.541, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 3.902634382247925, |
|
"learning_rate": 0.000127209797657082, |
|
"loss": 0.5725, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 5.952896595001221, |
|
"learning_rate": 0.00012667731629392972, |
|
"loss": 0.597, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 1.5133850574493408, |
|
"learning_rate": 0.00012614483493077742, |
|
"loss": 0.4974, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 1.7601635456085205, |
|
"learning_rate": 0.00012561235356762512, |
|
"loss": 0.5825, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 2.166303873062134, |
|
"learning_rate": 0.00012507987220447287, |
|
"loss": 0.4884, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 1.2939203977584839, |
|
"learning_rate": 0.00012454739084132057, |
|
"loss": 0.5576, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 3.2266693115234375, |
|
"learning_rate": 0.00012401490947816827, |
|
"loss": 0.5598, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 1.2390942573547363, |
|
"learning_rate": 0.00012348242811501597, |
|
"loss": 0.507, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.1552023887634277, |
|
"learning_rate": 0.0001229499467518637, |
|
"loss": 0.5781, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.372052788734436, |
|
"learning_rate": 0.0001224174653887114, |
|
"loss": 0.5315, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 2.0448787212371826, |
|
"learning_rate": 0.0001218849840255591, |
|
"loss": 0.5402, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.5772067308425903, |
|
"learning_rate": 0.00012135250266240683, |
|
"loss": 0.518, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 2.118274450302124, |
|
"learning_rate": 0.00012082002129925454, |
|
"loss": 0.5661, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 2.9228522777557373, |
|
"learning_rate": 0.00012028753993610224, |
|
"loss": 0.5419, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 3.01971697807312, |
|
"learning_rate": 0.00011975505857294995, |
|
"loss": 0.5731, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.2747687101364136, |
|
"learning_rate": 0.00011922257720979766, |
|
"loss": 0.5621, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 2.164107084274292, |
|
"learning_rate": 0.00011869009584664536, |
|
"loss": 0.5818, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.2156935930252075, |
|
"learning_rate": 0.00011815761448349307, |
|
"loss": 0.561, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.5158003568649292, |
|
"learning_rate": 0.0001176251331203408, |
|
"loss": 0.5255, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 1.795130729675293, |
|
"learning_rate": 0.00011709265175718851, |
|
"loss": 0.5451, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 2.8883111476898193, |
|
"learning_rate": 0.00011656017039403621, |
|
"loss": 0.539, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 2.8420214653015137, |
|
"learning_rate": 0.00011602768903088392, |
|
"loss": 0.5655, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 1.114376425743103, |
|
"learning_rate": 0.00011549520766773163, |
|
"loss": 0.4676, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 1.6993155479431152, |
|
"learning_rate": 0.00011496272630457933, |
|
"loss": 0.5308, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 1.809404969215393, |
|
"learning_rate": 0.00011443024494142705, |
|
"loss": 0.4535, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 3.3252310752868652, |
|
"learning_rate": 0.00011389776357827477, |
|
"loss": 0.4796, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.536, |
|
"eval_loss": 0.9386767745018005, |
|
"eval_runtime": 5.6156, |
|
"eval_samples_per_second": 89.037, |
|
"eval_steps_per_second": 2.849, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.6969894170761108, |
|
"learning_rate": 0.00011336528221512248, |
|
"loss": 0.5216, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 3.2442750930786133, |
|
"learning_rate": 0.00011283280085197018, |
|
"loss": 0.4074, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 2.2770748138427734, |
|
"learning_rate": 0.0001123003194888179, |
|
"loss": 0.3409, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 2.7340738773345947, |
|
"learning_rate": 0.0001117678381256656, |
|
"loss": 0.4351, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 2.154231071472168, |
|
"learning_rate": 0.0001112353567625133, |
|
"loss": 0.4451, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 2.5511295795440674, |
|
"learning_rate": 0.00011070287539936102, |
|
"loss": 0.4674, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.8929212093353271, |
|
"learning_rate": 0.00011017039403620874, |
|
"loss": 0.4184, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 1.6195924282073975, |
|
"learning_rate": 0.00010963791267305645, |
|
"loss": 0.3456, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 2.4593372344970703, |
|
"learning_rate": 0.00010910543130990417, |
|
"loss": 0.3398, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 4.4898457527160645, |
|
"learning_rate": 0.00010857294994675187, |
|
"loss": 0.3741, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 3.513038158416748, |
|
"learning_rate": 0.00010804046858359958, |
|
"loss": 0.5035, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 2.14664626121521, |
|
"learning_rate": 0.00010750798722044728, |
|
"loss": 0.3854, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 1.8807299137115479, |
|
"learning_rate": 0.00010697550585729502, |
|
"loss": 0.3503, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 3.9768781661987305, |
|
"learning_rate": 0.00010644302449414271, |
|
"loss": 0.4615, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 4.187206745147705, |
|
"learning_rate": 0.00010591054313099043, |
|
"loss": 0.3798, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 5.973431587219238, |
|
"learning_rate": 0.00010537806176783814, |
|
"loss": 0.4394, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 2.4600276947021484, |
|
"learning_rate": 0.00010484558040468584, |
|
"loss": 0.4936, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 2.6891305446624756, |
|
"learning_rate": 0.00010431309904153355, |
|
"loss": 0.4825, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 3.481198787689209, |
|
"learning_rate": 0.00010378061767838125, |
|
"loss": 0.5209, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 1.2965519428253174, |
|
"learning_rate": 0.0001033013844515442, |
|
"loss": 0.4865, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.5025973320007324, |
|
"learning_rate": 0.00010276890308839191, |
|
"loss": 0.4439, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 3.154419422149658, |
|
"learning_rate": 0.00010223642172523961, |
|
"loss": 0.4129, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 3.1447813510894775, |
|
"learning_rate": 0.00010170394036208732, |
|
"loss": 0.4973, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 2.3173677921295166, |
|
"learning_rate": 0.00010117145899893504, |
|
"loss": 0.4538, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 1.7222325801849365, |
|
"learning_rate": 0.00010063897763578276, |
|
"loss": 0.4326, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 1.9806593656539917, |
|
"learning_rate": 0.00010010649627263047, |
|
"loss": 0.4568, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 1.6622909307479858, |
|
"learning_rate": 9.957401490947817e-05, |
|
"loss": 0.4504, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 1.593268871307373, |
|
"learning_rate": 9.904153354632588e-05, |
|
"loss": 0.3772, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 2.6867918968200684, |
|
"learning_rate": 9.850905218317358e-05, |
|
"loss": 0.4217, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 2.0145435333251953, |
|
"learning_rate": 9.797657082002131e-05, |
|
"loss": 0.3476, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 2.0304441452026367, |
|
"learning_rate": 9.744408945686902e-05, |
|
"loss": 0.3252, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 4.424630165100098, |
|
"learning_rate": 9.691160809371672e-05, |
|
"loss": 0.5274, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 1.801672101020813, |
|
"learning_rate": 9.637912673056444e-05, |
|
"loss": 0.4023, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 2.912360906600952, |
|
"learning_rate": 9.584664536741214e-05, |
|
"loss": 0.3901, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 2.676485538482666, |
|
"learning_rate": 9.531416400425986e-05, |
|
"loss": 0.4589, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 2.1935267448425293, |
|
"learning_rate": 9.478168264110757e-05, |
|
"loss": 0.5248, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 2.977781295776367, |
|
"learning_rate": 9.424920127795528e-05, |
|
"loss": 0.4278, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 4.009756088256836, |
|
"learning_rate": 9.371671991480299e-05, |
|
"loss": 0.4132, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 3.5155155658721924, |
|
"learning_rate": 9.318423855165069e-05, |
|
"loss": 0.3934, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 2.687786817550659, |
|
"learning_rate": 9.265175718849842e-05, |
|
"loss": 0.3679, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 2.9000587463378906, |
|
"learning_rate": 9.211927582534611e-05, |
|
"loss": 0.483, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 2.172851085662842, |
|
"learning_rate": 9.158679446219383e-05, |
|
"loss": 0.3782, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 4.356163024902344, |
|
"learning_rate": 9.105431309904154e-05, |
|
"loss": 0.4852, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 3.4460835456848145, |
|
"learning_rate": 9.052183173588925e-05, |
|
"loss": 0.4019, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 3.1064889430999756, |
|
"learning_rate": 8.998935037273696e-05, |
|
"loss": 0.397, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 2.628739833831787, |
|
"learning_rate": 8.945686900958466e-05, |
|
"loss": 0.385, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 1.598198413848877, |
|
"learning_rate": 8.892438764643239e-05, |
|
"loss": 0.2814, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 4.517331123352051, |
|
"learning_rate": 8.839190628328009e-05, |
|
"loss": 0.3773, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 3.0325510501861572, |
|
"learning_rate": 8.78594249201278e-05, |
|
"loss": 0.3999, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 2.6737074851989746, |
|
"learning_rate": 8.732694355697551e-05, |
|
"loss": 0.4072, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 2.7080023288726807, |
|
"learning_rate": 8.679446219382322e-05, |
|
"loss": 0.4635, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.1088662147521973, |
|
"learning_rate": 8.626198083067093e-05, |
|
"loss": 0.4483, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 1.194525957107544, |
|
"learning_rate": 8.572949946751863e-05, |
|
"loss": 0.4549, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 12.147368431091309, |
|
"learning_rate": 8.519701810436636e-05, |
|
"loss": 0.425, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.538, |
|
"eval_loss": 0.9824263453483582, |
|
"eval_runtime": 5.1851, |
|
"eval_samples_per_second": 96.431, |
|
"eval_steps_per_second": 3.086, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 1.088959813117981, |
|
"learning_rate": 8.466453674121406e-05, |
|
"loss": 0.295, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 1.6955100297927856, |
|
"learning_rate": 8.413205537806177e-05, |
|
"loss": 0.2041, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 2.099268913269043, |
|
"learning_rate": 8.359957401490948e-05, |
|
"loss": 0.3164, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 3.751316547393799, |
|
"learning_rate": 8.30670926517572e-05, |
|
"loss": 0.2167, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"grad_norm": 0.7041321396827698, |
|
"learning_rate": 8.25346112886049e-05, |
|
"loss": 0.3246, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 1.6721878051757812, |
|
"learning_rate": 8.20021299254526e-05, |
|
"loss": 0.2365, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"grad_norm": 3.7711753845214844, |
|
"learning_rate": 8.146964856230033e-05, |
|
"loss": 0.22, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 5.828824520111084, |
|
"learning_rate": 8.093716719914803e-05, |
|
"loss": 0.3559, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 2.093899726867676, |
|
"learning_rate": 8.040468583599574e-05, |
|
"loss": 0.2061, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 4.466729164123535, |
|
"learning_rate": 7.987220447284345e-05, |
|
"loss": 0.3012, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 2.4526288509368896, |
|
"learning_rate": 7.933972310969116e-05, |
|
"loss": 0.2209, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"grad_norm": 4.241008281707764, |
|
"learning_rate": 7.880724174653888e-05, |
|
"loss": 0.2233, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 2.828065872192383, |
|
"learning_rate": 7.827476038338658e-05, |
|
"loss": 0.1914, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"grad_norm": 4.489712715148926, |
|
"learning_rate": 7.77422790202343e-05, |
|
"loss": 0.2209, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 7.091925621032715, |
|
"learning_rate": 7.7209797657082e-05, |
|
"loss": 0.2372, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 3.1179749965667725, |
|
"learning_rate": 7.667731629392971e-05, |
|
"loss": 0.3655, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 2.333217144012451, |
|
"learning_rate": 7.614483493077744e-05, |
|
"loss": 0.2942, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 3.904360771179199, |
|
"learning_rate": 7.561235356762514e-05, |
|
"loss": 0.2113, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 3.5696709156036377, |
|
"learning_rate": 7.507987220447285e-05, |
|
"loss": 0.1951, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 2.9150383472442627, |
|
"learning_rate": 7.454739084132055e-05, |
|
"loss": 0.2142, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"grad_norm": 3.8467230796813965, |
|
"learning_rate": 7.401490947816827e-05, |
|
"loss": 0.2315, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 2.5664279460906982, |
|
"learning_rate": 7.348242811501597e-05, |
|
"loss": 0.3081, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 2.859978199005127, |
|
"learning_rate": 7.294994675186368e-05, |
|
"loss": 0.2426, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 2.69465970993042, |
|
"learning_rate": 7.241746538871141e-05, |
|
"loss": 0.3018, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.47, |
|
"grad_norm": 3.34781551361084, |
|
"learning_rate": 7.188498402555911e-05, |
|
"loss": 0.1438, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 4.8066182136535645, |
|
"learning_rate": 7.135250266240682e-05, |
|
"loss": 0.224, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"grad_norm": 0.3521786630153656, |
|
"learning_rate": 7.082002129925452e-05, |
|
"loss": 0.2222, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"grad_norm": 2.7981643676757812, |
|
"learning_rate": 7.028753993610224e-05, |
|
"loss": 0.3419, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 4.798647403717041, |
|
"learning_rate": 6.975505857294996e-05, |
|
"loss": 0.2757, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 4.175738334655762, |
|
"learning_rate": 6.922257720979765e-05, |
|
"loss": 0.2388, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"grad_norm": 3.5461745262145996, |
|
"learning_rate": 6.869009584664538e-05, |
|
"loss": 0.1961, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 2.714567184448242, |
|
"learning_rate": 6.815761448349308e-05, |
|
"loss": 0.2225, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 4.761994361877441, |
|
"learning_rate": 6.762513312034079e-05, |
|
"loss": 0.23, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 2.0303542613983154, |
|
"learning_rate": 6.70926517571885e-05, |
|
"loss": 0.1996, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.66, |
|
"grad_norm": 3.595022678375244, |
|
"learning_rate": 6.656017039403621e-05, |
|
"loss": 0.2089, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 4.19135856628418, |
|
"learning_rate": 6.602768903088393e-05, |
|
"loss": 0.2199, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 6.599343299865723, |
|
"learning_rate": 6.549520766773163e-05, |
|
"loss": 0.2658, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 4.5252685546875, |
|
"learning_rate": 6.496272630457935e-05, |
|
"loss": 0.1459, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 2.3499250411987305, |
|
"learning_rate": 6.443024494142705e-05, |
|
"loss": 0.2447, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 4.795304298400879, |
|
"learning_rate": 6.389776357827476e-05, |
|
"loss": 0.1987, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"grad_norm": 2.328916311264038, |
|
"learning_rate": 6.336528221512247e-05, |
|
"loss": 0.2369, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"grad_norm": 5.857803821563721, |
|
"learning_rate": 6.283280085197019e-05, |
|
"loss": 0.2668, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 6.459561347961426, |
|
"learning_rate": 6.23003194888179e-05, |
|
"loss": 0.2438, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 3.3628289699554443, |
|
"learning_rate": 6.17678381256656e-05, |
|
"loss": 0.2694, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"grad_norm": 3.036752223968506, |
|
"learning_rate": 6.123535676251332e-05, |
|
"loss": 0.3594, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 1.0607565641403198, |
|
"learning_rate": 6.070287539936103e-05, |
|
"loss": 0.1804, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 3.0828421115875244, |
|
"learning_rate": 6.017039403620873e-05, |
|
"loss": 0.2408, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 6.958425045013428, |
|
"learning_rate": 5.963791267305645e-05, |
|
"loss": 0.2278, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 6.2445526123046875, |
|
"learning_rate": 5.910543130990416e-05, |
|
"loss": 0.1687, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"grad_norm": 4.447784900665283, |
|
"learning_rate": 5.857294994675186e-05, |
|
"loss": 0.2019, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 2.6770880222320557, |
|
"learning_rate": 5.8040468583599575e-05, |
|
"loss": 0.2857, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 2.762964963912964, |
|
"learning_rate": 5.750798722044729e-05, |
|
"loss": 0.2092, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 4.038713455200195, |
|
"learning_rate": 5.6975505857295e-05, |
|
"loss": 0.2869, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.542, |
|
"eval_loss": 1.6185880899429321, |
|
"eval_runtime": 5.2709, |
|
"eval_samples_per_second": 94.861, |
|
"eval_steps_per_second": 3.036, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 4.724943161010742, |
|
"learning_rate": 5.6443024494142705e-05, |
|
"loss": 0.1294, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 0.55290687084198, |
|
"learning_rate": 5.5910543130990424e-05, |
|
"loss": 0.0765, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 2.6764791011810303, |
|
"learning_rate": 5.537806176783813e-05, |
|
"loss": 0.0919, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 0.35876473784446716, |
|
"learning_rate": 5.4845580404685834e-05, |
|
"loss": 0.1147, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"grad_norm": 7.607916355133057, |
|
"learning_rate": 5.4313099041533546e-05, |
|
"loss": 0.1175, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"grad_norm": 0.5861133337020874, |
|
"learning_rate": 5.3780617678381265e-05, |
|
"loss": 0.041, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 7.388729095458984, |
|
"learning_rate": 5.324813631522897e-05, |
|
"loss": 0.0932, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 0.1238517016172409, |
|
"learning_rate": 5.2715654952076676e-05, |
|
"loss": 0.1329, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 2.567272424697876, |
|
"learning_rate": 5.2183173588924395e-05, |
|
"loss": 0.0433, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 2.4135849475860596, |
|
"learning_rate": 5.16506922257721e-05, |
|
"loss": 0.068, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 0.1513521671295166, |
|
"learning_rate": 5.1118210862619806e-05, |
|
"loss": 0.0358, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 0.09611086547374725, |
|
"learning_rate": 5.058572949946752e-05, |
|
"loss": 0.1276, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.34191715717315674, |
|
"learning_rate": 5.0053248136315237e-05, |
|
"loss": 0.0796, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 0.03613116592168808, |
|
"learning_rate": 4.952076677316294e-05, |
|
"loss": 0.0595, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 9.316259384155273, |
|
"learning_rate": 4.8988285410010654e-05, |
|
"loss": 0.0588, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 9.937971115112305, |
|
"learning_rate": 4.845580404685836e-05, |
|
"loss": 0.1649, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 8.708675384521484, |
|
"learning_rate": 4.792332268370607e-05, |
|
"loss": 0.2051, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"grad_norm": 1.0412534475326538, |
|
"learning_rate": 4.7390841320553784e-05, |
|
"loss": 0.117, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.2932884395122528, |
|
"learning_rate": 4.6858359957401496e-05, |
|
"loss": 0.1319, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 9.108260154724121, |
|
"learning_rate": 4.632587859424921e-05, |
|
"loss": 0.0514, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"grad_norm": 5.959402084350586, |
|
"learning_rate": 4.579339723109691e-05, |
|
"loss": 0.0938, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"grad_norm": 2.6586625576019287, |
|
"learning_rate": 4.5260915867944625e-05, |
|
"loss": 0.094, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"grad_norm": 0.6427993178367615, |
|
"learning_rate": 4.472843450479233e-05, |
|
"loss": 0.0934, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 4.278239727020264, |
|
"learning_rate": 4.419595314164004e-05, |
|
"loss": 0.1062, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"grad_norm": 12.555079460144043, |
|
"learning_rate": 4.3663471778487755e-05, |
|
"loss": 0.0382, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 7.286933422088623, |
|
"learning_rate": 4.313099041533547e-05, |
|
"loss": 0.07, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.022136323153972626, |
|
"learning_rate": 4.259850905218318e-05, |
|
"loss": 0.0973, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 13.952143669128418, |
|
"learning_rate": 4.2066027689030885e-05, |
|
"loss": 0.0956, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 5.638768672943115, |
|
"learning_rate": 4.15335463258786e-05, |
|
"loss": 0.0594, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.06895623356103897, |
|
"learning_rate": 4.10010649627263e-05, |
|
"loss": 0.1302, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"grad_norm": 0.5818095207214355, |
|
"learning_rate": 4.0468583599574014e-05, |
|
"loss": 0.1488, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 0.13191522657871246, |
|
"learning_rate": 3.9989350372736954e-05, |
|
"loss": 0.0878, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 18.15912628173828, |
|
"learning_rate": 3.9456869009584666e-05, |
|
"loss": 0.1311, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 13.288291931152344, |
|
"learning_rate": 3.892438764643238e-05, |
|
"loss": 0.2676, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 2.8228518962860107, |
|
"learning_rate": 3.839190628328009e-05, |
|
"loss": 0.0771, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.11967118829488754, |
|
"learning_rate": 3.7859424920127795e-05, |
|
"loss": 0.047, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"grad_norm": 0.17968201637268066, |
|
"learning_rate": 3.732694355697551e-05, |
|
"loss": 0.0551, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"grad_norm": 3.643249273300171, |
|
"learning_rate": 3.679446219382322e-05, |
|
"loss": 0.0916, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 1.9196984767913818, |
|
"learning_rate": 3.6261980830670925e-05, |
|
"loss": 0.0497, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 14.608271598815918, |
|
"learning_rate": 3.572949946751864e-05, |
|
"loss": 0.2575, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 0.6130942106246948, |
|
"learning_rate": 3.519701810436635e-05, |
|
"loss": 0.1304, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"grad_norm": 0.11332878470420837, |
|
"learning_rate": 3.466453674121406e-05, |
|
"loss": 0.0753, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.6603028178215027, |
|
"learning_rate": 3.413205537806177e-05, |
|
"loss": 0.0852, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 4.780795097351074, |
|
"learning_rate": 3.359957401490948e-05, |
|
"loss": 0.0702, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"grad_norm": 2.858586549758911, |
|
"learning_rate": 3.306709265175719e-05, |
|
"loss": 0.1232, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 4.370979309082031, |
|
"learning_rate": 3.2534611288604896e-05, |
|
"loss": 0.1168, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.04837312549352646, |
|
"learning_rate": 3.2002129925452615e-05, |
|
"loss": 0.0296, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 0.31512418389320374, |
|
"learning_rate": 3.146964856230032e-05, |
|
"loss": 0.1051, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 3.6252450942993164, |
|
"learning_rate": 3.093716719914803e-05, |
|
"loss": 0.1216, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 0.5809699892997742, |
|
"learning_rate": 3.0404685835995745e-05, |
|
"loss": 0.0501, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 0.15533213317394257, |
|
"learning_rate": 2.987220447284345e-05, |
|
"loss": 0.0621, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"grad_norm": 0.04270392283797264, |
|
"learning_rate": 2.9339723109691166e-05, |
|
"loss": 0.0353, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.01716708578169346, |
|
"learning_rate": 2.880724174653887e-05, |
|
"loss": 0.0337, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.538, |
|
"eval_loss": 2.5670464038848877, |
|
"eval_runtime": 5.0132, |
|
"eval_samples_per_second": 99.737, |
|
"eval_steps_per_second": 3.192, |
|
"step": 3724 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 0.029269874095916748, |
|
"learning_rate": 2.8274760383386583e-05, |
|
"loss": 0.0997, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"grad_norm": 0.11788934469223022, |
|
"learning_rate": 2.7742279020234292e-05, |
|
"loss": 0.0082, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.01990850828588009, |
|
"learning_rate": 2.7209797657082004e-05, |
|
"loss": 0.0251, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 0.38145899772644043, |
|
"learning_rate": 2.6677316293929716e-05, |
|
"loss": 0.013, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"grad_norm": 2.5921037197113037, |
|
"learning_rate": 2.6144834930777425e-05, |
|
"loss": 0.0601, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"grad_norm": 0.03015085868537426, |
|
"learning_rate": 2.5612353567625137e-05, |
|
"loss": 0.0365, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 9.706396102905273, |
|
"learning_rate": 2.5079872204472842e-05, |
|
"loss": 0.0722, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 0.02435947395861149, |
|
"learning_rate": 2.4547390841320554e-05, |
|
"loss": 0.0082, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 0.030334001407027245, |
|
"learning_rate": 2.4014909478168267e-05, |
|
"loss": 0.011, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"grad_norm": 3.0113189220428467, |
|
"learning_rate": 2.3482428115015975e-05, |
|
"loss": 0.0485, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.021948257461190224, |
|
"learning_rate": 2.2949946751863684e-05, |
|
"loss": 0.0384, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 0.02243644930422306, |
|
"learning_rate": 2.2417465388711396e-05, |
|
"loss": 0.0103, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 0.03432391956448555, |
|
"learning_rate": 2.188498402555911e-05, |
|
"loss": 0.0436, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.05325201898813248, |
|
"learning_rate": 2.1352502662406817e-05, |
|
"loss": 0.0519, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 0.01565093919634819, |
|
"learning_rate": 2.0820021299254526e-05, |
|
"loss": 0.0225, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"grad_norm": 0.14941458404064178, |
|
"learning_rate": 2.0287539936102238e-05, |
|
"loss": 0.0014, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.012375161051750183, |
|
"learning_rate": 1.9755058572949947e-05, |
|
"loss": 0.0155, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"grad_norm": 0.004966201260685921, |
|
"learning_rate": 1.9222577209797655e-05, |
|
"loss": 0.0131, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"grad_norm": 0.013580994680523872, |
|
"learning_rate": 1.869009584664537e-05, |
|
"loss": 0.0054, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.006246180739253759, |
|
"learning_rate": 1.815761448349308e-05, |
|
"loss": 0.0953, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 0.05573183670639992, |
|
"learning_rate": 1.762513312034079e-05, |
|
"loss": 0.0454, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"grad_norm": 0.023609144613146782, |
|
"learning_rate": 1.70926517571885e-05, |
|
"loss": 0.0313, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.01226514670997858, |
|
"learning_rate": 1.656017039403621e-05, |
|
"loss": 0.0448, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 20.82164764404297, |
|
"learning_rate": 1.6027689030883918e-05, |
|
"loss": 0.1159, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.018167337402701378, |
|
"learning_rate": 1.549520766773163e-05, |
|
"loss": 0.0377, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 0.031441397964954376, |
|
"learning_rate": 1.4962726304579342e-05, |
|
"loss": 0.0299, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.2981891632080078, |
|
"learning_rate": 1.4430244941427051e-05, |
|
"loss": 0.0301, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 0.052271194756031036, |
|
"learning_rate": 1.3897763578274761e-05, |
|
"loss": 0.0014, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 0.07910721004009247, |
|
"learning_rate": 1.3365282215122472e-05, |
|
"loss": 0.0029, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"grad_norm": 1.7630151510238647, |
|
"learning_rate": 1.283280085197018e-05, |
|
"loss": 0.0421, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.013812140561640263, |
|
"learning_rate": 1.2300319488817893e-05, |
|
"loss": 0.0221, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 7.59, |
|
"grad_norm": 0.013484804891049862, |
|
"learning_rate": 1.1767838125665601e-05, |
|
"loss": 0.0011, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"grad_norm": 0.031141534447669983, |
|
"learning_rate": 1.1235356762513314e-05, |
|
"loss": 0.0963, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"grad_norm": 0.1125926822423935, |
|
"learning_rate": 1.0702875399361024e-05, |
|
"loss": 0.0206, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 1.0204988718032837, |
|
"learning_rate": 1.0170394036208733e-05, |
|
"loss": 0.0019, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"grad_norm": 0.020539429038763046, |
|
"learning_rate": 9.637912673056443e-06, |
|
"loss": 0.0474, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.02552681416273117, |
|
"learning_rate": 9.105431309904154e-06, |
|
"loss": 0.0263, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"grad_norm": 0.0711381658911705, |
|
"learning_rate": 8.572949946751864e-06, |
|
"loss": 0.0225, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 0.031245287507772446, |
|
"learning_rate": 8.040468583599574e-06, |
|
"loss": 0.0012, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.014275019988417625, |
|
"learning_rate": 7.507987220447285e-06, |
|
"loss": 0.0434, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 0.060072433203458786, |
|
"learning_rate": 6.975505857294995e-06, |
|
"loss": 0.0846, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 2.3613195419311523, |
|
"learning_rate": 6.443024494142705e-06, |
|
"loss": 0.0444, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.03658520057797432, |
|
"learning_rate": 5.910543130990415e-06, |
|
"loss": 0.0022, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"grad_norm": 0.14258913695812225, |
|
"learning_rate": 5.378061767838126e-06, |
|
"loss": 0.0537, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 0.017876233905553818, |
|
"learning_rate": 4.845580404685836e-06, |
|
"loss": 0.0654, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 0.039430875331163406, |
|
"learning_rate": 4.3130990415335465e-06, |
|
"loss": 0.0241, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 0.07222764194011688, |
|
"learning_rate": 3.780617678381257e-06, |
|
"loss": 0.0018, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 7.89, |
|
"grad_norm": 0.2034127116203308, |
|
"learning_rate": 3.2481363152289674e-06, |
|
"loss": 0.0322, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 32.39055633544922, |
|
"learning_rate": 2.7156549520766774e-06, |
|
"loss": 0.0278, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"grad_norm": 0.02537735551595688, |
|
"learning_rate": 2.183173588924388e-06, |
|
"loss": 0.0016, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"grad_norm": 5.338490009307861, |
|
"learning_rate": 1.650692225772098e-06, |
|
"loss": 0.0305, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"grad_norm": 0.04093256965279579, |
|
"learning_rate": 1.1182108626198083e-06, |
|
"loss": 0.0439, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"grad_norm": 0.01651330478489399, |
|
"learning_rate": 5.857294994675187e-07, |
|
"loss": 0.0166, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.538, |
|
"eval_loss": 2.806075096130371, |
|
"eval_runtime": 5.1896, |
|
"eval_samples_per_second": 96.347, |
|
"eval_steps_per_second": 3.083, |
|
"step": 4256 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"step": 4256, |
|
"total_flos": 5.269455293792256e+18, |
|
"train_loss": 0.40350076897398923, |
|
"train_runtime": 1386.2467, |
|
"train_samples_per_second": 49.053, |
|
"train_steps_per_second": 3.07 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4256, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"total_flos": 5.269455293792256e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|