|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.56, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.571518898010254, |
|
"learning_rate": 0.0003965811965811966, |
|
"loss": 6.0892, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.477086067199707, |
|
"learning_rate": 0.00039316239316239317, |
|
"loss": 1.4485, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7137540578842163, |
|
"learning_rate": 0.00038974358974358975, |
|
"loss": 0.9732, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5362057685852051, |
|
"learning_rate": 0.0003863247863247863, |
|
"loss": 0.7804, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6810179948806763, |
|
"learning_rate": 0.00038290598290598296, |
|
"loss": 0.665, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4986821115016937, |
|
"learning_rate": 0.0003794871794871795, |
|
"loss": 0.6091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.4309682250022888, |
|
"learning_rate": 0.00037606837606837606, |
|
"loss": 0.5502, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3824257552623749, |
|
"learning_rate": 0.0003726495726495727, |
|
"loss": 0.5164, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.3188970386981964, |
|
"learning_rate": 0.00036923076923076927, |
|
"loss": 0.4883, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3359103202819824, |
|
"learning_rate": 0.00036581196581196584, |
|
"loss": 0.4612, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4327464699745178, |
|
"learning_rate": 0.0003623931623931624, |
|
"loss": 0.4351, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4424777030944824, |
|
"learning_rate": 0.000358974358974359, |
|
"loss": 0.4217, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.520322322845459, |
|
"learning_rate": 0.00035555555555555557, |
|
"loss": 0.4076, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.48572778701782227, |
|
"learning_rate": 0.00035213675213675215, |
|
"loss": 0.3948, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2985605001449585, |
|
"learning_rate": 0.0003487179487179487, |
|
"loss": 0.3823, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.28738752007484436, |
|
"learning_rate": 0.00034529914529914536, |
|
"loss": 0.375, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.29423144459724426, |
|
"learning_rate": 0.0003418803418803419, |
|
"loss": 0.3591, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.26430046558380127, |
|
"learning_rate": 0.00033846153846153846, |
|
"loss": 0.3494, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2734215259552002, |
|
"learning_rate": 0.0003350427350427351, |
|
"loss": 0.3396, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.3005197048187256, |
|
"learning_rate": 0.00033162393162393166, |
|
"loss": 0.3352, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.2822723686695099, |
|
"learning_rate": 0.0003282051282051282, |
|
"loss": 0.3241, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2792316973209381, |
|
"learning_rate": 0.0003247863247863248, |
|
"loss": 0.3208, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2761669158935547, |
|
"learning_rate": 0.0003213675213675214, |
|
"loss": 0.3148, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2733113467693329, |
|
"learning_rate": 0.0003179487179487179, |
|
"loss": 0.311, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3393694758415222, |
|
"learning_rate": 0.00031452991452991455, |
|
"loss": 0.3056, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.29316985607147217, |
|
"learning_rate": 0.0003111111111111111, |
|
"loss": 0.2975, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.29134783148765564, |
|
"learning_rate": 0.0003076923076923077, |
|
"loss": 0.2898, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.41234660148620605, |
|
"learning_rate": 0.0003042735042735043, |
|
"loss": 0.2895, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.26693716645240784, |
|
"learning_rate": 0.00030085470085470086, |
|
"loss": 0.2835, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2862294614315033, |
|
"learning_rate": 0.00029743589743589743, |
|
"loss": 0.2747, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.2596249282360077, |
|
"learning_rate": 0.00029401709401709406, |
|
"loss": 0.2752, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2555866241455078, |
|
"learning_rate": 0.0002905982905982906, |
|
"loss": 0.2662, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3845195472240448, |
|
"learning_rate": 0.0002871794871794872, |
|
"loss": 0.2625, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.23550209403038025, |
|
"learning_rate": 0.0002837606837606838, |
|
"loss": 0.256, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.2434936910867691, |
|
"learning_rate": 0.0002803418803418803, |
|
"loss": 0.2545, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.23562268912792206, |
|
"learning_rate": 0.00027692307692307695, |
|
"loss": 0.2536, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.3110085427761078, |
|
"learning_rate": 0.0002735042735042735, |
|
"loss": 0.2497, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2646142244338989, |
|
"learning_rate": 0.0002700854700854701, |
|
"loss": 0.2448, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.22812116146087646, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 0.2409, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.21481893956661224, |
|
"learning_rate": 0.00026324786324786326, |
|
"loss": 0.225, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.2561526298522949, |
|
"learning_rate": 0.00025982905982905983, |
|
"loss": 0.2194, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2297515720129013, |
|
"learning_rate": 0.00025641025641025646, |
|
"loss": 0.216, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.25526463985443115, |
|
"learning_rate": 0.000252991452991453, |
|
"loss": 0.2171, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.24202637374401093, |
|
"learning_rate": 0.00024957264957264956, |
|
"loss": 0.2149, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.20644807815551758, |
|
"learning_rate": 0.0002461538461538462, |
|
"loss": 0.209, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2795998454093933, |
|
"learning_rate": 0.00024273504273504272, |
|
"loss": 0.2071, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.306149959564209, |
|
"learning_rate": 0.00023931623931623932, |
|
"loss": 0.208, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.2355523407459259, |
|
"learning_rate": 0.00023589743589743593, |
|
"loss": 0.2051, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.2909263074398041, |
|
"learning_rate": 0.0002324786324786325, |
|
"loss": 0.2023, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.5298261642456055, |
|
"learning_rate": 0.00022905982905982905, |
|
"loss": 0.2018, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_cer": 0.9319083335386478, |
|
"eval_loss": 0.17038685083389282, |
|
"eval_runtime": 138.0008, |
|
"eval_samples_per_second": 14.493, |
|
"eval_steps_per_second": 0.457, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.23548871278762817, |
|
"learning_rate": 0.00022564102564102566, |
|
"loss": 0.2008, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.20162977278232574, |
|
"learning_rate": 0.00022222222222222223, |
|
"loss": 0.1975, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.2593408524990082, |
|
"learning_rate": 0.00021880341880341884, |
|
"loss": 0.1959, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.21452312171459198, |
|
"learning_rate": 0.0002153846153846154, |
|
"loss": 0.194, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.2637544274330139, |
|
"learning_rate": 0.000211965811965812, |
|
"loss": 0.1909, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.24357128143310547, |
|
"learning_rate": 0.00020854700854700857, |
|
"loss": 0.19, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.2084117978811264, |
|
"learning_rate": 0.00020512820512820512, |
|
"loss": 0.1857, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.23245294392108917, |
|
"learning_rate": 0.00020170940170940172, |
|
"loss": 0.1858, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.23836293816566467, |
|
"learning_rate": 0.0001982905982905983, |
|
"loss": 0.183, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.19184565544128418, |
|
"learning_rate": 0.00019487179487179487, |
|
"loss": 0.183, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.20401564240455627, |
|
"learning_rate": 0.00019145299145299148, |
|
"loss": 0.1829, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.21579188108444214, |
|
"learning_rate": 0.00018803418803418803, |
|
"loss": 0.1812, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.23108145594596863, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 0.1781, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.2311713844537735, |
|
"learning_rate": 0.0001811965811965812, |
|
"loss": 0.1755, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.19794794917106628, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.1768, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.2516119182109833, |
|
"learning_rate": 0.00017435897435897436, |
|
"loss": 0.1737, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.20975567400455475, |
|
"learning_rate": 0.00017094017094017094, |
|
"loss": 0.1712, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.22168505191802979, |
|
"learning_rate": 0.00016752136752136754, |
|
"loss": 0.1693, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.22844062745571136, |
|
"learning_rate": 0.0001641025641025641, |
|
"loss": 0.168, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.22804197669029236, |
|
"learning_rate": 0.0001606837606837607, |
|
"loss": 0.1721, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.22620578110218048, |
|
"learning_rate": 0.00015726495726495727, |
|
"loss": 0.1703, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.21445313096046448, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.1673, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.207479327917099, |
|
"learning_rate": 0.00015042735042735043, |
|
"loss": 0.1648, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.22134087979793549, |
|
"learning_rate": 0.00014700854700854703, |
|
"loss": 0.1629, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.20121484994888306, |
|
"learning_rate": 0.0001435897435897436, |
|
"loss": 0.1638, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2002618908882141, |
|
"learning_rate": 0.00014017094017094016, |
|
"loss": 0.1621, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.19750453531742096, |
|
"learning_rate": 0.00013675213675213676, |
|
"loss": 0.1667, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.22286508977413177, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.1642, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.21668635308742523, |
|
"learning_rate": 0.00012991452991452992, |
|
"loss": 0.1482, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.233961820602417, |
|
"learning_rate": 0.0001264957264957265, |
|
"loss": 0.1453, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.1865084022283554, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 0.1455, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.1853141337633133, |
|
"learning_rate": 0.00011965811965811966, |
|
"loss": 0.1442, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.17371739447116852, |
|
"learning_rate": 0.00011623931623931625, |
|
"loss": 0.1382, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.19631154835224152, |
|
"learning_rate": 0.00011282051282051283, |
|
"loss": 0.1415, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.194850891828537, |
|
"learning_rate": 0.00010940170940170942, |
|
"loss": 0.141, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.18121449649333954, |
|
"learning_rate": 0.000105982905982906, |
|
"loss": 0.1388, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.2176773101091385, |
|
"learning_rate": 0.00010256410256410256, |
|
"loss": 0.1399, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.19013133645057678, |
|
"learning_rate": 9.914529914529915e-05, |
|
"loss": 0.137, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.22148679196834564, |
|
"learning_rate": 9.572649572649574e-05, |
|
"loss": 0.139, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.20861493051052094, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.139, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.17541790008544922, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.1362, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.1971459984779358, |
|
"learning_rate": 8.547008547008547e-05, |
|
"loss": 0.1346, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.20883004367351532, |
|
"learning_rate": 8.205128205128205e-05, |
|
"loss": 0.1351, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.18058577179908752, |
|
"learning_rate": 7.863247863247864e-05, |
|
"loss": 0.1363, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.19193512201309204, |
|
"learning_rate": 7.521367521367521e-05, |
|
"loss": 0.1359, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.17777132987976074, |
|
"learning_rate": 7.17948717948718e-05, |
|
"loss": 0.1363, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.1730206310749054, |
|
"learning_rate": 6.837606837606838e-05, |
|
"loss": 0.1339, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.172698512673378, |
|
"learning_rate": 6.495726495726496e-05, |
|
"loss": 0.1317, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.1746242642402649, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.132, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.1631608009338379, |
|
"learning_rate": 5.8119658119658126e-05, |
|
"loss": 0.1292, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_cer": 0.9320702386692806, |
|
"eval_loss": 0.11450555920600891, |
|
"eval_runtime": 136.638, |
|
"eval_samples_per_second": 14.637, |
|
"eval_steps_per_second": 0.461, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1170, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 6.737241075941376e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|