{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.56, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 8.571518898010254, "learning_rate": 0.0003965811965811966, "loss": 6.0892, "step": 10 }, { "epoch": 0.05, "grad_norm": 2.477086067199707, "learning_rate": 0.00039316239316239317, "loss": 1.4485, "step": 20 }, { "epoch": 0.08, "grad_norm": 0.7137540578842163, "learning_rate": 0.00038974358974358975, "loss": 0.9732, "step": 30 }, { "epoch": 0.1, "grad_norm": 0.5362057685852051, "learning_rate": 0.0003863247863247863, "loss": 0.7804, "step": 40 }, { "epoch": 0.13, "grad_norm": 0.6810179948806763, "learning_rate": 0.00038290598290598296, "loss": 0.665, "step": 50 }, { "epoch": 0.15, "grad_norm": 0.4986821115016937, "learning_rate": 0.0003794871794871795, "loss": 0.6091, "step": 60 }, { "epoch": 0.18, "grad_norm": 0.4309682250022888, "learning_rate": 0.00037606837606837606, "loss": 0.5502, "step": 70 }, { "epoch": 0.2, "grad_norm": 0.3824257552623749, "learning_rate": 0.0003726495726495727, "loss": 0.5164, "step": 80 }, { "epoch": 0.23, "grad_norm": 0.3188970386981964, "learning_rate": 0.00036923076923076927, "loss": 0.4883, "step": 90 }, { "epoch": 0.26, "grad_norm": 0.3359103202819824, "learning_rate": 0.00036581196581196584, "loss": 0.4612, "step": 100 }, { "epoch": 0.28, "grad_norm": 0.4327464699745178, "learning_rate": 0.0003623931623931624, "loss": 0.4351, "step": 110 }, { "epoch": 0.31, "grad_norm": 0.4424777030944824, "learning_rate": 0.000358974358974359, "loss": 0.4217, "step": 120 }, { "epoch": 0.33, "grad_norm": 0.520322322845459, "learning_rate": 0.00035555555555555557, "loss": 0.4076, "step": 130 }, { "epoch": 0.36, "grad_norm": 0.48572778701782227, "learning_rate": 0.00035213675213675215, "loss": 0.3948, "step": 140 }, { "epoch": 0.38, "grad_norm": 0.2985605001449585, "learning_rate": 0.0003487179487179487, "loss": 0.3823, "step": 150 }, { "epoch": 0.41, "grad_norm": 0.28738752007484436, "learning_rate": 0.00034529914529914536, "loss": 0.375, "step": 160 }, { "epoch": 0.44, "grad_norm": 0.29423144459724426, "learning_rate": 0.0003418803418803419, "loss": 0.3591, "step": 170 }, { "epoch": 0.46, "grad_norm": 0.26430046558380127, "learning_rate": 0.00033846153846153846, "loss": 0.3494, "step": 180 }, { "epoch": 0.49, "grad_norm": 0.2734215259552002, "learning_rate": 0.0003350427350427351, "loss": 0.3396, "step": 190 }, { "epoch": 0.51, "grad_norm": 0.3005197048187256, "learning_rate": 0.00033162393162393166, "loss": 0.3352, "step": 200 }, { "epoch": 0.54, "grad_norm": 0.2822723686695099, "learning_rate": 0.0003282051282051282, "loss": 0.3241, "step": 210 }, { "epoch": 0.56, "grad_norm": 0.2792316973209381, "learning_rate": 0.0003247863247863248, "loss": 0.3208, "step": 220 }, { "epoch": 0.59, "grad_norm": 0.2761669158935547, "learning_rate": 0.0003213675213675214, "loss": 0.3148, "step": 230 }, { "epoch": 0.61, "grad_norm": 0.2733113467693329, "learning_rate": 0.0003179487179487179, "loss": 0.311, "step": 240 }, { "epoch": 0.64, "grad_norm": 0.3393694758415222, "learning_rate": 0.00031452991452991455, "loss": 0.3056, "step": 250 }, { "epoch": 0.67, "grad_norm": 0.29316985607147217, "learning_rate": 0.0003111111111111111, "loss": 0.2975, "step": 260 }, { "epoch": 0.69, "grad_norm": 0.29134783148765564, "learning_rate": 0.0003076923076923077, "loss": 0.2898, "step": 270 }, { "epoch": 0.72, "grad_norm": 0.41234660148620605, "learning_rate": 0.0003042735042735043, "loss": 0.2895, "step": 280 }, { "epoch": 0.74, "grad_norm": 0.26693716645240784, "learning_rate": 0.00030085470085470086, "loss": 0.2835, "step": 290 }, { "epoch": 0.77, "grad_norm": 0.2862294614315033, "learning_rate": 0.00029743589743589743, "loss": 0.2747, "step": 300 }, { "epoch": 0.79, "grad_norm": 0.2596249282360077, "learning_rate": 0.00029401709401709406, "loss": 0.2752, "step": 310 }, { "epoch": 0.82, "grad_norm": 0.2555866241455078, "learning_rate": 0.0002905982905982906, "loss": 0.2662, "step": 320 }, { "epoch": 0.84, "grad_norm": 0.3845195472240448, "learning_rate": 0.0002871794871794872, "loss": 0.2625, "step": 330 }, { "epoch": 0.87, "grad_norm": 0.23550209403038025, "learning_rate": 0.0002837606837606838, "loss": 0.256, "step": 340 }, { "epoch": 0.9, "grad_norm": 0.2434936910867691, "learning_rate": 0.0002803418803418803, "loss": 0.2545, "step": 350 }, { "epoch": 0.92, "grad_norm": 0.23562268912792206, "learning_rate": 0.00027692307692307695, "loss": 0.2536, "step": 360 }, { "epoch": 0.95, "grad_norm": 0.3110085427761078, "learning_rate": 0.0002735042735042735, "loss": 0.2497, "step": 370 }, { "epoch": 0.97, "grad_norm": 0.2646142244338989, "learning_rate": 0.0002700854700854701, "loss": 0.2448, "step": 380 }, { "epoch": 1.0, "grad_norm": 0.22812116146087646, "learning_rate": 0.0002666666666666667, "loss": 0.2409, "step": 390 }, { "epoch": 1.02, "grad_norm": 0.21481893956661224, "learning_rate": 0.00026324786324786326, "loss": 0.225, "step": 400 }, { "epoch": 1.05, "grad_norm": 0.2561526298522949, "learning_rate": 0.00025982905982905983, "loss": 0.2194, "step": 410 }, { "epoch": 1.08, "grad_norm": 0.2297515720129013, "learning_rate": 0.00025641025641025646, "loss": 0.216, "step": 420 }, { "epoch": 1.1, "grad_norm": 0.25526463985443115, "learning_rate": 0.000252991452991453, "loss": 0.2171, "step": 430 }, { "epoch": 1.13, "grad_norm": 0.24202637374401093, "learning_rate": 0.00024957264957264956, "loss": 0.2149, "step": 440 }, { "epoch": 1.15, "grad_norm": 0.20644807815551758, "learning_rate": 0.0002461538461538462, "loss": 0.209, "step": 450 }, { "epoch": 1.18, "grad_norm": 0.2795998454093933, "learning_rate": 0.00024273504273504272, "loss": 0.2071, "step": 460 }, { "epoch": 1.2, "grad_norm": 0.306149959564209, "learning_rate": 0.00023931623931623932, "loss": 0.208, "step": 470 }, { "epoch": 1.23, "grad_norm": 0.2355523407459259, "learning_rate": 0.00023589743589743593, "loss": 0.2051, "step": 480 }, { "epoch": 1.25, "grad_norm": 0.2909263074398041, "learning_rate": 0.0002324786324786325, "loss": 0.2023, "step": 490 }, { "epoch": 1.28, "grad_norm": 0.5298261642456055, "learning_rate": 0.00022905982905982905, "loss": 0.2018, "step": 500 }, { "epoch": 1.28, "eval_cer": 0.9319083335386478, "eval_loss": 0.17038685083389282, "eval_runtime": 138.0008, "eval_samples_per_second": 14.493, "eval_steps_per_second": 0.457, "step": 500 }, { "epoch": 1.31, "grad_norm": 0.23548871278762817, "learning_rate": 0.00022564102564102566, "loss": 0.2008, "step": 510 }, { "epoch": 1.33, "grad_norm": 0.20162977278232574, "learning_rate": 0.00022222222222222223, "loss": 0.1975, "step": 520 }, { "epoch": 1.36, "grad_norm": 0.2593408524990082, "learning_rate": 0.00021880341880341884, "loss": 0.1959, "step": 530 }, { "epoch": 1.38, "grad_norm": 0.21452312171459198, "learning_rate": 0.0002153846153846154, "loss": 0.194, "step": 540 }, { "epoch": 1.41, "grad_norm": 0.2637544274330139, "learning_rate": 0.000211965811965812, "loss": 0.1909, "step": 550 }, { "epoch": 1.43, "grad_norm": 0.24357128143310547, "learning_rate": 0.00020854700854700857, "loss": 0.19, "step": 560 }, { "epoch": 1.46, "grad_norm": 0.2084117978811264, "learning_rate": 0.00020512820512820512, "loss": 0.1857, "step": 570 }, { "epoch": 1.48, "grad_norm": 0.23245294392108917, "learning_rate": 0.00020170940170940172, "loss": 0.1858, "step": 580 }, { "epoch": 1.51, "grad_norm": 0.23836293816566467, "learning_rate": 0.0001982905982905983, "loss": 0.183, "step": 590 }, { "epoch": 1.54, "grad_norm": 0.19184565544128418, "learning_rate": 0.00019487179487179487, "loss": 0.183, "step": 600 }, { "epoch": 1.56, "grad_norm": 0.20401564240455627, "learning_rate": 0.00019145299145299148, "loss": 0.1829, "step": 610 }, { "epoch": 1.59, "grad_norm": 0.21579188108444214, "learning_rate": 0.00018803418803418803, "loss": 0.1812, "step": 620 }, { "epoch": 1.61, "grad_norm": 0.23108145594596863, "learning_rate": 0.00018461538461538463, "loss": 0.1781, "step": 630 }, { "epoch": 1.64, "grad_norm": 0.2311713844537735, "learning_rate": 0.0001811965811965812, "loss": 0.1755, "step": 640 }, { "epoch": 1.66, "grad_norm": 0.19794794917106628, "learning_rate": 0.00017777777777777779, "loss": 0.1768, "step": 650 }, { "epoch": 1.69, "grad_norm": 0.2516119182109833, "learning_rate": 0.00017435897435897436, "loss": 0.1737, "step": 660 }, { "epoch": 1.72, "grad_norm": 0.20975567400455475, "learning_rate": 0.00017094017094017094, "loss": 0.1712, "step": 670 }, { "epoch": 1.74, "grad_norm": 0.22168505191802979, "learning_rate": 0.00016752136752136754, "loss": 0.1693, "step": 680 }, { "epoch": 1.77, "grad_norm": 0.22844062745571136, "learning_rate": 0.0001641025641025641, "loss": 0.168, "step": 690 }, { "epoch": 1.79, "grad_norm": 0.22804197669029236, "learning_rate": 0.0001606837606837607, "loss": 0.1721, "step": 700 }, { "epoch": 1.82, "grad_norm": 0.22620578110218048, "learning_rate": 0.00015726495726495727, "loss": 0.1703, "step": 710 }, { "epoch": 1.84, "grad_norm": 0.21445313096046448, "learning_rate": 0.00015384615384615385, "loss": 0.1673, "step": 720 }, { "epoch": 1.87, "grad_norm": 0.207479327917099, "learning_rate": 0.00015042735042735043, "loss": 0.1648, "step": 730 }, { "epoch": 1.89, "grad_norm": 0.22134087979793549, "learning_rate": 0.00014700854700854703, "loss": 0.1629, "step": 740 }, { "epoch": 1.92, "grad_norm": 0.20121484994888306, "learning_rate": 0.0001435897435897436, "loss": 0.1638, "step": 750 }, { "epoch": 1.95, "grad_norm": 0.2002618908882141, "learning_rate": 0.00014017094017094016, "loss": 0.1621, "step": 760 }, { "epoch": 1.97, "grad_norm": 0.19750453531742096, "learning_rate": 0.00013675213675213676, "loss": 0.1667, "step": 770 }, { "epoch": 2.0, "grad_norm": 0.22286508977413177, "learning_rate": 0.00013333333333333334, "loss": 0.1642, "step": 780 }, { "epoch": 2.02, "grad_norm": 0.21668635308742523, "learning_rate": 0.00012991452991452992, "loss": 0.1482, "step": 790 }, { "epoch": 2.05, "grad_norm": 0.233961820602417, "learning_rate": 0.0001264957264957265, "loss": 0.1453, "step": 800 }, { "epoch": 2.07, "grad_norm": 0.1865084022283554, "learning_rate": 0.0001230769230769231, "loss": 0.1455, "step": 810 }, { "epoch": 2.1, "grad_norm": 0.1853141337633133, "learning_rate": 0.00011965811965811966, "loss": 0.1442, "step": 820 }, { "epoch": 2.12, "grad_norm": 0.17371739447116852, "learning_rate": 0.00011623931623931625, "loss": 0.1382, "step": 830 }, { "epoch": 2.15, "grad_norm": 0.19631154835224152, "learning_rate": 0.00011282051282051283, "loss": 0.1415, "step": 840 }, { "epoch": 2.18, "grad_norm": 0.194850891828537, "learning_rate": 0.00010940170940170942, "loss": 0.141, "step": 850 }, { "epoch": 2.2, "grad_norm": 0.18121449649333954, "learning_rate": 0.000105982905982906, "loss": 0.1388, "step": 860 }, { "epoch": 2.23, "grad_norm": 0.2176773101091385, "learning_rate": 0.00010256410256410256, "loss": 0.1399, "step": 870 }, { "epoch": 2.25, "grad_norm": 0.19013133645057678, "learning_rate": 9.914529914529915e-05, "loss": 0.137, "step": 880 }, { "epoch": 2.28, "grad_norm": 0.22148679196834564, "learning_rate": 9.572649572649574e-05, "loss": 0.139, "step": 890 }, { "epoch": 2.3, "grad_norm": 0.20861493051052094, "learning_rate": 9.230769230769232e-05, "loss": 0.139, "step": 900 }, { "epoch": 2.33, "grad_norm": 0.17541790008544922, "learning_rate": 8.888888888888889e-05, "loss": 0.1362, "step": 910 }, { "epoch": 2.36, "grad_norm": 0.1971459984779358, "learning_rate": 8.547008547008547e-05, "loss": 0.1346, "step": 920 }, { "epoch": 2.38, "grad_norm": 0.20883004367351532, "learning_rate": 8.205128205128205e-05, "loss": 0.1351, "step": 930 }, { "epoch": 2.41, "grad_norm": 0.18058577179908752, "learning_rate": 7.863247863247864e-05, "loss": 0.1363, "step": 940 }, { "epoch": 2.43, "grad_norm": 0.19193512201309204, "learning_rate": 7.521367521367521e-05, "loss": 0.1359, "step": 950 }, { "epoch": 2.46, "grad_norm": 0.17777132987976074, "learning_rate": 7.17948717948718e-05, "loss": 0.1363, "step": 960 }, { "epoch": 2.48, "grad_norm": 0.1730206310749054, "learning_rate": 6.837606837606838e-05, "loss": 0.1339, "step": 970 }, { "epoch": 2.51, "grad_norm": 0.172698512673378, "learning_rate": 6.495726495726496e-05, "loss": 0.1317, "step": 980 }, { "epoch": 2.53, "grad_norm": 0.1746242642402649, "learning_rate": 6.153846153846155e-05, "loss": 0.132, "step": 990 }, { "epoch": 2.56, "grad_norm": 0.1631608009338379, "learning_rate": 5.8119658119658126e-05, "loss": 0.1292, "step": 1000 }, { "epoch": 2.56, "eval_cer": 0.9320702386692806, "eval_loss": 0.11450555920600891, "eval_runtime": 136.638, "eval_samples_per_second": 14.637, "eval_steps_per_second": 0.461, "step": 1000 } ], "logging_steps": 10, "max_steps": 1170, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 6.737241075941376e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }